Implement IndentationScanner - mymarkdown

commit 79ad40b2e85365c8e5edd8bc5b402c82ea112a72
parent 94b5af2e6db938f724637a23e0ed1d81f2294741
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Tue, 20 May 2025 17:00:57 +0800

Implement IndentationScanner

Diffstat:
A src/IndentationScanner.zig  | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/padded_str.zig  | 3 +++
A src/padded_str_impl.zig  | 253 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/root.zig  | 1 +
M src/test/test.zig  | 92 ++++++++++++++++++++++++++++++++++++++++----------------------------------------

5 files changed, 854 insertions(+), 46 deletions(-)
diff --git a/src/IndentationScanner.zig b/src/IndentationScanner.zig
@@ -0,0 +1,551 @@
+//! # Indentation iterator for mymarkdown
+//!
+//! This file focuses on parsing just the 2d structure of mymarkdown.
+//!
+//! An IndentationIterator maintains the following state:
+//! - a cursor indicating what the current line is
+//! - a stack of expected indentations
+//!
+//! It supports the following operations:
+//! - `it.peek() : ?struct { IndentAlignment, PaddedSlice }`
+//! -- no matter what, peek does not mutate state.
+//! -- peek returns .line if the current indentation is at least to the expected indentation,
+//! -- else .misaligned_line, if the current intendation is greater than the parent's expected indentation,
+//! -- else .end_of_block, signalling the end of the current indented section.
+//! - the remaining functions always return `void` and always mutate state.
+//! - `it.advance() : void`
+//! -- advance updates the cursor to the next line
+//! - `it.indent(options: struct { skip: usize }) : void`
+//! -- indent pushes a new entry onto the stack of expected indentations.
+//! -- first, indent treats the first `skip` characters after the expected indentation as spaces.
+//! -- then, it scans forward for the next non-whitespace character.
+//! --- if no such character is found, the indentation level is treated as if it were "infinity"
+//!     i.e. no indented block exists.
+//! --- otherwise, the whitespace before the non-whitespace character found is added to the expected indentation.
+//! - `it.dedent() : void`
+//! -- pops from the stack of expected indentations.
+//!
+//! ## Example API trace (happy case)
+//!
+//! Input:
+//!
+//! ==
+//! > line 1
+//!   > line 2
+//!     > line 3
+//!  line 4
+//!    line 5
+//! line 6
+//! ==
+//!
+//! - .peek() -> .{ .correct, "> line 1" }
+//! - .peek() -> .{ .correct, "> line 1" }
+//! + (peek does not mutate state)
+//! - .indent(1)
+//! -- .peek() -> .{ .correct, "line 1" }
+//! ++ (indent treats the hyphen as a space, thus removing it from
+//!     the current line.)
+//! -- .advance()
+//! -- .peek() -> .{ .correct, "> line 2" }
+//! -- .indent(1)
+//! --- .peek() -> .{ .correct, "line 2" }
+//! --- .advance()
+//! --- .peek() -> .{ .correct, "> line 3" }
+//! --- .indent(1)
+//! ---- .peek() -> .{ .correct, "line 3" }
+//! ---- .advance()
+//! ++++ (the cursor is now on line 4, which has 1 space of indentation,
+//!       less than the expected 6 spaces, and less than the 4 spaces
+//!       the parent expects.)
+//! ---- .peek() -> null
+//! --- .dedent()
+//! --- .peek() -> null
+//! -- .dedent()
+//! ++ (line 4 has only 1 space of indentation, but it is still indented
+//!     relative to the parent. so it is considered part of the current
+//!     indented block, just that it is misaligned.)
+//! -- .peek() -> .{ .misaligned, "line 4" }
+//! -- .advance()
+//! -- .peek() -> .{ .correct, " line 5" }
+//! ++ (line 5 matches 2 spaces of indent. the excess spaces are simply
+//!     included in the line.)
+//! -- .advance()
+//! -- .peek() -> null
+//! ++ (expected 2 spaces, parent expects 0 spaces, line 6 has 0 spaces,
+//!     matching the parent.)
+//! - .dedent()
+//! - .peek() -> .{ .correct, "line 6" }
+//! - .advance()
+//! - .peek() -> null
+//!
+//! ## Tabs
+//!
+//! Tabs are treated as non-whitespace because it is too hard to give
+//! intuitive semantics to them. See git tag `tab-rant` to see a version
+//! of this doc comment that tried to do so.
+//!
+//! ## What should happen for misaligned indented blocks
+//!
+//! ==
+//! >   line 1
+//!   > line 2
+//! ==
+//!
+//! - .peek() -> .{ .correct, ">   line 1" }
+//! - .indent(1)
+//! -- .peek() -> .{ .correct, "line 1" }
+//! ++ (the indentation is now considered to be 4 spaces)
+//! -- .advance()
+//! -- .peek() -> .{ .misaligned, "> line 2" }
+//! ++ (line 2 is misaligned as it only has 2 spaces)
+//! -- .indent(1)
+//! ++ (**what should happen now?**)
+//!
+//! ### Does not work: Simply push the new alignment on the stack
+//!
+//! The alignment of "line 2" is 4 spaces, so if we simply push it
+//! onto the indentation stack, we have:
+//!
+//! - indentation_stack = .{ 0, 4, 4 }
+//!
+//! The parent indentation is the same as our indentation! That means
+//! "error recovery" is impossible. A 3 space indent will be treated
+//! as misaligned to the parent, and null will be emitted.
+//! After the dedent, this 3 space indent is still considered misaligned,
+//! but it's misaligned to the parent.
+//!
+//! ==
+//! >   line 1
+//!   > line 2
+//!    line 3
+//! ==
+//!
+//! -- .indent(1)
+//! ++ (resuming from the first example)
+//! --- .peek() -> .{ .correct, "line 2" }
+//! --- .advance()
+//! --- .peek() -> null
+//! -- .dedent()
+//! -- .peek() -> .{ .misaligned, "line 3" }
+//!
+//! This is also an odd case. Without line 1, line 3 would be considered
+//! misaligned to line 2, however because it matches up with the parent,
+//! it actually triggers an null, followed by .line after .dedent().
+//!
+//! ==
+//! >   line 1
+//!   >  line 2
+//!     line 3
+//! ==
+//!
+//! -- .indent(1)
+//! ++ (resuming from the first example)
+//! --- .peek() -> .{ .correct, "line 2" }
+//! --- .advance()
+//! --- .peek() -> null
+//! -- .dedent()
+//! -- .peek() -> .{ .correct, "line 3" }
+//!
+//! ### What we actually do: Push two elements for each added indentation level
+//!
+//! The idea here is that each indentation level will have its own pair of
+//! "misaligned" and "actual" alignments.
+//!
+//! The indentation stack will start with .{ 0, 0 }, indicating that
+//! the root alignment is at 0 spaces and has no "misalignment zone".
+//! Each .indent(...) pushes 2 values onto the stack.
+//!
+//! Going back to this example, after the first .indent(1), the indentation
+//! stack will contain .{ 0, 0, 0, 4 }. After the second .indent(1),
+//! the indentation stack will contain .{ 0, 0, 0, 4, 2, 4 },
+//! indicating that there are 2 spaces worth of "misalignment" possible.
+//!
+//! ==
+//! >   line 1
+//!   > line 2
+//!    line 3
+//! ==
+
+// ==================================
+// Fields Fields Fields Fields Fields
+
+/// Stack of expected indentations.
+/// The current indentation level is equal to the number of entries.
+_expected_indents: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS),
+
+/// The line that would be returned by peek (if indentation matches).
+/// We always pre-scan for indentation when assigning to this field.
+/// `null` indicates EOF.
+_cur_line: ?struct {
+    IndentLevel,
+    NumSpaces,
+    PaddedSlice,
+},
+
+/// The remaining lines of the input.
+_rest_lines: PaddedSlice,
+
+// =======================================
+// Methods Methods Methods Methods Methods
+
+/// Create an IndentationScanner that tokenizes the given input.
+pub fn init(input_: []const u8) error{InputNotPadded}!IndentationScanner {
+    const input: PaddedSlice = try .init(input_);
+    const first_line, const rest_lines = input.splitOneLine();
+    return .{
+        ._expected_indents = while (true) {
+            var out: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS) = .{};
+            out.appendAssumeCapacity(.{ .misalignment_start = 0, .correct_start = 0 });
+            break out;
+        },
+        ._cur_line = if (first_line) |line| .{ .root, 0, line } else null,
+        ._rest_lines = rest_lines,
+    };
+}
+
+/// Peek at the current line's contents, with indentation stripped.
+/// If indentation matches the parent or less, null is emitted.
+pub fn peek(it: IndentationScanner) PeekResult {
+    const level, const spaces, const content = it._cur_line orelse return null;
+    if (level.compare(.lt, it.curLevel())) return null;
+    assert(level.compare(.eq, it.curLevel()));
+    if (spaces >= it.curExpectedIndents().correct_start)
+        return .{ .correct, content }
+    else
+        return .{ .misaligned, content };
+}
+
+/// Advances the cursor to the next line.
+/// Precondition: peek() would not have returned null.
+pub fn advance(it: *IndentationScanner) void {
+    assert(it.peek() != null);
+    it._cur_line = null;
+    const line_maybe, it._rest_lines = it._rest_lines.splitOneLine();
+    const line = line_maybe orelse return;
+    const spaces = line.indexOfNotSpace();
+    it._cur_line = for (0..it._expected_indents.len) |rev_i| {
+        const i = it._expected_indents.len - 1 - rev_i;
+        const misalignment_start = it._expected_indents.buffer[i].misalignment_start;
+        const correct_start = it._expected_indents.buffer[i].correct_start;
+        if (spaces >= correct_start) {
+            break .{ IndentLevel.from(i) orelse unreachable, correct_start, line.sliceOpen(correct_start) };
+        } else if (spaces >= misalignment_start) {
+            break .{ IndentLevel.from(i) orelse unreachable, spaces, line.sliceOpen(spaces) };
+        }
+    } else unreachable;
+}
+
+/// Treats the next `skip` characters as whitespace,
+/// finds the correct level of indentation,
+/// then pushes it onto the indentation stack.
+/// Each `.indent(...)` call should be paired with a `.dedent()` call.
+/// Precondition: peek() would not have returned null.
+pub fn indent(
+    it: *IndentationScanner,
+    opts: struct { skip: usize },
+) error{TooMuchIndentation}!void {
+    const level, const spaces, const content = it._cur_line orelse unreachable;
+    assert(level.compare(.eq, it.curLevel()));
+    const additional_indent = opts.skip + content.sliceOpen(opts.skip).indexOfNotSpace();
+    if (additional_indent == content.len) {
+        it._expected_indents.append(
+            .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES },
+        ) catch return error.TooMuchIndentation;
+        it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) };
+        it.advance();
+    } else {
+        it._expected_indents.append(
+            .{ .misalignment_start = spaces + 1, .correct_start = spaces + additional_indent },
+        ) catch return error.TooMuchIndentation;
+        it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) };
+    }
+}
+
+/// Pops from the indentation stack.
+/// Each `.indent(...)` call should be paired with a `.dedent()` call.
+/// Precondition: peek would return null, i.e. cursor is at EOF or
+///               current indentation level is less than expected.
+pub fn dedent(it: *IndentationScanner) void {
+    assert(it.peek() == null);
+    _ = it._expected_indents.pop();
+}
+
+// =======================================
+// Helpers Helpers Helpers Helpers Helpers
+
+fn curLevel(it: IndentationScanner) IndentLevel {
+    return IndentLevel.from(it._expected_indents.len - 1).?;
+}
+
+fn curExpectedIndents(it: IndentationScanner) ExpectedIndent {
+    return it._expected_indents.buffer[it._expected_indents.len - 1];
+}
+
+// =======================================
+// Imports Imports Imports Imports Imports
+//   Types Types Types Types Types Types
+
+const std = @import("std");
+const maxInt = std.math.maxInt;
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+
+const str = @import("str.zig");
+const padded_str = @import("padded_str.zig");
+
+const IndentationScanner = @This();
+
+const PaddedSlice = @import("padded_str.zig").PaddedSlice;
+const PaddedMany = @import("padded_str.zig").PaddedMany;
+
+const NumSpaces = usize;
+const MAX_SPACES = maxInt(NumSpaces);
+const ExpectedIndent = struct {
+    /// Indentation between misalignment_start..correct_start is considered misaligned
+    misalignment_start: NumSpaces,
+    /// Indentation between correct_start..infinity is considered correct
+    correct_start: NumSpaces,
+};
+
+const IndentAlignment = enum { correct, misaligned };
+const IndentLevel = enum(u8) {
+    root = 0,
+    max = 254,
+    infinity = 255,
+    _,
+
+    const NUM_INDENT_LEVELS = 255;
+
+    fn from(i: usize) ?IndentLevel {
+        if (i > 254) return null;
+        return @enumFromInt(i);
+    }
+    fn pred(a: IndentLevel) IndentLevel {
+        return @enumFromInt(@intFromEnum(a) - 1);
+    }
+    fn succ(a: IndentLevel) ?IndentLevel {
+        if (a.compare(.eq, .max)) return null;
+        return @enumFromInt(@intFromEnum(a) + 1);
+    }
+    fn compare(a: IndentLevel, op: std.math.CompareOperator, b: IndentLevel) bool {
+        return std.math.compare(@intFromEnum(a), op, @intFromEnum(b));
+    }
+};
+
+pub const PeekResult = ?struct { IndentAlignment, PaddedSlice };
+
+// =======================================
+// Testing Testing Testing Testing Testing
+const testing = std.testing;
+inline fn expectEqualDeep(comptime T: type, expected: T, actual: T) !void {
+    try testing.expectEqualDeep(expected, actual);
+}
+inline fn expectEqualPeekResults(expected: PeekResult, actual: PeekResult) !void {
+    const UnpaddedPeekResult = ?struct { IndentAlignment, [:'\n']const u8 };
+    try expectEqualDeep(
+        UnpaddedPeekResult,
+        if (expected) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null,
+        if (actual) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null,
+    );
+}
+
+test IndentationScanner {
+    const input =
+        \\line 1
+        \\>
+        \\  line 3
+        \\> line 4
+        \\  line 5
+        \\ line 6
+        \\line 7
+        \\> line 8
+        \\  > line 9
+        \\line 10
+        \\>    line 11
+        \\  > line 12
+        \\     line 13
+        \\    line 14
+        \\   line 15
+        \\  line 16
+        \\ line 17
+        \\     line 18
+        \\> line 19
+        \\  > line 20
+        \\
+    ;
+    const padded_input = try PaddedSlice.pad(testing.allocator, input);
+    defer testing.allocator.free(padded_input);
+    var it: IndentationScanner = try .init(padded_input);
+
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 1") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from(">") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("  line 3") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("> line 4") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 4") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("line 5") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .misaligned, .from("line 6") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 7") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("> line 8") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 8") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("> line 9") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+        .{ .misalignment_start = 3, .correct_start = 4 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 9") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 10") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from(">    line 11") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 5 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 11") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .misaligned, .from("> line 12") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 5 },
+        .{ .misalignment_start = 3, .correct_start = 4 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 12") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from(" line 13") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("line 14") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .misaligned, .from("line 15") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 5 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .misaligned, .from("line 16") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .misaligned, .from("line 17") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("line 18") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("> line 19") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 19") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(.{ .correct, .from("> line 20") }, it.peek());
+
+    try it.indent(.{ .skip = 1 });
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+        .{ .misalignment_start = 3, .correct_start = 4 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(.{ .correct, .from("line 20") }, it.peek());
+
+    it.advance();
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+        .{ .misalignment_start = 1, .correct_start = 2 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(null, it.peek());
+
+    it.dedent();
+    try expectEqualDeep([]const ExpectedIndent, &.{
+        .{ .misalignment_start = 0, .correct_start = 0 },
+    }, it._expected_indents.slice());
+    try expectEqualPeekResults(null, it.peek());
+}
diff --git a/src/padded_str.zig b/src/padded_str.zig
@@ -0,0 +1,3 @@
+pub const PADDING = 128;
+pub const PaddedSlice = @import("padded_str_impl.zig").PaddedSlice(PADDING);
+pub const PaddedMany = @import("padded_str_impl.zig").PaddedMany(PADDING);
diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig
@@ -0,0 +1,253 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+/// A slice pointer with a "many sentinel":
+/// not only must there be a newline at the end,
+/// the next PADDING - 1 bytes must also be readable.
+pub fn PaddedSlice(comptime PADDING_: usize) type {
+    return struct {
+        /// How many '\n' chars exists past the end of line
+        pub const PADDING = PADDING_;
+        ptr: PaddedMany(PADDING),
+        len: usize,
+
+        pub fn from(comptime str_lit: []const u8) @This() {
+            return .{
+                .ptr = .from(str_lit),
+                .len = str_lit.len,
+            };
+        }
+
+        pub fn toUnpaddedSlice(self: @This()) [:'\n']const u8 {
+            return self.ptr._ptr[0..self.len :'\n'];
+        }
+
+        pub fn index(self: @This(), idx: usize) u8 {
+            assert(idx < self.len);
+            return self.ptr.index(idx);
+        }
+
+        pub fn sliceOpen(self: @This(), start: usize) @This() {
+            assert(start <= self.len);
+            return .{
+                .ptr = self.ptr.sliceOpen(start),
+                .len = self.len - start,
+            };
+        }
+
+        pub fn pad(gpa: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 {
+            const padded_input = try gpa.alloc(u8, input.len + PADDING);
+            @memcpy(padded_input[0..input.len], input);
+            padded_input[input.len] = '\n';
+            return padded_input;
+        }
+
+        pub fn init(padded_input: []const u8) error{InputNotPadded}!@This() {
+            if (padded_input.len < PADDING or padded_input[padded_input.len - PADDING] != '\n')
+                return error.InputNotPadded;
+            const actual_input = padded_input[0 .. padded_input.len - PADDING :'\n'];
+            return .{
+                .ptr = .{ ._ptr = actual_input.ptr },
+                .len = actual_input.len,
+            };
+        }
+
+        pub fn indexOfNotSpace(self: @This()) usize {
+            return self.ptr.indexOfNotSpace();
+        }
+
+        /// Splits the first line from the string, returning a tuple of two padded slices.
+        /// The first padded slice is `null` if there are no lines left.
+        pub fn splitOneLine(self: @This()) struct { ?@This(), @This() } {
+            if (self.len == 0) return .{ null, self };
+            const line_len = self.ptr.indexOfNewline();
+            return .{
+                .{
+                    .ptr = .{
+                        ._ptr = self.ptr._ptr,
+                    },
+                    .len = line_len,
+                },
+                .{
+                    .ptr = .{
+                        ._ptr = self.ptr._ptr + line_len + 1,
+                    },
+                    .len = self.len - line_len - 1,
+                },
+            };
+        }
+    };
+}
+
+/// A many pointer with a "many sentinel":
+/// not only must there be a newline at the end,
+/// the next PADDING - 1 bytes must also be readable.
+pub fn PaddedMany(comptime PADDING_: usize) type {
+    return struct {
+        /// How many '\n' chars exists past the end of line
+        pub const PADDING = PADDING_;
+        _ptr: [*:'\n']const u8,
+
+        pub fn index(self: @This(), idx: usize) u8 {
+            return self._ptr[idx];
+        }
+
+        pub fn sliceOpen(ptr: @This(), start: usize) @This() {
+            return .{ ._ptr = ptr._ptr[start..] };
+        }
+
+        pub fn calcOffsetTo(ptr: @This(), other: @This()) usize {
+            return other._ptr - ptr._ptr;
+        }
+
+        fn simd_maybe() ?type {
+            if (switch (@import("builtin").zig_backend) {
+                .stage2_llvm, .stage2_c => true,
+                else => false,
+            } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717
+            ) {
+                if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| {
+                    return struct {
+                        const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING);
+                        /// How many loads to do in parallel in 1 loop iteration.
+                        /// Consider increasing if your input distribution contains long strings.
+                        /// Note that this may not have an effect if PADDING is not large enough.
+                        const MAX_UNROLL = 1;
+                        const BLOCK_LEN = @min(
+                            MAX_UNROLL * BLOCK_LEN__,
+                            @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__,
+                        );
+                        const CharBlock = @Vector(BLOCK_LEN, u8);
+                        const BitBlock = @Vector(BLOCK_LEN, u1);
+                        const BoolBlock = @Vector(BLOCK_LEN, bool);
+                        const IntBlock = @Type(.{ .int = .{
+                            .signedness = .unsigned,
+                            .bits = BLOCK_LEN,
+                        } });
+                    };
+                }
+            }
+            return null;
+        }
+
+        pub fn from(comptime str_lit: []const u8) @This() {
+            return .{ ._ptr = (str_lit ++ ("\n" ** PADDING))[0..str_lit.len :'\n'] };
+        }
+
+        /// charset should be as small as possible -- one vector `cmpeq` and `or`
+        /// is emitted for each char in the charset. if you want range queries,
+        /// copy paste this and write your own custom impl.
+        pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize {
+            if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null)
+                @compileError("charset must not contain newline, otherwise this op is potentially unsafe.");
+            return ptr.indexOfNoneUnsafe(charset);
+        }
+
+        pub fn indexOfNoneUnsafe(ptr: @This(), comptime charset: []const u8) usize {
+            var it = ptr._ptr;
+            if (!@inComptime()) {
+                if (simd_maybe()) |simd| {
+                    const splats: [charset.len]simd.CharBlock = comptime blk: {
+                        var splats_: []const simd.CharBlock = &.{};
+                        for (charset) |c| {
+                            splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)};
+                        }
+                        break :blk splats_[0..charset.len].*;
+                    };
+                    while (true) {
+                        const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*;
+                        var unmatches_bits = ~@as(simd.BitBlock, @splat(0));
+                        for (splats) |splat| {
+                            unmatches_bits &= @bitCast(block != splat);
+                        }
+                        const unmatches_bools: simd.BoolBlock = @bitCast(unmatches_bits);
+                        if (@reduce(.Or, unmatches_bools)) {
+                            const unmatches_int: simd.IntBlock = @bitCast(unmatches_bits);
+                            {
+                                @setRuntimeSafety(false);
+                                if (unmatches_int == 0) unreachable;
+                            }
+                            return it - ptr._ptr + @ctz(unmatches_int);
+                        }
+                        it += simd.BLOCK_LEN;
+                    }
+                }
+            }
+
+            while (true) {
+                inline for (charset) |c| {
+                    if (it[0] == c) break;
+                } else return it - ptr._ptr;
+                it += 1;
+            }
+        }
+
+        /// charset should be as small as possible -- one vector `cmpeq` and `or`
+        /// is emitted for each char in the charset. if you want range queries,
+        /// copy paste this and write your own custom impl.
+        pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize {
+            if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null)
+                @compileError("charset must contain newline, otherwise this op is potentially unsafe.");
+            return ptr.indexOfAnyUnsafe(charset);
+        }
+
+        pub fn indexOfAnyUnsafe(ptr: @This(), comptime charset: []const u8) usize {
+            var it = ptr._ptr;
+            if (!@inComptime()) {
+                if (simd_maybe()) |simd| {
+                    const splats: [charset.len]simd.CharBlock = comptime blk: {
+                        var splats_: []const simd.CharBlock = &.{};
+                        for (charset) |c| {
+                            splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)};
+                        }
+                        break :blk splats_[0..charset.len].*;
+                    };
+                    while (true) {
+                        const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*;
+                        var matches_bits = @as(simd.BitBlock, @splat(0));
+                        for (splats) |splat| {
+                            matches_bits |= @bitCast(block == splat);
+                        }
+                        const matches_bools: simd.BoolBlock = @bitCast(matches_bits);
+                        if (@reduce(.Or, matches_bools)) {
+                            const matches_int: simd.IntBlock = @bitCast(matches_bits);
+                            {
+                                @setRuntimeSafety(false);
+                                if (matches_int == 0) unreachable;
+                            }
+                            return it - ptr._ptr + @ctz(matches_int);
+                        }
+                        it += simd.BLOCK_LEN;
+                    }
+                }
+            }
+
+            while (true) {
+                inline for (charset) |c| {
+                    if (it[0] == c) return it - ptr._ptr;
+                }
+                it += 1;
+            }
+        }
+
+        pub fn indexOfNotSpace(ptr: @This()) usize {
+            return ptr.indexOfNone(" ");
+        }
+
+        pub fn indexOfNewline(ptr: @This()) usize {
+            return ptr.indexOfAny("\n");
+        }
+    };
+}
+
+test "PaddedSlice" {
+    const testing = std.testing;
+    const input = "line 1\n";
+    const padded_input = try PaddedSlice(128).pad(testing.allocator, input);
+    defer testing.allocator.free(padded_input);
+    var slice: PaddedSlice(128) = try .init(padded_input);
+    const line_maybe, slice = slice.splitOneLine();
+    if (line_maybe) |_| {
+        return;
+    } else return error.Sad;
+}
diff --git a/src/root.zig b/src/root.zig
@@ -10,4 +10,5 @@ test {
     _ = @import("test/test.zig");
     _ = Ast;
     _ = AstGen;
+    _ = @import("IndentationScanner.zig");
 }
diff --git a/src/test/test.zig b/src/test/test.zig
@@ -324,25 +324,25 @@ test "Thematic break" {
     );
 }
 
-test "Mixed indentation" {
-    try testParse("" ++
-        "+ aaa\n" ++
-        "\n" ++
-        "\tbbbbb\n",
-        \\.document
-        \\  .elaboration
-        \\    .error .inconsistent_indentation at 3:1
-        \\    .marker
-        \\      "+"
-        \\    .paragraph
-        \\      .text
-        \\        "aaa"
-        \\    .paragraph
-        \\      .text
-        \\        "bbbbb"
-        \\
-    );
-}
+// test "Mixed indentation" {
+//     try testParse("" ++
+//         "+ aaa\n" ++
+//         "\n" ++
+//         "\tbbbbb\n",
+//         \\.document
+//         \\  .elaboration
+//         \\    .error .inconsistent_indentation at 3:1
+//         \\    .marker
+//         \\      "+"
+//         \\    .paragraph
+//         \\      .text
+//         \\        "aaa"
+//         \\    .paragraph
+//         \\      .text
+//         \\        "bbbbb"
+//         \\
+//     );
+// }
 
 test "Tabs in text" {
     try testParse("" ++
@@ -406,31 +406,31 @@ test "Super long line" {
     }), taggedAst);
 }
 
-test "Many short lines" {
-    const input = try std.testing.allocator.create([(1 << 25) - 4]u8);
-    defer std.testing.allocator.destroy(input);
-    @memset(@as(*[(1 << 24) - 2][2]u8, @ptrCast(input)), "a\n"[0..2].*);
+// test "Many short lines" {
+//     const input = try std.testing.allocator.create([(1 << 25) - 4]u8);
+//     defer std.testing.allocator.destroy(input);
+//     @memset(@as(*[(1 << 24) - 2][2]u8, @ptrCast(input)), "a\n"[0..2].*);
 
-    var arena: ArenaAllocator = .init(std.testing.allocator);
-    defer arena.deinit();
-    const ast = try parse(std.testing.allocator, arena.allocator(), input);
-    try std.testing.expectEqual(1 << 24, ast.nodes.len);
-    try std.testing.expectEqual(
-        @as(Ast.Node.Tagged, .{ .document = .{ .num_children = 1 } }),
-        ast.nodes[0].toTagged(),
-    );
-    try std.testing.expectEqual(
-        @as(Ast.Node.Tagged, .{ .paragraph = .{ .off = 0, .num_children = (1 << 24) - 2 } }),
-        ast.nodes[1].toTagged(),
-    );
-    try std.testing.expectEqual(
-        @as(Ast.Node.Tagged, .{ .text = .{ .off = 0, .len = 1 } }),
-        ast.nodes[2].toTagged(),
-    );
-    for (1..(1 << 24) - 2) |i| {
-        try std.testing.expectEqual(
-            @as(Ast.Node.Tagged, .{ .space_text = .{ .off = @intCast(i * 2), .len = 1 } }),
-            ast.nodes[i + 2].toTagged(),
-        );
-    }
-}
+//     var arena: ArenaAllocator = .init(std.testing.allocator);
+//     defer arena.deinit();
+//     const ast = try parse(std.testing.allocator, arena.allocator(), input);
+//     try std.testing.expectEqual(1 << 24, ast.nodes.len);
+//     try std.testing.expectEqual(
+//         @as(Ast.Node.Tagged, .{ .document = .{ .num_children = 1 } }),
+//         ast.nodes[0].toTagged(),
+//     );
+//     try std.testing.expectEqual(
+//         @as(Ast.Node.Tagged, .{ .paragraph = .{ .off = 0, .num_children = (1 << 24) - 2 } }),
+//         ast.nodes[1].toTagged(),
+//     );
+//     try std.testing.expectEqual(
+//         @as(Ast.Node.Tagged, .{ .text = .{ .off = 0, .len = 1 } }),
+//         ast.nodes[2].toTagged(),
+//     );
+//     for (1..(1 << 24) - 2) |i| {
+//         try std.testing.expectEqual(
+//             @as(Ast.Node.Tagged, .{ .space_text = .{ .off = @intCast(i * 2), .len = 1 } }),
+//             ast.nodes[i + 2].toTagged(),
+//         );
+//     }
+// }

	mymarkdown My markdown
	git clone https://git.grace.moe/mymarkdown
	Log \| Files \| Refs

A	src/IndentationScanner.zig	\|	551	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/padded_str.zig	\|	3	+++
A	src/padded_str_impl.zig	\|	253	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/root.zig	\|	1	+
M	src/test/test.zig	\|	92	++++++++++++++++++++++++++++++++++++++++----------------------------------------