commit 79ad40b2e85365c8e5edd8bc5b402c82ea112a72
parent 94b5af2e6db938f724637a23e0ed1d81f2294741
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Tue, 20 May 2025 17:00:57 +0800
Implement IndentationScanner
Diffstat:
5 files changed, 854 insertions(+), 46 deletions(-)
diff --git a/src/IndentationScanner.zig b/src/IndentationScanner.zig
@@ -0,0 +1,551 @@
+//! # Indentation iterator for mymarkdown
+//!
+//! This file focuses on parsing just the 2d structure of mymarkdown.
+//!
+//! An IndentationIterator maintains the following state:
+//! - a cursor indicating what the current line is
+//! - a stack of expected indentations
+//!
+//! It supports the following operations:
+//! - `it.peek() : ?struct { IndentAlignment, PaddedSlice }`
+//! -- no matter what, peek does not mutate state.
+//! -- peek returns .line if the current indentation is at least to the expected indentation,
+//! -- else .misaligned_line, if the current intendation is greater than the parent's expected indentation,
+//! -- else .end_of_block, signalling the end of the current indented section.
+//! - the remaining functions always return `void` and always mutate state.
+//! - `it.advance() : void`
+//! -- advance updates the cursor to the next line
+//! - `it.indent(options: struct { skip: usize }) : void`
+//! -- indent pushes a new entry onto the stack of expected indentations.
+//! -- first, indent treats the first `skip` characters after the expected indentation as spaces.
+//! -- then, it scans forward for the next non-whitespace character.
+//! --- if no such character is found, the indentation level is treated as if it were "infinity"
+//! i.e. no indented block exists.
+//! --- otherwise, the whitespace before the non-whitespace character found is added to the expected indentation.
+//! - `it.dedent() : void`
+//! -- pops from the stack of expected indentations.
+//!
+//! ## Example API trace (happy case)
+//!
+//! Input:
+//!
+//! ==
+//! > line 1
+//! > line 2
+//! > line 3
+//! line 4
+//! line 5
+//! line 6
+//! ==
+//!
+//! - .peek() -> .{ .correct, "> line 1" }
+//! - .peek() -> .{ .correct, "> line 1" }
+//! + (peek does not mutate state)
+//! - .indent(1)
+//! -- .peek() -> .{ .correct, "line 1" }
+//! ++ (indent treats the hyphen as a space, thus removing it from
+//! the current line.)
+//! -- .advance()
+//! -- .peek() -> .{ .correct, "> line 2" }
+//! -- .indent(1)
+//! --- .peek() -> .{ .correct, "line 2" }
+//! --- .advance()
+//! --- .peek() -> .{ .correct, "> line 3" }
+//! --- .indent(1)
+//! ---- .peek() -> .{ .correct, "line 3" }
+//! ---- .advance()
+//! ++++ (the cursor is now on line 4, which has 1 space of indentation,
+//! less than the expected 6 spaces, and less than the 4 spaces
+//! the parent expects.)
+//! ---- .peek() -> null
+//! --- .dedent()
+//! --- .peek() -> null
+//! -- .dedent()
+//! ++ (line 4 has only 1 space of indentation, but it is still indented
+//! relative to the parent. so it is considered part of the current
+//! indented block, just that it is misaligned.)
+//! -- .peek() -> .{ .misaligned, "line 4" }
+//! -- .advance()
+//! -- .peek() -> .{ .correct, " line 5" }
+//! ++ (line 5 matches 2 spaces of indent. the excess spaces are simply
+//! included in the line.)
+//! -- .advance()
+//! -- .peek() -> null
+//! ++ (expected 2 spaces, parent expects 0 spaces, line 6 has 0 spaces,
+//! matching the parent.)
+//! - .dedent()
+//! - .peek() -> .{ .correct, "line 6" }
+//! - .advance()
+//! - .peek() -> null
+//!
+//! ## Tabs
+//!
+//! Tabs are treated as non-whitespace because it is too hard to give
+//! intuitive semantics to them. See git tag `tab-rant` to see a version
+//! of this doc comment that tried to do so.
+//!
+//! ## What should happen for misaligned indented blocks
+//!
+//! ==
+//! > line 1
+//! > line 2
+//! ==
+//!
+//! - .peek() -> .{ .correct, "> line 1" }
+//! - .indent(1)
+//! -- .peek() -> .{ .correct, "line 1" }
+//! ++ (the indentation is now considered to be 4 spaces)
+//! -- .advance()
+//! -- .peek() -> .{ .misaligned, "> line 2" }
+//! ++ (line 2 is misaligned as it only has 2 spaces)
+//! -- .indent(1)
+//! ++ (**what should happen now?**)
+//!
+//! ### Does not work: Simply push the new alignment on the stack
+//!
+//! The alignment of "line 2" is 4 spaces, so if we simply push it
+//! onto the indentation stack, we have:
+//!
+//! - indentation_stack = .{ 0, 4, 4 }
+//!
+//! The parent indentation is the same as our indentation! That means
+//! "error recovery" is impossible. A 3 space indent will be treated
+//! as misaligned to the parent, and null will be emitted.
+//! After the dedent, this 3 space indent is still considered misaligned,
+//! but it's misaligned to the parent.
+//!
+//! ==
+//! > line 1
+//! > line 2
+//! line 3
+//! ==
+//!
+//! -- .indent(1)
+//! ++ (resuming from the first example)
+//! --- .peek() -> .{ .correct, "line 2" }
+//! --- .advance()
+//! --- .peek() -> null
+//! -- .dedent()
+//! -- .peek() -> .{ .misaligned, "line 3" }
+//!
+//! This is also an odd case. Without line 1, line 3 would be considered
+//! misaligned to line 2, however because it matches up with the parent,
+//! it actually triggers an null, followed by .line after .dedent().
+//!
+//! ==
+//! > line 1
+//! > line 2
+//! line 3
+//! ==
+//!
+//! -- .indent(1)
+//! ++ (resuming from the first example)
+//! --- .peek() -> .{ .correct, "line 2" }
+//! --- .advance()
+//! --- .peek() -> null
+//! -- .dedent()
+//! -- .peek() -> .{ .correct, "line 3" }
+//!
+//! ### What we actually do: Push two elements for each added indentation level
+//!
+//! The idea here is that each indentation level will have its own pair of
+//! "misaligned" and "actual" alignments.
+//!
+//! The indentation stack will start with .{ 0, 0 }, indicating that
+//! the root alignment is at 0 spaces and has no "misalignment zone".
+//! Each .indent(...) pushes 2 values onto the stack.
+//!
+//! Going back to this example, after the first .indent(1), the indentation
+//! stack will contain .{ 0, 0, 0, 4 }. After the second .indent(1),
+//! the indentation stack will contain .{ 0, 0, 0, 4, 2, 4 },
+//! indicating that there are 2 spaces worth of "misalignment" possible.
+//!
+//! ==
+//! > line 1
+//! > line 2
+//! line 3
+//! ==
+
+// ==================================
+// Fields Fields Fields Fields Fields
+
+/// Stack of expected indentations.
+/// The current indentation level is equal to the number of entries.
+_expected_indents: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS),
+
+/// The line that would be returned by peek (if indentation matches).
+/// We always pre-scan for indentation when assigning to this field.
+/// `null` indicates EOF.
+_cur_line: ?struct {
+ IndentLevel,
+ NumSpaces,
+ PaddedSlice,
+},
+
+/// The remaining lines of the input.
+_rest_lines: PaddedSlice,
+
+// =======================================
+// Methods Methods Methods Methods Methods
+
+/// Create an IndentationScanner that tokenizes the given input.
+pub fn init(input_: []const u8) error{InputNotPadded}!IndentationScanner {
+ const input: PaddedSlice = try .init(input_);
+ const first_line, const rest_lines = input.splitOneLine();
+ return .{
+ ._expected_indents = while (true) {
+ var out: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS) = .{};
+ out.appendAssumeCapacity(.{ .misalignment_start = 0, .correct_start = 0 });
+ break out;
+ },
+ ._cur_line = if (first_line) |line| .{ .root, 0, line } else null,
+ ._rest_lines = rest_lines,
+ };
+}
+
+/// Peek at the current line's contents, with indentation stripped.
+/// If indentation matches the parent or less, null is emitted.
+pub fn peek(it: IndentationScanner) PeekResult {
+ const level, const spaces, const content = it._cur_line orelse return null;
+ if (level.compare(.lt, it.curLevel())) return null;
+ assert(level.compare(.eq, it.curLevel()));
+ if (spaces >= it.curExpectedIndents().correct_start)
+ return .{ .correct, content }
+ else
+ return .{ .misaligned, content };
+}
+
+/// Advances the cursor to the next line.
+/// Precondition: peek() would not have returned null.
+pub fn advance(it: *IndentationScanner) void {
+ assert(it.peek() != null);
+ it._cur_line = null;
+ const line_maybe, it._rest_lines = it._rest_lines.splitOneLine();
+ const line = line_maybe orelse return;
+ const spaces = line.indexOfNotSpace();
+ it._cur_line = for (0..it._expected_indents.len) |rev_i| {
+ const i = it._expected_indents.len - 1 - rev_i;
+ const misalignment_start = it._expected_indents.buffer[i].misalignment_start;
+ const correct_start = it._expected_indents.buffer[i].correct_start;
+ if (spaces >= correct_start) {
+ break .{ IndentLevel.from(i) orelse unreachable, correct_start, line.sliceOpen(correct_start) };
+ } else if (spaces >= misalignment_start) {
+ break .{ IndentLevel.from(i) orelse unreachable, spaces, line.sliceOpen(spaces) };
+ }
+ } else unreachable;
+}
+
+/// Treats the next `skip` characters as whitespace,
+/// finds the correct level of indentation,
+/// then pushes it onto the indentation stack.
+/// Each `.indent(...)` call should be paired with a `.dedent()` call.
+/// Precondition: peek() would not have returned null.
+pub fn indent(
+ it: *IndentationScanner,
+ opts: struct { skip: usize },
+) error{TooMuchIndentation}!void {
+ const level, const spaces, const content = it._cur_line orelse unreachable;
+ assert(level.compare(.eq, it.curLevel()));
+ const additional_indent = opts.skip + content.sliceOpen(opts.skip).indexOfNotSpace();
+ if (additional_indent == content.len) {
+ it._expected_indents.append(
+ .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES },
+ ) catch return error.TooMuchIndentation;
+ it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) };
+ it.advance();
+ } else {
+ it._expected_indents.append(
+ .{ .misalignment_start = spaces + 1, .correct_start = spaces + additional_indent },
+ ) catch return error.TooMuchIndentation;
+ it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) };
+ }
+}
+
+/// Pops from the indentation stack.
+/// Each `.indent(...)` call should be paired with a `.dedent()` call.
+/// Precondition: peek would return null, i.e. cursor is at EOF or
+/// current indentation level is less than expected.
+pub fn dedent(it: *IndentationScanner) void {
+ assert(it.peek() == null);
+ _ = it._expected_indents.pop();
+}
+
+// =======================================
+// Helpers Helpers Helpers Helpers Helpers
+
+fn curLevel(it: IndentationScanner) IndentLevel {
+ return IndentLevel.from(it._expected_indents.len - 1).?;
+}
+
+fn curExpectedIndents(it: IndentationScanner) ExpectedIndent {
+ return it._expected_indents.buffer[it._expected_indents.len - 1];
+}
+
+// =======================================
+// Imports Imports Imports Imports Imports
+// Types Types Types Types Types Types
+
+const std = @import("std");
+const maxInt = std.math.maxInt;
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+
+const str = @import("str.zig");
+const padded_str = @import("padded_str.zig");
+
+const IndentationScanner = @This();
+
+const PaddedSlice = @import("padded_str.zig").PaddedSlice;
+const PaddedMany = @import("padded_str.zig").PaddedMany;
+
+const NumSpaces = usize;
+const MAX_SPACES = maxInt(NumSpaces);
+const ExpectedIndent = struct {
+ /// Indentation between misalignment_start..correct_start is considered misaligned
+ misalignment_start: NumSpaces,
+ /// Indentation between correct_start..infinity is considered correct
+ correct_start: NumSpaces,
+};
+
+const IndentAlignment = enum { correct, misaligned };
+const IndentLevel = enum(u8) {
+ root = 0,
+ max = 254,
+ infinity = 255,
+ _,
+
+ const NUM_INDENT_LEVELS = 255;
+
+ fn from(i: usize) ?IndentLevel {
+ if (i > 254) return null;
+ return @enumFromInt(i);
+ }
+ fn pred(a: IndentLevel) IndentLevel {
+ return @enumFromInt(@intFromEnum(a) - 1);
+ }
+ fn succ(a: IndentLevel) ?IndentLevel {
+ if (a.compare(.eq, .max)) return null;
+ return @enumFromInt(@intFromEnum(a) + 1);
+ }
+ fn compare(a: IndentLevel, op: std.math.CompareOperator, b: IndentLevel) bool {
+ return std.math.compare(@intFromEnum(a), op, @intFromEnum(b));
+ }
+};
+
+pub const PeekResult = ?struct { IndentAlignment, PaddedSlice };
+
+// =======================================
+// Testing Testing Testing Testing Testing
+const testing = std.testing;
+inline fn expectEqualDeep(comptime T: type, expected: T, actual: T) !void {
+ try testing.expectEqualDeep(expected, actual);
+}
+inline fn expectEqualPeekResults(expected: PeekResult, actual: PeekResult) !void {
+ const UnpaddedPeekResult = ?struct { IndentAlignment, [:'\n']const u8 };
+ try expectEqualDeep(
+ UnpaddedPeekResult,
+ if (expected) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null,
+ if (actual) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null,
+ );
+}
+
+test IndentationScanner {
+ const input =
+ \\line 1
+ \\>
+ \\ line 3
+ \\> line 4
+ \\ line 5
+ \\ line 6
+ \\line 7
+ \\> line 8
+ \\ > line 9
+ \\line 10
+ \\> line 11
+ \\ > line 12
+ \\ line 13
+ \\ line 14
+ \\ line 15
+ \\ line 16
+ \\ line 17
+ \\ line 18
+ \\> line 19
+ \\ > line 20
+ \\
+ ;
+ const padded_input = try PaddedSlice.pad(testing.allocator, input);
+ defer testing.allocator.free(padded_input);
+ var it: IndentationScanner = try .init(padded_input);
+
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 1") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from(">") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from(" line 3") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("> line 4") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 4") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("line 5") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .misaligned, .from("line 6") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 7") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("> line 8") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 8") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("> line 9") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ .{ .misalignment_start = 3, .correct_start = 4 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 9") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 10") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("> line 11") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 5 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 11") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .misaligned, .from("> line 12") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 5 },
+ .{ .misalignment_start = 3, .correct_start = 4 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 12") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from(" line 13") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("line 14") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .misaligned, .from("line 15") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 5 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .misaligned, .from("line 16") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .misaligned, .from("line 17") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("line 18") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("> line 19") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 19") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(.{ .correct, .from("> line 20") }, it.peek());
+
+ try it.indent(.{ .skip = 1 });
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ .{ .misalignment_start = 3, .correct_start = 4 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(.{ .correct, .from("line 20") }, it.peek());
+
+ it.advance();
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ .{ .misalignment_start = 1, .correct_start = 2 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(null, it.peek());
+
+ it.dedent();
+ try expectEqualDeep([]const ExpectedIndent, &.{
+ .{ .misalignment_start = 0, .correct_start = 0 },
+ }, it._expected_indents.slice());
+ try expectEqualPeekResults(null, it.peek());
+}
diff --git a/src/padded_str.zig b/src/padded_str.zig
@@ -0,0 +1,3 @@
+pub const PADDING = 128;
+pub const PaddedSlice = @import("padded_str_impl.zig").PaddedSlice(PADDING);
+pub const PaddedMany = @import("padded_str_impl.zig").PaddedMany(PADDING);
diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig
@@ -0,0 +1,253 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+/// A slice pointer with a "many sentinel":
+/// not only must there be a newline at the end,
+/// the next PADDING - 1 bytes must also be readable.
+pub fn PaddedSlice(comptime PADDING_: usize) type {
+ return struct {
+ /// How many '\n' chars exists past the end of line
+ pub const PADDING = PADDING_;
+ ptr: PaddedMany(PADDING),
+ len: usize,
+
+ pub fn from(comptime str_lit: []const u8) @This() {
+ return .{
+ .ptr = .from(str_lit),
+ .len = str_lit.len,
+ };
+ }
+
+ pub fn toUnpaddedSlice(self: @This()) [:'\n']const u8 {
+ return self.ptr._ptr[0..self.len :'\n'];
+ }
+
+ pub fn index(self: @This(), idx: usize) u8 {
+ assert(idx < self.len);
+ return self.ptr.index(idx);
+ }
+
+ pub fn sliceOpen(self: @This(), start: usize) @This() {
+ assert(start <= self.len);
+ return .{
+ .ptr = self.ptr.sliceOpen(start),
+ .len = self.len - start,
+ };
+ }
+
+ pub fn pad(gpa: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 {
+ const padded_input = try gpa.alloc(u8, input.len + PADDING);
+ @memcpy(padded_input[0..input.len], input);
+ padded_input[input.len] = '\n';
+ return padded_input;
+ }
+
+ pub fn init(padded_input: []const u8) error{InputNotPadded}!@This() {
+ if (padded_input.len < PADDING or padded_input[padded_input.len - PADDING] != '\n')
+ return error.InputNotPadded;
+ const actual_input = padded_input[0 .. padded_input.len - PADDING :'\n'];
+ return .{
+ .ptr = .{ ._ptr = actual_input.ptr },
+ .len = actual_input.len,
+ };
+ }
+
+ pub fn indexOfNotSpace(self: @This()) usize {
+ return self.ptr.indexOfNotSpace();
+ }
+
+ /// Splits the first line from the string, returning a tuple of two padded slices.
+ /// The first padded slice is `null` if there are no lines left.
+ pub fn splitOneLine(self: @This()) struct { ?@This(), @This() } {
+ if (self.len == 0) return .{ null, self };
+ const line_len = self.ptr.indexOfNewline();
+ return .{
+ .{
+ .ptr = .{
+ ._ptr = self.ptr._ptr,
+ },
+ .len = line_len,
+ },
+ .{
+ .ptr = .{
+ ._ptr = self.ptr._ptr + line_len + 1,
+ },
+ .len = self.len - line_len - 1,
+ },
+ };
+ }
+ };
+}
+
+/// A many pointer with a "many sentinel":
+/// not only must there be a newline at the end,
+/// the next PADDING - 1 bytes must also be readable.
+pub fn PaddedMany(comptime PADDING_: usize) type {
+ return struct {
+ /// How many '\n' chars exists past the end of line
+ pub const PADDING = PADDING_;
+ _ptr: [*:'\n']const u8,
+
+ pub fn index(self: @This(), idx: usize) u8 {
+ return self._ptr[idx];
+ }
+
+ pub fn sliceOpen(ptr: @This(), start: usize) @This() {
+ return .{ ._ptr = ptr._ptr[start..] };
+ }
+
+ pub fn calcOffsetTo(ptr: @This(), other: @This()) usize {
+ return other._ptr - ptr._ptr;
+ }
+
+ fn simd_maybe() ?type {
+ if (switch (@import("builtin").zig_backend) {
+ .stage2_llvm, .stage2_c => true,
+ else => false,
+ } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717
+ ) {
+ if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| {
+ return struct {
+ const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING);
+ /// How many loads to do in parallel in 1 loop iteration.
+ /// Consider increasing if your input distribution contains long strings.
+ /// Note that this may not have an effect if PADDING is not large enough.
+ const MAX_UNROLL = 1;
+ const BLOCK_LEN = @min(
+ MAX_UNROLL * BLOCK_LEN__,
+ @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__,
+ );
+ const CharBlock = @Vector(BLOCK_LEN, u8);
+ const BitBlock = @Vector(BLOCK_LEN, u1);
+ const BoolBlock = @Vector(BLOCK_LEN, bool);
+ const IntBlock = @Type(.{ .int = .{
+ .signedness = .unsigned,
+ .bits = BLOCK_LEN,
+ } });
+ };
+ }
+ }
+ return null;
+ }
+
+ pub fn from(comptime str_lit: []const u8) @This() {
+ return .{ ._ptr = (str_lit ++ ("\n" ** PADDING))[0..str_lit.len :'\n'] };
+ }
+
+ /// charset should be as small as possible -- one vector `cmpeq` and `or`
+ /// is emitted for each char in the charset. if you want range queries,
+ /// copy paste this and write your own custom impl.
+ pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize {
+ if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null)
+ @compileError("charset must not contain newline, otherwise this op is potentially unsafe.");
+ return ptr.indexOfNoneUnsafe(charset);
+ }
+
+ pub fn indexOfNoneUnsafe(ptr: @This(), comptime charset: []const u8) usize {
+ var it = ptr._ptr;
+ if (!@inComptime()) {
+ if (simd_maybe()) |simd| {
+ const splats: [charset.len]simd.CharBlock = comptime blk: {
+ var splats_: []const simd.CharBlock = &.{};
+ for (charset) |c| {
+ splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)};
+ }
+ break :blk splats_[0..charset.len].*;
+ };
+ while (true) {
+ const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*;
+ var unmatches_bits = ~@as(simd.BitBlock, @splat(0));
+ for (splats) |splat| {
+ unmatches_bits &= @bitCast(block != splat);
+ }
+ const unmatches_bools: simd.BoolBlock = @bitCast(unmatches_bits);
+ if (@reduce(.Or, unmatches_bools)) {
+ const unmatches_int: simd.IntBlock = @bitCast(unmatches_bits);
+ {
+ @setRuntimeSafety(false);
+ if (unmatches_int == 0) unreachable;
+ }
+ return it - ptr._ptr + @ctz(unmatches_int);
+ }
+ it += simd.BLOCK_LEN;
+ }
+ }
+ }
+
+ while (true) {
+ inline for (charset) |c| {
+ if (it[0] == c) break;
+ } else return it - ptr._ptr;
+ it += 1;
+ }
+ }
+
+ /// charset should be as small as possible -- one vector `cmpeq` and `or`
+ /// is emitted for each char in the charset. if you want range queries,
+ /// copy paste this and write your own custom impl.
+ pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize {
+ if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null)
+ @compileError("charset must contain newline, otherwise this op is potentially unsafe.");
+ return ptr.indexOfAnyUnsafe(charset);
+ }
+
+ pub fn indexOfAnyUnsafe(ptr: @This(), comptime charset: []const u8) usize {
+ var it = ptr._ptr;
+ if (!@inComptime()) {
+ if (simd_maybe()) |simd| {
+ const splats: [charset.len]simd.CharBlock = comptime blk: {
+ var splats_: []const simd.CharBlock = &.{};
+ for (charset) |c| {
+ splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)};
+ }
+ break :blk splats_[0..charset.len].*;
+ };
+ while (true) {
+ const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*;
+ var matches_bits = @as(simd.BitBlock, @splat(0));
+ for (splats) |splat| {
+ matches_bits |= @bitCast(block == splat);
+ }
+ const matches_bools: simd.BoolBlock = @bitCast(matches_bits);
+ if (@reduce(.Or, matches_bools)) {
+ const matches_int: simd.IntBlock = @bitCast(matches_bits);
+ {
+ @setRuntimeSafety(false);
+ if (matches_int == 0) unreachable;
+ }
+ return it - ptr._ptr + @ctz(matches_int);
+ }
+ it += simd.BLOCK_LEN;
+ }
+ }
+ }
+
+ while (true) {
+ inline for (charset) |c| {
+ if (it[0] == c) return it - ptr._ptr;
+ }
+ it += 1;
+ }
+ }
+
+ pub fn indexOfNotSpace(ptr: @This()) usize {
+ return ptr.indexOfNone(" ");
+ }
+
+ pub fn indexOfNewline(ptr: @This()) usize {
+ return ptr.indexOfAny("\n");
+ }
+ };
+}
+
+test "PaddedSlice" {
+ const testing = std.testing;
+ const input = "line 1\n";
+ const padded_input = try PaddedSlice(128).pad(testing.allocator, input);
+ defer testing.allocator.free(padded_input);
+ var slice: PaddedSlice(128) = try .init(padded_input);
+ const line_maybe, slice = slice.splitOneLine();
+ if (line_maybe) |_| {
+ return;
+ } else return error.Sad;
+}
diff --git a/src/root.zig b/src/root.zig
@@ -10,4 +10,5 @@ test {
_ = @import("test/test.zig");
_ = Ast;
_ = AstGen;
+ _ = @import("IndentationScanner.zig");
}
diff --git a/src/test/test.zig b/src/test/test.zig
@@ -324,25 +324,25 @@ test "Thematic break" {
);
}
-test "Mixed indentation" {
- try testParse("" ++
- "+ aaa\n" ++
- "\n" ++
- "\tbbbbb\n",
- \\.document
- \\ .elaboration
- \\ .error .inconsistent_indentation at 3:1
- \\ .marker
- \\ "+"
- \\ .paragraph
- \\ .text
- \\ "aaa"
- \\ .paragraph
- \\ .text
- \\ "bbbbb"
- \\
- );
-}
+// test "Mixed indentation" {
+// try testParse("" ++
+// "+ aaa\n" ++
+// "\n" ++
+// "\tbbbbb\n",
+// \\.document
+// \\ .elaboration
+// \\ .error .inconsistent_indentation at 3:1
+// \\ .marker
+// \\ "+"
+// \\ .paragraph
+// \\ .text
+// \\ "aaa"
+// \\ .paragraph
+// \\ .text
+// \\ "bbbbb"
+// \\
+// );
+// }
test "Tabs in text" {
try testParse("" ++
@@ -406,31 +406,31 @@ test "Super long line" {
}), taggedAst);
}
-test "Many short lines" {
- const input = try std.testing.allocator.create([(1 << 25) - 4]u8);
- defer std.testing.allocator.destroy(input);
- @memset(@as(*[(1 << 24) - 2][2]u8, @ptrCast(input)), "a\n"[0..2].*);
+// test "Many short lines" {
+// const input = try std.testing.allocator.create([(1 << 25) - 4]u8);
+// defer std.testing.allocator.destroy(input);
+// @memset(@as(*[(1 << 24) - 2][2]u8, @ptrCast(input)), "a\n"[0..2].*);
- var arena: ArenaAllocator = .init(std.testing.allocator);
- defer arena.deinit();
- const ast = try parse(std.testing.allocator, arena.allocator(), input);
- try std.testing.expectEqual(1 << 24, ast.nodes.len);
- try std.testing.expectEqual(
- @as(Ast.Node.Tagged, .{ .document = .{ .num_children = 1 } }),
- ast.nodes[0].toTagged(),
- );
- try std.testing.expectEqual(
- @as(Ast.Node.Tagged, .{ .paragraph = .{ .off = 0, .num_children = (1 << 24) - 2 } }),
- ast.nodes[1].toTagged(),
- );
- try std.testing.expectEqual(
- @as(Ast.Node.Tagged, .{ .text = .{ .off = 0, .len = 1 } }),
- ast.nodes[2].toTagged(),
- );
- for (1..(1 << 24) - 2) |i| {
- try std.testing.expectEqual(
- @as(Ast.Node.Tagged, .{ .space_text = .{ .off = @intCast(i * 2), .len = 1 } }),
- ast.nodes[i + 2].toTagged(),
- );
- }
-}
+// var arena: ArenaAllocator = .init(std.testing.allocator);
+// defer arena.deinit();
+// const ast = try parse(std.testing.allocator, arena.allocator(), input);
+// try std.testing.expectEqual(1 << 24, ast.nodes.len);
+// try std.testing.expectEqual(
+// @as(Ast.Node.Tagged, .{ .document = .{ .num_children = 1 } }),
+// ast.nodes[0].toTagged(),
+// );
+// try std.testing.expectEqual(
+// @as(Ast.Node.Tagged, .{ .paragraph = .{ .off = 0, .num_children = (1 << 24) - 2 } }),
+// ast.nodes[1].toTagged(),
+// );
+// try std.testing.expectEqual(
+// @as(Ast.Node.Tagged, .{ .text = .{ .off = 0, .len = 1 } }),
+// ast.nodes[2].toTagged(),
+// );
+// for (1..(1 << 24) - 2) |i| {
+// try std.testing.expectEqual(
+// @as(Ast.Node.Tagged, .{ .space_text = .{ .off = @intCast(i * 2), .len = 1 } }),
+// ast.nodes[i + 2].toTagged(),
+// );
+// }
+// }