mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

commit 79ad40b2e85365c8e5edd8bc5b402c82ea112a72
parent 94b5af2e6db938f724637a23e0ed1d81f2294741
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Tue, 20 May 2025 17:00:57 +0800

Implement IndentationScanner

Diffstat:
Asrc/IndentationScanner.zig | 551+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/padded_str.zig | 3+++
Asrc/padded_str_impl.zig | 253+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/root.zig | 1+
Msrc/test/test.zig | 92++++++++++++++++++++++++++++++++++++++++----------------------------------------
5 files changed, 854 insertions(+), 46 deletions(-)

diff --git a/src/IndentationScanner.zig b/src/IndentationScanner.zig @@ -0,0 +1,551 @@ +//! # Indentation iterator for mymarkdown +//! +//! This file focuses on parsing just the 2d structure of mymarkdown. +//! +//! An IndentationIterator maintains the following state: +//! - a cursor indicating what the current line is +//! - a stack of expected indentations +//! +//! It supports the following operations: +//! - `it.peek() : ?struct { IndentAlignment, PaddedSlice }` +//! -- no matter what, peek does not mutate state. +//! -- peek returns .line if the current indentation is at least to the expected indentation, +//! -- else .misaligned_line, if the current intendation is greater than the parent's expected indentation, +//! -- else .end_of_block, signalling the end of the current indented section. +//! - the remaining functions always return `void` and always mutate state. +//! - `it.advance() : void` +//! -- advance updates the cursor to the next line +//! - `it.indent(options: struct { skip: usize }) : void` +//! -- indent pushes a new entry onto the stack of expected indentations. +//! -- first, indent treats the first `skip` characters after the expected indentation as spaces. +//! -- then, it scans forward for the next non-whitespace character. +//! --- if no such character is found, the indentation level is treated as if it were "infinity" +//! i.e. no indented block exists. +//! --- otherwise, the whitespace before the non-whitespace character found is added to the expected indentation. +//! - `it.dedent() : void` +//! -- pops from the stack of expected indentations. +//! +//! ## Example API trace (happy case) +//! +//! Input: +//! +//! == +//! > line 1 +//! > line 2 +//! > line 3 +//! line 4 +//! line 5 +//! line 6 +//! == +//! +//! - .peek() -> .{ .correct, "> line 1" } +//! - .peek() -> .{ .correct, "> line 1" } +//! + (peek does not mutate state) +//! - .indent(1) +//! -- .peek() -> .{ .correct, "line 1" } +//! ++ (indent treats the hyphen as a space, thus removing it from +//! the current line.) +//! -- .advance() +//! -- .peek() -> .{ .correct, "> line 2" } +//! -- .indent(1) +//! --- .peek() -> .{ .correct, "line 2" } +//! --- .advance() +//! --- .peek() -> .{ .correct, "> line 3" } +//! --- .indent(1) +//! ---- .peek() -> .{ .correct, "line 3" } +//! ---- .advance() +//! ++++ (the cursor is now on line 4, which has 1 space of indentation, +//! less than the expected 6 spaces, and less than the 4 spaces +//! the parent expects.) +//! ---- .peek() -> null +//! --- .dedent() +//! --- .peek() -> null +//! -- .dedent() +//! ++ (line 4 has only 1 space of indentation, but it is still indented +//! relative to the parent. so it is considered part of the current +//! indented block, just that it is misaligned.) +//! -- .peek() -> .{ .misaligned, "line 4" } +//! -- .advance() +//! -- .peek() -> .{ .correct, " line 5" } +//! ++ (line 5 matches 2 spaces of indent. the excess spaces are simply +//! included in the line.) +//! -- .advance() +//! -- .peek() -> null +//! ++ (expected 2 spaces, parent expects 0 spaces, line 6 has 0 spaces, +//! matching the parent.) +//! - .dedent() +//! - .peek() -> .{ .correct, "line 6" } +//! - .advance() +//! - .peek() -> null +//! +//! ## Tabs +//! +//! Tabs are treated as non-whitespace because it is too hard to give +//! intuitive semantics to them. See git tag `tab-rant` to see a version +//! of this doc comment that tried to do so. +//! +//! ## What should happen for misaligned indented blocks +//! +//! == +//! > line 1 +//! > line 2 +//! == +//! +//! - .peek() -> .{ .correct, "> line 1" } +//! - .indent(1) +//! -- .peek() -> .{ .correct, "line 1" } +//! ++ (the indentation is now considered to be 4 spaces) +//! -- .advance() +//! -- .peek() -> .{ .misaligned, "> line 2" } +//! ++ (line 2 is misaligned as it only has 2 spaces) +//! -- .indent(1) +//! ++ (**what should happen now?**) +//! +//! ### Does not work: Simply push the new alignment on the stack +//! +//! The alignment of "line 2" is 4 spaces, so if we simply push it +//! onto the indentation stack, we have: +//! +//! - indentation_stack = .{ 0, 4, 4 } +//! +//! The parent indentation is the same as our indentation! That means +//! "error recovery" is impossible. A 3 space indent will be treated +//! as misaligned to the parent, and null will be emitted. +//! After the dedent, this 3 space indent is still considered misaligned, +//! but it's misaligned to the parent. +//! +//! == +//! > line 1 +//! > line 2 +//! line 3 +//! == +//! +//! -- .indent(1) +//! ++ (resuming from the first example) +//! --- .peek() -> .{ .correct, "line 2" } +//! --- .advance() +//! --- .peek() -> null +//! -- .dedent() +//! -- .peek() -> .{ .misaligned, "line 3" } +//! +//! This is also an odd case. Without line 1, line 3 would be considered +//! misaligned to line 2, however because it matches up with the parent, +//! it actually triggers an null, followed by .line after .dedent(). +//! +//! == +//! > line 1 +//! > line 2 +//! line 3 +//! == +//! +//! -- .indent(1) +//! ++ (resuming from the first example) +//! --- .peek() -> .{ .correct, "line 2" } +//! --- .advance() +//! --- .peek() -> null +//! -- .dedent() +//! -- .peek() -> .{ .correct, "line 3" } +//! +//! ### What we actually do: Push two elements for each added indentation level +//! +//! The idea here is that each indentation level will have its own pair of +//! "misaligned" and "actual" alignments. +//! +//! The indentation stack will start with .{ 0, 0 }, indicating that +//! the root alignment is at 0 spaces and has no "misalignment zone". +//! Each .indent(...) pushes 2 values onto the stack. +//! +//! Going back to this example, after the first .indent(1), the indentation +//! stack will contain .{ 0, 0, 0, 4 }. After the second .indent(1), +//! the indentation stack will contain .{ 0, 0, 0, 4, 2, 4 }, +//! indicating that there are 2 spaces worth of "misalignment" possible. +//! +//! == +//! > line 1 +//! > line 2 +//! line 3 +//! == + +// ================================== +// Fields Fields Fields Fields Fields + +/// Stack of expected indentations. +/// The current indentation level is equal to the number of entries. +_expected_indents: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS), + +/// The line that would be returned by peek (if indentation matches). +/// We always pre-scan for indentation when assigning to this field. +/// `null` indicates EOF. +_cur_line: ?struct { + IndentLevel, + NumSpaces, + PaddedSlice, +}, + +/// The remaining lines of the input. +_rest_lines: PaddedSlice, + +// ======================================= +// Methods Methods Methods Methods Methods + +/// Create an IndentationScanner that tokenizes the given input. +pub fn init(input_: []const u8) error{InputNotPadded}!IndentationScanner { + const input: PaddedSlice = try .init(input_); + const first_line, const rest_lines = input.splitOneLine(); + return .{ + ._expected_indents = while (true) { + var out: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS) = .{}; + out.appendAssumeCapacity(.{ .misalignment_start = 0, .correct_start = 0 }); + break out; + }, + ._cur_line = if (first_line) |line| .{ .root, 0, line } else null, + ._rest_lines = rest_lines, + }; +} + +/// Peek at the current line's contents, with indentation stripped. +/// If indentation matches the parent or less, null is emitted. +pub fn peek(it: IndentationScanner) PeekResult { + const level, const spaces, const content = it._cur_line orelse return null; + if (level.compare(.lt, it.curLevel())) return null; + assert(level.compare(.eq, it.curLevel())); + if (spaces >= it.curExpectedIndents().correct_start) + return .{ .correct, content } + else + return .{ .misaligned, content }; +} + +/// Advances the cursor to the next line. +/// Precondition: peek() would not have returned null. +pub fn advance(it: *IndentationScanner) void { + assert(it.peek() != null); + it._cur_line = null; + const line_maybe, it._rest_lines = it._rest_lines.splitOneLine(); + const line = line_maybe orelse return; + const spaces = line.indexOfNotSpace(); + it._cur_line = for (0..it._expected_indents.len) |rev_i| { + const i = it._expected_indents.len - 1 - rev_i; + const misalignment_start = it._expected_indents.buffer[i].misalignment_start; + const correct_start = it._expected_indents.buffer[i].correct_start; + if (spaces >= correct_start) { + break .{ IndentLevel.from(i) orelse unreachable, correct_start, line.sliceOpen(correct_start) }; + } else if (spaces >= misalignment_start) { + break .{ IndentLevel.from(i) orelse unreachable, spaces, line.sliceOpen(spaces) }; + } + } else unreachable; +} + +/// Treats the next `skip` characters as whitespace, +/// finds the correct level of indentation, +/// then pushes it onto the indentation stack. +/// Each `.indent(...)` call should be paired with a `.dedent()` call. +/// Precondition: peek() would not have returned null. +pub fn indent( + it: *IndentationScanner, + opts: struct { skip: usize }, +) error{TooMuchIndentation}!void { + const level, const spaces, const content = it._cur_line orelse unreachable; + assert(level.compare(.eq, it.curLevel())); + const additional_indent = opts.skip + content.sliceOpen(opts.skip).indexOfNotSpace(); + if (additional_indent == content.len) { + it._expected_indents.append( + .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES }, + ) catch return error.TooMuchIndentation; + it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) }; + it.advance(); + } else { + it._expected_indents.append( + .{ .misalignment_start = spaces + 1, .correct_start = spaces + additional_indent }, + ) catch return error.TooMuchIndentation; + it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) }; + } +} + +/// Pops from the indentation stack. +/// Each `.indent(...)` call should be paired with a `.dedent()` call. +/// Precondition: peek would return null, i.e. cursor is at EOF or +/// current indentation level is less than expected. +pub fn dedent(it: *IndentationScanner) void { + assert(it.peek() == null); + _ = it._expected_indents.pop(); +} + +// ======================================= +// Helpers Helpers Helpers Helpers Helpers + +fn curLevel(it: IndentationScanner) IndentLevel { + return IndentLevel.from(it._expected_indents.len - 1).?; +} + +fn curExpectedIndents(it: IndentationScanner) ExpectedIndent { + return it._expected_indents.buffer[it._expected_indents.len - 1]; +} + +// ======================================= +// Imports Imports Imports Imports Imports +// Types Types Types Types Types Types + +const std = @import("std"); +const maxInt = std.math.maxInt; +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; + +const str = @import("str.zig"); +const padded_str = @import("padded_str.zig"); + +const IndentationScanner = @This(); + +const PaddedSlice = @import("padded_str.zig").PaddedSlice; +const PaddedMany = @import("padded_str.zig").PaddedMany; + +const NumSpaces = usize; +const MAX_SPACES = maxInt(NumSpaces); +const ExpectedIndent = struct { + /// Indentation between misalignment_start..correct_start is considered misaligned + misalignment_start: NumSpaces, + /// Indentation between correct_start..infinity is considered correct + correct_start: NumSpaces, +}; + +const IndentAlignment = enum { correct, misaligned }; +const IndentLevel = enum(u8) { + root = 0, + max = 254, + infinity = 255, + _, + + const NUM_INDENT_LEVELS = 255; + + fn from(i: usize) ?IndentLevel { + if (i > 254) return null; + return @enumFromInt(i); + } + fn pred(a: IndentLevel) IndentLevel { + return @enumFromInt(@intFromEnum(a) - 1); + } + fn succ(a: IndentLevel) ?IndentLevel { + if (a.compare(.eq, .max)) return null; + return @enumFromInt(@intFromEnum(a) + 1); + } + fn compare(a: IndentLevel, op: std.math.CompareOperator, b: IndentLevel) bool { + return std.math.compare(@intFromEnum(a), op, @intFromEnum(b)); + } +}; + +pub const PeekResult = ?struct { IndentAlignment, PaddedSlice }; + +// ======================================= +// Testing Testing Testing Testing Testing +const testing = std.testing; +inline fn expectEqualDeep(comptime T: type, expected: T, actual: T) !void { + try testing.expectEqualDeep(expected, actual); +} +inline fn expectEqualPeekResults(expected: PeekResult, actual: PeekResult) !void { + const UnpaddedPeekResult = ?struct { IndentAlignment, [:'\n']const u8 }; + try expectEqualDeep( + UnpaddedPeekResult, + if (expected) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null, + if (actual) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null, + ); +} + +test IndentationScanner { + const input = + \\line 1 + \\> + \\ line 3 + \\> line 4 + \\ line 5 + \\ line 6 + \\line 7 + \\> line 8 + \\ > line 9 + \\line 10 + \\> line 11 + \\ > line 12 + \\ line 13 + \\ line 14 + \\ line 15 + \\ line 16 + \\ line 17 + \\ line 18 + \\> line 19 + \\ > line 20 + \\ + ; + const padded_input = try PaddedSlice.pad(testing.allocator, input); + defer testing.allocator.free(padded_input); + var it: IndentationScanner = try .init(padded_input); + + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 1") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from(">") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from(" line 3") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("> line 4") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 4") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("line 5") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .misaligned, .from("line 6") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 7") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("> line 8") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 8") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("> line 9") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + .{ .misalignment_start = 3, .correct_start = 4 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 9") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 10") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("> line 11") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 5 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 11") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .misaligned, .from("> line 12") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 5 }, + .{ .misalignment_start = 3, .correct_start = 4 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 12") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from(" line 13") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("line 14") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .misaligned, .from("line 15") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 5 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .misaligned, .from("line 16") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .misaligned, .from("line 17") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("line 18") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("> line 19") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 19") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(.{ .correct, .from("> line 20") }, it.peek()); + + try it.indent(.{ .skip = 1 }); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + .{ .misalignment_start = 3, .correct_start = 4 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(.{ .correct, .from("line 20") }, it.peek()); + + it.advance(); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + .{ .misalignment_start = 1, .correct_start = 2 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(null, it.peek()); + + it.dedent(); + try expectEqualDeep([]const ExpectedIndent, &.{ + .{ .misalignment_start = 0, .correct_start = 0 }, + }, it._expected_indents.slice()); + try expectEqualPeekResults(null, it.peek()); +} diff --git a/src/padded_str.zig b/src/padded_str.zig @@ -0,0 +1,3 @@ +pub const PADDING = 128; +pub const PaddedSlice = @import("padded_str_impl.zig").PaddedSlice(PADDING); +pub const PaddedMany = @import("padded_str_impl.zig").PaddedMany(PADDING); diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig @@ -0,0 +1,253 @@ +const std = @import("std"); +const assert = std.debug.assert; + +/// A slice pointer with a "many sentinel": +/// not only must there be a newline at the end, +/// the next PADDING - 1 bytes must also be readable. +pub fn PaddedSlice(comptime PADDING_: usize) type { + return struct { + /// How many '\n' chars exists past the end of line + pub const PADDING = PADDING_; + ptr: PaddedMany(PADDING), + len: usize, + + pub fn from(comptime str_lit: []const u8) @This() { + return .{ + .ptr = .from(str_lit), + .len = str_lit.len, + }; + } + + pub fn toUnpaddedSlice(self: @This()) [:'\n']const u8 { + return self.ptr._ptr[0..self.len :'\n']; + } + + pub fn index(self: @This(), idx: usize) u8 { + assert(idx < self.len); + return self.ptr.index(idx); + } + + pub fn sliceOpen(self: @This(), start: usize) @This() { + assert(start <= self.len); + return .{ + .ptr = self.ptr.sliceOpen(start), + .len = self.len - start, + }; + } + + pub fn pad(gpa: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 { + const padded_input = try gpa.alloc(u8, input.len + PADDING); + @memcpy(padded_input[0..input.len], input); + padded_input[input.len] = '\n'; + return padded_input; + } + + pub fn init(padded_input: []const u8) error{InputNotPadded}!@This() { + if (padded_input.len < PADDING or padded_input[padded_input.len - PADDING] != '\n') + return error.InputNotPadded; + const actual_input = padded_input[0 .. padded_input.len - PADDING :'\n']; + return .{ + .ptr = .{ ._ptr = actual_input.ptr }, + .len = actual_input.len, + }; + } + + pub fn indexOfNotSpace(self: @This()) usize { + return self.ptr.indexOfNotSpace(); + } + + /// Splits the first line from the string, returning a tuple of two padded slices. + /// The first padded slice is `null` if there are no lines left. + pub fn splitOneLine(self: @This()) struct { ?@This(), @This() } { + if (self.len == 0) return .{ null, self }; + const line_len = self.ptr.indexOfNewline(); + return .{ + .{ + .ptr = .{ + ._ptr = self.ptr._ptr, + }, + .len = line_len, + }, + .{ + .ptr = .{ + ._ptr = self.ptr._ptr + line_len + 1, + }, + .len = self.len - line_len - 1, + }, + }; + } + }; +} + +/// A many pointer with a "many sentinel": +/// not only must there be a newline at the end, +/// the next PADDING - 1 bytes must also be readable. +pub fn PaddedMany(comptime PADDING_: usize) type { + return struct { + /// How many '\n' chars exists past the end of line + pub const PADDING = PADDING_; + _ptr: [*:'\n']const u8, + + pub fn index(self: @This(), idx: usize) u8 { + return self._ptr[idx]; + } + + pub fn sliceOpen(ptr: @This(), start: usize) @This() { + return .{ ._ptr = ptr._ptr[start..] }; + } + + pub fn calcOffsetTo(ptr: @This(), other: @This()) usize { + return other._ptr - ptr._ptr; + } + + fn simd_maybe() ?type { + if (switch (@import("builtin").zig_backend) { + .stage2_llvm, .stage2_c => true, + else => false, + } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717 + ) { + if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| { + return struct { + const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING); + /// How many loads to do in parallel in 1 loop iteration. + /// Consider increasing if your input distribution contains long strings. + /// Note that this may not have an effect if PADDING is not large enough. + const MAX_UNROLL = 1; + const BLOCK_LEN = @min( + MAX_UNROLL * BLOCK_LEN__, + @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__, + ); + const CharBlock = @Vector(BLOCK_LEN, u8); + const BitBlock = @Vector(BLOCK_LEN, u1); + const BoolBlock = @Vector(BLOCK_LEN, bool); + const IntBlock = @Type(.{ .int = .{ + .signedness = .unsigned, + .bits = BLOCK_LEN, + } }); + }; + } + } + return null; + } + + pub fn from(comptime str_lit: []const u8) @This() { + return .{ ._ptr = (str_lit ++ ("\n" ** PADDING))[0..str_lit.len :'\n'] }; + } + + /// charset should be as small as possible -- one vector `cmpeq` and `or` + /// is emitted for each char in the charset. if you want range queries, + /// copy paste this and write your own custom impl. + pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize { + if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null) + @compileError("charset must not contain newline, otherwise this op is potentially unsafe."); + return ptr.indexOfNoneUnsafe(charset); + } + + pub fn indexOfNoneUnsafe(ptr: @This(), comptime charset: []const u8) usize { + var it = ptr._ptr; + if (!@inComptime()) { + if (simd_maybe()) |simd| { + const splats: [charset.len]simd.CharBlock = comptime blk: { + var splats_: []const simd.CharBlock = &.{}; + for (charset) |c| { + splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)}; + } + break :blk splats_[0..charset.len].*; + }; + while (true) { + const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*; + var unmatches_bits = ~@as(simd.BitBlock, @splat(0)); + for (splats) |splat| { + unmatches_bits &= @bitCast(block != splat); + } + const unmatches_bools: simd.BoolBlock = @bitCast(unmatches_bits); + if (@reduce(.Or, unmatches_bools)) { + const unmatches_int: simd.IntBlock = @bitCast(unmatches_bits); + { + @setRuntimeSafety(false); + if (unmatches_int == 0) unreachable; + } + return it - ptr._ptr + @ctz(unmatches_int); + } + it += simd.BLOCK_LEN; + } + } + } + + while (true) { + inline for (charset) |c| { + if (it[0] == c) break; + } else return it - ptr._ptr; + it += 1; + } + } + + /// charset should be as small as possible -- one vector `cmpeq` and `or` + /// is emitted for each char in the charset. if you want range queries, + /// copy paste this and write your own custom impl. + pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize { + if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null) + @compileError("charset must contain newline, otherwise this op is potentially unsafe."); + return ptr.indexOfAnyUnsafe(charset); + } + + pub fn indexOfAnyUnsafe(ptr: @This(), comptime charset: []const u8) usize { + var it = ptr._ptr; + if (!@inComptime()) { + if (simd_maybe()) |simd| { + const splats: [charset.len]simd.CharBlock = comptime blk: { + var splats_: []const simd.CharBlock = &.{}; + for (charset) |c| { + splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)}; + } + break :blk splats_[0..charset.len].*; + }; + while (true) { + const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*; + var matches_bits = @as(simd.BitBlock, @splat(0)); + for (splats) |splat| { + matches_bits |= @bitCast(block == splat); + } + const matches_bools: simd.BoolBlock = @bitCast(matches_bits); + if (@reduce(.Or, matches_bools)) { + const matches_int: simd.IntBlock = @bitCast(matches_bits); + { + @setRuntimeSafety(false); + if (matches_int == 0) unreachable; + } + return it - ptr._ptr + @ctz(matches_int); + } + it += simd.BLOCK_LEN; + } + } + } + + while (true) { + inline for (charset) |c| { + if (it[0] == c) return it - ptr._ptr; + } + it += 1; + } + } + + pub fn indexOfNotSpace(ptr: @This()) usize { + return ptr.indexOfNone(" "); + } + + pub fn indexOfNewline(ptr: @This()) usize { + return ptr.indexOfAny("\n"); + } + }; +} + +test "PaddedSlice" { + const testing = std.testing; + const input = "line 1\n"; + const padded_input = try PaddedSlice(128).pad(testing.allocator, input); + defer testing.allocator.free(padded_input); + var slice: PaddedSlice(128) = try .init(padded_input); + const line_maybe, slice = slice.splitOneLine(); + if (line_maybe) |_| { + return; + } else return error.Sad; +} diff --git a/src/root.zig b/src/root.zig @@ -10,4 +10,5 @@ test { _ = @import("test/test.zig"); _ = Ast; _ = AstGen; + _ = @import("IndentationScanner.zig"); } diff --git a/src/test/test.zig b/src/test/test.zig @@ -324,25 +324,25 @@ test "Thematic break" { ); } -test "Mixed indentation" { - try testParse("" ++ - "+ aaa\n" ++ - "\n" ++ - "\tbbbbb\n", - \\.document - \\ .elaboration - \\ .error .inconsistent_indentation at 3:1 - \\ .marker - \\ "+" - \\ .paragraph - \\ .text - \\ "aaa" - \\ .paragraph - \\ .text - \\ "bbbbb" - \\ - ); -} +// test "Mixed indentation" { +// try testParse("" ++ +// "+ aaa\n" ++ +// "\n" ++ +// "\tbbbbb\n", +// \\.document +// \\ .elaboration +// \\ .error .inconsistent_indentation at 3:1 +// \\ .marker +// \\ "+" +// \\ .paragraph +// \\ .text +// \\ "aaa" +// \\ .paragraph +// \\ .text +// \\ "bbbbb" +// \\ +// ); +// } test "Tabs in text" { try testParse("" ++ @@ -406,31 +406,31 @@ test "Super long line" { }), taggedAst); } -test "Many short lines" { - const input = try std.testing.allocator.create([(1 << 25) - 4]u8); - defer std.testing.allocator.destroy(input); - @memset(@as(*[(1 << 24) - 2][2]u8, @ptrCast(input)), "a\n"[0..2].*); +// test "Many short lines" { +// const input = try std.testing.allocator.create([(1 << 25) - 4]u8); +// defer std.testing.allocator.destroy(input); +// @memset(@as(*[(1 << 24) - 2][2]u8, @ptrCast(input)), "a\n"[0..2].*); - var arena: ArenaAllocator = .init(std.testing.allocator); - defer arena.deinit(); - const ast = try parse(std.testing.allocator, arena.allocator(), input); - try std.testing.expectEqual(1 << 24, ast.nodes.len); - try std.testing.expectEqual( - @as(Ast.Node.Tagged, .{ .document = .{ .num_children = 1 } }), - ast.nodes[0].toTagged(), - ); - try std.testing.expectEqual( - @as(Ast.Node.Tagged, .{ .paragraph = .{ .off = 0, .num_children = (1 << 24) - 2 } }), - ast.nodes[1].toTagged(), - ); - try std.testing.expectEqual( - @as(Ast.Node.Tagged, .{ .text = .{ .off = 0, .len = 1 } }), - ast.nodes[2].toTagged(), - ); - for (1..(1 << 24) - 2) |i| { - try std.testing.expectEqual( - @as(Ast.Node.Tagged, .{ .space_text = .{ .off = @intCast(i * 2), .len = 1 } }), - ast.nodes[i + 2].toTagged(), - ); - } -} +// var arena: ArenaAllocator = .init(std.testing.allocator); +// defer arena.deinit(); +// const ast = try parse(std.testing.allocator, arena.allocator(), input); +// try std.testing.expectEqual(1 << 24, ast.nodes.len); +// try std.testing.expectEqual( +// @as(Ast.Node.Tagged, .{ .document = .{ .num_children = 1 } }), +// ast.nodes[0].toTagged(), +// ); +// try std.testing.expectEqual( +// @as(Ast.Node.Tagged, .{ .paragraph = .{ .off = 0, .num_children = (1 << 24) - 2 } }), +// ast.nodes[1].toTagged(), +// ); +// try std.testing.expectEqual( +// @as(Ast.Node.Tagged, .{ .text = .{ .off = 0, .len = 1 } }), +// ast.nodes[2].toTagged(), +// ); +// for (1..(1 << 24) - 2) |i| { +// try std.testing.expectEqual( +// @as(Ast.Node.Tagged, .{ .space_text = .{ .off = @intCast(i * 2), .len = 1 } }), +// ast.nodes[i + 2].toTagged(), +// ); +// } +// }