didnt fix all bugs - mymarkdown

commit 94b5af2e6db938f724637a23e0ed1d81f2294741
parent de923c7630cdc9d438a72ba2e423e49b1f041e01
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Sun, 18 May 2025 22:33:09 +0800

didnt fix all bugs

Diffstat:
M src/Ast.zig  | 1 +
M src/AstGen2.zig  | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
M src/str.zig  | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M src/test/test.zig  | 18 +++++++++++++++---

4 files changed, 184 insertions(+), 37 deletions(-)
diff --git a/src/Ast.zig b/src/Ast.zig
@@ -271,6 +271,7 @@ fn AstRenderer(Writer: type) type {
                     try str.escapeStringForDoubleQuotedString(
                         renderer.writer,
                         renderer.input[data.off .. data.off + data.len],
+                        .padded,
                     );
                     try renderer.writer.writeByte('"');
                     try renderer.writer.writeByte('\n');
diff --git a/src/AstGen2.zig b/src/AstGen2.zig
@@ -175,6 +175,8 @@ pub fn parse(
 const ParsingContext = enum { block_context, inline_context };
 
 fn parseRoot(self: *AstGen) !void {
+    const tracy_frame = tracy.trace(@src());
+    defer tracy_frame.end();
     const root = try self.appendNode(.{ .document = .{} });
     assert(root == .root);
     assert(self.input.ptr == self.cursor.ptr);
@@ -211,6 +213,8 @@ fn parseColumn(
     OutOfErrorIdx,
     OutOfMemory,
 }!Column {
+    const tracy_frame = tracy.trace(@src());
+    defer tracy_frame.end();
     return self.parseColumnInline(parent_idx, parent_col, cursor_col, parsing_context);
 }
 
@@ -245,8 +249,6 @@ inline fn parseColumnInline(
     cursor_col: Column,
     comptime parsing_context: ParsingContext,
 ) !Column {
-    const tracy_frame = tracy.trace(@src());
-    defer tracy_frame.end();
     assert(cursor_col == 0 or parent_col < cursor_col);
 
     // Used for "indentation correction".
@@ -299,7 +301,8 @@ inline fn parseColumnInline(
     // but I don't have anything better. (Btw, the original AstGen.zig uses the same recovery logic.)
 
     parse_another_block: while (true) {
-        assert(self.cursor.len > 0 and str.isNoneOf(self.cursor[0], " \t\r\n"));
+        assert(self.cursor.len > 0);
+        assert(str.isNoneOf(self.cursor[0], " \t\r\n"));
 
         // Will be set by the recursive call (if any),
         // to indicate how much indentation was previously checked.
@@ -311,7 +314,9 @@ inline fn parseColumnInline(
                 inline '-', '.', ':', '#' => |m| {
                     const marker_len = try self.findMarkerEnd(m);
                     if (m == '-') {
-                        const potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable;
+                        var potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable;
+                        while (potential_task_item >= 3 and self.cursor[marker_len + potential_task_item - 1] == ' ')
+                            potential_task_item -= 1;
                         if (potential_task_item >= 3 and
                             self.cursor[marker_len + potential_task_item - 1] == ']' and
                             self.cursor[marker_len + potential_task_item - 3] == '[' and
@@ -405,16 +410,35 @@ inline fn parseColumnInline(
                 },
 
                 '*' => {
-                    if (std.mem.eql(u8, self.cursor[0..3], "***") and
-                        self.cursor[
-                            3 + (str.indexOfNone(
-                                self.cursor[3..],
-                                " \t",
-                            ) orelse unreachable)
-                        ] == '\n')
-                    {
-                        _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3);
-                        break :finish_parsing_block;
+                    if (std.mem.eql(u8, self.cursor[0..3], "***")) {
+                        const after_stars = self.cursor[3..];
+                        const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable;
+                        if (after_stars[skip_whitespace_idx] == '\n') {
+                            _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3);
+                            self.advanceCursor(3 + skip_whitespace_idx + 1);
+                            while (true) {
+                                if (self.cursor.len == 0) return 0;
+
+                                const next_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
+                                if (self.cursor[next_idx] == '\n') {
+                                    self.advanceCursor(next_idx + 1);
+                                    continue;
+                                }
+
+                                const verified_indentation_idx = std.mem.indexOfDiff(
+                                    u8,
+                                    self.cursor,
+                                    self.indentation[0..block_col],
+                                ) orelse unreachable;
+                                if (verified_indentation_idx == block_col) {
+                                    indentation_idx = @intCast(next_idx);
+                                    break :finish_parsing_block;
+                                } else {
+                                    indentation_idx = @intCast(verified_indentation_idx);
+                                    break :finish_parsing_block;
+                                }
+                            }
+                        }
                     }
                 },
 
@@ -474,19 +498,6 @@ inline fn parseColumnInline(
             if (self.cursor.len == 0) return 0;
 
             assert(self.cursor[indentation_idx] != '\n');
-            // if (self.cursor[indentation_idx] == '\n') {
-            //     std.debug.assert(false); // TODO: Check this logic
-            //     // Empty line
-            //     self.advanceCursor(indentation_idx + 1);
-            //     // NOTE: null is impossible because input is guaranteed to end in newlines.
-            //     indentation_idx = if (str.indexOfNone(self.cursor, " \t") orelse unreachable) |idx|
-            //         // Explicitly check for indentation length because malicious input is possible
-            //         if (idx > std.math.maxInt(Column))
-            //             return error.IndentationTooLong
-            //         else
-            //             @intCast(idx);
-            //     continue :finding_block;
-            // } else
             if (indentation_idx > cursor_col) {
                 // Matches us but there's too much whitespace.
                 // Fix the indentation.
@@ -738,8 +749,8 @@ fn parseParagraph(
         if (self.cursor.len == 0) return 0;
 
         const indentation_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
-        // block line found, exit
         if (str.isAnyOf(self.cursor[indentation_idx], "-.:+>#;")) {
+            // block line found, exit
             const verified_indentation_idx = std.mem.indexOfDiff(
                 u8,
                 self.cursor,
@@ -751,6 +762,23 @@ fn parseParagraph(
                 return @intCast(verified_indentation_idx);
             }
         }
+        if (self.cursor[indentation_idx] == '*') {
+            const after_stars = self.cursor[3..];
+            const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable;
+            if (after_stars[skip_whitespace_idx] == '\n') {
+                // block line found, exit
+                const verified_indentation_idx = std.mem.indexOfDiff(
+                    u8,
+                    self.cursor,
+                    self.indentation[0..block_col],
+                ) orelse unreachable;
+                if (verified_indentation_idx == block_col) {
+                    return @intCast(indentation_idx);
+                } else {
+                    return @intCast(verified_indentation_idx);
+                }
+            }
+        }
         // empty line found, consume to next nonwhitespace and exit
         if (self.cursor[indentation_idx] == '\n') {
             self.advanceCursor(indentation_idx + 1);
diff --git a/src/str.zig b/src/str.zig
@@ -97,25 +97,49 @@ pub fn lastIndexOfNone(s: Str, cs: Charset) ?usize {
     return mem.lastIndexOfNone(u8, s, cs);
 }
 
+pub const PaddingOption = enum { padded, not_padded };
+
 pub fn escapeStringForDoubleQuotedString(
     writer: anytype,
     slice: []const u8,
+    comptime has_padding: PaddingOption,
 ) !void {
-    return escapeString(writer, slice, .double_quoted_string);
+    return escapeString(writer, slice, has_padding, .double_quoted_string);
 }
 
 pub fn escapeStringForSingleQuotedString(
     writer: anytype,
     slice: []const u8,
+    comptime has_padding: PaddingOption,
 ) !void {
-    return escapeString(writer, slice, .double_quoted_string);
+    return escapeString(writer, slice, has_padding, .double_quoted_string);
 }
 
-pub fn fmtEscapes(bytes: []const u8) std.fmt.Formatter(stringEscapeFormatter) {
+pub fn fmtEscapes(
+    bytes: []const u8,
+    comptime has_padding: PaddingOption,
+) std.fmt.Formatter(if (has_padding == .padded)
+    stringEscapeFormatterWithPadding
+else
+    stringEscapeFormatterWithoutPadding) {
     return .{ .data = bytes };
 }
 
-pub fn stringEscapeFormatter(
+pub fn stringEscapeFormatterWithPadding(
+    bytes: []const u8,
+    comptime f: []const u8,
+    options: std.fmt.FormatOptions,
+    writer: anytype,
+) !void {
+    _ = options;
+    if (f.len == 1 and f[0] == '\'') {
+        try escapeString(writer, bytes, .padded, .single_quoted_string);
+    } else {
+        try escapeString(writer, bytes, .padded, .double_quoted_string);
+    }
+}
+
+pub fn stringEscapeFormatterWithoutPadding(
     bytes: []const u8,
     comptime f: []const u8,
     options: std.fmt.FormatOptions,
@@ -123,15 +147,16 @@ pub fn stringEscapeFormatter(
 ) !void {
     _ = options;
     if (f.len == 1 and f[0] == '\'') {
-        try escapeString(writer, bytes, .single_quoted_string);
+        try escapeString(writer, bytes, .not_padded, .single_quoted_string);
     } else {
-        try escapeString(writer, bytes, .double_quoted_string);
+        try escapeString(writer, bytes, .not_padded, .double_quoted_string);
     }
 }
 
 pub fn escapeString(
     writer: anytype,
     slice: []const u8,
+    comptime has_padding: PaddingOption,
     comptime escape_for: enum { double_quoted_string, single_quoted_string },
 ) !void {
     const tracy_frame = tracy.trace(@src());
@@ -218,6 +243,87 @@ pub fn escapeString(
                     i += block_len;
                 }
             }
+
+            if (has_padding == .padded) {
+                if (i == slice.len) return;
+                if (slice.len - i >= block_len) {
+                    const load: Block = slice[i .. i + block_len][0..block_len].*;
+
+                    const has_low_ctrl = load < @as(Block, @splat(0x20));
+                    const has_high_ctrl = load >= @as(Block, @splat(0x7f));
+                    const has_quote = load == @as(Block, @splat(quote));
+                    const has_backslash = load == @as(Block, @splat('\\'));
+
+                    // If any character is escaped, do slow path
+                    if (@reduce(.Or, has_low_ctrl) or
+                        @reduce(.Or, has_high_ctrl) or
+                        @reduce(.Or, has_quote) or
+                        @reduce(.Or, has_backslash))
+                    {
+                        // uncomment if you want to inspect the assembly, not that it helps much
+                        // @branchHint(.cold);
+                        // adapted from std.zig.stringEscape
+                        for (slice[i .. i + block_len]) |byte| {
+                            switch (byte) {
+                                '\t' => try writer.writeAll("\\t"),
+                                '\r' => try writer.writeAll("\\r"),
+                                '\n' => try writer.writeAll("\\n"),
+                                quote => try writer.writeAll(escaped_quote),
+                                '\\' => try writer.writeAll("\\\\"),
+                                else => if (byte < 0x20 or byte >= 0x7f) {
+                                    try writer.writeAll("\\x");
+                                    try std.fmt.formatInt(
+                                        byte,
+                                        16,
+                                        .lower,
+                                        .{ .width = 2, .fill = '0' },
+                                        writer,
+                                    );
+                                } else try writer.writeByte(byte),
+                            }
+                        }
+                    } else {
+                        try writer.writeAll(slice[i .. i + block_len]);
+                    }
+
+                    i += block_len;
+                }
+
+                if (i == slice.len) return;
+
+                {
+                    const load_masks: [block_len]Block = comptime blk: {
+                        var masks: []const Block = &.{};
+                        var mask: [block_len]u8 = @splat(0x00);
+                        for (0..block_len) |mask_i| {
+                            mask[mask_i] = 0xff;
+                            masks = masks ++ .{@as(Block, mask)};
+                        }
+                        break :blk masks[0..block_len].*;
+                    };
+
+                    const load: Block =
+                        (slice.ptr[i .. i + block_len][0..block_len].* -%
+                            @as(Block, @splat(0x20))) &
+                        load_masks[slice.len - 1 - i];
+
+                    const has_ctrl = load >= @as(Block, @splat(0x7f - 0x20));
+                    const has_quote = load == @as(Block, @splat(quote - 0x20));
+                    const has_backslash = load == @as(Block, @splat('\\' - 0x20));
+
+                    // If any character is escaped, do slow path
+                    if (@reduce(.Or, has_ctrl) or
+                        @reduce(.Or, has_quote) or
+                        @reduce(.Or, has_backslash))
+                    {
+                        // Fall through to the non-simd for loop
+                    } else {
+                        // Fast path, just write the thing
+                        try writer.writeAll(slice[i..]);
+                        return;
+                    }
+                }
+            }
         }
     }
 
diff --git a/src/test/test.zig b/src/test/test.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const parse = @import("../AstGen.zig").parse;
+const parse2 = @import("../AstGen2.zig").parse;
 const Ast = @import("../Ast.zig");
 
 const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{});
@@ -8,11 +9,22 @@ const ArenaAllocator = std.heap.ArenaAllocator;
 fn testParse(input: []const u8, expected: []const u8) !void {
     var arena: ArenaAllocator = .init(std.testing.allocator);
     defer arena.deinit();
-    const ast = try parse(std.testing.allocator, arena.allocator(), input);
+
+    const safe_input = try arena.allocator().alloc(u8, input.len + 128);
+    @memcpy(safe_input[0..input.len], input);
+    @memset(safe_input[input.len..], '\n');
+
+    const ast = try parse(std.testing.allocator, arena.allocator(), safe_input);
     var ast_render: std.ArrayListUnmanaged(u8) = .empty;
     defer ast_render.deinit(std.testing.allocator);
-    try ast.renderAst(ast_render.writer(std.testing.allocator), input);
+    try ast.renderAst(ast_render.writer(std.testing.allocator), safe_input);
     try std.testing.expectEqualStrings(expected, ast_render.items);
+
+    const ast2 = try parse2(std.testing.allocator, arena.allocator(), safe_input);
+    var ast2_render: std.ArrayListUnmanaged(u8) = .empty;
+    defer ast2_render.deinit(std.testing.allocator);
+    try ast2.renderAst(ast2_render.writer(std.testing.allocator), safe_input);
+    try std.testing.expectEqualStrings(expected, ast2_render.items);
 }
 
 test "Empty" {
@@ -362,7 +374,7 @@ test "Empty line in heading" {
         \\    .text
         \\      "heading"
         \\    .space_text
-        \\      error .unexpected_block_in_inline_context
+        \\      .error .unexpected_block_in_inline_context
         \\      "text"
         \\  .paragraph
         \\    .text

	mymarkdown My markdown
	git clone https://git.grace.moe/mymarkdown
	Log \| Files \| Refs

M	src/Ast.zig	\|	1	+
M	src/AstGen2.zig	\|	84	+++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
M	src/str.zig	\|	118	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	src/test/test.zig	\|	18	+++++++++++++++---