mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

commit 94b5af2e6db938f724637a23e0ed1d81f2294741
parent de923c7630cdc9d438a72ba2e423e49b1f041e01
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Sun, 18 May 2025 22:33:09 +0800

didnt fix all bugs

Diffstat:
Msrc/Ast.zig | 1+
Msrc/AstGen2.zig | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Msrc/str.zig | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Msrc/test/test.zig | 18+++++++++++++++---
4 files changed, 184 insertions(+), 37 deletions(-)

diff --git a/src/Ast.zig b/src/Ast.zig @@ -271,6 +271,7 @@ fn AstRenderer(Writer: type) type { try str.escapeStringForDoubleQuotedString( renderer.writer, renderer.input[data.off .. data.off + data.len], + .padded, ); try renderer.writer.writeByte('"'); try renderer.writer.writeByte('\n'); diff --git a/src/AstGen2.zig b/src/AstGen2.zig @@ -175,6 +175,8 @@ pub fn parse( const ParsingContext = enum { block_context, inline_context }; fn parseRoot(self: *AstGen) !void { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); const root = try self.appendNode(.{ .document = .{} }); assert(root == .root); assert(self.input.ptr == self.cursor.ptr); @@ -211,6 +213,8 @@ fn parseColumn( OutOfErrorIdx, OutOfMemory, }!Column { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); return self.parseColumnInline(parent_idx, parent_col, cursor_col, parsing_context); } @@ -245,8 +249,6 @@ inline fn parseColumnInline( cursor_col: Column, comptime parsing_context: ParsingContext, ) !Column { - const tracy_frame = tracy.trace(@src()); - defer tracy_frame.end(); assert(cursor_col == 0 or parent_col < cursor_col); // Used for "indentation correction". @@ -299,7 +301,8 @@ inline fn parseColumnInline( // but I don't have anything better. (Btw, the original AstGen.zig uses the same recovery logic.) parse_another_block: while (true) { - assert(self.cursor.len > 0 and str.isNoneOf(self.cursor[0], " \t\r\n")); + assert(self.cursor.len > 0); + assert(str.isNoneOf(self.cursor[0], " \t\r\n")); // Will be set by the recursive call (if any), // to indicate how much indentation was previously checked. @@ -311,7 +314,9 @@ inline fn parseColumnInline( inline '-', '.', ':', '#' => |m| { const marker_len = try self.findMarkerEnd(m); if (m == '-') { - const potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable; + var potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable; + while (potential_task_item >= 3 and self.cursor[marker_len + potential_task_item - 1] == ' ') + potential_task_item -= 1; if (potential_task_item >= 3 and self.cursor[marker_len + potential_task_item - 1] == ']' and self.cursor[marker_len + potential_task_item - 3] == '[' and @@ -405,16 +410,35 @@ inline fn parseColumnInline( }, '*' => { - if (std.mem.eql(u8, self.cursor[0..3], "***") and - self.cursor[ - 3 + (str.indexOfNone( - self.cursor[3..], - " \t", - ) orelse unreachable) - ] == '\n') - { - _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3); - break :finish_parsing_block; + if (std.mem.eql(u8, self.cursor[0..3], "***")) { + const after_stars = self.cursor[3..]; + const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable; + if (after_stars[skip_whitespace_idx] == '\n') { + _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3); + self.advanceCursor(3 + skip_whitespace_idx + 1); + while (true) { + if (self.cursor.len == 0) return 0; + + const next_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; + if (self.cursor[next_idx] == '\n') { + self.advanceCursor(next_idx + 1); + continue; + } + + const verified_indentation_idx = std.mem.indexOfDiff( + u8, + self.cursor, + self.indentation[0..block_col], + ) orelse unreachable; + if (verified_indentation_idx == block_col) { + indentation_idx = @intCast(next_idx); + break :finish_parsing_block; + } else { + indentation_idx = @intCast(verified_indentation_idx); + break :finish_parsing_block; + } + } + } } }, @@ -474,19 +498,6 @@ inline fn parseColumnInline( if (self.cursor.len == 0) return 0; assert(self.cursor[indentation_idx] != '\n'); - // if (self.cursor[indentation_idx] == '\n') { - // std.debug.assert(false); // TODO: Check this logic - // // Empty line - // self.advanceCursor(indentation_idx + 1); - // // NOTE: null is impossible because input is guaranteed to end in newlines. - // indentation_idx = if (str.indexOfNone(self.cursor, " \t") orelse unreachable) |idx| - // // Explicitly check for indentation length because malicious input is possible - // if (idx > std.math.maxInt(Column)) - // return error.IndentationTooLong - // else - // @intCast(idx); - // continue :finding_block; - // } else if (indentation_idx > cursor_col) { // Matches us but there's too much whitespace. // Fix the indentation. @@ -738,8 +749,8 @@ fn parseParagraph( if (self.cursor.len == 0) return 0; const indentation_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; - // block line found, exit if (str.isAnyOf(self.cursor[indentation_idx], "-.:+>#;")) { + // block line found, exit const verified_indentation_idx = std.mem.indexOfDiff( u8, self.cursor, @@ -751,6 +762,23 @@ fn parseParagraph( return @intCast(verified_indentation_idx); } } + if (self.cursor[indentation_idx] == '*') { + const after_stars = self.cursor[3..]; + const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable; + if (after_stars[skip_whitespace_idx] == '\n') { + // block line found, exit + const verified_indentation_idx = std.mem.indexOfDiff( + u8, + self.cursor, + self.indentation[0..block_col], + ) orelse unreachable; + if (verified_indentation_idx == block_col) { + return @intCast(indentation_idx); + } else { + return @intCast(verified_indentation_idx); + } + } + } // empty line found, consume to next nonwhitespace and exit if (self.cursor[indentation_idx] == '\n') { self.advanceCursor(indentation_idx + 1); diff --git a/src/str.zig b/src/str.zig @@ -97,25 +97,49 @@ pub fn lastIndexOfNone(s: Str, cs: Charset) ?usize { return mem.lastIndexOfNone(u8, s, cs); } +pub const PaddingOption = enum { padded, not_padded }; + pub fn escapeStringForDoubleQuotedString( writer: anytype, slice: []const u8, + comptime has_padding: PaddingOption, ) !void { - return escapeString(writer, slice, .double_quoted_string); + return escapeString(writer, slice, has_padding, .double_quoted_string); } pub fn escapeStringForSingleQuotedString( writer: anytype, slice: []const u8, + comptime has_padding: PaddingOption, ) !void { - return escapeString(writer, slice, .double_quoted_string); + return escapeString(writer, slice, has_padding, .double_quoted_string); } -pub fn fmtEscapes(bytes: []const u8) std.fmt.Formatter(stringEscapeFormatter) { +pub fn fmtEscapes( + bytes: []const u8, + comptime has_padding: PaddingOption, +) std.fmt.Formatter(if (has_padding == .padded) + stringEscapeFormatterWithPadding +else + stringEscapeFormatterWithoutPadding) { return .{ .data = bytes }; } -pub fn stringEscapeFormatter( +pub fn stringEscapeFormatterWithPadding( + bytes: []const u8, + comptime f: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = options; + if (f.len == 1 and f[0] == '\'') { + try escapeString(writer, bytes, .padded, .single_quoted_string); + } else { + try escapeString(writer, bytes, .padded, .double_quoted_string); + } +} + +pub fn stringEscapeFormatterWithoutPadding( bytes: []const u8, comptime f: []const u8, options: std.fmt.FormatOptions, @@ -123,15 +147,16 @@ pub fn stringEscapeFormatter( ) !void { _ = options; if (f.len == 1 and f[0] == '\'') { - try escapeString(writer, bytes, .single_quoted_string); + try escapeString(writer, bytes, .not_padded, .single_quoted_string); } else { - try escapeString(writer, bytes, .double_quoted_string); + try escapeString(writer, bytes, .not_padded, .double_quoted_string); } } pub fn escapeString( writer: anytype, slice: []const u8, + comptime has_padding: PaddingOption, comptime escape_for: enum { double_quoted_string, single_quoted_string }, ) !void { const tracy_frame = tracy.trace(@src()); @@ -218,6 +243,87 @@ pub fn escapeString( i += block_len; } } + + if (has_padding == .padded) { + if (i == slice.len) return; + if (slice.len - i >= block_len) { + const load: Block = slice[i .. i + block_len][0..block_len].*; + + const has_low_ctrl = load < @as(Block, @splat(0x20)); + const has_high_ctrl = load >= @as(Block, @splat(0x7f)); + const has_quote = load == @as(Block, @splat(quote)); + const has_backslash = load == @as(Block, @splat('\\')); + + // If any character is escaped, do slow path + if (@reduce(.Or, has_low_ctrl) or + @reduce(.Or, has_high_ctrl) or + @reduce(.Or, has_quote) or + @reduce(.Or, has_backslash)) + { + // uncomment if you want to inspect the assembly, not that it helps much + // @branchHint(.cold); + // adapted from std.zig.stringEscape + for (slice[i .. i + block_len]) |byte| { + switch (byte) { + '\t' => try writer.writeAll("\\t"), + '\r' => try writer.writeAll("\\r"), + '\n' => try writer.writeAll("\\n"), + quote => try writer.writeAll(escaped_quote), + '\\' => try writer.writeAll("\\\\"), + else => if (byte < 0x20 or byte >= 0x7f) { + try writer.writeAll("\\x"); + try std.fmt.formatInt( + byte, + 16, + .lower, + .{ .width = 2, .fill = '0' }, + writer, + ); + } else try writer.writeByte(byte), + } + } + } else { + try writer.writeAll(slice[i .. i + block_len]); + } + + i += block_len; + } + + if (i == slice.len) return; + + { + const load_masks: [block_len]Block = comptime blk: { + var masks: []const Block = &.{}; + var mask: [block_len]u8 = @splat(0x00); + for (0..block_len) |mask_i| { + mask[mask_i] = 0xff; + masks = masks ++ .{@as(Block, mask)}; + } + break :blk masks[0..block_len].*; + }; + + const load: Block = + (slice.ptr[i .. i + block_len][0..block_len].* -% + @as(Block, @splat(0x20))) & + load_masks[slice.len - 1 - i]; + + const has_ctrl = load >= @as(Block, @splat(0x7f - 0x20)); + const has_quote = load == @as(Block, @splat(quote - 0x20)); + const has_backslash = load == @as(Block, @splat('\\' - 0x20)); + + // If any character is escaped, do slow path + if (@reduce(.Or, has_ctrl) or + @reduce(.Or, has_quote) or + @reduce(.Or, has_backslash)) + { + // Fall through to the non-simd for loop + } else { + // Fast path, just write the thing + try writer.writeAll(slice[i..]); + return; + } + } + } } } diff --git a/src/test/test.zig b/src/test/test.zig @@ -1,5 +1,6 @@ const std = @import("std"); const parse = @import("../AstGen.zig").parse; +const parse2 = @import("../AstGen2.zig").parse; const Ast = @import("../Ast.zig"); const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{}); @@ -8,11 +9,22 @@ const ArenaAllocator = std.heap.ArenaAllocator; fn testParse(input: []const u8, expected: []const u8) !void { var arena: ArenaAllocator = .init(std.testing.allocator); defer arena.deinit(); - const ast = try parse(std.testing.allocator, arena.allocator(), input); + + const safe_input = try arena.allocator().alloc(u8, input.len + 128); + @memcpy(safe_input[0..input.len], input); + @memset(safe_input[input.len..], '\n'); + + const ast = try parse(std.testing.allocator, arena.allocator(), safe_input); var ast_render: std.ArrayListUnmanaged(u8) = .empty; defer ast_render.deinit(std.testing.allocator); - try ast.renderAst(ast_render.writer(std.testing.allocator), input); + try ast.renderAst(ast_render.writer(std.testing.allocator), safe_input); try std.testing.expectEqualStrings(expected, ast_render.items); + + const ast2 = try parse2(std.testing.allocator, arena.allocator(), safe_input); + var ast2_render: std.ArrayListUnmanaged(u8) = .empty; + defer ast2_render.deinit(std.testing.allocator); + try ast2.renderAst(ast2_render.writer(std.testing.allocator), safe_input); + try std.testing.expectEqualStrings(expected, ast2_render.items); } test "Empty" { @@ -362,7 +374,7 @@ test "Empty line in heading" { \\ .text \\ "heading" \\ .space_text - \\ error .unexpected_block_in_inline_context + \\ .error .unexpected_block_in_inline_context \\ "text" \\ .paragraph \\ .text