mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

commit 561fee2b93050e3cc92927e9a73b14252d45ed85
parent c08e3785171f96ac5779ee9c5577dc2ce4fa6c9a
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Sun, 18 May 2025 01:48:45 +0800

v0 vs v1

Diffstat:
Msrc/Ast.zig | 169+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
Msrc/AstGen.zig | 24+++++++++++++++++-------
Asrc/AstGen2.zig | 801+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/main.zig | 40++++++++++++++++++++++------------------
Msrc/root.zig | 2++
Msrc/str.zig | 18++++++++++++++++++
Msrc/test/test.zig | 6+++++-
Msrc/utils.zig | 10++++++----
8 files changed, 980 insertions(+), 90 deletions(-)

diff --git a/src/Ast.zig b/src/Ast.zig @@ -37,12 +37,12 @@ pub const Node = utils.Packed(union(enum(u8)) { pub const Idx = utils.NewType(u24, opaque {}); pub const Root = packed struct { - num_children: u24 = 0, + num_children: Idx.IntType = 0, pub const format = utils.structFormat(@This()); }; pub const Container = packed struct { off: StrOffset, - num_children: u24 = 0, + num_children: Idx.IntType = 0, pub const format = utils.structFormat(@This()); }; pub const Leaf = packed struct { @@ -55,7 +55,9 @@ pub const Node = utils.Packed(union(enum(u8)) { pub fn incrementNumChildren(self: *Node) void { switch (self.tag) { inline else => |t| { - if (@TypeOf(@field(self.data, @tagName(t))) == Container or @TypeOf(@field(self.data, @tagName(t))) == Root) { + if (@TypeOf(@field(self.data, @tagName(t))) == Container or + @TypeOf(@field(self.data, @tagName(t))) == Root) + { @field(self.data, @tagName(t)).num_children += 1; } else unreachable; }, @@ -67,8 +69,8 @@ pub const Node = utils.Packed(union(enum(u8)) { pub const Error = utils.Packed(union(enum(u8)) { marker_too_long: NodeError, + unexpected_block_in_inline_context: NodeError, invalid_marker: PointError, - empty_line_in_inline_block: PointError, inconsistent_indentation: PointError, /// Used when the error diagnostic spans the entire node @@ -126,7 +128,13 @@ pub fn toTagged(self: Ast, gpa: Allocator) !Tagged { } pub fn renderAst(self: Ast, writer: anytype, input: []const u8) !void { - var renderer: AstRenderer(@TypeOf(writer)) = .{ .self = self, .writer = writer, .input = input, .cached_offset = 0, .cached_pos = .{ .row = 1, .col = 1 } }; + var renderer: AstRenderer(@TypeOf(writer)) = .{ + .self = self, + .writer = writer, + .input = input, + .cached_offset = 0, + .cached_pos = .{ .row = 1, .col = 1 }, + }; const node_idx, const error_idx = try renderer.renderAstInner(0, 0, 0); assert(node_idx == self.nodes.len); assert(error_idx == self.errors.len); @@ -175,72 +183,110 @@ fn AstRenderer(Writer: type) type { } } - pub fn renderAstInner(renderer: *@This(), start_node: usize, start_error: usize, indent: usize) !struct { usize, usize } { + pub fn renderAstInner( + renderer: *@This(), + start_node: usize, + start_error: usize, + indent: usize, + ) !struct { usize, usize } { const tracy_frame = tracy.trace(@src()); defer tracy_frame.end(); + try renderer.writer.writeByteNTimes(' ', indent); + try renderer.writer.writeByte('.'); + try renderer.writer.writeAll(@tagName(renderer.self.nodes[start_node].tag)); + try renderer.writer.writeByte('\n'); + var cur_node: usize = start_node + 1; + var cur_error: usize = start_error; + while (cur_error < renderer.self.errors.len and + @intFromEnum(renderer.self.errors[cur_error].get(.idx)) == start_node) + { + try renderer.writer.writeByteNTimes(' ', indent + 2); + try renderer.writer.writeAll(".error ."); + try renderer.writer.writeAll(@tagName(renderer.self.errors[cur_error].tag)); + switch (renderer.self.errors[cur_error].toTagged()) { + // PointError + .inconsistent_indentation, + .invalid_marker, + => |error_data| { + try renderer.writer.print( + " at {}:{}", + renderer.offsetToPos(error_data.off), + ); + }, + // NodeError + .unexpected_block_in_inline_context, + .marker_too_long, + => {}, + } + try renderer.writer.writeByte('\n'); + cur_error += 1; + } switch (renderer.self.nodes[start_node].toTagged()) { - inline else => |data, tag| { - try renderer.writer.writeByteNTimes(' ', indent); - try renderer.writer.writeByte('.'); - try renderer.writer.writeAll(@tagName(tag)); - try renderer.writer.writeByte('\n'); - var cur_node: usize = start_node + 1; - var cur_error: usize = start_error; - while (cur_error < renderer.self.errors.len and @intFromEnum(renderer.self.errors[cur_error].get(.idx)) == start_node) { - switch (renderer.self.errors[cur_error].toTagged()) { - // PointError - .empty_line_in_inline_block, - .inconsistent_indentation, - .invalid_marker, - => |error_data| { - try renderer.writer.writeByteNTimes(' ', indent + 2); - try renderer.writer.writeAll(".error ."); - try renderer.writer.writeAll(@tagName(renderer.self.errors[cur_error].tag)); - try renderer.writer.print(" at {}:{}", renderer.offsetToPos(error_data.off)); - try renderer.writer.writeByte('\n'); - }, - // NodeError - .marker_too_long => { - try renderer.writer.writeByteNTimes(' ', indent + 2); - try renderer.writer.writeAll("error ."); - try renderer.writer.writeAll(@tagName(renderer.self.errors[cur_error].tag)); - try renderer.writer.writeByte('\n'); - }, - } - cur_error += 1; + .document, + => |data| { + for (0..data.num_children) |_| { + assert(cur_node < renderer.self.nodes.len); + cur_node, cur_error = try renderer.renderAstInner( + cur_node, + cur_error, + indent + 2, + ); + assert(cur_node <= renderer.self.nodes.len); } - if (@TypeOf(data) == Node.Tagged.Container or @TypeOf(data) == Node.Tagged.Root) { - for (0..data.num_children) |_| { - assert(cur_node < renderer.self.nodes.len); - cur_node, cur_error = try renderer.renderAstInner(cur_node, cur_error, indent + 2); - assert(cur_node <= renderer.self.nodes.len); - } - return .{ cur_node, cur_error }; - } else if (@TypeOf(data) == Node.Tagged.Leaf) { - try renderer.writer.writeByteNTimes(' ', indent + 2); - // This was too slow! - // try renderer.writer.print("\"{}\"", .{ - // std.zig.fmtEscapes(renderer.input[data.off .. data.off + data.len]), - // }); - try renderer.writer.writeByte('"'); - try str.escapeStringForDoubleQuotedString( - renderer.writer, - renderer.input[data.off .. data.off + data.len], + return .{ cur_node, cur_error }; + }, + + .heading, + .quote, + .paragraph, + .unordered_item, + .ordered_item, + .term_item, + .task_item, + .elaboration, + => |data| { + for (0..data.num_children) |_| { + assert(cur_node < renderer.self.nodes.len); + cur_node, cur_error = try renderer.renderAstInner( + cur_node, + cur_error, + indent + 2, ); - try renderer.writer.writeByte('"'); - try renderer.writer.writeByte('\n'); - return .{ cur_node, cur_error }; - } else { - @compileError(@typeName(@TypeOf(data))); + assert(cur_node <= renderer.self.nodes.len); } + return .{ cur_node, cur_error }; + }, + + .marker, + .thematic_break, + .text, + .space_text, + => |data| { + try renderer.writer.writeByteNTimes(' ', indent + 2); + // This was too slow! + // try renderer.writer.print("\"{}\"", .{ + // std.zig.fmtEscapes(renderer.input[data.off .. data.off + data.len]), + // }); + try renderer.writer.writeByte('"'); + try str.escapeStringForDoubleQuotedString( + renderer.writer, + renderer.input[data.off .. data.off + data.len], + ); + try renderer.writer.writeByte('"'); + try renderer.writer.writeByte('\n'); + return .{ cur_node, cur_error }; }, } - unreachable; } }; } -pub fn renderHtml(self: Ast, writer: anytype, input: []const u8, start_: ?Node.Idx) !?Node.Idx { +pub fn renderHtml( + self: Ast, + writer: anytype, + input: []const u8, + start_: ?Node.Idx, +) !?Node.Idx { const tracy_frame = tracy.trace(@src()); defer tracy_frame.end(); const start: Node.Idx = start_ orelse @enumFromInt(0); @@ -261,7 +307,10 @@ pub fn renderHtml(self: Ast, writer: anytype, input: []const u8, start_: ?Node.I var cur_idx: ?Node.Idx = start.next(); switch (self.nodes[@intFromEnum(start)].tag) { inline .document, .paragraph => |t| { - const data = @field(self.nodes[@intFromEnum(start)].data, @tagName(t)); + const data = @field( + self.nodes[@intFromEnum(start)].data, + @tagName(t), + ); for (0..data.num_children) |_| { if (cur_idx) |idx| { cur_idx = try self.renderHtml(writer, input, idx); diff --git a/src/AstGen.zig b/src/AstGen.zig @@ -187,7 +187,7 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: N const tracy_frame = tracy.trace(@src()); defer tracy_frame.end(); var lines = lines_; - var empty_line_off: ?u32 = null; + var saw_empty_line: bool = false; outer: { // empty lines at the start of the inline block are fine, just skip these @@ -230,12 +230,13 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: N if (lines.len == 0) break :outer; if (lines[0].len != 0) break; // empty line detected - empty_line_off = self.calcOffset(@ptrCast(lines[0].ptr)); + saw_empty_line = true; } - if (empty_line_off) |off| { + if (saw_empty_line) { + saw_empty_line = false; try self.errors.append(gpa, .fromTagged(.{ - .empty_line_in_inline_block = .{ .idx = self.nextNodeIdx(), .off = off }, + .unexpected_block_in_inline_context = .{ .idx = self.nextNodeIdx() }, })); } @@ -309,7 +310,16 @@ fn parseColumn(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: Node.I num_lines += 1; } - const paragraph_lines = lines[0..num_lines]; + var paragraph_lines = lines[0..num_lines]; + if (lines[0][0] == ' ' or lines[0][0] == '\t') { + try self.errors.append(gpa, .fromTagged(.{ + .inconsistent_indentation = .{ + .idx = self.lastNodeIdx(), + .off = self.calcOffset(&lines[0][0]), + }, + })); + paragraph_lines = try self.findIndentedColumn(gpa, paragraph_lines, child); + } lines = lines[num_lines..]; try self.parseInlineBlock(gpa, paragraph_lines, child); }, @@ -446,9 +456,9 @@ const block_specs = blockSpecs(struct { pub const @"+": BlockSpec = &.{ .{ .tag = .elaboration, - .marker = .{ .starts_with = "+" }, + .marker = .{ .starts_with_multi = .{ .marker_char = '+' } }, .mode = .indented_column, - .store_marker_child = .no_store, + .store_marker_child = .store, }, }; pub const @";": BlockSpec = &.{ diff --git a/src/AstGen2.zig b/src/AstGen2.zig @@ -0,0 +1,801 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; + +const ziggy = @import("ziggy"); +const tracy = @import("tracy"); + +const utils = @import("utils.zig"); +const str = @import("str.zig"); +const Ast = @import("Ast.zig"); +const StrOffset = Ast.StrOffset; +const StrLen = Ast.StrLen; +const Node = Ast.Node; +const Error = Ast.Error; + +const Column = u10; + +const AstGen = @This(); + +gpa: Allocator, +output_gpa: Allocator, +output_gpa_same_as_gpa: bool, +input: []const u8, +cursor: []const u8, // suffix of input +indentation: [std.math.maxInt(Column)]u8, +nodes: std.ArrayListUnmanaged(Node), +errors: std.ArrayListUnmanaged(Error), +extra: std.ArrayListUnmanaged(u32), + +fn cursorOffset(self: *AstGen) StrOffset { + return @intCast(self.cursor.ptr - self.input.ptr); +} +fn advanceCursor(self: *AstGen, advance: usize) void { + // NOTE: `advance` should really be u32, but this makes it easier to work with other str functions. + self.cursor = self.cursor[advance..]; +} +fn findMarkerEnd(self: AstGen, m: u8) error{IndentationTooLong}!Column { + // NOTE: null is impossible because input is guaranteed to end in newlines. + const idx = str.indexOfNotChar(self.cursor, m) orelse unreachable; + // Explicitly check for marker length because malicious input is possible + if (idx > std.math.maxInt(Column)) + return error.IndentationTooLong; + return @intCast(idx); +} + +fn getNode(self: AstGen, idx: Node.Idx) *Node { + @setRuntimeSafety(true); + return &self.nodes.items[@intFromEnum(idx)]; +} +fn lastNodeIdx(self: AstGen) Node.Idx { + @setRuntimeSafety(true); + return @enumFromInt(self.nodes.items.len - 1); +} +fn nextNodeIdx(self: AstGen) Node.Idx { + @setRuntimeSafety(true); + return @enumFromInt(self.nodes.items.len); +} +fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { + if (self.nodes.items.len > std.math.maxInt( + @typeInfo(Node.Idx).@"enum".tag_type, + )) return error.OutOfNodeIdx; + const idx = self.nodes.items.len; + try self.nodes.append(self.gpa, .fromTagged(node)); + return @enumFromInt(idx); +} +fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx { + self.getNode(parent_idx).incrementNumChildren(); + return try self.appendNode( + @unionInit( + Node.Tagged, + @tagName(container_node_tag), + .{ .off = self.cursorOffset() }, + ), + ); +} +fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx { + self.getNode(parent_idx).incrementNumChildren(); + return try self.appendNode( + @unionInit( + Node.Tagged, + @tagName(leaf_node_tag), + .{ .off = self.cursorOffset(), .len = len }, + ), + ); +} +fn appendError(self: *AstGen, err: Error.Tagged) !void { + if (self.errors.items.len > std.math.maxInt( + @typeInfo(Error.Idx).@"enum".tag_type, + )) return error.OutOfErrorIdx; + try self.errors.append(self.gpa, .fromTagged(err)); +} +fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { + try self.appendError( + @unionInit( + Error.Tagged, + @tagName(tag), + .{ .idx = idx, .off = self.cursorOffset() }, + ), + ); +} +fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { + try self.appendError( + @unionInit( + Error.Tagged, + @tagName(tag), + .{ .idx = idx }, + ), + ); +} + +pub fn deinit(self: *AstGen) void { + self.nodes.deinit(self.gpa); + self.errors.deinit(self.gpa); + self.extra.deinit(self.gpa); +} + +/// Parses mymarkdown +/// +/// : `gpa` +/// + A suitable allocator for scratch allocations that supports free and ideally remap. +/// : `output_gpa` +/// + If passed, no scratch allocations will outlive this function, +/// and any allocations returned will be allocated on this. +/// : `input` +/// + The input slice to be parsed. Must end in at least 1024 \n characters. +/// +/// Errors: +/// : `IndentationTooLong` +/// + This implementation of mymarkdown supports up to 1023 characters of indentation. +pub fn parse( + gpa: Allocator, + output_gpa: ?Allocator, + input: []const u8, +) !Ast { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + + if (@typeInfo(Column).int.bits > @typeInfo(StrLen).int.bits) + @compileError("Column should have less bits than StrLen"); + if (input.len < 128 or !std.mem.eql(u8, input[input.len - 128 ..], "\n" ** 128)) + return error.InputUnsafe; + if (input.len > std.math.maxInt(u32)) + return error.InputTooLarge; + + var ast: AstGen = .{ + .gpa = gpa, + .output_gpa = output_gpa orelse gpa, + .output_gpa_same_as_gpa = output_gpa == null, + .input = input, + .cursor = input, + .indentation = undefined, + .nodes = .empty, + .errors = .empty, + .extra = .empty, + }; + defer ast.deinit(); + + try ast.parseRoot(); + + if (output_gpa) |gpa2| { + return .{ + .nodes = try gpa2.dupe(Node, ast.nodes.items), + .errors = try gpa2.dupe(Error, ast.errors.items), + .extra = try gpa2.dupe(u32, ast.extra.items), + }; + } else { + return .{ + .nodes = try ast.nodes.toOwnedSlice(gpa), + .errors = try ast.errors.toOwnedSlice(gpa), + .extra = try ast.extra.toOwnedSlice(gpa), + }; + } +} + +const ParsingContext = enum { block_context, inline_context }; + +fn parseRoot(self: *AstGen) !void { + const root = try self.appendNode(.{ .document = .{} }); + assert(root == .root); + assert(self.input.ptr == self.cursor.ptr); + assert(self.input.len == self.cursor.len); + + if (str.indexOfNone(self.input, " \t\r\n")) |idx| { + if (idx == 0 or self.input[idx - 1] != '\n') { + // Happy case: input starts at the start of the line :) + self.advanceCursor(idx); + } else { + // Input doesn't start at the start of the line :( + // Log an error but otherwise proceed as usual + try self.appendPointErrorAtCursor(.inconsistent_indentation, root); + self.advanceCursor(idx); + } + + // The actual parse + // "inline" hack to get different branch predictors for the root column + _ = try self.parseColumnInline(root, 0, 0, .block_context); + } else { + // Input is completely empty, return without really parsing anything + } +} + +fn parseColumn( + self: *AstGen, + parent_idx: Node.Idx, + parent_col: Column, + cursor_col: Column, + comptime parsing_context: ParsingContext, +) error{ + IndentationTooLong, + OutOfNodeIdx, + OutOfErrorIdx, + OutOfMemory, +}!Column { + return self.parseColumnInline(parent_idx, parent_col, cursor_col, parsing_context); +} + +fn parseMarkerItem( + self: *AstGen, + comptime node_tag: Node.Tag, + parent_idx: Node.Idx, + block_col: Column, + marker_len: Column, + comptime output_marker: enum { output_marker, no_output_marker }, + comptime parent_parsing_context: ParsingContext, + comptime child_parsing_context: ParsingContext, +) !Column { + const block_idx = try self.appendContainerNodeAtCursor(parent_idx, node_tag); + if (parent_parsing_context == .inline_context) + try self.appendNodeErrorAtCursor(.unexpected_block_in_inline_context, block_idx); + if (output_marker == .output_marker) { + _ = try self.appendLeafNodeAtCursor(block_idx, .marker, marker_len); + } + switch (try self.findIndentation(block_col, marker_len)) { + .found_column => |child_col| { + return try self.parseColumn(block_idx, block_col, child_col, child_parsing_context); + }, + .mismatched_indentation => |indentation_idx_found| return indentation_idx_found, + } +} + +inline fn parseColumnInline( + self: *AstGen, + parent_idx: Node.Idx, + parent_col: Column, + cursor_col: Column, + comptime parsing_context: ParsingContext, +) !Column { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + assert(cursor_col == 0 or parent_col < cursor_col); + + // Used for "indentation correction". + // For simplicity, just think of this as this column's indentation. + var block_col = cursor_col; + + var parsed_first_paragraph_for_inline_context: bool = false; + + // # parseColumn's input parameter explanation + // + AKA: How to deal with indentation + // + // Our cursor points at the first (usually non-whitespace) char of the column. + // (The caller is in charge of finding the start of the column.) + // + // We also need both our own and our parent's indentations. + // These are represented as column indices that indicate + // how many characters of `self.indentation` should match. + // indicating where the parent column's ends, + // as well as where this column's indentation ends. + // For the root column, both of these values are 0, which means the column + // can never be exited from until it reaches the end of the file. + // + // === + // column 0, file starts here + // | column 2, parent column starts here + // | | column 5, our column starts here + // v v v + // + // | Parent column (parent's indentation = " ", represented as parent_col = 2) + // | v (cursor points at H, our indentation = " ", represented as cursor_col = 5) + // | Hello there + // | Same column + // | Parent column + // === + // + // Lines that match with our indentation are considered part of our column, + // and the column ends upon finding a line aligns with the parent indentation. + // When a line matches the parent's indentation but NOT ours, we log an error, + // then do error recovery by pretending it does match with our indentation. + // + // === + // | Parent column + // | v (cursor) + // | Hello there + // | \t Same column but syntax error! (" \t " does not match with " ", but matches " ") + // | Parent column + // === + // + // This indentation recovery system is not the best and it can fix indentation "incorrectly", + // but I don't have anything better. (Btw, the original AstGen.zig uses the same recovery logic.) + + parse_another_block: while (true) { + assert(self.cursor.len > 0 and str.isNoneOf(self.cursor[0], " \t\r\n")); + + // Will be set by the recursive call (if any), + // to indicate how much indentation was previously checked. + var indentation_idx: Column = undefined; + + finish_parsing_block: { + switch (self.cursor[0]) { + // Par-like repeatable markers + inline '-', '.', ':', '#' => |m| { + const marker_len = try self.findMarkerEnd(m); + if (m == '-') { + const potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable; + if (potential_task_item >= 3 and + self.cursor[marker_len + potential_task_item - 1] == ']' and + self.cursor[marker_len + potential_task_item - 3] == '[' and + (self.cursor[marker_len + potential_task_item - 2] == ' ' or + self.cursor[marker_len + potential_task_item - 2] == 'x' or + self.cursor[marker_len + potential_task_item - 2] == 'X') and + std.mem.allEqual(u8, self.cursor[marker_len .. marker_len + potential_task_item - 3], ' ')) + { + if (marker_len + potential_task_item > std.math.maxInt(Column)) + return error.IndentationTooLong; + indentation_idx = try self.parseMarkerItem( + .task_item, + parent_idx, + block_col, + @intCast(marker_len + potential_task_item), + .output_marker, + parsing_context, + .inline_context, + ); + break :finish_parsing_block; + } + } + indentation_idx = try self.parseMarkerItem( + switch (m) { + '-' => .unordered_item, + '.' => .ordered_item, + ':' => .term_item, + '#' => .heading, + else => unreachable, + }, + parent_idx, + block_col, + marker_len, + .output_marker, + parsing_context, + .inline_context, + ); + break :finish_parsing_block; + }, + + // Div-like repeatable markers + inline '+' => |m| { + const marker_len = try self.findMarkerEnd(m); + indentation_idx = try self.parseMarkerItem( + switch (m) { + '+' => .elaboration, + else => unreachable, + }, + parent_idx, + block_col, + marker_len, + .output_marker, + parsing_context, + .block_context, + ); + break :finish_parsing_block; + }, + + // Par-like single markers + inline ';' => |m| { + indentation_idx = try self.parseMarkerItem( + switch (m) { + ';' => .paragraph, + else => unreachable, + }, + parent_idx, + block_col, + 1, + .no_output_marker, + parsing_context, + .inline_context, + ); + break :finish_parsing_block; + }, + + // Div-like single markers + inline '>' => |m| { + indentation_idx = try self.parseMarkerItem( + switch (m) { + '>' => .quote, + else => unreachable, + }, + parent_idx, + block_col, + 1, + .no_output_marker, + parsing_context, + .block_context, + ); + break :finish_parsing_block; + }, + + '*' => { + if (std.mem.eql(u8, self.cursor[0..3], "***") and + self.cursor[ + 3 + (str.indexOfNone( + self.cursor[3..], + " \t", + ) orelse unreachable) + ] == '\n') + { + _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3); + break :finish_parsing_block; + } + }, + + else => {}, + } + + // Handle paragraph + switch (parsing_context) { + .inline_context => { + if (parsed_first_paragraph_for_inline_context) { + try self.appendNodeErrorAtCursor(.unexpected_block_in_inline_context, self.nextNodeIdx()); + indentation_idx = try self.parseParagraph(.space_text, parent_idx, parent_col, block_col); + } else { + indentation_idx = try self.parseParagraph(.text, parent_idx, parent_col, block_col); + parsed_first_paragraph_for_inline_context = true; + } + }, + .block_context => { + const paragraph_idx = try self.appendContainerNodeAtCursor(parent_idx, .paragraph); + indentation_idx = try self.parseParagraph(.text, paragraph_idx, parent_col, block_col); + }, + } + } + + // We just finished parsing a block, so cursor points at the start of a line: + // + // === + // | Parent column + // | Hello there + // | Same column + // |v----------------- (cursor) + // | Parent column + // === + // + // We need to find the next block. This involves checking for empty lines and indentation. + // + // We find where the indentation differs, if any. There are 6 cases: + // - **The line is empty.** + // + We loop again, looking for another block. + // - **Matches us, next char is non-whitespace.** + // + Happy path, we continue parsing as usual. + // - **Matches us, next char is whitespace.** + // + Log and error and recover by treating all leading whitespace + // as if it matched our indentation level exactly. + // - **Does not even match parent.** + // + Return with no errors. + // - **Matches parent and not us, next char is non-whitespace.** + // + Return with no errors. + // - **Matches parent and not us, next char is whitespace.** + // + Log and error and recover by treating all leading whitespace + // as if it matched our indentation level exactly. + + block_col = cursor_col; + // finding_block: + while (true) { + // Special case: when we hit EOF, there's nothing left to parse. + if (self.cursor.len == 0) return 0; + + assert(self.cursor[indentation_idx] != '\n'); + // if (self.cursor[indentation_idx] == '\n') { + // std.debug.assert(false); // TODO: Check this logic + // // Empty line + // self.advanceCursor(indentation_idx + 1); + // // NOTE: null is impossible because input is guaranteed to end in newlines. + // indentation_idx = if (str.indexOfNone(self.cursor, " \t") orelse unreachable) |idx| + // // Explicitly check for indentation length because malicious input is possible + // if (idx > std.math.maxInt(Column)) + // return error.IndentationTooLong + // else + // @intCast(idx); + // continue :finding_block; + // } else + if (indentation_idx > cursor_col) { + // Matches us but there's too much whitespace. + // Fix the indentation. + // Here, we fix the indentation by pretending that + // the block starts from wherever the whitespace ended. + block_col = indentation_idx; + @memcpy( + self.indentation[cursor_col..block_col], + self.cursor[cursor_col..block_col], + ); + // Log the error. + self.advanceCursor(cursor_col); + try self.appendPointErrorAtCursor(.inconsistent_indentation, self.nextNodeIdx()); + // Continue parsing. + self.advanceCursor(indentation_idx - cursor_col); + continue :parse_another_block; + } else if (indentation_idx == cursor_col) { + // Matches us exactly. + self.advanceCursor(indentation_idx); + continue :parse_another_block; + } else if (indentation_idx > parent_col) { + // Matches parent but there's extra whitespace that doesn't match us. + // Log the error. + self.advanceCursor(parent_col); + try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx); + // Fix the indentation. + // Here, we fix the indentation by pretending that + // the block starts from the correct amount of whitespace. + // <no op> + + // Continue parsing. + self.advanceCursor(indentation_idx - parent_col); + continue :parse_another_block; + } else { + // Matches parent exactly or doesn't match parent, return. + return indentation_idx; + } + } + } +} + +/// Finds where the indented block starts +fn findIndentation( + self: *AstGen, + parent_col: Column, + skip: Column, +) !union(enum) { + found_column: Column, + mismatched_indentation: Column, +} { + // We're given the input at the marker. + // + // === + // parent_col + // v + // + // | Parent column + // | v------------ (cursor) + // | - Hello there + // | Same column + // | Parent column + // === + // + // We first skip some number of characters, and then scan forward until we find non-whitespace. + // + // === + // first skip... + // + // parent_col + // v + // + // | Parent column + // | v------------ (cursor) + // | - Hello there + // | Same column + // | Parent column + // + // then scan for non-whitespace: + // + // parent_col + // v + // + // | Parent column + // | v------------ (cursor) + // | - Hello there + // | Same column + // | Parent column + // === + // + // Then we store the indentation found in `self.indentation` and return the indentation column (in this case 5). + // Because in this case we found the non-whitespace character on the first line, we will memset the marker to spaces. + // + // - return = 5 + // - self.indentation = ` ` + // + // *** + // + // There are a couple other cases. + // + // This next case is when the non-whitespace char is not on the same line. + // In this case we can simply copy the indentation from the input. + // + // === + // parent_col + // v + // + // | Parent column + // | + // | - v------------ (cursor) + // | Hello there + // | Same column + // | Parent column + // === + // + // - return = 5 + // - self.indentation = ` ` + // + // *** + // + // This next case is we find that the indentation does not match the parent's indentation. + // Note that this can only happen when the non-whitespace char is not on the same line as the initial cursor, + // since the indentation behind the initial cursor has already been checked by the caller. + // + // === + // parent_col + // v + // + // | Parent column + // | + // | - v------------ (cursor) + // | \t Hello there + // | Same column + // | Parent column + // === + // + // - return = mismatched indentation at column 1 + // + // Here, parent's indentation is ` ` but we saw ` \t `. + // In this case we return how many characters did match to the caller. + // The caller should interpret this case as that no indented column was found. + // + // Note that we do not break out of the loop until we find a non-whitespace char, + // even if we see non-matching indentation. In this next example, we ignore the `\t` + // and continue to the H, where the indentation actually does match. + // + // === + // parent_col + // v + // + // | Parent column + // | + // | - + // | \t v------------ (cursor) + // | Hello there + // | Same column + // | Parent column + // === + // + // - return = 5 + // - self.indentation = ` ` + + // Handle first line separately + { + self.advanceCursor(skip); + const inner_block_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; + if (self.cursor[inner_block_idx] != '\n') { + // We found the indentation! + // Because this is the first line, we need to memset the marker into spaces. + if (parent_col + skip + inner_block_idx > std.math.maxInt(Column)) + return error.IndentationTooLong; + @memset(self.indentation[parent_col .. parent_col + skip], ' '); + @memcpy(self.indentation[parent_col + skip .. parent_col + skip + inner_block_idx], self.cursor[0..inner_block_idx]); + self.advanceCursor(inner_block_idx); + return .{ .found_column = @intCast(parent_col + skip + inner_block_idx) }; + } else { + // I lied, inner_block_idx doesn't point to the inner block. + self.advanceCursor(inner_block_idx + 1); + } + } + + // Remaining lines don't need to memset the marker into spaces. + while (true) { + // Find column + const inner_block_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; + if (self.cursor[inner_block_idx] != '\n') { + // Verify parent indentation + const indentation_idx = std.mem.indexOfDiff(u8, self.cursor, self.indentation[0..parent_col]) orelse unreachable; + if (indentation_idx != parent_col) { + return .{ .mismatched_indentation = @intCast(indentation_idx) }; + } + // We found the indentation! + if (parent_col + inner_block_idx > std.math.maxInt(Column)) + return error.IndentationTooLong; + @memcpy(self.indentation[parent_col .. parent_col + inner_block_idx], self.cursor[parent_col..inner_block_idx]); + self.advanceCursor(inner_block_idx); + return .{ .found_column = @intCast(parent_col + inner_block_idx) }; + } else { + // I lied, inner_block_idx doesn't point to the inner block. + self.advanceCursor(inner_block_idx + 1); + } + } +} + +fn insertTextLine( + self: *AstGen, + comptime first_text_tag: Node.Tag, + comptime rest_text_tag: Node.Tag, + parent_idx: Node.Idx, + len_: usize, +) !void { + var len = len_; + if (len <= std.math.maxInt(StrLen)) { + _ = try self.appendLeafNodeAtCursor(parent_idx, first_text_tag, @intCast(len)); + self.advanceCursor(len); + } else { + @branchHint(.cold); + { + const consumed_len = @min(len, std.math.maxInt(StrLen)); + _ = try self.appendLeafNodeAtCursor(parent_idx, first_text_tag, @intCast(consumed_len)); + self.advanceCursor(consumed_len); + len -= consumed_len; + } + while (len > 0) { + const consumed_len = @min(len, std.math.maxInt(StrLen)); + _ = try self.appendLeafNodeAtCursor(parent_idx, rest_text_tag, @intCast(consumed_len)); + self.advanceCursor(consumed_len); + len -= consumed_len; + } + } +} + +fn parseParagraph( + self: *AstGen, + comptime first_text_tag: Node.Tag, + parent_idx: Node.Idx, + parent_col: Column, + block_col: Column, +) !Column { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + + { + const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; + try self.insertTextLine(first_text_tag, .text, parent_idx, newline); + self.advanceCursor(1); + } + + while (true) { + if (self.cursor.len == 0) return 0; + + const indentation_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; + // block line found, exit + if (str.isAnyOf(self.cursor[indentation_idx], "-.:+>#;")) { + const verified_indentation_idx = std.mem.indexOfDiff( + u8, + self.cursor, + self.indentation[0..block_col], + ) orelse unreachable; + if (verified_indentation_idx == block_col) { + return @intCast(indentation_idx); + } else { + return @intCast(verified_indentation_idx); + } + } + // empty line found, consume to next nonwhitespace and exit + if (self.cursor[indentation_idx] == '\n') { + self.advanceCursor(indentation_idx + 1); + while (true) { + if (self.cursor.len == 0) return 0; + + const next_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; + if (self.cursor[next_idx] == '\n') { + self.advanceCursor(next_idx + 1); + continue; + } + + const verified_indentation_idx = std.mem.indexOfDiff( + u8, + self.cursor, + self.indentation[0..block_col], + ) orelse unreachable; + if (verified_indentation_idx == block_col) { + return @intCast(next_idx); + } else { + return @intCast(verified_indentation_idx); + } + } + } + + // verify indentation + const verified_indentation_idx = std.mem.indexOfDiff( + u8, + self.cursor, + self.indentation[0..block_col], + ) orelse unreachable; + if (verified_indentation_idx == block_col) { + self.advanceCursor(verified_indentation_idx); + const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; + try self.insertTextLine(.space_text, .text, parent_idx, newline); + self.advanceCursor(1); + } else if (verified_indentation_idx > parent_col) { + self.advanceCursor(parent_col); + try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx); + self.advanceCursor(indentation_idx - parent_col); + const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; + try self.insertTextLine(.space_text, .text, parent_idx, newline); + self.advanceCursor(1); + } else { + return @intCast(verified_indentation_idx); + } + } +} diff --git a/src/main.zig b/src/main.zig @@ -17,23 +17,23 @@ pub fn main() !void { defer tracy_frame.end(); if (stdin.stat()) |stat| { if (stat.size > 0) { - var al: std.ArrayList(u8) = try .initCapacity(arena.allocator(), stat.size + 128); - try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 128); - try al.appendNTimes('\n', 128); + var al: std.ArrayList(u8) = try .initCapacity(arena.allocator(), stat.size + 1024); + try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024); + try al.appendNTimes('\n', 1024); break :blk al; } } else |_| {} - var al: std.ArrayList(u8) = try .initCapacity(gpa.allocator(), 1024); + var al: std.ArrayList(u8) = try .initCapacity(gpa.allocator(), 4096); errdefer al.deinit(); - try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 128); - try al.appendNTimes('\n', 128); + try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024); + try al.appendNTimes('\n', 1024); break :blk al; }; defer input_arraylist.deinit(); const input = input_arraylist.items; const ast = blk: { - const tracy_frame = tracy.namedFrame("parse"); + const tracy_frame = tracy.namedFrame("parse 0"); defer tracy_frame.end(); break :blk try mymarkdown.parse( gpa.allocator(), @@ -41,24 +41,28 @@ pub fn main() !void { input, ); }; - // const ast2 = blk: { - // const tracy_frame = tracy.namedFrame("parse 2"); - // tracy_frame.end(); - // break :blk try mymarkdown.parse2( - // gpa.allocator(), - // arena.allocator(), - // input, - // ); - // }; - // try std.testing.expectEqualDeep(ast, ast2); + const ast2 = blk: { + const tracy_frame = tracy.namedFrame("parse 1"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse2( + gpa.allocator(), + arena.allocator(), + input, + ); + }; var bw = std.io.bufferedWriter(std.io.getStdOut().writer()); const stdout = bw.writer(); { - const tracy_frame = tracy.namedFrame("render"); + const tracy_frame = tracy.namedFrame("render 0"); defer tracy_frame.end(); _ = try ast.renderAst(stdout, input); } + { + const tracy_frame = tracy.namedFrame("render 1"); + defer tracy_frame.end(); + _ = try ast2.renderAst(stdout, input); + } try bw.flush(); if (tracy.enable) { diff --git a/src/root.zig b/src/root.zig @@ -2,7 +2,9 @@ const std = @import("std"); pub const Ast = @import("Ast.zig"); pub const AstGen = @import("AstGen.zig"); +pub const AstGen2 = @import("AstGen2.zig"); pub const parse = AstGen.parse; +pub const parse2 = AstGen2.parse; test { _ = @import("test/test.zig"); diff --git a/src/str.zig b/src/str.zig @@ -111,6 +111,24 @@ pub fn escapeStringForSingleQuotedString( return escapeString(writer, slice, .double_quoted_string); } +pub fn fmtEscapes(bytes: []const u8) std.fmt.Formatter(stringEscapeFormatter) { + return .{ .data = bytes }; +} + +pub fn stringEscapeFormatter( + bytes: []const u8, + comptime f: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = options; + if (f.len == 1 and f[0] == '\'') { + try escapeString(writer, bytes, .single_quoted_string); + } else { + try escapeString(writer, bytes, .double_quoted_string); + } +} + pub fn escapeString( writer: anytype, slice: []const u8, diff --git a/src/test/test.zig b/src/test/test.zig @@ -269,6 +269,8 @@ test "Happy path list elaboration" { \\ .text \\ "a" \\ .elaboration + \\ .marker + \\ "+" \\ .paragraph \\ .text \\ "bb" @@ -318,6 +320,8 @@ test "Mixed indentation" { \\.document \\ .elaboration \\ .error .inconsistent_indentation at 3:1 + \\ .marker + \\ "+" \\ .paragraph \\ .text \\ "aaa" @@ -358,7 +362,7 @@ test "Empty line in heading" { \\ .text \\ "heading" \\ .space_text - \\ .error .empty_line_in_inline_block at 2:1 + \\ error .unexpected_block_in_inline_context \\ "text" \\ .paragraph \\ .text diff --git a/src/utils.zig b/src/utils.zig @@ -2,14 +2,16 @@ const std = @import("std"); const ziggy = @import("ziggy"); -pub fn NewType(comptime int_type: type, comptime dummy_type_: type) type { - return enum(int_type) { +pub fn NewType(comptime IntType_: type, comptime DummyType_: type) type { + return enum(IntType_) { + root = 0, _, const Self = @This(); + pub const IntType = IntType_; pub fn next(self: @This()) ?@This() { - if (@intFromEnum(self) == std.math.maxInt(int_type)) + if (@intFromEnum(self) == std.math.maxInt(IntType_)) return null; return @enumFromInt(@intFromEnum(self) + 1); } @@ -19,7 +21,7 @@ pub fn NewType(comptime int_type: type, comptime dummy_type_: type) type { } pub const ziggy_options = struct { - const dummy_type = dummy_type_; + const DummyType = DummyType_; pub fn parse( self: *ziggy.Parser, first_tok: ziggy.Tokenizer.Token,