commit 561fee2b93050e3cc92927e9a73b14252d45ed85
parent c08e3785171f96ac5779ee9c5577dc2ce4fa6c9a
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 18 May 2025 01:48:45 +0800
v0 vs v1
Diffstat:
8 files changed, 980 insertions(+), 90 deletions(-)
diff --git a/src/Ast.zig b/src/Ast.zig
@@ -37,12 +37,12 @@ pub const Node = utils.Packed(union(enum(u8)) {
pub const Idx = utils.NewType(u24, opaque {});
pub const Root = packed struct {
- num_children: u24 = 0,
+ num_children: Idx.IntType = 0,
pub const format = utils.structFormat(@This());
};
pub const Container = packed struct {
off: StrOffset,
- num_children: u24 = 0,
+ num_children: Idx.IntType = 0,
pub const format = utils.structFormat(@This());
};
pub const Leaf = packed struct {
@@ -55,7 +55,9 @@ pub const Node = utils.Packed(union(enum(u8)) {
pub fn incrementNumChildren(self: *Node) void {
switch (self.tag) {
inline else => |t| {
- if (@TypeOf(@field(self.data, @tagName(t))) == Container or @TypeOf(@field(self.data, @tagName(t))) == Root) {
+ if (@TypeOf(@field(self.data, @tagName(t))) == Container or
+ @TypeOf(@field(self.data, @tagName(t))) == Root)
+ {
@field(self.data, @tagName(t)).num_children += 1;
} else unreachable;
},
@@ -67,8 +69,8 @@ pub const Node = utils.Packed(union(enum(u8)) {
pub const Error = utils.Packed(union(enum(u8)) {
marker_too_long: NodeError,
+ unexpected_block_in_inline_context: NodeError,
invalid_marker: PointError,
- empty_line_in_inline_block: PointError,
inconsistent_indentation: PointError,
/// Used when the error diagnostic spans the entire node
@@ -126,7 +128,13 @@ pub fn toTagged(self: Ast, gpa: Allocator) !Tagged {
}
pub fn renderAst(self: Ast, writer: anytype, input: []const u8) !void {
- var renderer: AstRenderer(@TypeOf(writer)) = .{ .self = self, .writer = writer, .input = input, .cached_offset = 0, .cached_pos = .{ .row = 1, .col = 1 } };
+ var renderer: AstRenderer(@TypeOf(writer)) = .{
+ .self = self,
+ .writer = writer,
+ .input = input,
+ .cached_offset = 0,
+ .cached_pos = .{ .row = 1, .col = 1 },
+ };
const node_idx, const error_idx = try renderer.renderAstInner(0, 0, 0);
assert(node_idx == self.nodes.len);
assert(error_idx == self.errors.len);
@@ -175,72 +183,110 @@ fn AstRenderer(Writer: type) type {
}
}
- pub fn renderAstInner(renderer: *@This(), start_node: usize, start_error: usize, indent: usize) !struct { usize, usize } {
+ pub fn renderAstInner(
+ renderer: *@This(),
+ start_node: usize,
+ start_error: usize,
+ indent: usize,
+ ) !struct { usize, usize } {
const tracy_frame = tracy.trace(@src());
defer tracy_frame.end();
+ try renderer.writer.writeByteNTimes(' ', indent);
+ try renderer.writer.writeByte('.');
+ try renderer.writer.writeAll(@tagName(renderer.self.nodes[start_node].tag));
+ try renderer.writer.writeByte('\n');
+ var cur_node: usize = start_node + 1;
+ var cur_error: usize = start_error;
+ while (cur_error < renderer.self.errors.len and
+ @intFromEnum(renderer.self.errors[cur_error].get(.idx)) == start_node)
+ {
+ try renderer.writer.writeByteNTimes(' ', indent + 2);
+ try renderer.writer.writeAll(".error .");
+ try renderer.writer.writeAll(@tagName(renderer.self.errors[cur_error].tag));
+ switch (renderer.self.errors[cur_error].toTagged()) {
+ // PointError
+ .inconsistent_indentation,
+ .invalid_marker,
+ => |error_data| {
+ try renderer.writer.print(
+ " at {}:{}",
+ renderer.offsetToPos(error_data.off),
+ );
+ },
+ // NodeError
+ .unexpected_block_in_inline_context,
+ .marker_too_long,
+ => {},
+ }
+ try renderer.writer.writeByte('\n');
+ cur_error += 1;
+ }
switch (renderer.self.nodes[start_node].toTagged()) {
- inline else => |data, tag| {
- try renderer.writer.writeByteNTimes(' ', indent);
- try renderer.writer.writeByte('.');
- try renderer.writer.writeAll(@tagName(tag));
- try renderer.writer.writeByte('\n');
- var cur_node: usize = start_node + 1;
- var cur_error: usize = start_error;
- while (cur_error < renderer.self.errors.len and @intFromEnum(renderer.self.errors[cur_error].get(.idx)) == start_node) {
- switch (renderer.self.errors[cur_error].toTagged()) {
- // PointError
- .empty_line_in_inline_block,
- .inconsistent_indentation,
- .invalid_marker,
- => |error_data| {
- try renderer.writer.writeByteNTimes(' ', indent + 2);
- try renderer.writer.writeAll(".error .");
- try renderer.writer.writeAll(@tagName(renderer.self.errors[cur_error].tag));
- try renderer.writer.print(" at {}:{}", renderer.offsetToPos(error_data.off));
- try renderer.writer.writeByte('\n');
- },
- // NodeError
- .marker_too_long => {
- try renderer.writer.writeByteNTimes(' ', indent + 2);
- try renderer.writer.writeAll("error .");
- try renderer.writer.writeAll(@tagName(renderer.self.errors[cur_error].tag));
- try renderer.writer.writeByte('\n');
- },
- }
- cur_error += 1;
+ .document,
+ => |data| {
+ for (0..data.num_children) |_| {
+ assert(cur_node < renderer.self.nodes.len);
+ cur_node, cur_error = try renderer.renderAstInner(
+ cur_node,
+ cur_error,
+ indent + 2,
+ );
+ assert(cur_node <= renderer.self.nodes.len);
}
- if (@TypeOf(data) == Node.Tagged.Container or @TypeOf(data) == Node.Tagged.Root) {
- for (0..data.num_children) |_| {
- assert(cur_node < renderer.self.nodes.len);
- cur_node, cur_error = try renderer.renderAstInner(cur_node, cur_error, indent + 2);
- assert(cur_node <= renderer.self.nodes.len);
- }
- return .{ cur_node, cur_error };
- } else if (@TypeOf(data) == Node.Tagged.Leaf) {
- try renderer.writer.writeByteNTimes(' ', indent + 2);
- // This was too slow!
- // try renderer.writer.print("\"{}\"", .{
- // std.zig.fmtEscapes(renderer.input[data.off .. data.off + data.len]),
- // });
- try renderer.writer.writeByte('"');
- try str.escapeStringForDoubleQuotedString(
- renderer.writer,
- renderer.input[data.off .. data.off + data.len],
+ return .{ cur_node, cur_error };
+ },
+
+ .heading,
+ .quote,
+ .paragraph,
+ .unordered_item,
+ .ordered_item,
+ .term_item,
+ .task_item,
+ .elaboration,
+ => |data| {
+ for (0..data.num_children) |_| {
+ assert(cur_node < renderer.self.nodes.len);
+ cur_node, cur_error = try renderer.renderAstInner(
+ cur_node,
+ cur_error,
+ indent + 2,
);
- try renderer.writer.writeByte('"');
- try renderer.writer.writeByte('\n');
- return .{ cur_node, cur_error };
- } else {
- @compileError(@typeName(@TypeOf(data)));
+ assert(cur_node <= renderer.self.nodes.len);
}
+ return .{ cur_node, cur_error };
+ },
+
+ .marker,
+ .thematic_break,
+ .text,
+ .space_text,
+ => |data| {
+ try renderer.writer.writeByteNTimes(' ', indent + 2);
+ // This was too slow!
+ // try renderer.writer.print("\"{}\"", .{
+ // std.zig.fmtEscapes(renderer.input[data.off .. data.off + data.len]),
+ // });
+ try renderer.writer.writeByte('"');
+ try str.escapeStringForDoubleQuotedString(
+ renderer.writer,
+ renderer.input[data.off .. data.off + data.len],
+ );
+ try renderer.writer.writeByte('"');
+ try renderer.writer.writeByte('\n');
+ return .{ cur_node, cur_error };
},
}
- unreachable;
}
};
}
-pub fn renderHtml(self: Ast, writer: anytype, input: []const u8, start_: ?Node.Idx) !?Node.Idx {
+pub fn renderHtml(
+ self: Ast,
+ writer: anytype,
+ input: []const u8,
+ start_: ?Node.Idx,
+) !?Node.Idx {
const tracy_frame = tracy.trace(@src());
defer tracy_frame.end();
const start: Node.Idx = start_ orelse @enumFromInt(0);
@@ -261,7 +307,10 @@ pub fn renderHtml(self: Ast, writer: anytype, input: []const u8, start_: ?Node.I
var cur_idx: ?Node.Idx = start.next();
switch (self.nodes[@intFromEnum(start)].tag) {
inline .document, .paragraph => |t| {
- const data = @field(self.nodes[@intFromEnum(start)].data, @tagName(t));
+ const data = @field(
+ self.nodes[@intFromEnum(start)].data,
+ @tagName(t),
+ );
for (0..data.num_children) |_| {
if (cur_idx) |idx| {
cur_idx = try self.renderHtml(writer, input, idx);
diff --git a/src/AstGen.zig b/src/AstGen.zig
@@ -187,7 +187,7 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: N
const tracy_frame = tracy.trace(@src());
defer tracy_frame.end();
var lines = lines_;
- var empty_line_off: ?u32 = null;
+ var saw_empty_line: bool = false;
outer: {
// empty lines at the start of the inline block are fine, just skip these
@@ -230,12 +230,13 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: N
if (lines.len == 0) break :outer;
if (lines[0].len != 0) break;
// empty line detected
- empty_line_off = self.calcOffset(@ptrCast(lines[0].ptr));
+ saw_empty_line = true;
}
- if (empty_line_off) |off| {
+ if (saw_empty_line) {
+ saw_empty_line = false;
try self.errors.append(gpa, .fromTagged(.{
- .empty_line_in_inline_block = .{ .idx = self.nextNodeIdx(), .off = off },
+ .unexpected_block_in_inline_context = .{ .idx = self.nextNodeIdx() },
}));
}
@@ -309,7 +310,16 @@ fn parseColumn(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: Node.I
num_lines += 1;
}
- const paragraph_lines = lines[0..num_lines];
+ var paragraph_lines = lines[0..num_lines];
+ if (lines[0][0] == ' ' or lines[0][0] == '\t') {
+ try self.errors.append(gpa, .fromTagged(.{
+ .inconsistent_indentation = .{
+ .idx = self.lastNodeIdx(),
+ .off = self.calcOffset(&lines[0][0]),
+ },
+ }));
+ paragraph_lines = try self.findIndentedColumn(gpa, paragraph_lines, child);
+ }
lines = lines[num_lines..];
try self.parseInlineBlock(gpa, paragraph_lines, child);
},
@@ -446,9 +456,9 @@ const block_specs = blockSpecs(struct {
pub const @"+": BlockSpec = &.{
.{
.tag = .elaboration,
- .marker = .{ .starts_with = "+" },
+ .marker = .{ .starts_with_multi = .{ .marker_char = '+' } },
.mode = .indented_column,
- .store_marker_child = .no_store,
+ .store_marker_child = .store,
},
};
pub const @";": BlockSpec = &.{
diff --git a/src/AstGen2.zig b/src/AstGen2.zig
@@ -0,0 +1,801 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const assert = std.debug.assert;
+
+const ziggy = @import("ziggy");
+const tracy = @import("tracy");
+
+const utils = @import("utils.zig");
+const str = @import("str.zig");
+const Ast = @import("Ast.zig");
+const StrOffset = Ast.StrOffset;
+const StrLen = Ast.StrLen;
+const Node = Ast.Node;
+const Error = Ast.Error;
+
+const Column = u10;
+
+const AstGen = @This();
+
+gpa: Allocator,
+output_gpa: Allocator,
+output_gpa_same_as_gpa: bool,
+input: []const u8,
+cursor: []const u8, // suffix of input
+indentation: [std.math.maxInt(Column)]u8,
+nodes: std.ArrayListUnmanaged(Node),
+errors: std.ArrayListUnmanaged(Error),
+extra: std.ArrayListUnmanaged(u32),
+
+fn cursorOffset(self: *AstGen) StrOffset {
+ return @intCast(self.cursor.ptr - self.input.ptr);
+}
+fn advanceCursor(self: *AstGen, advance: usize) void {
+ // NOTE: `advance` should really be u32, but this makes it easier to work with other str functions.
+ self.cursor = self.cursor[advance..];
+}
+fn findMarkerEnd(self: AstGen, m: u8) error{IndentationTooLong}!Column {
+ // NOTE: null is impossible because input is guaranteed to end in newlines.
+ const idx = str.indexOfNotChar(self.cursor, m) orelse unreachable;
+ // Explicitly check for marker length because malicious input is possible
+ if (idx > std.math.maxInt(Column))
+ return error.IndentationTooLong;
+ return @intCast(idx);
+}
+
+fn getNode(self: AstGen, idx: Node.Idx) *Node {
+ @setRuntimeSafety(true);
+ return &self.nodes.items[@intFromEnum(idx)];
+}
+fn lastNodeIdx(self: AstGen) Node.Idx {
+ @setRuntimeSafety(true);
+ return @enumFromInt(self.nodes.items.len - 1);
+}
+fn nextNodeIdx(self: AstGen) Node.Idx {
+ @setRuntimeSafety(true);
+ return @enumFromInt(self.nodes.items.len);
+}
+fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
+ if (self.nodes.items.len > std.math.maxInt(
+ @typeInfo(Node.Idx).@"enum".tag_type,
+ )) return error.OutOfNodeIdx;
+ const idx = self.nodes.items.len;
+ try self.nodes.append(self.gpa, .fromTagged(node));
+ return @enumFromInt(idx);
+}
+fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx {
+ self.getNode(parent_idx).incrementNumChildren();
+ return try self.appendNode(
+ @unionInit(
+ Node.Tagged,
+ @tagName(container_node_tag),
+ .{ .off = self.cursorOffset() },
+ ),
+ );
+}
+fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx {
+ self.getNode(parent_idx).incrementNumChildren();
+ return try self.appendNode(
+ @unionInit(
+ Node.Tagged,
+ @tagName(leaf_node_tag),
+ .{ .off = self.cursorOffset(), .len = len },
+ ),
+ );
+}
+fn appendError(self: *AstGen, err: Error.Tagged) !void {
+ if (self.errors.items.len > std.math.maxInt(
+ @typeInfo(Error.Idx).@"enum".tag_type,
+ )) return error.OutOfErrorIdx;
+ try self.errors.append(self.gpa, .fromTagged(err));
+}
+fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+ try self.appendError(
+ @unionInit(
+ Error.Tagged,
+ @tagName(tag),
+ .{ .idx = idx, .off = self.cursorOffset() },
+ ),
+ );
+}
+fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+ try self.appendError(
+ @unionInit(
+ Error.Tagged,
+ @tagName(tag),
+ .{ .idx = idx },
+ ),
+ );
+}
+
+pub fn deinit(self: *AstGen) void {
+ self.nodes.deinit(self.gpa);
+ self.errors.deinit(self.gpa);
+ self.extra.deinit(self.gpa);
+}
+
+/// Parses mymarkdown
+///
+/// : `gpa`
+/// + A suitable allocator for scratch allocations that supports free and ideally remap.
+/// : `output_gpa`
+/// + If passed, no scratch allocations will outlive this function,
+/// and any allocations returned will be allocated on this.
+/// : `input`
+/// + The input slice to be parsed. Must end in at least 1024 \n characters.
+///
+/// Errors:
+/// : `IndentationTooLong`
+/// + This implementation of mymarkdown supports up to 1023 characters of indentation.
+pub fn parse(
+ gpa: Allocator,
+ output_gpa: ?Allocator,
+ input: []const u8,
+) !Ast {
+ const tracy_frame = tracy.trace(@src());
+ defer tracy_frame.end();
+
+ if (@typeInfo(Column).int.bits > @typeInfo(StrLen).int.bits)
+ @compileError("Column should have less bits than StrLen");
+ if (input.len < 128 or !std.mem.eql(u8, input[input.len - 128 ..], "\n" ** 128))
+ return error.InputUnsafe;
+ if (input.len > std.math.maxInt(u32))
+ return error.InputTooLarge;
+
+ var ast: AstGen = .{
+ .gpa = gpa,
+ .output_gpa = output_gpa orelse gpa,
+ .output_gpa_same_as_gpa = output_gpa == null,
+ .input = input,
+ .cursor = input,
+ .indentation = undefined,
+ .nodes = .empty,
+ .errors = .empty,
+ .extra = .empty,
+ };
+ defer ast.deinit();
+
+ try ast.parseRoot();
+
+ if (output_gpa) |gpa2| {
+ return .{
+ .nodes = try gpa2.dupe(Node, ast.nodes.items),
+ .errors = try gpa2.dupe(Error, ast.errors.items),
+ .extra = try gpa2.dupe(u32, ast.extra.items),
+ };
+ } else {
+ return .{
+ .nodes = try ast.nodes.toOwnedSlice(gpa),
+ .errors = try ast.errors.toOwnedSlice(gpa),
+ .extra = try ast.extra.toOwnedSlice(gpa),
+ };
+ }
+}
+
+const ParsingContext = enum { block_context, inline_context };
+
+fn parseRoot(self: *AstGen) !void {
+ const root = try self.appendNode(.{ .document = .{} });
+ assert(root == .root);
+ assert(self.input.ptr == self.cursor.ptr);
+ assert(self.input.len == self.cursor.len);
+
+ if (str.indexOfNone(self.input, " \t\r\n")) |idx| {
+ if (idx == 0 or self.input[idx - 1] != '\n') {
+ // Happy case: input starts at the start of the line :)
+ self.advanceCursor(idx);
+ } else {
+ // Input doesn't start at the start of the line :(
+ // Log an error but otherwise proceed as usual
+ try self.appendPointErrorAtCursor(.inconsistent_indentation, root);
+ self.advanceCursor(idx);
+ }
+
+ // The actual parse
+ // "inline" hack to get different branch predictors for the root column
+ _ = try self.parseColumnInline(root, 0, 0, .block_context);
+ } else {
+ // Input is completely empty, return without really parsing anything
+ }
+}
+
+fn parseColumn(
+ self: *AstGen,
+ parent_idx: Node.Idx,
+ parent_col: Column,
+ cursor_col: Column,
+ comptime parsing_context: ParsingContext,
+) error{
+ IndentationTooLong,
+ OutOfNodeIdx,
+ OutOfErrorIdx,
+ OutOfMemory,
+}!Column {
+ return self.parseColumnInline(parent_idx, parent_col, cursor_col, parsing_context);
+}
+
+fn parseMarkerItem(
+ self: *AstGen,
+ comptime node_tag: Node.Tag,
+ parent_idx: Node.Idx,
+ block_col: Column,
+ marker_len: Column,
+ comptime output_marker: enum { output_marker, no_output_marker },
+ comptime parent_parsing_context: ParsingContext,
+ comptime child_parsing_context: ParsingContext,
+) !Column {
+ const block_idx = try self.appendContainerNodeAtCursor(parent_idx, node_tag);
+ if (parent_parsing_context == .inline_context)
+ try self.appendNodeErrorAtCursor(.unexpected_block_in_inline_context, block_idx);
+ if (output_marker == .output_marker) {
+ _ = try self.appendLeafNodeAtCursor(block_idx, .marker, marker_len);
+ }
+ switch (try self.findIndentation(block_col, marker_len)) {
+ .found_column => |child_col| {
+ return try self.parseColumn(block_idx, block_col, child_col, child_parsing_context);
+ },
+ .mismatched_indentation => |indentation_idx_found| return indentation_idx_found,
+ }
+}
+
+inline fn parseColumnInline(
+ self: *AstGen,
+ parent_idx: Node.Idx,
+ parent_col: Column,
+ cursor_col: Column,
+ comptime parsing_context: ParsingContext,
+) !Column {
+ const tracy_frame = tracy.trace(@src());
+ defer tracy_frame.end();
+ assert(cursor_col == 0 or parent_col < cursor_col);
+
+ // Used for "indentation correction".
+ // For simplicity, just think of this as this column's indentation.
+ var block_col = cursor_col;
+
+ var parsed_first_paragraph_for_inline_context: bool = false;
+
+ // # parseColumn's input parameter explanation
+ // + AKA: How to deal with indentation
+ //
+ // Our cursor points at the first (usually non-whitespace) char of the column.
+ // (The caller is in charge of finding the start of the column.)
+ //
+ // We also need both our own and our parent's indentations.
+ // These are represented as column indices that indicate
+ // how many characters of `self.indentation` should match.
+ // indicating where the parent column's ends,
+ // as well as where this column's indentation ends.
+ // For the root column, both of these values are 0, which means the column
+ // can never be exited from until it reaches the end of the file.
+ //
+ // ===
+ // column 0, file starts here
+ // | column 2, parent column starts here
+ // | | column 5, our column starts here
+ // v v v
+ //
+ // | Parent column (parent's indentation = " ", represented as parent_col = 2)
+ // | v (cursor points at H, our indentation = " ", represented as cursor_col = 5)
+ // | Hello there
+ // | Same column
+ // | Parent column
+ // ===
+ //
+ // Lines that match with our indentation are considered part of our column,
+ // and the column ends upon finding a line aligns with the parent indentation.
+ // When a line matches the parent's indentation but NOT ours, we log an error,
+ // then do error recovery by pretending it does match with our indentation.
+ //
+ // ===
+ // | Parent column
+ // | v (cursor)
+ // | Hello there
+ // | \t Same column but syntax error! (" \t " does not match with " ", but matches " ")
+ // | Parent column
+ // ===
+ //
+ // This indentation recovery system is not the best and it can fix indentation "incorrectly",
+ // but I don't have anything better. (Btw, the original AstGen.zig uses the same recovery logic.)
+
+ parse_another_block: while (true) {
+ assert(self.cursor.len > 0 and str.isNoneOf(self.cursor[0], " \t\r\n"));
+
+ // Will be set by the recursive call (if any),
+ // to indicate how much indentation was previously checked.
+ var indentation_idx: Column = undefined;
+
+ finish_parsing_block: {
+ switch (self.cursor[0]) {
+ // Par-like repeatable markers
+ inline '-', '.', ':', '#' => |m| {
+ const marker_len = try self.findMarkerEnd(m);
+ if (m == '-') {
+ const potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable;
+ if (potential_task_item >= 3 and
+ self.cursor[marker_len + potential_task_item - 1] == ']' and
+ self.cursor[marker_len + potential_task_item - 3] == '[' and
+ (self.cursor[marker_len + potential_task_item - 2] == ' ' or
+ self.cursor[marker_len + potential_task_item - 2] == 'x' or
+ self.cursor[marker_len + potential_task_item - 2] == 'X') and
+ std.mem.allEqual(u8, self.cursor[marker_len .. marker_len + potential_task_item - 3], ' '))
+ {
+ if (marker_len + potential_task_item > std.math.maxInt(Column))
+ return error.IndentationTooLong;
+ indentation_idx = try self.parseMarkerItem(
+ .task_item,
+ parent_idx,
+ block_col,
+ @intCast(marker_len + potential_task_item),
+ .output_marker,
+ parsing_context,
+ .inline_context,
+ );
+ break :finish_parsing_block;
+ }
+ }
+ indentation_idx = try self.parseMarkerItem(
+ switch (m) {
+ '-' => .unordered_item,
+ '.' => .ordered_item,
+ ':' => .term_item,
+ '#' => .heading,
+ else => unreachable,
+ },
+ parent_idx,
+ block_col,
+ marker_len,
+ .output_marker,
+ parsing_context,
+ .inline_context,
+ );
+ break :finish_parsing_block;
+ },
+
+ // Div-like repeatable markers
+ inline '+' => |m| {
+ const marker_len = try self.findMarkerEnd(m);
+ indentation_idx = try self.parseMarkerItem(
+ switch (m) {
+ '+' => .elaboration,
+ else => unreachable,
+ },
+ parent_idx,
+ block_col,
+ marker_len,
+ .output_marker,
+ parsing_context,
+ .block_context,
+ );
+ break :finish_parsing_block;
+ },
+
+ // Par-like single markers
+ inline ';' => |m| {
+ indentation_idx = try self.parseMarkerItem(
+ switch (m) {
+ ';' => .paragraph,
+ else => unreachable,
+ },
+ parent_idx,
+ block_col,
+ 1,
+ .no_output_marker,
+ parsing_context,
+ .inline_context,
+ );
+ break :finish_parsing_block;
+ },
+
+ // Div-like single markers
+ inline '>' => |m| {
+ indentation_idx = try self.parseMarkerItem(
+ switch (m) {
+ '>' => .quote,
+ else => unreachable,
+ },
+ parent_idx,
+ block_col,
+ 1,
+ .no_output_marker,
+ parsing_context,
+ .block_context,
+ );
+ break :finish_parsing_block;
+ },
+
+ '*' => {
+ if (std.mem.eql(u8, self.cursor[0..3], "***") and
+ self.cursor[
+ 3 + (str.indexOfNone(
+ self.cursor[3..],
+ " \t",
+ ) orelse unreachable)
+ ] == '\n')
+ {
+ _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3);
+ break :finish_parsing_block;
+ }
+ },
+
+ else => {},
+ }
+
+ // Handle paragraph
+ switch (parsing_context) {
+ .inline_context => {
+ if (parsed_first_paragraph_for_inline_context) {
+ try self.appendNodeErrorAtCursor(.unexpected_block_in_inline_context, self.nextNodeIdx());
+ indentation_idx = try self.parseParagraph(.space_text, parent_idx, parent_col, block_col);
+ } else {
+ indentation_idx = try self.parseParagraph(.text, parent_idx, parent_col, block_col);
+ parsed_first_paragraph_for_inline_context = true;
+ }
+ },
+ .block_context => {
+ const paragraph_idx = try self.appendContainerNodeAtCursor(parent_idx, .paragraph);
+ indentation_idx = try self.parseParagraph(.text, paragraph_idx, parent_col, block_col);
+ },
+ }
+ }
+
+ // We just finished parsing a block, so cursor points at the start of a line:
+ //
+ // ===
+ // | Parent column
+ // | Hello there
+ // | Same column
+ // |v----------------- (cursor)
+ // | Parent column
+ // ===
+ //
+ // We need to find the next block. This involves checking for empty lines and indentation.
+ //
+ // We find where the indentation differs, if any. There are 6 cases:
+ // - **The line is empty.**
+ // + We loop again, looking for another block.
+ // - **Matches us, next char is non-whitespace.**
+ // + Happy path, we continue parsing as usual.
+ // - **Matches us, next char is whitespace.**
+ // + Log and error and recover by treating all leading whitespace
+ // as if it matched our indentation level exactly.
+ // - **Does not even match parent.**
+ // + Return with no errors.
+ // - **Matches parent and not us, next char is non-whitespace.**
+ // + Return with no errors.
+ // - **Matches parent and not us, next char is whitespace.**
+ // + Log and error and recover by treating all leading whitespace
+ // as if it matched our indentation level exactly.
+
+ block_col = cursor_col;
+ // finding_block:
+ while (true) {
+ // Special case: when we hit EOF, there's nothing left to parse.
+ if (self.cursor.len == 0) return 0;
+
+ assert(self.cursor[indentation_idx] != '\n');
+ // if (self.cursor[indentation_idx] == '\n') {
+ // std.debug.assert(false); // TODO: Check this logic
+ // // Empty line
+ // self.advanceCursor(indentation_idx + 1);
+ // // NOTE: null is impossible because input is guaranteed to end in newlines.
+ // indentation_idx = if (str.indexOfNone(self.cursor, " \t") orelse unreachable) |idx|
+ // // Explicitly check for indentation length because malicious input is possible
+ // if (idx > std.math.maxInt(Column))
+ // return error.IndentationTooLong
+ // else
+ // @intCast(idx);
+ // continue :finding_block;
+ // } else
+ if (indentation_idx > cursor_col) {
+ // Matches us but there's too much whitespace.
+ // Fix the indentation.
+ // Here, we fix the indentation by pretending that
+ // the block starts from wherever the whitespace ended.
+ block_col = indentation_idx;
+ @memcpy(
+ self.indentation[cursor_col..block_col],
+ self.cursor[cursor_col..block_col],
+ );
+ // Log the error.
+ self.advanceCursor(cursor_col);
+ try self.appendPointErrorAtCursor(.inconsistent_indentation, self.nextNodeIdx());
+ // Continue parsing.
+ self.advanceCursor(indentation_idx - cursor_col);
+ continue :parse_another_block;
+ } else if (indentation_idx == cursor_col) {
+ // Matches us exactly.
+ self.advanceCursor(indentation_idx);
+ continue :parse_another_block;
+ } else if (indentation_idx > parent_col) {
+ // Matches parent but there's extra whitespace that doesn't match us.
+ // Log the error.
+ self.advanceCursor(parent_col);
+ try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx);
+ // Fix the indentation.
+ // Here, we fix the indentation by pretending that
+ // the block starts from the correct amount of whitespace.
+ // <no op>
+
+ // Continue parsing.
+ self.advanceCursor(indentation_idx - parent_col);
+ continue :parse_another_block;
+ } else {
+ // Matches parent exactly or doesn't match parent, return.
+ return indentation_idx;
+ }
+ }
+ }
+}
+
+/// Finds where the indented block starts
+fn findIndentation(
+ self: *AstGen,
+ parent_col: Column,
+ skip: Column,
+) !union(enum) {
+ found_column: Column,
+ mismatched_indentation: Column,
+} {
+ // We're given the input at the marker.
+ //
+ // ===
+ // parent_col
+ // v
+ //
+ // | Parent column
+ // | v------------ (cursor)
+ // | - Hello there
+ // | Same column
+ // | Parent column
+ // ===
+ //
+ // We first skip some number of characters, and then scan forward until we find non-whitespace.
+ //
+ // ===
+ // first skip...
+ //
+ // parent_col
+ // v
+ //
+ // | Parent column
+ // | v------------ (cursor)
+ // | - Hello there
+ // | Same column
+ // | Parent column
+ //
+ // then scan for non-whitespace:
+ //
+ // parent_col
+ // v
+ //
+ // | Parent column
+ // | v------------ (cursor)
+ // | - Hello there
+ // | Same column
+ // | Parent column
+ // ===
+ //
+ // Then we store the indentation found in `self.indentation` and return the indentation column (in this case 5).
+ // Because in this case we found the non-whitespace character on the first line, we will memset the marker to spaces.
+ //
+ // - return = 5
+ // - self.indentation = ` `
+ //
+ // ***
+ //
+ // There are a couple other cases.
+ //
+ // This next case is when the non-whitespace char is not on the same line.
+ // In this case we can simply copy the indentation from the input.
+ //
+ // ===
+ // parent_col
+ // v
+ //
+ // | Parent column
+ // |
+ // | - v------------ (cursor)
+ // | Hello there
+ // | Same column
+ // | Parent column
+ // ===
+ //
+ // - return = 5
+ // - self.indentation = ` `
+ //
+ // ***
+ //
+ // This next case is we find that the indentation does not match the parent's indentation.
+ // Note that this can only happen when the non-whitespace char is not on the same line as the initial cursor,
+ // since the indentation behind the initial cursor has already been checked by the caller.
+ //
+ // ===
+ // parent_col
+ // v
+ //
+ // | Parent column
+ // |
+ // | - v------------ (cursor)
+ // | \t Hello there
+ // | Same column
+ // | Parent column
+ // ===
+ //
+ // - return = mismatched indentation at column 1
+ //
+ // Here, parent's indentation is ` ` but we saw ` \t `.
+ // In this case we return how many characters did match to the caller.
+ // The caller should interpret this case as that no indented column was found.
+ //
+ // Note that we do not break out of the loop until we find a non-whitespace char,
+ // even if we see non-matching indentation. In this next example, we ignore the `\t`
+ // and continue to the H, where the indentation actually does match.
+ //
+ // ===
+ // parent_col
+ // v
+ //
+ // | Parent column
+ // |
+ // | -
+ // | \t v------------ (cursor)
+ // | Hello there
+ // | Same column
+ // | Parent column
+ // ===
+ //
+ // - return = 5
+ // - self.indentation = ` `
+
+ // Handle first line separately
+ {
+ self.advanceCursor(skip);
+ const inner_block_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
+ if (self.cursor[inner_block_idx] != '\n') {
+ // We found the indentation!
+ // Because this is the first line, we need to memset the marker into spaces.
+ if (parent_col + skip + inner_block_idx > std.math.maxInt(Column))
+ return error.IndentationTooLong;
+ @memset(self.indentation[parent_col .. parent_col + skip], ' ');
+ @memcpy(self.indentation[parent_col + skip .. parent_col + skip + inner_block_idx], self.cursor[0..inner_block_idx]);
+ self.advanceCursor(inner_block_idx);
+ return .{ .found_column = @intCast(parent_col + skip + inner_block_idx) };
+ } else {
+ // I lied, inner_block_idx doesn't point to the inner block.
+ self.advanceCursor(inner_block_idx + 1);
+ }
+ }
+
+ // Remaining lines don't need to memset the marker into spaces.
+ while (true) {
+ // Find column
+ const inner_block_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
+ if (self.cursor[inner_block_idx] != '\n') {
+ // Verify parent indentation
+ const indentation_idx = std.mem.indexOfDiff(u8, self.cursor, self.indentation[0..parent_col]) orelse unreachable;
+ if (indentation_idx != parent_col) {
+ return .{ .mismatched_indentation = @intCast(indentation_idx) };
+ }
+ // We found the indentation!
+ if (parent_col + inner_block_idx > std.math.maxInt(Column))
+ return error.IndentationTooLong;
+ @memcpy(self.indentation[parent_col .. parent_col + inner_block_idx], self.cursor[parent_col..inner_block_idx]);
+ self.advanceCursor(inner_block_idx);
+ return .{ .found_column = @intCast(parent_col + inner_block_idx) };
+ } else {
+ // I lied, inner_block_idx doesn't point to the inner block.
+ self.advanceCursor(inner_block_idx + 1);
+ }
+ }
+}
+
+fn insertTextLine(
+ self: *AstGen,
+ comptime first_text_tag: Node.Tag,
+ comptime rest_text_tag: Node.Tag,
+ parent_idx: Node.Idx,
+ len_: usize,
+) !void {
+ var len = len_;
+ if (len <= std.math.maxInt(StrLen)) {
+ _ = try self.appendLeafNodeAtCursor(parent_idx, first_text_tag, @intCast(len));
+ self.advanceCursor(len);
+ } else {
+ @branchHint(.cold);
+ {
+ const consumed_len = @min(len, std.math.maxInt(StrLen));
+ _ = try self.appendLeafNodeAtCursor(parent_idx, first_text_tag, @intCast(consumed_len));
+ self.advanceCursor(consumed_len);
+ len -= consumed_len;
+ }
+ while (len > 0) {
+ const consumed_len = @min(len, std.math.maxInt(StrLen));
+ _ = try self.appendLeafNodeAtCursor(parent_idx, rest_text_tag, @intCast(consumed_len));
+ self.advanceCursor(consumed_len);
+ len -= consumed_len;
+ }
+ }
+}
+
+fn parseParagraph(
+ self: *AstGen,
+ comptime first_text_tag: Node.Tag,
+ parent_idx: Node.Idx,
+ parent_col: Column,
+ block_col: Column,
+) !Column {
+ const tracy_frame = tracy.trace(@src());
+ defer tracy_frame.end();
+
+ {
+ const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable;
+ try self.insertTextLine(first_text_tag, .text, parent_idx, newline);
+ self.advanceCursor(1);
+ }
+
+ while (true) {
+ if (self.cursor.len == 0) return 0;
+
+ const indentation_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
+ // block line found, exit
+ if (str.isAnyOf(self.cursor[indentation_idx], "-.:+>#;")) {
+ const verified_indentation_idx = std.mem.indexOfDiff(
+ u8,
+ self.cursor,
+ self.indentation[0..block_col],
+ ) orelse unreachable;
+ if (verified_indentation_idx == block_col) {
+ return @intCast(indentation_idx);
+ } else {
+ return @intCast(verified_indentation_idx);
+ }
+ }
+ // empty line found, consume to next nonwhitespace and exit
+ if (self.cursor[indentation_idx] == '\n') {
+ self.advanceCursor(indentation_idx + 1);
+ while (true) {
+ if (self.cursor.len == 0) return 0;
+
+ const next_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
+ if (self.cursor[next_idx] == '\n') {
+ self.advanceCursor(next_idx + 1);
+ continue;
+ }
+
+ const verified_indentation_idx = std.mem.indexOfDiff(
+ u8,
+ self.cursor,
+ self.indentation[0..block_col],
+ ) orelse unreachable;
+ if (verified_indentation_idx == block_col) {
+ return @intCast(next_idx);
+ } else {
+ return @intCast(verified_indentation_idx);
+ }
+ }
+ }
+
+ // verify indentation
+ const verified_indentation_idx = std.mem.indexOfDiff(
+ u8,
+ self.cursor,
+ self.indentation[0..block_col],
+ ) orelse unreachable;
+ if (verified_indentation_idx == block_col) {
+ self.advanceCursor(verified_indentation_idx);
+ const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable;
+ try self.insertTextLine(.space_text, .text, parent_idx, newline);
+ self.advanceCursor(1);
+ } else if (verified_indentation_idx > parent_col) {
+ self.advanceCursor(parent_col);
+ try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx);
+ self.advanceCursor(indentation_idx - parent_col);
+ const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable;
+ try self.insertTextLine(.space_text, .text, parent_idx, newline);
+ self.advanceCursor(1);
+ } else {
+ return @intCast(verified_indentation_idx);
+ }
+ }
+}
diff --git a/src/main.zig b/src/main.zig
@@ -17,23 +17,23 @@ pub fn main() !void {
defer tracy_frame.end();
if (stdin.stat()) |stat| {
if (stat.size > 0) {
- var al: std.ArrayList(u8) = try .initCapacity(arena.allocator(), stat.size + 128);
- try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 128);
- try al.appendNTimes('\n', 128);
+ var al: std.ArrayList(u8) = try .initCapacity(arena.allocator(), stat.size + 1024);
+ try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024);
+ try al.appendNTimes('\n', 1024);
break :blk al;
}
} else |_| {}
- var al: std.ArrayList(u8) = try .initCapacity(gpa.allocator(), 1024);
+ var al: std.ArrayList(u8) = try .initCapacity(gpa.allocator(), 4096);
errdefer al.deinit();
- try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 128);
- try al.appendNTimes('\n', 128);
+ try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024);
+ try al.appendNTimes('\n', 1024);
break :blk al;
};
defer input_arraylist.deinit();
const input = input_arraylist.items;
const ast = blk: {
- const tracy_frame = tracy.namedFrame("parse");
+ const tracy_frame = tracy.namedFrame("parse 0");
defer tracy_frame.end();
break :blk try mymarkdown.parse(
gpa.allocator(),
@@ -41,24 +41,28 @@ pub fn main() !void {
input,
);
};
- // const ast2 = blk: {
- // const tracy_frame = tracy.namedFrame("parse 2");
- // tracy_frame.end();
- // break :blk try mymarkdown.parse2(
- // gpa.allocator(),
- // arena.allocator(),
- // input,
- // );
- // };
- // try std.testing.expectEqualDeep(ast, ast2);
+ const ast2 = blk: {
+ const tracy_frame = tracy.namedFrame("parse 1");
+ defer tracy_frame.end();
+ break :blk try mymarkdown.parse2(
+ gpa.allocator(),
+ arena.allocator(),
+ input,
+ );
+ };
var bw = std.io.bufferedWriter(std.io.getStdOut().writer());
const stdout = bw.writer();
{
- const tracy_frame = tracy.namedFrame("render");
+ const tracy_frame = tracy.namedFrame("render 0");
defer tracy_frame.end();
_ = try ast.renderAst(stdout, input);
}
+ {
+ const tracy_frame = tracy.namedFrame("render 1");
+ defer tracy_frame.end();
+ _ = try ast2.renderAst(stdout, input);
+ }
try bw.flush();
if (tracy.enable) {
diff --git a/src/root.zig b/src/root.zig
@@ -2,7 +2,9 @@ const std = @import("std");
pub const Ast = @import("Ast.zig");
pub const AstGen = @import("AstGen.zig");
+pub const AstGen2 = @import("AstGen2.zig");
pub const parse = AstGen.parse;
+pub const parse2 = AstGen2.parse;
test {
_ = @import("test/test.zig");
diff --git a/src/str.zig b/src/str.zig
@@ -111,6 +111,24 @@ pub fn escapeStringForSingleQuotedString(
return escapeString(writer, slice, .double_quoted_string);
}
+pub fn fmtEscapes(bytes: []const u8) std.fmt.Formatter(stringEscapeFormatter) {
+ return .{ .data = bytes };
+}
+
+pub fn stringEscapeFormatter(
+ bytes: []const u8,
+ comptime f: []const u8,
+ options: std.fmt.FormatOptions,
+ writer: anytype,
+) !void {
+ _ = options;
+ if (f.len == 1 and f[0] == '\'') {
+ try escapeString(writer, bytes, .single_quoted_string);
+ } else {
+ try escapeString(writer, bytes, .double_quoted_string);
+ }
+}
+
pub fn escapeString(
writer: anytype,
slice: []const u8,
diff --git a/src/test/test.zig b/src/test/test.zig
@@ -269,6 +269,8 @@ test "Happy path list elaboration" {
\\ .text
\\ "a"
\\ .elaboration
+ \\ .marker
+ \\ "+"
\\ .paragraph
\\ .text
\\ "bb"
@@ -318,6 +320,8 @@ test "Mixed indentation" {
\\.document
\\ .elaboration
\\ .error .inconsistent_indentation at 3:1
+ \\ .marker
+ \\ "+"
\\ .paragraph
\\ .text
\\ "aaa"
@@ -358,7 +362,7 @@ test "Empty line in heading" {
\\ .text
\\ "heading"
\\ .space_text
- \\ .error .empty_line_in_inline_block at 2:1
+ \\ error .unexpected_block_in_inline_context
\\ "text"
\\ .paragraph
\\ .text
diff --git a/src/utils.zig b/src/utils.zig
@@ -2,14 +2,16 @@ const std = @import("std");
const ziggy = @import("ziggy");
-pub fn NewType(comptime int_type: type, comptime dummy_type_: type) type {
- return enum(int_type) {
+pub fn NewType(comptime IntType_: type, comptime DummyType_: type) type {
+ return enum(IntType_) {
+ root = 0,
_,
const Self = @This();
+ pub const IntType = IntType_;
pub fn next(self: @This()) ?@This() {
- if (@intFromEnum(self) == std.math.maxInt(int_type))
+ if (@intFromEnum(self) == std.math.maxInt(IntType_))
return null;
return @enumFromInt(@intFromEnum(self) + 1);
}
@@ -19,7 +21,7 @@ pub fn NewType(comptime int_type: type, comptime dummy_type_: type) type {
}
pub const ziggy_options = struct {
- const dummy_type = dummy_type_;
+ const DummyType = DummyType_;
pub fn parse(
self: *ziggy.Parser,
first_tok: ziggy.Tokenizer.Token,