mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

commit 6a848e2833660bccc8ac9343e66765825151cb93
parent 79ad40b2e85365c8e5edd8bc5b402c82ea112a72
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Wed, 21 May 2025 00:26:27 +0800

Implement AstGen3

Diffstat:
Asample.my | 143+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/Ast.zig | 22++++++++++++++++++++--
Msrc/AstGen.zig | 12+++++++++---
Msrc/AstGen2.zig | 20+++++++++++++-------
Asrc/AstGen3.zig | 500+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/IndentationScanner.zig | 41+++++++++++++++++++++++++++++++++--------
Msrc/main.zig | 188+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Msrc/padded_str_impl.zig | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Msrc/root.zig | 4++++
Msrc/utils.zig | 12++++++------
10 files changed, 927 insertions(+), 85 deletions(-)

diff --git a/sample.my b/sample.my @@ -0,0 +1,143 @@ +# Mymarkdown syntax showcase + +@@ $toc.generate() + +## Block syntaxes + +### Paragraphs + +Normal blocks of text separated by newlines, constitute paragraphs. +This sentence is part of the same paragraph as above. + +While the first line of a paragraph may not be indented, + subsequent lines may be indented relative to the first line. + +> This is a paragraph. + + This is a syntax error. + +In general, indented blocks are not allowed unless explicitly introduced by a marker. +In particular, indented code blocks do not exist in mymarkdown. +This is because indented blocks can easily arise from having insufficient indentation on the previous element. + +; Paragraphs may also be introduced with the `;` marker, + similar to other block syntaxes. + An indented set of blocks follows the `;` marker. + +### Headings + +Headings are introduced with hashes. +The number of hashes indicates the level of the heading (h1 = `#`, h2 = `##`, ...). +A single indented inline block follows the marker. + +Note that indented blocks in mymarkdown are strict --- +the first non-whitespace character after the marker is used to determine the indentation, +after which all subsequent lines must have matching or more indentation. + +> #### This is a + multiline heading. + + #### This is also a + multiline heading. + + #### This is a syntax error. + + This is because there are two paragraphs following the marker. + + #### This heading itself is not a syntax error. + But this misaligned second line is interpreted as an indented paragraph, which is a syntax error. + +### Blocks + +> Blocks are introduced by the `>` marker. + + An indented set of blocks follows the `>` marker. + +> Unlike markdown, placing `>` before each line is probably a mistake. +> This is because each `>` introduces a new block quote, making this the third blockquote in this section. + +### Lists + +There are many types of lists in mymarkdown. They are: +- Unordered list, introduced by `-` +- Ordered list, introduced by `.` +- Description list, introduced by `:` +- Task list, introduced by `- [ ]`, `- [x]`, or `- [X]` + +In all cases, the list is formed by grouping list items of the same marker type, +and an indented inline block follows the list item marker. + +The level of nesting can be increased by repeating the marker: +- Level-2 unordered list marker: `--` +- Level-2 ordered list marker: `..` +- Level-2 description list marker: `::` +- Level-2 task list marker: `-- [ ]`, `-- [x]`, or `-- [X]` + +Note that for ordered lists, no numbers or letters are allowed before the marker. +Instead they are simply automatically numbered from 1. +Use directives to adjust the starting index. + +Nested labels extend the parent's counter rather than beginning a new counter. + +. List item 1 +. List item 2 +.. List item 2.1 +.. List item 2.2 +. List item 3 + +For all list items, an indented //inline// block follows the list marker, +**not** an arbitrary **set** of indented blocks. +That means the following is a syntax error: + +> - This is a syntax error. + + One should not use multiple paragraphs in a list item. + +To have items that contain multiple paragraphs, instead use a short phrase or sentence for the list item itself, +and follow it up with a longer series of paragraphs that [//elaborate//](#List item elaborations) on the list item. + +### List item elaborations + +Sometimes, lists are used not to convey merely a list of items, +but also come with a **elaboration** or **description** of the item. +In such cases, elaboration blocks introduced by `+` markers matching the level of the list item +can be added immediately after the list item to be described or elaborated on. +This way, the main list marker itself can be used to contain just the key point or phrase. + +An indented set of blocks follows the list item elaboration marker. + +#### Examples + +- Some examples of when list item elaborations can be used: +-- **In a list of pros and cons.** +++ The main list items can be used to list the actual pros and cons, + while list item elaborations can be used to justify them. + +-- **In a list of examples.** +++ The main list items can be used for the names of the examples, + while elaborations go into more detail about how the examples work. + Like this example! + +- Some examples of when list item elaborations must be used: +-- **When you want to nest multiple paragraphs within a bullet.** +++ As the list item markers themselves only take in one indented inline block, + they physically cannot contain multiple paragraphs. + This limitation is meant to discourage using lists in situations + where a subsection would be more appropriate. + + In such cases, it is usually beneficial to add a short "name" or "title" for your paragraphs + in the actual list item (and emphasize or strongly emphasize them), + and then place the paragraphs themselves inside the list item elaboration. + + If that is not desirable however, the workaround is to use an //empty// list item, + so that the paragraphs you place inside the list item elaboration + become the only paragraphs nested under the bullet. + +- Some examples of when list item elaborations should not be used: +-- **When you simply have long list items.** +++ Regardless of the length of the content you wish to place in the list items, + the key consideration to keep in mind is whether the content "belongs" to the main list, + or simply to the current list item. + + If the content does truly belong to the main list, then the content should be used as the list item itself. + In such cases, it may be worth making the list a //loose// list, which means placing empty lines after each list item. diff --git a/src/Ast.zig b/src/Ast.zig @@ -18,7 +18,25 @@ pub const empty: Ast = .{ .nodes = &.{}, .errors = &.{}, .extra = &.{} }; pub const StrOffset = u32; pub const StrLen = u24; -pub const Node = utils.Packed(union(enum(u8)) { +pub const Tag = enum(u8) { + document = 255, + marker = 254, // First child of nodes like heading, list items, ... + + thematic_break = 253, + heading = '#', + quote = '>', + paragraph = ';', + unordered_item = '-', + ordered_item = '.', + term_item = ':', + task_item = 252, + elaboration = '+', + + text = 251, + space_text = 250, +}; + +pub const Node = utils.Packed(union(Tag) { document: Root, marker: Leaf, // First child of nodes like heading, list items, ... @@ -52,7 +70,7 @@ pub const Node = utils.Packed(union(enum(u8)) { pub const format = utils.structFormat(@This()); }; - pub fn incrementNumChildren(self: *Node) void { + pub inline fn incrementNumChildren(self: *Node) void { switch (self.tag) { inline else => |t| { if (@TypeOf(@field(self.data, @tagName(t))) == Container or diff --git a/src/AstGen.zig b/src/AstGen.zig @@ -110,6 +110,12 @@ pub fn parse( // std.time.sleep(std.time.ns_per_hour); + std.sort.pdq(Error, ast.errors.items, {}, struct { + fn func(_: void, lhs: Error, rhs: Error) bool { + return @intFromEnum(lhs.get(.idx)) < @intFromEnum(rhs.get(.idx)); + } + }.func); + if (output_gpa) |gpa2| { return .{ .nodes = try gpa2.dupe(Node, ast.nodes.items), @@ -169,11 +175,11 @@ fn findIndentedColumn(self: *AstGen, gpa: Allocator, lines_: [][]u8, node_idx: N const diff_idx = std.mem.indexOfDiff(u8, line.*, indentation) orelse unreachable; assert(diff_idx != line.len); if (diff_idx != indentation.len) { - try self.errors.append(gpa, .fromTagged(.{ - .inconsistent_indentation = .{ .idx = node_idx, .off = self.calcOffset(&line.*[0]) }, - })); // Recover by stripping all whitespace on this line const recover_indentation_idx = std.mem.indexOfNone(u8, line.*, " \t\r\n") orelse unreachable; + try self.errors.append(gpa, .fromTagged(.{ + .inconsistent_indentation = .{ .idx = node_idx, .off = self.calcOffset(&line.*[diff_idx]) }, + })); line.* = line.*[recover_indentation_idx..]; } else { line.* = line.*[indentation.len..]; diff --git a/src/AstGen2.zig b/src/AstGen2.zig @@ -157,6 +157,12 @@ pub fn parse( try ast.parseRoot(); + std.sort.pdq(Error, ast.errors.items, {}, struct { + fn func(_: void, lhs: Error, rhs: Error) bool { + return @intFromEnum(lhs.get(.idx)) < @intFromEnum(rhs.get(.idx)); + } + }.func); + if (output_gpa) |gpa2| { return .{ .nodes = try gpa2.dupe(Node, ast.nodes.items), @@ -174,7 +180,7 @@ pub fn parse( const ParsingContext = enum { block_context, inline_context }; -fn parseRoot(self: *AstGen) !void { +noinline fn parseRoot(self: *AstGen) !void { const tracy_frame = tracy.trace(@src()); defer tracy_frame.end(); const root = try self.appendNode(.{ .document = .{} }); @@ -213,6 +219,8 @@ fn parseColumn( OutOfErrorIdx, OutOfMemory, }!Column { + // const tracy_frame = if (parsing_context == .block_context) tracy.trace(@src()); + // defer if (parsing_context == .block_context) tracy_frame.end(); const tracy_frame = tracy.trace(@src()); defer tracy_frame.end(); return self.parseColumnInline(parent_idx, parent_col, cursor_col, parsing_context); @@ -521,7 +529,7 @@ inline fn parseColumnInline( } else if (indentation_idx > parent_col) { // Matches parent but there's extra whitespace that doesn't match us. // Log the error. - self.advanceCursor(parent_col); + self.advanceCursor(indentation_idx); try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx); // Fix the indentation. // Here, we fix the indentation by pretending that @@ -529,7 +537,6 @@ inline fn parseColumnInline( // <no op> // Continue parsing. - self.advanceCursor(indentation_idx - parent_col); continue :parse_another_block; } else { // Matches parent exactly or doesn't match parent, return. @@ -736,8 +743,8 @@ fn parseParagraph( parent_col: Column, block_col: Column, ) !Column { - const tracy_frame = tracy.trace(@src()); - defer tracy_frame.end(); + // const tracy_frame = tracy.trace(@src()); + // defer tracy_frame.end(); { const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; @@ -816,9 +823,8 @@ fn parseParagraph( try self.insertTextLine(.space_text, .text, parent_idx, newline); self.advanceCursor(1); } else if (verified_indentation_idx > parent_col) { - self.advanceCursor(parent_col); + self.advanceCursor(indentation_idx); try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx); - self.advanceCursor(indentation_idx - parent_col); const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; try self.insertTextLine(.space_text, .text, parent_idx, newline); self.advanceCursor(1); diff --git a/src/AstGen3.zig b/src/AstGen3.zig @@ -0,0 +1,500 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; + +const ziggy = @import("ziggy"); +const tracy = @import("tracy"); + +const utils = @import("utils.zig"); +const str = @import("str.zig"); +const Ast = @import("Ast.zig"); +const StrOffset = Ast.StrOffset; +const StrLen = Ast.StrLen; +const Node = Ast.Node; +const Error = Ast.Error; +const PaddedSlice = @import("padded_str.zig").PaddedSlice; +const PaddedMany = @import("padded_str.zig").PaddedMany; +const IndentationScanner = @import("IndentationScanner.zig"); +const IndentAlignment = IndentationScanner.IndentAlignment; + +const AstGen = @This(); + +gpa: Allocator, +output_gpa: Allocator, +output_gpa_same_as_gpa: bool, +input: PaddedSlice, +scanner: IndentationScanner, +nodes: std.ArrayListUnmanaged(Node), +errors: std.ArrayListUnmanaged(Error), +extra: std.ArrayListUnmanaged(u32), + +// fn scannerPeek(self: AstGen) ?struct { IndentAlignment, PaddedSlice, StrOffset } { +// const alignment, const line = self.scanner.peek() orelse return null; +// return .{ alignment, line, line.ptr.calcOffsetTo(self.input.ptr) }; +// } + +fn getNode(self: *const AstGen, idx: Node.Idx) *Node { + @setRuntimeSafety(true); + return &self.nodes.items[@intFromEnum(idx)]; +} +fn lastNodeIdx(self: *const AstGen) Node.Idx { + @setRuntimeSafety(true); + return @enumFromInt(self.nodes.items.len - 1); +} +fn nextNodeIdx(self: *const AstGen) Node.Idx { + @setRuntimeSafety(true); + return @enumFromInt(self.nodes.items.len); +} +fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { + if (self.nodes.items.len > std.math.maxInt( + @typeInfo(Node.Idx).@"enum".tag_type, + )) return error.OutOfNodeIdx; + const idx = self.nodes.items.len; + try self.nodes.append(self.gpa, .fromTagged(node)); + return @enumFromInt(idx); +} +fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx { + self.getNode(parent_idx).incrementNumChildren(); + return try self.appendNode( + @unionInit( + Node.Tagged, + @tagName(container_node_tag), + @as(Node.Tagged.Container, .{ .off = @intCast(self.input.ptr.calcOffsetTo(ptr)) }), + ), + ); +} +fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx { + self.getNode(parent_idx).incrementNumChildren(); + return try self.appendNode( + @unionInit( + Node.Tagged, + @tagName(leaf_node_tag), + @as(Node.Leaf, .{ .off = @intCast(self.input.ptr.calcOffsetTo(ptr)), .len = len }), + ), + ); +} +fn appendError(self: *AstGen, err: Error.Tagged) !void { + if (self.errors.items.len > std.math.maxInt( + @typeInfo(Error.Idx).@"enum".tag_type, + )) return error.OutOfErrorIdx; + try self.errors.append(self.gpa, .fromTagged(err)); +} +fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void { + try self.appendError( + @unionInit( + Error.Tagged, + @tagName(tag), + .{ .idx = idx, .off = @intCast(self.input.ptr.calcOffsetTo(ptr)) }, + ), + ); +} +fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { + try self.appendError( + @unionInit( + Error.Tagged, + @tagName(tag), + .{ .idx = idx }, + ), + ); +} +fn castStrLen(len: usize, comptime err: anytype) @TypeOf(err)!StrLen { + return if (len <= std.math.maxInt(StrLen)) @intCast(len) else return err; +} + +pub fn deinit(self: *AstGen) void { + self.nodes.deinit(self.gpa); + self.errors.deinit(self.gpa); + self.extra.deinit(self.gpa); +} + +/// Parses mymarkdown +/// +/// : `gpa` +/// + A suitable allocator for scratch allocations that supports free and ideally remap. +/// : `output_gpa` +/// + If passed, no scratch allocations will outlive this function, +/// and any allocations returned will be allocated on this. +/// : `input` +/// + The input slice to be parsed. Must end in at least 1024 \n characters. +/// +/// Errors: +/// : `IndentationTooLong` +/// + This implementation of mymarkdown supports up to 1023 characters of indentation. +pub fn parse( + gpa: Allocator, + output_gpa: ?Allocator, + input_: []const u8, +) !Ast { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + + const input: PaddedSlice = try .init(input_); + if (input.len >= std.math.maxInt(u32)) + return error.InputTooLarge; + + var ast: AstGen = .{ + .gpa = gpa, + .output_gpa = output_gpa orelse gpa, + .output_gpa_same_as_gpa = output_gpa == null, + .input = input, + .scanner = .init(input), + .nodes = .empty, + .errors = .empty, + .extra = .empty, + }; + defer ast.deinit(); + + try ast.parseRoot(); + + std.sort.pdq(Error, ast.errors.items, {}, struct { + fn func(_: void, lhs: Error, rhs: Error) bool { + return @intFromEnum(lhs.get(.idx)) < @intFromEnum(rhs.get(.idx)); + } + }.func); + + if (output_gpa) |gpa2| { + return .{ + .nodes = try gpa2.dupe(Node, ast.nodes.items), + .errors = try gpa2.dupe(Error, ast.errors.items), + .extra = try gpa2.dupe(u32, ast.extra.items), + }; + } else { + return .{ + .nodes = try ast.nodes.toOwnedSlice(gpa), + .errors = try ast.errors.toOwnedSlice(gpa), + .extra = try ast.extra.toOwnedSlice(gpa), + }; + } +} + +const ParsingContext = enum { block_context, inline_context }; + +fn parseRoot(self: *AstGen) !void { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + const root = try self.appendNode(.{ .document = .{} }); + assert(root == .root); + if (self.scanner.peek()) |p| assert(self.input.ptr._ptr == p.@"1".ptr._ptr); + + // "inline" hack to get different branch predictors for the root column + _ = try self.parseColumnImpl(root); + assert(self.scanner.peek() == null); +} + +fn parseColumn( + self: *AstGen, + parent_idx: Node.Idx, +) (error{ + TooMuchIndentation, + MarkerTooLong, + OutOfNodeIdx, + OutOfErrorIdx, +} || Allocator.Error)!void { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + return self.parseColumnImpl(parent_idx); +} + +/// Used in headings, semicolon paragraphs, etc. +/// Has different rules as regular unmarked paragraphs, because +/// unmarked paragraphs are interrupted by block elements and +/// do not consume the whole column, whereas here, we ignore +/// block special characters. +inline fn parseInlineColumn( + self: *AstGen, + parent_idx: Node.Idx, +) !void { + const tracy_frame = tracy.trace(@src()); + defer tracy_frame.end(); + // TODO: Inline parsing + next_paragraph: { + if (self.scanner.peek()) |peek| { + const alignment, const line = peek; + try self.insertTextLine(.text, .text, parent_idx, line); + if (alignment == .misaligned) + try self.appendPointError(.inconsistent_indentation, parent_idx, line.ptr); + self.scanner.advance(); + } + while (self.scanner.peek()) |peek| { + const alignment, const line = peek; + if (line.len == line.indexOfNotSpaceOrTab()) + break :next_paragraph; + try self.insertTextLine(.space_text, .text, parent_idx, line); + if (alignment == .misaligned) + try self.appendPointError(.inconsistent_indentation, parent_idx, line.ptr); + self.scanner.advance(); + } + return; + } + next_paragraph: while (true) { + while (self.scanner.peek()) |peek| { + _, const line = peek; + if (line.len != line.indexOfNotSpaceOrTab()) break; + self.scanner.advance(); + } + if (self.scanner.peek()) |peek| { + const alignment, const line = peek; + try self.insertTextLine(.space_text, .text, parent_idx, line); + try self.appendNodeError(.unexpected_block_in_inline_context, self.lastNodeIdx()); + if (alignment == .misaligned) + try self.appendPointError(.inconsistent_indentation, parent_idx, line.ptr); + self.scanner.advance(); + } + while (self.scanner.peek()) |peek| { + const alignment, const line = peek; + if (line.len == line.indexOfNotSpaceOrTab()) + continue :next_paragraph; + try self.insertTextLine(.space_text, .text, parent_idx, line); + if (alignment == .misaligned) + try self.appendPointError(.inconsistent_indentation, parent_idx, line.ptr); + self.scanner.advance(); + } + break; + } +} + +inline fn parseColumnImpl( + self: *AstGen, + parent_idx: Node.Idx, +) !void { + parsing_blocks: while (self.scanner.peek()) |peek| { + const alignment, const raw_line = peek; + + // Strip leading whitespace + const line_start = raw_line.indexOfNone(" \t"); + const overaligned = line_start != 0; + const line = raw_line.sliceOpen(line_start); + + // Skip empty lines + if (line.len == 0) { + self.scanner.advance(); + continue :parsing_blocks; + } + + // Parse one block and get its id so we can attach an error to it if the block was misaligned. + const block_idx: Node.Idx = block_idx: { + switch (line.index(0)) { + // Par-like repeatable markers + inline '-', '.', ':', '#' => |m| { + const marker_len = line.indexOfNotChar(m); + if (comptime m == '-') { + var potential_task_item = line.sliceOpen(marker_len).indexOfNone("[ ]xX"); + while (potential_task_item >= 3 and line.index(marker_len + potential_task_item - 1) == ' ') + potential_task_item -= 1; + if (potential_task_item >= 3 and + line.index(marker_len + potential_task_item - 1) == ']' and + line.index(marker_len + potential_task_item - 3) == '[' and + (line.index(marker_len + potential_task_item - 2) == ' ' or + line.index(marker_len + potential_task_item - 2) == 'x' or + line.index(marker_len + potential_task_item - 2) == 'X') and + std.mem.allEqual(u8, line.toUnpaddedSlice()[marker_len .. marker_len + potential_task_item - 3], ' ')) + { + const block_idx = try self.appendContainerNode(parent_idx, .task_item, line.ptr); + _ = try self.appendLeafNode(block_idx, .marker, line.ptr, try castStrLen(marker_len + potential_task_item, error.MarkerTooLong)); + try self.scanner.indent(.{ .skip = line_start + marker_len + potential_task_item }); + try self.parseInlineColumn(block_idx); + self.scanner.dedent(); + break :block_idx block_idx; + } + } + const block_idx = try self.appendContainerNode(parent_idx, @enumFromInt(m), line.ptr); + _ = try self.appendLeafNode(block_idx, .marker, line.ptr, try castStrLen(marker_len, error.MarkerTooLong)); + try self.scanner.indent(.{ .skip = line_start + marker_len }); + try self.parseInlineColumn(block_idx); + self.scanner.dedent(); + break :block_idx block_idx; + }, + + // Div-like repeatable markers + inline '+' => |m| { + const marker_len = try castStrLen(line.indexOfNotChar(m), error.MarkerTooLong); + const block_idx = try self.appendContainerNode(parent_idx, @enumFromInt(m), line.ptr); + _ = try self.appendLeafNode(block_idx, .marker, line.ptr, marker_len); + try self.scanner.indent(.{ .skip = line_start + marker_len }); + try self.parseColumn(block_idx); + self.scanner.dedent(); + break :block_idx block_idx; + }, + + // Par-like single markers + inline ';' => |m| { + const block_idx = try self.appendContainerNode(parent_idx, @enumFromInt(m), line.ptr); + try self.scanner.indent(.{ .skip = line_start + 1 }); + try self.parseInlineColumn(block_idx); + self.scanner.dedent(); + break :block_idx block_idx; + }, + + // Div-like single markers + inline '>' => |m| { + const block_idx = try self.appendContainerNode(parent_idx, @enumFromInt(m), line.ptr); + try self.scanner.indent(.{ .skip = line_start + 1 }); + try self.parseColumn(block_idx); + self.scanner.dedent(); + break :block_idx block_idx; + }, + + '*' => { + if (std.mem.eql(u8, line.toUnpaddedSlice()[0..3], "***")) { + const after_stars = line.sliceOpen(3); + const skip_whitespace_idx = after_stars.indexOfNone(" \t"); + if (after_stars.index(skip_whitespace_idx) == '\n') { + self.scanner.advance(); + break :block_idx try self.appendLeafNode(parent_idx, .thematic_break, line.ptr, 3); + } + } + }, + + else => {}, + } + + // Parse paragraph + { + // const tracy_frame = tracy.traceNamed(@src(), "paragraph"); + // defer tracy_frame.end(); + const paragraph_idx = try self.appendContainerNode(parent_idx, .paragraph, line.ptr); + try self.insertTextLine(.text, .text, paragraph_idx, line); + self.scanner.advance(); + + while (self.scanner.peek()) |peek2| { + _, const line2 = peek2; + if (line2.len == 0) break; + if (line2.index(line2.indexOfNotSpaceOrTab()) == '\n') break; + if (str.isAnyOf(line2.index(0), "-.:+>#")) break; + if (line2.len >= 3 and std.mem.allEqual(u8, line2.toUnpaddedSlice()[0..3], '*') and line2.index(line2.indexOfNotSpaceOrTab()) == '\n') break; + if (line2.len >= 2 and std.mem.eql(u8, line2.toUnpaddedSlice()[0..2], "==")) break; + if (line2.len >= 3 and std.mem.eql(u8, line2.toUnpaddedSlice()[0..3], "$==")) break; + try self.insertTextLine(.space_text, .text, paragraph_idx, line2); + self.scanner.advance(); + } + + break :block_idx paragraph_idx; + } + }; + + if (alignment == .misaligned or overaligned) { + try self.appendPointError(.inconsistent_indentation, block_idx, raw_line.ptr); + } + } +} + +fn insertTextLine( + self: *AstGen, + comptime first_text_tag: Node.Tag, + comptime rest_text_tag: Node.Tag, + parent_idx: Node.Idx, + line_: PaddedSlice, +) !void { + var line = line_; + if (line.len <= std.math.maxInt(StrLen)) { + _ = try self.appendLeafNode(parent_idx, first_text_tag, line.ptr, @intCast(line.len)); + } else { + @branchHint(.cold); + { + const consumed_len = @min(line.len, std.math.maxInt(StrLen)); + _ = try self.appendLeafNode(parent_idx, first_text_tag, line.ptr, @intCast(consumed_len)); + line = line.sliceOpen(consumed_len); + } + while (line.len > 0) { + const consumed_len = @min(line.len, std.math.maxInt(StrLen)); + _ = try self.appendLeafNode(parent_idx, rest_text_tag, line.ptr, @intCast(consumed_len)); + line = line.sliceOpen(consumed_len); + } + } +} + +// fn parseParagraph( +// self: *AstGen, +// parent_idx: Node.Idx, +// ) !void { +// const tracy_frame = tracy.trace(@src()); +// defer tracy_frame.end(); + +// { +// const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; +// try self.insertTextLine(.text, .text, parent_idx, newline); +// self.advanceCursor(1); +// } + +// while (true) { +// if (self.cursor.len == 0) return 0; + +// const indentation_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; +// if (str.isAnyOf(self.cursor[indentation_idx], "-.:+>#;")) { +// // block line found, exit +// const verified_indentation_idx = std.mem.indexOfDiff( +// u8, +// self.cursor, +// self.indentation[0..block_col], +// ) orelse unreachable; +// if (verified_indentation_idx == block_col) { +// return @intCast(indentation_idx); +// } else { +// return @intCast(verified_indentation_idx); +// } +// } +// if (self.cursor[indentation_idx] == '*') { +// const after_stars = self.cursor[3..]; +// const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable; +// if (after_stars[skip_whitespace_idx] == '\n') { +// // block line found, exit +// const verified_indentation_idx = std.mem.indexOfDiff( +// u8, +// self.cursor, +// self.indentation[0..block_col], +// ) orelse unreachable; +// if (verified_indentation_idx == block_col) { +// return @intCast(indentation_idx); +// } else { +// return @intCast(verified_indentation_idx); +// } +// } +// } +// // empty line found, consume to next nonwhitespace and exit +// if (self.cursor[indentation_idx] == '\n') { +// self.advanceCursor(indentation_idx + 1); +// while (true) { +// if (self.cursor.len == 0) return 0; + +// const next_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable; +// if (self.cursor[next_idx] == '\n') { +// self.advanceCursor(next_idx + 1); +// continue; +// } + +// const verified_indentation_idx = std.mem.indexOfDiff( +// u8, +// self.cursor, +// self.indentation[0..block_col], +// ) orelse unreachable; +// if (verified_indentation_idx == block_col) { +// return @intCast(next_idx); +// } else { +// return @intCast(verified_indentation_idx); +// } +// } +// } + +// // verify indentation +// const verified_indentation_idx = std.mem.indexOfDiff( +// u8, +// self.cursor, +// self.indentation[0..block_col], +// ) orelse unreachable; +// if (verified_indentation_idx == block_col) { +// self.advanceCursor(verified_indentation_idx); +// const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; +// try self.insertTextLine(.space_text, .text, parent_idx, newline); +// self.advanceCursor(1); +// } else if (verified_indentation_idx > parent_col) { +// self.advanceCursor(parent_col); +// try self.appendPointErrorAtCursor(.inconsistent_indentation, parent_idx); +// self.advanceCursor(indentation_idx - parent_col); +// const newline = str.indexOfChar(self.cursor, '\n') orelse unreachable; +// try self.insertTextLine(.space_text, .text, parent_idx, newline); +// self.advanceCursor(1); +// } else { +// return @intCast(verified_indentation_idx); +// } +// } +// } diff --git a/src/IndentationScanner.zig b/src/IndentationScanner.zig @@ -189,7 +189,7 @@ _rest_lines: PaddedSlice, // Methods Methods Methods Methods Methods /// Create an IndentationScanner that tokenizes the given input. -pub fn init(input_: []const u8) error{InputNotPadded}!IndentationScanner { +pub fn initUnpadded(input_: []const u8) error{InputNotPadded}!IndentationScanner { const input: PaddedSlice = try .init(input_); const first_line, const rest_lines = input.splitOneLine(); return .{ @@ -203,9 +203,23 @@ pub fn init(input_: []const u8) error{InputNotPadded}!IndentationScanner { }; } +/// Create an IndentationScanner that tokenizes the given input. +pub fn init(input: PaddedSlice) IndentationScanner { + const first_line, const rest_lines = input.splitOneLine(); + return .{ + ._expected_indents = while (true) { + var out: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS) = .{}; + out.appendAssumeCapacity(.{ .misalignment_start = 0, .correct_start = 0 }); + break out; + }, + ._cur_line = if (first_line) |line| .{ .root, 0, line } else null, + ._rest_lines = rest_lines, + }; +} + /// Peek at the current line's contents, with indentation stripped. /// If indentation matches the parent or less, null is emitted. -pub fn peek(it: IndentationScanner) PeekResult { +pub fn peek(it: *const IndentationScanner) PeekResult { const level, const spaces, const content = it._cur_line orelse return null; if (level.compare(.lt, it.curLevel())) return null; assert(level.compare(.eq, it.curLevel())); @@ -222,6 +236,12 @@ pub fn advance(it: *IndentationScanner) void { it._cur_line = null; const line_maybe, it._rest_lines = it._rest_lines.splitOneLine(); const line = line_maybe orelse return; + const spaces_or_tabs = line.indexOfNotSpaceOrTab(); + if (spaces_or_tabs == line.len) { + // Lie to the caller -- pretend empty lines have been indented to the correct level + it._cur_line = .{ it.curLevel(), it.curExpectedIndents().correct_start, line.sliceOpen(spaces_or_tabs) }; + return; + } const spaces = line.indexOfNotSpace(); it._cur_line = for (0..it._expected_indents.len) |rev_i| { const i = it._expected_indents.len - 1 - rev_i; @@ -273,11 +293,11 @@ pub fn dedent(it: *IndentationScanner) void { // ======================================= // Helpers Helpers Helpers Helpers Helpers -fn curLevel(it: IndentationScanner) IndentLevel { +fn curLevel(it: *const IndentationScanner) IndentLevel { return IndentLevel.from(it._expected_indents.len - 1).?; } -fn curExpectedIndents(it: IndentationScanner) ExpectedIndent { +fn curExpectedIndents(it: *const IndentationScanner) ExpectedIndent { return it._expected_indents.buffer[it._expected_indents.len - 1]; } @@ -350,11 +370,12 @@ inline fn expectEqualPeekResults(expected: PeekResult, actual: PeekResult) !void } test IndentationScanner { - const input = + const input_ = \\line 1 \\> \\ line 3 \\> line 4 + \\ \\ line 5 \\ line 6 \\line 7 @@ -373,9 +394,10 @@ test IndentationScanner { \\ > line 20 \\ ; - const padded_input = try PaddedSlice.pad(testing.allocator, input); - defer testing.allocator.free(padded_input); - var it: IndentationScanner = try .init(padded_input); + const input_with_padding = try PaddedSlice.pad(testing.allocator, input_); + defer testing.allocator.free(input_with_padding); + const input = try PaddedSlice.init(input_with_padding); + var it: IndentationScanner = .init(input); try expectEqualDeep([]const ExpectedIndent, &.{ .{ .misalignment_start = 0, .correct_start = 0 }, @@ -409,6 +431,9 @@ test IndentationScanner { try expectEqualPeekResults(.{ .correct, .from("line 4") }, it.peek()); it.advance(); + try expectEqualPeekResults(.{ .correct, .from("") }, it.peek()); + + it.advance(); try expectEqualPeekResults(.{ .correct, .from("line 5") }, it.peek()); it.advance(); diff --git a/src/main.zig b/src/main.zig @@ -6,64 +6,158 @@ const mymarkdown = @import("mymarkdown"); const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{}); const ArenaAllocator = std.heap.ArenaAllocator; +fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8) { + const stdin = std.io.getStdIn(); + const tracy_frame = tracy.namedFrame("reading input"); + defer tracy_frame.end(); + if (stdin.stat()) |stat| { + if (stat.size > 0) { + var al: std.ArrayList(u8) = try .initCapacity(arena, stat.size + 1024); + try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024); + try al.appendNTimes('\n', 1024); + return al; + } + } else |_| {} + var al: std.ArrayList(u8) = try .initCapacity(gpa, 4096); + errdefer al.deinit(); + try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024); + try al.appendNTimes('\n', 1024); + return al; +} + pub fn main() !void { var gpa: GeneralPurposeAllocator = .{}; var arena: ArenaAllocator = .init(gpa.allocator()); defer arena.deinit(); - const stdin = std.io.getStdIn(); - var input_arraylist = blk: { - const tracy_frame = tracy.namedFrame("reading input"); - defer tracy_frame.end(); - if (stdin.stat()) |stat| { - if (stat.size > 0) { - var al: std.ArrayList(u8) = try .initCapacity(arena.allocator(), stat.size + 1024); - try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024); - try al.appendNTimes('\n', 1024); - break :blk al; - } - } else |_| {} - var al: std.ArrayList(u8) = try .initCapacity(gpa.allocator(), 4096); - errdefer al.deinit(); - try stdin.reader().readAllArrayList(&al, std.math.maxInt(u32) - 1024); - try al.appendNTimes('\n', 1024); - break :blk al; + const args = try std.process.argsAlloc(arena.allocator()); + const run1, const run2, const run3 = blk: { + var run1, var run2, var run3 = .{ false, false, false }; + for (args) |arg| { + if (std.mem.eql(u8, arg, "--run1")) + run1 = true; + if (std.mem.eql(u8, arg, "--run2")) + run2 = true; + if (std.mem.eql(u8, arg, "--run3")) + run3 = true; + } + break :blk .{ run1, run2, run3 }; }; + + var input_arraylist = try readInput(gpa.allocator(), arena.allocator()); defer input_arraylist.deinit(); const input = input_arraylist.items; - // const ast = blk: { - // const tracy_frame = tracy.namedFrame("parse 0"); - // defer tracy_frame.end(); - // break :blk try mymarkdown.parse( - // gpa.allocator(), - // arena.allocator(), - // input, - // ); - // }; - const ast2 = blk: { - const tracy_frame = tracy.namedFrame("parse 1"); - defer tracy_frame.end(); - break :blk try mymarkdown.parse2( - gpa.allocator(), - arena.allocator(), - input, - ); - }; + for (0..10) |_| { + if (run1) + std.mem.doNotOptimizeAway(blk: { + const tracy_frame = tracy.namedFrame("parse 1"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse( + gpa.allocator(), + arena.allocator(), + input, + ); + }); + if (run2) + std.mem.doNotOptimizeAway(blk: { + const tracy_frame = tracy.namedFrame("parse 2"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse2( + gpa.allocator(), + arena.allocator(), + input, + ); + }); + if (run3) + std.mem.doNotOptimizeAway(blk: { + const tracy_frame = tracy.namedFrame("parse 3"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse3( + gpa.allocator(), + arena.allocator(), + input, + ); + }); + } + + if (!run1 and !run2 and !run3) { + const ast = blk: { + const tracy_frame = tracy.namedFrame("parse 1"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse( + gpa.allocator(), + arena.allocator(), + input, + ); + }; + const ast2 = blk: { + const tracy_frame = tracy.namedFrame("parse 2"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse2( + gpa.allocator(), + arena.allocator(), + input, + ); + }; + const ast3 = blk: { + const tracy_frame = tracy.namedFrame("parse 3"); + defer tracy_frame.end(); + break :blk try mymarkdown.parse3( + gpa.allocator(), + arena.allocator(), + input, + ); + }; + + var render_arraylist1: std.ArrayList(u8) = .init(gpa.allocator()); + defer render_arraylist1.deinit(); + { + std.debug.print("Rendering 1\n", .{}); + const tracy_frame = tracy.namedFrame("check render 1"); + defer tracy_frame.end(); + _ = try ast.renderAst(render_arraylist1.writer(), input); + } + var render_arraylist2: std.ArrayList(u8) = .init(gpa.allocator()); + defer render_arraylist2.deinit(); + { + std.debug.print("Rendering 2\n", .{}); + const tracy_frame = tracy.namedFrame("check render 2"); + defer tracy_frame.end(); + _ = try ast2.renderAst(render_arraylist2.writer(), input); + } + var render_arraylist3: std.ArrayList(u8) = .init(gpa.allocator()); + defer render_arraylist3.deinit(); + { + std.debug.print("Rendering 3\n", .{}); + const tracy_frame = tracy.namedFrame("check render 3"); + defer tracy_frame.end(); + _ = try ast3.renderAst(render_arraylist3.writer(), input); + } + std.debug.print("Testing 1 vs 3\n", .{}); + try std.testing.expectEqualStrings(render_arraylist1.items, render_arraylist3.items); + std.debug.print("Testing 2 vs 3\n", .{}); + try std.testing.expectEqualStrings(render_arraylist2.items, render_arraylist3.items); - var bw = std.io.bufferedWriter(std.io.getStdOut().writer()); - const stdout = bw.writer(); - // { - // const tracy_frame = tracy.namedFrame("render 0"); - // defer tracy_frame.end(); - // _ = try ast.renderAst(stdout, input); - // } - { - const tracy_frame = tracy.namedFrame("render 1"); - defer tracy_frame.end(); - _ = try ast2.renderAst(stdout, input); + // var bw = std.io.bufferedWriter(std.io.getStdOut().writer()); + // const stdout = bw.writer(); + // { + // const tracy_frame = tracy.namedFrame("render 1"); + // defer tracy_frame.end(); + // _ = try ast.renderAst(stdout, input); + // } + // { + // const tracy_frame = tracy.namedFrame("render 2"); + // defer tracy_frame.end(); + // _ = try ast2.renderAst(stdout, input); + // } + // { + // const tracy_frame = tracy.namedFrame("render 3"); + // defer tracy_frame.end(); + // _ = try ast3.renderAst(stdout, input); + // } + // try bw.flush(); } - try bw.flush(); if (tracy.enable) { tracy.frameMarkNamed("waiting for tracy"); diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig @@ -35,6 +35,12 @@ pub fn PaddedSlice(comptime PADDING_: usize) type { }; } + pub fn sliceRange(self: @This(), start: usize, end: usize) @This() { + assert(start <= end); + assert(end <= self.len); + return self.ptr.sliceRange(start, end); + } + pub fn pad(gpa: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 { const padded_input = try gpa.alloc(u8, input.len + PADDING); @memcpy(padded_input[0..input.len], input); @@ -52,6 +58,10 @@ pub fn PaddedSlice(comptime PADDING_: usize) type { }; } + pub fn indexOfNotSpaceOrTab(self: @This()) usize { + return self.ptr.indexOfNotSpaceOrTab(); + } + pub fn indexOfNotSpace(self: @This()) usize { return self.ptr.indexOfNotSpace(); } @@ -76,6 +86,18 @@ pub fn PaddedSlice(comptime PADDING_: usize) type { }, }; } + + pub fn indexOfNone(self: @This(), comptime charset: []const u8) usize { + return self.ptr.indexOfNone(charset); + } + + pub fn indexOfAny(self: @This(), comptime charset: []const u8) usize { + return self.ptr.indexOfAny(charset); + } + + pub fn indexOfNotChar(self: @This(), comptime char: u8) usize { + return self.ptr.indexOfNone(&.{char}); + } }; } @@ -96,11 +118,23 @@ pub fn PaddedMany(comptime PADDING_: usize) type { return .{ ._ptr = ptr._ptr[start..] }; } + pub fn sliceRange(self: @This(), start: usize, end: usize) PaddedSlice(PADDING) { + return .{ + .ptr = self.sliceOpen(start), + .len = end - start, + }; + } + pub fn calcOffsetTo(ptr: @This(), other: @This()) usize { return other._ptr - ptr._ptr; } - fn simd_maybe() ?type { + fn simd_maybe( + /// How many loads to do in parallel in 1 loop iteration. + /// Consider increasing if your input distribution contains long strings. + /// Note that this may not have an effect if PADDING is not large enough. + comptime MAX_UNROLL: usize, + ) ?type { if (switch (@import("builtin").zig_backend) { .stage2_llvm, .stage2_c => true, else => false, @@ -109,10 +143,6 @@ pub fn PaddedMany(comptime PADDING_: usize) type { if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| { return struct { const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING); - /// How many loads to do in parallel in 1 loop iteration. - /// Consider increasing if your input distribution contains long strings. - /// Note that this may not have an effect if PADDING is not large enough. - const MAX_UNROLL = 1; const BLOCK_LEN = @min( MAX_UNROLL * BLOCK_LEN__, @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__, @@ -140,13 +170,17 @@ pub fn PaddedMany(comptime PADDING_: usize) type { pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize { if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null) @compileError("charset must not contain newline, otherwise this op is potentially unsafe."); - return ptr.indexOfNoneUnsafe(charset); + return ptr.indexOfNoneUnsafe(charset, .{ .max_unroll = 3 }); } - pub fn indexOfNoneUnsafe(ptr: @This(), comptime charset: []const u8) usize { + pub fn indexOfNoneUnsafe( + ptr: @This(), + comptime charset: []const u8, + comptime opts: struct { max_unroll: usize = 1 }, + ) usize { var it = ptr._ptr; if (!@inComptime()) { - if (simd_maybe()) |simd| { + if (simd_maybe(opts.max_unroll)) |simd| { const splats: [charset.len]simd.CharBlock = comptime blk: { var splats_: []const simd.CharBlock = &.{}; for (charset) |c| { @@ -188,13 +222,17 @@ pub fn PaddedMany(comptime PADDING_: usize) type { pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize { if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null) @compileError("charset must contain newline, otherwise this op is potentially unsafe."); - return ptr.indexOfAnyUnsafe(charset); + return ptr.indexOfAnyUnsafe(charset, .{ .max_unroll = 1 }); } - pub fn indexOfAnyUnsafe(ptr: @This(), comptime charset: []const u8) usize { + pub fn indexOfAnyUnsafe( + ptr: @This(), + comptime charset: []const u8, + comptime opts: struct { max_unroll: usize = 1 }, + ) usize { var it = ptr._ptr; if (!@inComptime()) { - if (simd_maybe()) |simd| { + if (simd_maybe(opts.max_unroll)) |simd| { const splats: [charset.len]simd.CharBlock = comptime blk: { var splats_: []const simd.CharBlock = &.{}; for (charset) |c| { @@ -230,8 +268,16 @@ pub fn PaddedMany(comptime PADDING_: usize) type { } } + pub fn indexOfNotChar(ptr: @This(), comptime char: u8) usize { + return ptr.indexOfNone(&.{char}); + } + + pub fn indexOfNotSpaceOrTab(ptr: @This()) usize { + return ptr.indexOfNone(" \t"); + } + pub fn indexOfNotSpace(ptr: @This()) usize { - return ptr.indexOfNone(" "); + return ptr.indexOfNotChar(' '); } pub fn indexOfNewline(ptr: @This()) usize { diff --git a/src/root.zig b/src/root.zig @@ -3,12 +3,16 @@ const std = @import("std"); pub const Ast = @import("Ast.zig"); pub const AstGen = @import("AstGen.zig"); pub const AstGen2 = @import("AstGen2.zig"); +pub const AstGen3 = @import("AstGen3.zig"); pub const parse = AstGen.parse; pub const parse2 = AstGen2.parse; +pub const parse3 = AstGen3.parse; test { _ = @import("test/test.zig"); _ = Ast; _ = AstGen; + _ = AstGen2; + _ = AstGen3; _ = @import("IndentationScanner.zig"); } diff --git a/src/utils.zig b/src/utils.zig @@ -57,20 +57,20 @@ pub fn Packed(comptime Tagged_: type) type { .decls = &.{}, } }); - pub fn fromTagged(tagged: Tagged_) Self { - switch (@as(Tag, tagged)) { - inline else => |t| return .{ + pub inline fn fromTagged(tagged: Tagged_) Self { + switch (tagged) { + inline else => |data, tag| return .{ .tag = tagged, .data = @unionInit( Data, - @tagName(t), - @field(tagged, @tagName(t)), + @tagName(tag), + data, ), }, } } - pub fn toTagged(self: Self) Tagged_ { + pub inline fn toTagged(self: Self) Tagged_ { switch (self.tag) { inline else => |t| return @unionInit( Tagged_,