mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

commit a293f5b6aea00d2a3465b0aa4a513953b60f4bfa
parent a274d6f4377b2e6de289fcb2c7b2aa0f42d99903
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Fri, 16 May 2025 05:37:57 +0800

v-1

Diffstat:
Msrc/AstGen.zig | 216++++++++++++++++++++++++++++++++-----------------------------------------------
1 file changed, 87 insertions(+), 129 deletions(-)

diff --git a/src/AstGen.zig b/src/AstGen.zig @@ -10,7 +10,6 @@ const Node = Ast.Node; const Error = Ast.Error; input_base: [*]u8, -input: []u8, nodes: std.ArrayListUnmanaged(Node), errors: std.ArrayListUnmanaged(Error), extra: std.ArrayListUnmanaged(u32), @@ -46,20 +45,14 @@ pub fn deinit(self: *AstGen, gpa: Allocator) void { } pub fn parse(gpa: Allocator, output_gpa: ?Allocator, input: []const u8) error{ InputTooLarge, OutOfMemory }!Ast { - if (input.len > std.math.maxInt(u32) - 1) { + if (input.len > std.math.maxInt(u32)) { return error.InputTooLarge; } - // const input_copy = input; - // const input_copy = try gpa.dupe(u8, input); - // defer gpa.free(input_copy); - var input_copy_arraylist: std.ArrayListUnmanaged(u8) = .empty; - defer input_copy_arraylist.deinit(gpa); - try input_copy_arraylist.ensureTotalCapacityPrecise(gpa, input.len + 2); - + const input_copy = try gpa.dupe(u8, input); + defer gpa.free(input_copy); var ast: AstGen = .{ - .input_base = input_copy_arraylist.items.ptr, - .input = undefined, + .input_base = input_copy.ptr, .nodes = .empty, .errors = .empty, .extra = .empty, @@ -67,30 +60,15 @@ pub fn parse(gpa: Allocator, output_gpa: ?Allocator, input: []const u8) error{ I defer ast.deinit(gpa); const root = try ast.appendNode(gpa, .{ .document = .{} }); - var lines: std.ArrayListUnmanaged(Ast.StrOffset) = .empty; + var lines: std.ArrayListUnmanaged([]u8) = .empty; defer lines.deinit(gpa); - // var lines: std.ArrayListUnmanaged([]u8) = .empty; - // defer lines.deinit(gpa); - var lines_it = std.mem.splitScalar(u8, input, '\n'); + var lines_it = std.mem.splitScalar(u8, input_copy, '\n'); var maybe_line: ?[]u8 = @constCast(lines_it.first()); while (maybe_line) |line| : (maybe_line = @constCast(lines_it.next())) { - if (str.lastIndexOfNone(line, " \t\r\n")) |idx| { - const old_len = input_copy_arraylist.items.len; - try lines.append(gpa, @intCast(old_len)); - input_copy_arraylist.appendSliceAssumeCapacity(line); - input_copy_arraylist.appendAssumeCapacity('\n'); - input_copy_arraylist.items[old_len + idx + 1] = '\n'; - // try lines.append(gpa, input_copy_arraylist.items[old_len .. old_len + idx + 1]); - } else { - try lines.append(gpa, @intCast(input_copy_arraylist.items.len)); - input_copy_arraylist.appendAssumeCapacity('\n'); - // try lines.append(gpa, &.{}); - } + try lines.append(gpa, line); } - input_copy_arraylist.appendAssumeCapacity('\n'); - ast.input = input_copy_arraylist.items; - // stripTrailingWhitespace(&lines.items); + stripTrailingWhitespace(&lines.items); try ast.parseColumn(gpa, lines.items, root); @@ -125,51 +103,47 @@ fn calcOffset(self: *AstGen, c: *u8) u32 { return @intCast(c - self.input_base); } -fn findIndentedColumn(self: *AstGen, gpa: Allocator, lines_: []u32, node_idx: Node.Idx) ![]u32 { +fn findIndentedColumn(self: *AstGen, gpa: Allocator, lines_: [][]u8, node_idx: Node.Idx) ![][]u8 { var lines = lines_; // empty lines at the start of the inline block are fine, just skip these // special case: the first line consist of only whitespace // because they may have been introduced via marker replacement - if (lines.len > 0) - if (str.indexOfNone(self.input[lines[0]..], " \t\r")) |idx| - if (self.input[lines[0] + idx] == '\n') { - lines = lines[1..]; - while (true) : (lines = lines[1..]) { - if (lines.len == 0) return &.{}; - if (self.input[lines[0]] != '\n') break; - } - }; - if (lines.len == 0) return &.{}; + if (lines.len > 0 and str.indexOfNone(lines[0], " \t\r\n") == null) lines = lines[1..]; + while (true) : (lines = lines[1..]) { + if (lines.len == 0) return &.{}; + if (lines[0].len != 0) break; + } // determine indentation - const indentation_idx = str.indexOfNone(self.input[lines[0]..], " \t\r") orelse unreachable; + const indentation_idx = str.indexOfNone(lines[0], " \t\r\n") orelse unreachable; if (indentation_idx == 0) return &.{}; - const indentation = self.input[lines[0] .. lines[0] + indentation_idx]; + + const indentation = lines[0][0..indentation_idx]; // strip all lines of their indentation - lines[0] += @truncate(indentation.len); + lines[0] = lines[0][indentation.len..]; for (lines[1..]) |*line| { - if (self.input[line.*] == '\n') continue; + if (line.len == 0) continue; - const diff_idx = std.mem.indexOfDiff(u8, self.input[line.*..], indentation) orelse unreachable; - // std.debug.assert(diff_idx != line.len); + const diff_idx = std.mem.indexOfDiff(u8, line.*, indentation) orelse unreachable; + std.debug.assert(diff_idx != line.len); if (diff_idx != indentation.len) { try self.errors.append(gpa, .fromTagged(.{ - .inconsistent_indentation = .{ .idx = node_idx, .off = line.* }, + .inconsistent_indentation = .{ .idx = node_idx, .off = self.calcOffset(&line.*[0]) }, })); // Recover by stripping all whitespace on this line - const recover_indentation_idx = std.mem.indexOfNone(u8, self.input[line.*..], " \t\r") orelse unreachable; - line.* += @truncate(recover_indentation_idx); + const recover_indentation_idx = std.mem.indexOfNone(u8, line.*, " \t\r\n") orelse unreachable; + line.* = line.*[recover_indentation_idx..]; } else { - line.* += @truncate(indentation.len); + line.* = line.*[indentation.len..]; } } return lines; } -fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, parent_idx: Node.Idx) !void { +fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: Node.Idx) !void { var lines = lines_; var empty_line_off: ?u32 = null; @@ -177,46 +151,39 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, pare // empty lines at the start of the inline block are fine, just skip these // special case: the first line consist of only whitespace // because they may have been introduced via marker replacement - if (lines.len > 0) - if (str.indexOfNone(self.input[lines[0]..], " \t\r")) |idx| - if (self.input[lines[0] + idx] == '\n') { - lines = lines[1..]; - while (true) : (lines = lines[1..]) { - if (lines.len == 0) break :outer; - if (self.input[lines[0]] != '\n') break; - } - }; - if (lines.len == 0) break :outer; + if (lines.len > 0 and str.indexOfNone(lines[0], " \t\r\n") == null) lines = lines[1..]; + while (true) : (lines = lines[1..]) { + if (lines.len == 0) break :outer; + if (lines[0].len != 0) break; + } self.getNode(parent_idx).incrementNumChildren(); // determine indentation - const indentation_idx = str.indexOfNone(self.input[lines[0]..], " \t\r") orelse unreachable; - const indentation = self.input[lines[0] .. lines[0] + indentation_idx]; + const indentation_idx = str.indexOfNone(lines[0], " \t\r\n") orelse unreachable; + const indentation = lines[0][0..indentation_idx]; - lines[0] += @truncate(indentation.len); - // lines[0] = lines[0][indentation.len..]; + lines[0] = lines[0][indentation.len..]; - var len = str.indexOfChar(self.input[lines[0]..], '\n') orelse unreachable; - if (len <= std.math.maxInt(Ast.StrLen)) { + if (lines[0].len <= std.math.maxInt(Ast.StrLen)) { _ = try self.appendNode(gpa, .{ .text = .{ - .off = lines[0], - .len = @truncate(len), + .off = self.calcOffset(&lines[0][0]), + .len = @intCast(lines[0].len), }, }); } else { @branchHint(.cold); - while (len > 0) { - const chunk_len = @min(len, std.math.maxInt(Ast.StrLen)); + var line = lines[0]; + while (line.len > 0) { + const len = @min(line.len, std.math.maxInt(Ast.StrLen)); _ = try self.appendNode(gpa, .{ .text = .{ - .off = lines[0], - .len = chunk_len, + .off = self.calcOffset(&lines[0][0]), + .len = @intCast(len), }, }); - lines[0] += chunk_len; - len -= chunk_len; + line = line[len..]; } } lines = lines[1..]; @@ -225,9 +192,9 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, pare // Skip and error on empty lines while (true) : (lines = lines[1..]) { if (lines.len == 0) break :outer; - if (self.input[lines[0]] != '\n') break; + if (lines[0].len != 0) break; // empty line detected - empty_line_off = lines[0]; + empty_line_off = self.calcOffset(@ptrCast(lines[0].ptr)); } if (empty_line_off) |off| { @@ -236,49 +203,47 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, pare })); } - const diff_idx = std.mem.indexOfDiff(u8, self.input[lines[0]..], indentation) orelse unreachable; - // std.debug.assert(diff_idx != lines[0].len); + const diff_idx = std.mem.indexOfDiff(u8, lines[0], indentation) orelse unreachable; + std.debug.assert(diff_idx != lines[0].len); if (diff_idx != indentation.len) { try self.errors.append(gpa, .fromTagged(.{ - .inconsistent_indentation = .{ .idx = self.nextNodeIdx(), .off = lines[0] }, + .inconsistent_indentation = .{ .idx = self.nextNodeIdx(), .off = self.calcOffset(&lines[0][0]) }, })); // Recover by stripping all whitespace on this line - const recover_indentation_idx = std.mem.indexOfNone(u8, self.input[lines[0]..], " \t\r\n") orelse unreachable; - lines[0] += @truncate(recover_indentation_idx); + const recover_indentation_idx = std.mem.indexOfNone(u8, lines[0], " \t\r\n") orelse unreachable; + lines[0] = lines[0][recover_indentation_idx..]; } else { - lines[0] += @truncate(indentation.len); + lines[0] = lines[0][indentation.len..]; } self.getNode(parent_idx).incrementNumChildren(); - var len2 = str.indexOfChar(self.input[lines[0]..], '\n') orelse unreachable; - if (len2 <= std.math.maxInt(Ast.StrLen)) { + if (lines[0].len <= std.math.maxInt(Ast.StrLen)) { _ = try self.appendNode(gpa, .{ .space_text = .{ - .off = lines[0], - .len = @truncate(len2), + .off = self.calcOffset(&lines[0][0]), + .len = @intCast(lines[0].len), }, }); } else { @branchHint(.cold); + var line = lines[0]; _ = try self.appendNode(gpa, .{ .space_text = .{ - .off = lines[0], - .len = std.math.maxInt(Ast.StrLen), + .off = self.calcOffset(&lines[0][0]), + .len = @intCast(std.math.maxInt(Ast.StrLen)), }, }); - len2 -= std.math.maxInt(Ast.StrLen); - lines[0] += std.math.maxInt(Ast.StrLen); - while (len2 > 0) { - const chunk_len = @min(len2, std.math.maxInt(Ast.StrLen)); + line = line[std.math.maxInt(Ast.StrLen)..]; + while (line.len > 0) { + const len = @min(line.len, std.math.maxInt(Ast.StrLen)); _ = try self.appendNode(gpa, .{ .text = .{ - .off = lines[0], - .len = chunk_len, + .off = self.calcOffset(&lines[0][0]), + .len = @intCast(len), }, }); - lines[0] += chunk_len; - len2 -= chunk_len; + line = line[len..]; } } lines = lines[1..]; @@ -286,24 +251,17 @@ fn parseInlineBlock(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, pare } } -fn parseColumn(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, parent_idx: Node.Idx) !void { +fn parseColumn(self: *AstGen, gpa: Allocator, lines_: [][]u8, parent_idx: Node.Idx) !void { var lines = lines_; outer: while (true) { // Skip empty lines // special case: the first line consist of only whitespace // because they may have been introduced via marker replacement - if (lines.len > 0) { - if (str.indexOfNone(self.input[lines[0]..], " \t\r")) |idx| { - if (self.input[lines[0] + idx] == '\n') { - lines = lines[1..]; - while (true) : (lines = lines[1..]) { - if (lines.len == 0) break :outer; - if (self.input[lines[0]] != '\n') break; - } - } - } + if (lines.len > 0 and str.indexOfNone(lines[0], " \t\r\n") == null) lines = lines[1..]; + while (true) : (lines = lines[1..]) { + if (lines.len == 0) break :outer; + if (lines[0].len != 0) break; } - if (lines.len == 0) break :outer; // Use first character to determine marker const mode, const child = try self.parseBlockStart(gpa, lines[0]); @@ -314,8 +272,8 @@ fn parseColumn(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, parent_id // take indented or non-block-marker lines var num_lines: usize = 1; for (lines[1..]) |line| { - if (self.input[line] == '\n') break; - if (block_specs[self.input[line]] != null) break; + if (line.len == 0) break; + if (block_specs[line[0]] != null) break; num_lines += 1; } @@ -327,7 +285,7 @@ fn parseColumn(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, parent_id // take indented or empty lines var num_lines: usize = 1; for (lines[1..]) |line| { - if (str.isNoneOf(self.input[line], " \t\r\n")) break; + if (line.len != 0 and str.isNoneOf(line[0], " \t\r\n")) break; num_lines += 1; } @@ -339,7 +297,7 @@ fn parseColumn(self: *AstGen, gpa: Allocator, lines_: []Ast.StrOffset, parent_id // take indented or empty lines var num_lines: usize = 1; for (lines[1..]) |line| { - if (str.isNoneOf(self.input[line], " \t\r\n")) break; + if (line.len != 0 and std.mem.indexOfScalar(u8, " \t\r\n", line[0]) == null) break; num_lines += 1; } @@ -459,26 +417,26 @@ const block_specs = blockSpecs(struct { /// Appends the suitable block node to the ast, /// then returns how parsing should proceed for the children of this block. /// Also returns the idx of the container node created. -fn parseBlockStart(self: *AstGen, gpa: Allocator, line: Ast.StrOffset) !struct { ParseMode, Node.Idx } { - switch (self.input[line]) { +fn parseBlockStart(self: *AstGen, gpa: Allocator, line: []u8) !struct { ParseMode, Node.Idx } { + switch (line[0]) { inline else => |c| { const spec_or_null = block_specs[c]; if (spec_or_null) |spec| { inline for (spec) |case| { switch (case.marker) { .exact, .starts_with => |marker| { - if (std.mem.startsWith(u8, self.input[line..], marker)) { + if (std.mem.startsWith(u8, line, marker)) { const node = if (case.mode == .no_children) try self.appendNode(gpa, @unionInit(Node.Tagged, @tagName(case.tag), @as(Node.Tagged.Leaf, .{ - .off = line, + .off = self.calcOffset(&line[0]), .len = marker.len, }))) else try self.appendNode(gpa, @unionInit(Node.Tagged, @tagName(case.tag), @as(Node.Tagged.Container, .{ - .off = line, + .off = self.calcOffset(&line[0]), .num_children = if (case.store_marker_child == .store) 1 else 0, }))); - @memset(self.input[line .. line + marker.len], ' '); + @memset(line[0..marker.len], ' '); if (case.store_marker_child == .store) { _ = try self.appendNode(gpa, .{ .marker = .{ - .off = line, + .off = self.calcOffset(&line[0]), .len = case.marker.len, } }); } @@ -486,14 +444,14 @@ fn parseBlockStart(self: *AstGen, gpa: Allocator, line: Ast.StrOffset) !struct { } }, .starts_with_multi => |marker_spec| { - var marker_len = str.indexOfNotChar(self.input[line..], marker_spec.marker_char) orelse str.indexOfChar(self.input[line..], '\n') orelse unreachable; + var marker_len = str.indexOfNotChar(line, marker_spec.marker_char) orelse line.len; inline for (marker_spec.extra) |extra| { - if (std.mem.startsWith(u8, self.input[line + marker_len ..], extra)) { + if (std.mem.startsWith(u8, line[marker_len..], extra)) { marker_len += extra.len; const node = try self.appendNode(gpa, @unionInit(Node.Tagged, @tagName(case.tag), @as(Node.Tagged.Container, .{ - .off = line, + .off = self.calcOffset(&line[0]), .num_children = if (case.store_marker_child == .store) 1 else 0, }))); @@ -508,10 +466,10 @@ fn parseBlockStart(self: *AstGen, gpa: Allocator, line: Ast.StrOffset) !struct { }, })); - @memset(self.input[line .. line + marker_len], ' '); + @memset(line[0..marker_len], ' '); if (case.store_marker_child == .store) { _ = try self.appendNode(gpa, .{ .marker = .{ - .off = line, + .off = self.calcOffset(&line[0]), .len = utils.safeIntCast(Ast.StrLen, marker_len), } }); } @@ -527,7 +485,7 @@ fn parseBlockStart(self: *AstGen, gpa: Allocator, line: Ast.StrOffset) !struct { .paragraph, try self.appendNode(gpa, .{ .paragraph = .{ - .off = line, + .off = self.calcOffset(&line[0]), }, }), }; @@ -540,7 +498,7 @@ fn parseBlockStart(self: *AstGen, gpa: Allocator, line: Ast.StrOffset) !struct { try self.errors.append(gpa, .fromTagged(.{ .invalid_marker = .{ .idx = self.nextNodeIdx(), - .off = line, + .off = self.calcOffset(&line[0]), }, })); @@ -548,7 +506,7 @@ fn parseBlockStart(self: *AstGen, gpa: Allocator, line: Ast.StrOffset) !struct { .paragraph, try self.appendNode(gpa, .{ .paragraph = .{ - .off = line, + .off = self.calcOffset(&line[0]), }, }), };