commit 94b5af2e6db938f724637a23e0ed1d81f2294741
parent de923c7630cdc9d438a72ba2e423e49b1f041e01
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 18 May 2025 22:33:09 +0800
didnt fix all bugs
Diffstat:
4 files changed, 184 insertions(+), 37 deletions(-)
diff --git a/src/Ast.zig b/src/Ast.zig
@@ -271,6 +271,7 @@ fn AstRenderer(Writer: type) type {
try str.escapeStringForDoubleQuotedString(
renderer.writer,
renderer.input[data.off .. data.off + data.len],
+ .padded,
);
try renderer.writer.writeByte('"');
try renderer.writer.writeByte('\n');
diff --git a/src/AstGen2.zig b/src/AstGen2.zig
@@ -175,6 +175,8 @@ pub fn parse(
const ParsingContext = enum { block_context, inline_context };
fn parseRoot(self: *AstGen) !void {
+ const tracy_frame = tracy.trace(@src());
+ defer tracy_frame.end();
const root = try self.appendNode(.{ .document = .{} });
assert(root == .root);
assert(self.input.ptr == self.cursor.ptr);
@@ -211,6 +213,8 @@ fn parseColumn(
OutOfErrorIdx,
OutOfMemory,
}!Column {
+ const tracy_frame = tracy.trace(@src());
+ defer tracy_frame.end();
return self.parseColumnInline(parent_idx, parent_col, cursor_col, parsing_context);
}
@@ -245,8 +249,6 @@ inline fn parseColumnInline(
cursor_col: Column,
comptime parsing_context: ParsingContext,
) !Column {
- const tracy_frame = tracy.trace(@src());
- defer tracy_frame.end();
assert(cursor_col == 0 or parent_col < cursor_col);
// Used for "indentation correction".
@@ -299,7 +301,8 @@ inline fn parseColumnInline(
// but I don't have anything better. (Btw, the original AstGen.zig uses the same recovery logic.)
parse_another_block: while (true) {
- assert(self.cursor.len > 0 and str.isNoneOf(self.cursor[0], " \t\r\n"));
+ assert(self.cursor.len > 0);
+ assert(str.isNoneOf(self.cursor[0], " \t\r\n"));
// Will be set by the recursive call (if any),
// to indicate how much indentation was previously checked.
@@ -311,7 +314,9 @@ inline fn parseColumnInline(
inline '-', '.', ':', '#' => |m| {
const marker_len = try self.findMarkerEnd(m);
if (m == '-') {
- const potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable;
+ var potential_task_item = str.indexOfNone(self.cursor[marker_len..], "[ ]xX") orelse unreachable;
+ while (potential_task_item >= 3 and self.cursor[marker_len + potential_task_item - 1] == ' ')
+ potential_task_item -= 1;
if (potential_task_item >= 3 and
self.cursor[marker_len + potential_task_item - 1] == ']' and
self.cursor[marker_len + potential_task_item - 3] == '[' and
@@ -405,16 +410,35 @@ inline fn parseColumnInline(
},
'*' => {
- if (std.mem.eql(u8, self.cursor[0..3], "***") and
- self.cursor[
- 3 + (str.indexOfNone(
- self.cursor[3..],
- " \t",
- ) orelse unreachable)
- ] == '\n')
- {
- _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3);
- break :finish_parsing_block;
+ if (std.mem.eql(u8, self.cursor[0..3], "***")) {
+ const after_stars = self.cursor[3..];
+ const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable;
+ if (after_stars[skip_whitespace_idx] == '\n') {
+ _ = try self.appendLeafNodeAtCursor(parent_idx, .thematic_break, 3);
+ self.advanceCursor(3 + skip_whitespace_idx + 1);
+ while (true) {
+ if (self.cursor.len == 0) return 0;
+
+ const next_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
+ if (self.cursor[next_idx] == '\n') {
+ self.advanceCursor(next_idx + 1);
+ continue;
+ }
+
+ const verified_indentation_idx = std.mem.indexOfDiff(
+ u8,
+ self.cursor,
+ self.indentation[0..block_col],
+ ) orelse unreachable;
+ if (verified_indentation_idx == block_col) {
+ indentation_idx = @intCast(next_idx);
+ break :finish_parsing_block;
+ } else {
+ indentation_idx = @intCast(verified_indentation_idx);
+ break :finish_parsing_block;
+ }
+ }
+ }
}
},
@@ -474,19 +498,6 @@ inline fn parseColumnInline(
if (self.cursor.len == 0) return 0;
assert(self.cursor[indentation_idx] != '\n');
- // if (self.cursor[indentation_idx] == '\n') {
- // std.debug.assert(false); // TODO: Check this logic
- // // Empty line
- // self.advanceCursor(indentation_idx + 1);
- // // NOTE: null is impossible because input is guaranteed to end in newlines.
- // indentation_idx = if (str.indexOfNone(self.cursor, " \t") orelse unreachable) |idx|
- // // Explicitly check for indentation length because malicious input is possible
- // if (idx > std.math.maxInt(Column))
- // return error.IndentationTooLong
- // else
- // @intCast(idx);
- // continue :finding_block;
- // } else
if (indentation_idx > cursor_col) {
// Matches us but there's too much whitespace.
// Fix the indentation.
@@ -738,8 +749,8 @@ fn parseParagraph(
if (self.cursor.len == 0) return 0;
const indentation_idx = str.indexOfNone(self.cursor, " \t") orelse unreachable;
- // block line found, exit
if (str.isAnyOf(self.cursor[indentation_idx], "-.:+>#;")) {
+ // block line found, exit
const verified_indentation_idx = std.mem.indexOfDiff(
u8,
self.cursor,
@@ -751,6 +762,23 @@ fn parseParagraph(
return @intCast(verified_indentation_idx);
}
}
+ if (self.cursor[indentation_idx] == '*') {
+ const after_stars = self.cursor[3..];
+ const skip_whitespace_idx = str.indexOfNone(after_stars, " \t") orelse unreachable;
+ if (after_stars[skip_whitespace_idx] == '\n') {
+ // block line found, exit
+ const verified_indentation_idx = std.mem.indexOfDiff(
+ u8,
+ self.cursor,
+ self.indentation[0..block_col],
+ ) orelse unreachable;
+ if (verified_indentation_idx == block_col) {
+ return @intCast(indentation_idx);
+ } else {
+ return @intCast(verified_indentation_idx);
+ }
+ }
+ }
// empty line found, consume to next nonwhitespace and exit
if (self.cursor[indentation_idx] == '\n') {
self.advanceCursor(indentation_idx + 1);
diff --git a/src/str.zig b/src/str.zig
@@ -97,25 +97,49 @@ pub fn lastIndexOfNone(s: Str, cs: Charset) ?usize {
return mem.lastIndexOfNone(u8, s, cs);
}
+pub const PaddingOption = enum { padded, not_padded };
+
pub fn escapeStringForDoubleQuotedString(
writer: anytype,
slice: []const u8,
+ comptime has_padding: PaddingOption,
) !void {
- return escapeString(writer, slice, .double_quoted_string);
+ return escapeString(writer, slice, has_padding, .double_quoted_string);
}
pub fn escapeStringForSingleQuotedString(
writer: anytype,
slice: []const u8,
+ comptime has_padding: PaddingOption,
) !void {
- return escapeString(writer, slice, .double_quoted_string);
+ return escapeString(writer, slice, has_padding, .double_quoted_string);
}
-pub fn fmtEscapes(bytes: []const u8) std.fmt.Formatter(stringEscapeFormatter) {
+pub fn fmtEscapes(
+ bytes: []const u8,
+ comptime has_padding: PaddingOption,
+) std.fmt.Formatter(if (has_padding == .padded)
+ stringEscapeFormatterWithPadding
+else
+ stringEscapeFormatterWithoutPadding) {
return .{ .data = bytes };
}
-pub fn stringEscapeFormatter(
+pub fn stringEscapeFormatterWithPadding(
+ bytes: []const u8,
+ comptime f: []const u8,
+ options: std.fmt.FormatOptions,
+ writer: anytype,
+) !void {
+ _ = options;
+ if (f.len == 1 and f[0] == '\'') {
+ try escapeString(writer, bytes, .padded, .single_quoted_string);
+ } else {
+ try escapeString(writer, bytes, .padded, .double_quoted_string);
+ }
+}
+
+pub fn stringEscapeFormatterWithoutPadding(
bytes: []const u8,
comptime f: []const u8,
options: std.fmt.FormatOptions,
@@ -123,15 +147,16 @@ pub fn stringEscapeFormatter(
) !void {
_ = options;
if (f.len == 1 and f[0] == '\'') {
- try escapeString(writer, bytes, .single_quoted_string);
+ try escapeString(writer, bytes, .not_padded, .single_quoted_string);
} else {
- try escapeString(writer, bytes, .double_quoted_string);
+ try escapeString(writer, bytes, .not_padded, .double_quoted_string);
}
}
pub fn escapeString(
writer: anytype,
slice: []const u8,
+ comptime has_padding: PaddingOption,
comptime escape_for: enum { double_quoted_string, single_quoted_string },
) !void {
const tracy_frame = tracy.trace(@src());
@@ -218,6 +243,87 @@ pub fn escapeString(
i += block_len;
}
}
+
+ if (has_padding == .padded) {
+ if (i == slice.len) return;
+ if (slice.len - i >= block_len) {
+ const load: Block = slice[i .. i + block_len][0..block_len].*;
+
+ const has_low_ctrl = load < @as(Block, @splat(0x20));
+ const has_high_ctrl = load >= @as(Block, @splat(0x7f));
+ const has_quote = load == @as(Block, @splat(quote));
+ const has_backslash = load == @as(Block, @splat('\\'));
+
+ // If any character is escaped, do slow path
+ if (@reduce(.Or, has_low_ctrl) or
+ @reduce(.Or, has_high_ctrl) or
+ @reduce(.Or, has_quote) or
+ @reduce(.Or, has_backslash))
+ {
+ // uncomment if you want to inspect the assembly, not that it helps much
+ // @branchHint(.cold);
+ // adapted from std.zig.stringEscape
+ for (slice[i .. i + block_len]) |byte| {
+ switch (byte) {
+ '\t' => try writer.writeAll("\\t"),
+ '\r' => try writer.writeAll("\\r"),
+ '\n' => try writer.writeAll("\\n"),
+ quote => try writer.writeAll(escaped_quote),
+ '\\' => try writer.writeAll("\\\\"),
+ else => if (byte < 0x20 or byte >= 0x7f) {
+ try writer.writeAll("\\x");
+ try std.fmt.formatInt(
+ byte,
+ 16,
+ .lower,
+ .{ .width = 2, .fill = '0' },
+ writer,
+ );
+ } else try writer.writeByte(byte),
+ }
+ }
+ } else {
+ try writer.writeAll(slice[i .. i + block_len]);
+ }
+
+ i += block_len;
+ }
+
+ if (i == slice.len) return;
+
+ {
+ const load_masks: [block_len]Block = comptime blk: {
+ var masks: []const Block = &.{};
+ var mask: [block_len]u8 = @splat(0x00);
+ for (0..block_len) |mask_i| {
+ mask[mask_i] = 0xff;
+ masks = masks ++ .{@as(Block, mask)};
+ }
+ break :blk masks[0..block_len].*;
+ };
+
+ const load: Block =
+ (slice.ptr[i .. i + block_len][0..block_len].* -%
+ @as(Block, @splat(0x20))) &
+ load_masks[slice.len - 1 - i];
+
+ const has_ctrl = load >= @as(Block, @splat(0x7f - 0x20));
+ const has_quote = load == @as(Block, @splat(quote - 0x20));
+ const has_backslash = load == @as(Block, @splat('\\' - 0x20));
+
+ // If any character is escaped, do slow path
+ if (@reduce(.Or, has_ctrl) or
+ @reduce(.Or, has_quote) or
+ @reduce(.Or, has_backslash))
+ {
+ // Fall through to the non-simd for loop
+ } else {
+ // Fast path, just write the thing
+ try writer.writeAll(slice[i..]);
+ return;
+ }
+ }
+ }
}
}
diff --git a/src/test/test.zig b/src/test/test.zig
@@ -1,5 +1,6 @@
const std = @import("std");
const parse = @import("../AstGen.zig").parse;
+const parse2 = @import("../AstGen2.zig").parse;
const Ast = @import("../Ast.zig");
const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{});
@@ -8,11 +9,22 @@ const ArenaAllocator = std.heap.ArenaAllocator;
fn testParse(input: []const u8, expected: []const u8) !void {
var arena: ArenaAllocator = .init(std.testing.allocator);
defer arena.deinit();
- const ast = try parse(std.testing.allocator, arena.allocator(), input);
+
+ const safe_input = try arena.allocator().alloc(u8, input.len + 128);
+ @memcpy(safe_input[0..input.len], input);
+ @memset(safe_input[input.len..], '\n');
+
+ const ast = try parse(std.testing.allocator, arena.allocator(), safe_input);
var ast_render: std.ArrayListUnmanaged(u8) = .empty;
defer ast_render.deinit(std.testing.allocator);
- try ast.renderAst(ast_render.writer(std.testing.allocator), input);
+ try ast.renderAst(ast_render.writer(std.testing.allocator), safe_input);
try std.testing.expectEqualStrings(expected, ast_render.items);
+
+ const ast2 = try parse2(std.testing.allocator, arena.allocator(), safe_input);
+ var ast2_render: std.ArrayListUnmanaged(u8) = .empty;
+ defer ast2_render.deinit(std.testing.allocator);
+ try ast2.renderAst(ast2_render.writer(std.testing.allocator), safe_input);
+ try std.testing.expectEqualStrings(expected, ast2_render.items);
}
test "Empty" {
@@ -362,7 +374,7 @@ test "Empty line in heading" {
\\ .text
\\ "heading"
\\ .space_text
- \\ error .unexpected_block_in_inline_context
+ \\ .error .unexpected_block_in_inline_context
\\ "text"
\\ .paragraph
\\ .text