Make AstGen2 and AstGen3 faster and now they're the same speed - mymarkdown

commit 627faa642d6d6a91925ad705440d210d25e6c897
parent 6a848e2833660bccc8ac9343e66765825151cb93
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Wed, 21 May 2025 01:43:09 +0800

Make AstGen2 and AstGen3 faster and now they're the same speed

Diffstat:
M src/AstGen2.zig  | 25 ++++++++++++++++++-------
M src/AstGen3.zig  | 25 ++++++++++++++++++-------
M src/main.zig  | 50 +++++++++++++++++++++++++++++---------------------
M src/padded_str_impl.zig  | 36 +++++++++++++++++++++++++-----------

4 files changed, 90 insertions(+), 46 deletions(-)
diff --git a/src/AstGen2.zig b/src/AstGen2.zig
@@ -55,7 +55,18 @@ fn nextNodeIdx(self: AstGen) Node.Idx {
     @setRuntimeSafety(true);
     return @enumFromInt(self.nodes.items.len);
 }
-fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
+
+// These need manual inlining for some reason.
+//
+// LLVM doesn't seem to think that inlining these are worth it, but LLVM is wrong.
+// Because this constructs the Node using .fromTagged, and Node.Tagged and Node
+// have different memory representations, the only way to construct in place and
+// elide copies is for every function from this one to the callsite to be inlined.
+//
+// The same applies for Error / Error.Tagged below, but the impact is less severe
+// as appending Errors is a much less common operation. Nevertheless, we inline it
+// despite not having any data to back it up, because I have vibes that it should be faster.
+inline fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
     if (self.nodes.items.len > std.math.maxInt(
         @typeInfo(Node.Idx).@"enum".tag_type,
     )) return error.OutOfNodeIdx;
@@ -63,7 +74,7 @@ fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
     try self.nodes.append(self.gpa, .fromTagged(node));
     return @enumFromInt(idx);
 }
-fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx {
+inline fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx {
     self.getNode(parent_idx).incrementNumChildren();
     return try self.appendNode(
         @unionInit(
@@ -73,7 +84,7 @@ fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime con
         ),
     );
 }
-fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx {
+inline fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx {
     self.getNode(parent_idx).incrementNumChildren();
     return try self.appendNode(
         @unionInit(
@@ -83,13 +94,13 @@ fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_nod
         ),
     );
 }
-fn appendError(self: *AstGen, err: Error.Tagged) !void {
+inline fn appendError(self: *AstGen, err: Error.Tagged) !void {
     if (self.errors.items.len > std.math.maxInt(
         @typeInfo(Error.Idx).@"enum".tag_type,
     )) return error.OutOfErrorIdx;
     try self.errors.append(self.gpa, .fromTagged(err));
 }
-fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+inline fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
     try self.appendError(
         @unionInit(
             Error.Tagged,
@@ -98,7 +109,7 @@ fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Id
         ),
     );
 }
-fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+inline fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
     try self.appendError(
         @unionInit(
             Error.Tagged,
@@ -180,7 +191,7 @@ pub fn parse(
 
 const ParsingContext = enum { block_context, inline_context };
 
-noinline fn parseRoot(self: *AstGen) !void {
+fn parseRoot(self: *AstGen) !void {
     const tracy_frame = tracy.trace(@src());
     defer tracy_frame.end();
     const root = try self.appendNode(.{ .document = .{} });
diff --git a/src/AstGen3.zig b/src/AstGen3.zig
@@ -45,7 +45,18 @@ fn nextNodeIdx(self: *const AstGen) Node.Idx {
     @setRuntimeSafety(true);
     return @enumFromInt(self.nodes.items.len);
 }
-fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
+
+// These need manual inlining for some reason.
+//
+// LLVM doesn't seem to think that inlining these are worth it, but LLVM is wrong.
+// Because this constructs the Node using .fromTagged, and Node.Tagged and Node
+// have different memory representations, the only way to construct in place and
+// elide copies is for every function from this one to the callsite to be inlined.
+//
+// The same applies for Error / Error.Tagged below, but the impact is less severe
+// as appending Errors is a much less common operation. Nevertheless, we inline it
+// despite not having any data to back it up, because I have vibes that it should be faster.
+inline fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
     if (self.nodes.items.len > std.math.maxInt(
         @typeInfo(Node.Idx).@"enum".tag_type,
     )) return error.OutOfNodeIdx;
@@ -53,7 +64,7 @@ fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
     try self.nodes.append(self.gpa, .fromTagged(node));
     return @enumFromInt(idx);
 }
-fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx {
+inline fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx {
     self.getNode(parent_idx).incrementNumChildren();
     return try self.appendNode(
         @unionInit(
@@ -63,7 +74,7 @@ fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_n
         ),
     );
 }
-fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx {
+inline fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx {
     self.getNode(parent_idx).incrementNumChildren();
     return try self.appendNode(
         @unionInit(
@@ -73,13 +84,13 @@ fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: N
         ),
     );
 }
-fn appendError(self: *AstGen, err: Error.Tagged) !void {
+inline fn appendError(self: *AstGen, err: Error.Tagged) !void {
     if (self.errors.items.len > std.math.maxInt(
         @typeInfo(Error.Idx).@"enum".tag_type,
     )) return error.OutOfErrorIdx;
     try self.errors.append(self.gpa, .fromTagged(err));
 }
-fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void {
+inline fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void {
     try self.appendError(
         @unionInit(
             Error.Tagged,
@@ -88,7 +99,7 @@ fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: 
         ),
     );
 }
-fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+inline fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
     try self.appendError(
         @unionInit(
             Error.Tagged,
@@ -200,7 +211,7 @@ fn parseColumn(
 /// unmarked paragraphs are interrupted by block elements and
 /// do not consume the whole column, whereas here, we ignore
 /// block special characters.
-inline fn parseInlineColumn(
+fn parseInlineColumn(
     self: *AstGen,
     parent_idx: Node.Idx,
 ) !void {
diff --git a/src/main.zig b/src/main.zig
@@ -3,7 +3,6 @@ const std = @import("std");
 const tracy = @import("tracy");
 const mymarkdown = @import("mymarkdown");
 
-const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{});
 const ArenaAllocator = std.heap.ArenaAllocator;
 
 fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8) {
@@ -25,12 +24,21 @@ fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8
     return al;
 }
 
+var debug_allocator: std.heap.DebugAllocator(.{}) = .init;
+
 pub fn main() !void {
-    var gpa: GeneralPurposeAllocator = .{};
-    var arena: ArenaAllocator = .init(gpa.allocator());
-    defer arena.deinit();
+    const gpa, const is_debug = gpa: {
+        break :gpa switch (@import("builtin").mode) {
+            .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true },
+            .ReleaseFast, .ReleaseSmall => .{ std.heap.page_allocator, false },
+        };
+    };
+    defer _ = if (is_debug) debug_allocator.deinit();
+    var arena_instance: ArenaAllocator = .init(gpa);
+    defer arena_instance.deinit();
+    const arena = arena_instance.allocator();
 
-    const args = try std.process.argsAlloc(arena.allocator());
+    const args = try std.process.argsAlloc(arena);
     const run1, const run2, const run3 = blk: {
         var run1, var run2, var run3 = .{ false, false, false };
         for (args) |arg| {
@@ -44,7 +52,7 @@ pub fn main() !void {
         break :blk .{ run1, run2, run3 };
     };
 
-    var input_arraylist = try readInput(gpa.allocator(), arena.allocator());
+    var input_arraylist = try readInput(gpa, arena);
     defer input_arraylist.deinit();
     const input = input_arraylist.items;
 
@@ -54,8 +62,8 @@ pub fn main() !void {
                 const tracy_frame = tracy.namedFrame("parse 1");
                 defer tracy_frame.end();
                 break :blk try mymarkdown.parse(
-                    gpa.allocator(),
-                    arena.allocator(),
+                    gpa,
+                    arena,
                     input,
                 );
             });
@@ -64,8 +72,8 @@ pub fn main() !void {
                 const tracy_frame = tracy.namedFrame("parse 2");
                 defer tracy_frame.end();
                 break :blk try mymarkdown.parse2(
-                    gpa.allocator(),
-                    arena.allocator(),
+                    gpa,
+                    arena,
                     input,
                 );
             });
@@ -74,8 +82,8 @@ pub fn main() !void {
                 const tracy_frame = tracy.namedFrame("parse 3");
                 defer tracy_frame.end();
                 break :blk try mymarkdown.parse3(
-                    gpa.allocator(),
-                    arena.allocator(),
+                    gpa,
+                    arena,
                     input,
                 );
             });
@@ -86,8 +94,8 @@ pub fn main() !void {
             const tracy_frame = tracy.namedFrame("parse 1");
             defer tracy_frame.end();
             break :blk try mymarkdown.parse(
-                gpa.allocator(),
-                arena.allocator(),
+                gpa,
+                arena,
                 input,
             );
         };
@@ -95,8 +103,8 @@ pub fn main() !void {
             const tracy_frame = tracy.namedFrame("parse 2");
             defer tracy_frame.end();
             break :blk try mymarkdown.parse2(
-                gpa.allocator(),
-                arena.allocator(),
+                gpa,
+                arena,
                 input,
             );
         };
@@ -104,13 +112,13 @@ pub fn main() !void {
             const tracy_frame = tracy.namedFrame("parse 3");
             defer tracy_frame.end();
             break :blk try mymarkdown.parse3(
-                gpa.allocator(),
-                arena.allocator(),
+                gpa,
+                arena,
                 input,
             );
         };
 
-        var render_arraylist1: std.ArrayList(u8) = .init(gpa.allocator());
+        var render_arraylist1: std.ArrayList(u8) = .init(gpa);
         defer render_arraylist1.deinit();
         {
             std.debug.print("Rendering 1\n", .{});
@@ -118,7 +126,7 @@ pub fn main() !void {
             defer tracy_frame.end();
             _ = try ast.renderAst(render_arraylist1.writer(), input);
         }
-        var render_arraylist2: std.ArrayList(u8) = .init(gpa.allocator());
+        var render_arraylist2: std.ArrayList(u8) = .init(gpa);
         defer render_arraylist2.deinit();
         {
             std.debug.print("Rendering 2\n", .{});
@@ -126,7 +134,7 @@ pub fn main() !void {
             defer tracy_frame.end();
             _ = try ast2.renderAst(render_arraylist2.writer(), input);
         }
-        var render_arraylist3: std.ArrayList(u8) = .init(gpa.allocator());
+        var render_arraylist3: std.ArrayList(u8) = .init(gpa);
         defer render_arraylist3.deinit();
         {
             std.debug.print("Rendering 3\n", .{});
diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig
@@ -129,22 +129,30 @@ pub fn PaddedMany(comptime PADDING_: usize) type {
             return other._ptr - ptr._ptr;
         }
 
-        fn simd_maybe(
+        const SimdOptions = struct {
+            /// Whether to disable SIMD entirely
+            disable: bool = false,
             /// How many loads to do in parallel in 1 loop iteration.
             /// Consider increasing if your input distribution contains long strings.
             /// Note that this may not have an effect if PADDING is not large enough.
-            comptime MAX_UNROLL: usize,
-        ) ?type {
+            max_unroll: ?usize = null,
+            /// How many bytes to read in 1 loop iteration.
+            /// Gives more fine grained control than max_unroll.
+            max_block_len: ?usize = null,
+        };
+
+        fn simd_maybe(comptime options: SimdOptions) ?type {
             if (switch (@import("builtin").zig_backend) {
                 .stage2_llvm, .stage2_c => true,
                 else => false,
             } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717
-            ) {
+            and !options.disable) {
                 if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| {
                     return struct {
                         const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING);
                         const BLOCK_LEN = @min(
-                            MAX_UNROLL * BLOCK_LEN__,
+                            options.max_block_len orelse PADDING,
+                            (options.max_unroll orelse 1) * BLOCK_LEN__,
                             @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__,
                         );
                         const CharBlock = @Vector(BLOCK_LEN, u8);
@@ -170,17 +178,20 @@ pub fn PaddedMany(comptime PADDING_: usize) type {
         pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize {
             if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null)
                 @compileError("charset must not contain newline, otherwise this op is potentially unsafe.");
-            return ptr.indexOfNoneUnsafe(charset, .{ .max_unroll = 3 });
+            return ptr.indexOfNoneUnsafe(
+                charset,
+                .{ .disable = true },
+            );
         }
 
         pub fn indexOfNoneUnsafe(
             ptr: @This(),
             comptime charset: []const u8,
-            comptime opts: struct { max_unroll: usize = 1 },
+            comptime opts: SimdOptions,
         ) usize {
             var it = ptr._ptr;
             if (!@inComptime()) {
-                if (simd_maybe(opts.max_unroll)) |simd| {
+                if (simd_maybe(opts)) |simd| {
                     const splats: [charset.len]simd.CharBlock = comptime blk: {
                         var splats_: []const simd.CharBlock = &.{};
                         for (charset) |c| {
@@ -222,17 +233,20 @@ pub fn PaddedMany(comptime PADDING_: usize) type {
         pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize {
             if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null)
                 @compileError("charset must contain newline, otherwise this op is potentially unsafe.");
-            return ptr.indexOfAnyUnsafe(charset, .{ .max_unroll = 1 });
+            return ptr.indexOfAnyUnsafe(
+                charset,
+                .{ .max_unroll = 2 },
+            );
         }
 
         pub fn indexOfAnyUnsafe(
             ptr: @This(),
             comptime charset: []const u8,
-            comptime opts: struct { max_unroll: usize = 1 },
+            comptime opts: SimdOptions,
         ) usize {
             var it = ptr._ptr;
             if (!@inComptime()) {
-                if (simd_maybe(opts.max_unroll)) |simd| {
+                if (simd_maybe(opts)) |simd| {
                     const splats: [charset.len]simd.CharBlock = comptime blk: {
                         var splats_: []const simd.CharBlock = &.{};
                         for (charset) |c| {

	mymarkdown My markdown
	git clone https://git.grace.moe/mymarkdown
	Log \| Files \| Refs

M	src/AstGen2.zig	\|	25	++++++++++++++++++-------
M	src/AstGen3.zig	\|	25	++++++++++++++++++-------
M	src/main.zig	\|	50	+++++++++++++++++++++++++++++---------------------
M	src/padded_str_impl.zig	\|	36	+++++++++++++++++++++++++-----------