mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

commit 627faa642d6d6a91925ad705440d210d25e6c897
parent 6a848e2833660bccc8ac9343e66765825151cb93
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Wed, 21 May 2025 01:43:09 +0800

Make AstGen2 and AstGen3 faster and now they're the same speed

Diffstat:
Msrc/AstGen2.zig | 25++++++++++++++++++-------
Msrc/AstGen3.zig | 25++++++++++++++++++-------
Msrc/main.zig | 50+++++++++++++++++++++++++++++---------------------
Msrc/padded_str_impl.zig | 36+++++++++++++++++++++++++-----------
4 files changed, 90 insertions(+), 46 deletions(-)

diff --git a/src/AstGen2.zig b/src/AstGen2.zig @@ -55,7 +55,18 @@ fn nextNodeIdx(self: AstGen) Node.Idx { @setRuntimeSafety(true); return @enumFromInt(self.nodes.items.len); } -fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { + +// These need manual inlining for some reason. +// +// LLVM doesn't seem to think that inlining these are worth it, but LLVM is wrong. +// Because this constructs the Node using .fromTagged, and Node.Tagged and Node +// have different memory representations, the only way to construct in place and +// elide copies is for every function from this one to the callsite to be inlined. +// +// The same applies for Error / Error.Tagged below, but the impact is less severe +// as appending Errors is a much less common operation. Nevertheless, we inline it +// despite not having any data to back it up, because I have vibes that it should be faster. +inline fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { if (self.nodes.items.len > std.math.maxInt( @typeInfo(Node.Idx).@"enum".tag_type, )) return error.OutOfNodeIdx; @@ -63,7 +74,7 @@ fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { try self.nodes.append(self.gpa, .fromTagged(node)); return @enumFromInt(idx); } -fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx { +inline fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx { self.getNode(parent_idx).incrementNumChildren(); return try self.appendNode( @unionInit( @@ -73,7 +84,7 @@ fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime con ), ); } -fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx { +inline fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx { self.getNode(parent_idx).incrementNumChildren(); return try self.appendNode( @unionInit( @@ -83,13 +94,13 @@ fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_nod ), ); } -fn appendError(self: *AstGen, err: Error.Tagged) !void { +inline fn appendError(self: *AstGen, err: Error.Tagged) !void { if (self.errors.items.len > std.math.maxInt( @typeInfo(Error.Idx).@"enum".tag_type, )) return error.OutOfErrorIdx; try self.errors.append(self.gpa, .fromTagged(err)); } -fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { +inline fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { try self.appendError( @unionInit( Error.Tagged, @@ -98,7 +109,7 @@ fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Id ), ); } -fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { +inline fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { try self.appendError( @unionInit( Error.Tagged, @@ -180,7 +191,7 @@ pub fn parse( const ParsingContext = enum { block_context, inline_context }; -noinline fn parseRoot(self: *AstGen) !void { +fn parseRoot(self: *AstGen) !void { const tracy_frame = tracy.trace(@src()); defer tracy_frame.end(); const root = try self.appendNode(.{ .document = .{} }); diff --git a/src/AstGen3.zig b/src/AstGen3.zig @@ -45,7 +45,18 @@ fn nextNodeIdx(self: *const AstGen) Node.Idx { @setRuntimeSafety(true); return @enumFromInt(self.nodes.items.len); } -fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { + +// These need manual inlining for some reason. +// +// LLVM doesn't seem to think that inlining these are worth it, but LLVM is wrong. +// Because this constructs the Node using .fromTagged, and Node.Tagged and Node +// have different memory representations, the only way to construct in place and +// elide copies is for every function from this one to the callsite to be inlined. +// +// The same applies for Error / Error.Tagged below, but the impact is less severe +// as appending Errors is a much less common operation. Nevertheless, we inline it +// despite not having any data to back it up, because I have vibes that it should be faster. +inline fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { if (self.nodes.items.len > std.math.maxInt( @typeInfo(Node.Idx).@"enum".tag_type, )) return error.OutOfNodeIdx; @@ -53,7 +64,7 @@ fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx { try self.nodes.append(self.gpa, .fromTagged(node)); return @enumFromInt(idx); } -fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx { +inline fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx { self.getNode(parent_idx).incrementNumChildren(); return try self.appendNode( @unionInit( @@ -63,7 +74,7 @@ fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_n ), ); } -fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx { +inline fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx { self.getNode(parent_idx).incrementNumChildren(); return try self.appendNode( @unionInit( @@ -73,13 +84,13 @@ fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: N ), ); } -fn appendError(self: *AstGen, err: Error.Tagged) !void { +inline fn appendError(self: *AstGen, err: Error.Tagged) !void { if (self.errors.items.len > std.math.maxInt( @typeInfo(Error.Idx).@"enum".tag_type, )) return error.OutOfErrorIdx; try self.errors.append(self.gpa, .fromTagged(err)); } -fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void { +inline fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void { try self.appendError( @unionInit( Error.Tagged, @@ -88,7 +99,7 @@ fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: ), ); } -fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { +inline fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void { try self.appendError( @unionInit( Error.Tagged, @@ -200,7 +211,7 @@ fn parseColumn( /// unmarked paragraphs are interrupted by block elements and /// do not consume the whole column, whereas here, we ignore /// block special characters. -inline fn parseInlineColumn( +fn parseInlineColumn( self: *AstGen, parent_idx: Node.Idx, ) !void { diff --git a/src/main.zig b/src/main.zig @@ -3,7 +3,6 @@ const std = @import("std"); const tracy = @import("tracy"); const mymarkdown = @import("mymarkdown"); -const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{}); const ArenaAllocator = std.heap.ArenaAllocator; fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8) { @@ -25,12 +24,21 @@ fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8 return al; } +var debug_allocator: std.heap.DebugAllocator(.{}) = .init; + pub fn main() !void { - var gpa: GeneralPurposeAllocator = .{}; - var arena: ArenaAllocator = .init(gpa.allocator()); - defer arena.deinit(); + const gpa, const is_debug = gpa: { + break :gpa switch (@import("builtin").mode) { + .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true }, + .ReleaseFast, .ReleaseSmall => .{ std.heap.page_allocator, false }, + }; + }; + defer _ = if (is_debug) debug_allocator.deinit(); + var arena_instance: ArenaAllocator = .init(gpa); + defer arena_instance.deinit(); + const arena = arena_instance.allocator(); - const args = try std.process.argsAlloc(arena.allocator()); + const args = try std.process.argsAlloc(arena); const run1, const run2, const run3 = blk: { var run1, var run2, var run3 = .{ false, false, false }; for (args) |arg| { @@ -44,7 +52,7 @@ pub fn main() !void { break :blk .{ run1, run2, run3 }; }; - var input_arraylist = try readInput(gpa.allocator(), arena.allocator()); + var input_arraylist = try readInput(gpa, arena); defer input_arraylist.deinit(); const input = input_arraylist.items; @@ -54,8 +62,8 @@ pub fn main() !void { const tracy_frame = tracy.namedFrame("parse 1"); defer tracy_frame.end(); break :blk try mymarkdown.parse( - gpa.allocator(), - arena.allocator(), + gpa, + arena, input, ); }); @@ -64,8 +72,8 @@ pub fn main() !void { const tracy_frame = tracy.namedFrame("parse 2"); defer tracy_frame.end(); break :blk try mymarkdown.parse2( - gpa.allocator(), - arena.allocator(), + gpa, + arena, input, ); }); @@ -74,8 +82,8 @@ pub fn main() !void { const tracy_frame = tracy.namedFrame("parse 3"); defer tracy_frame.end(); break :blk try mymarkdown.parse3( - gpa.allocator(), - arena.allocator(), + gpa, + arena, input, ); }); @@ -86,8 +94,8 @@ pub fn main() !void { const tracy_frame = tracy.namedFrame("parse 1"); defer tracy_frame.end(); break :blk try mymarkdown.parse( - gpa.allocator(), - arena.allocator(), + gpa, + arena, input, ); }; @@ -95,8 +103,8 @@ pub fn main() !void { const tracy_frame = tracy.namedFrame("parse 2"); defer tracy_frame.end(); break :blk try mymarkdown.parse2( - gpa.allocator(), - arena.allocator(), + gpa, + arena, input, ); }; @@ -104,13 +112,13 @@ pub fn main() !void { const tracy_frame = tracy.namedFrame("parse 3"); defer tracy_frame.end(); break :blk try mymarkdown.parse3( - gpa.allocator(), - arena.allocator(), + gpa, + arena, input, ); }; - var render_arraylist1: std.ArrayList(u8) = .init(gpa.allocator()); + var render_arraylist1: std.ArrayList(u8) = .init(gpa); defer render_arraylist1.deinit(); { std.debug.print("Rendering 1\n", .{}); @@ -118,7 +126,7 @@ pub fn main() !void { defer tracy_frame.end(); _ = try ast.renderAst(render_arraylist1.writer(), input); } - var render_arraylist2: std.ArrayList(u8) = .init(gpa.allocator()); + var render_arraylist2: std.ArrayList(u8) = .init(gpa); defer render_arraylist2.deinit(); { std.debug.print("Rendering 2\n", .{}); @@ -126,7 +134,7 @@ pub fn main() !void { defer tracy_frame.end(); _ = try ast2.renderAst(render_arraylist2.writer(), input); } - var render_arraylist3: std.ArrayList(u8) = .init(gpa.allocator()); + var render_arraylist3: std.ArrayList(u8) = .init(gpa); defer render_arraylist3.deinit(); { std.debug.print("Rendering 3\n", .{}); diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig @@ -129,22 +129,30 @@ pub fn PaddedMany(comptime PADDING_: usize) type { return other._ptr - ptr._ptr; } - fn simd_maybe( + const SimdOptions = struct { + /// Whether to disable SIMD entirely + disable: bool = false, /// How many loads to do in parallel in 1 loop iteration. /// Consider increasing if your input distribution contains long strings. /// Note that this may not have an effect if PADDING is not large enough. - comptime MAX_UNROLL: usize, - ) ?type { + max_unroll: ?usize = null, + /// How many bytes to read in 1 loop iteration. + /// Gives more fine grained control than max_unroll. + max_block_len: ?usize = null, + }; + + fn simd_maybe(comptime options: SimdOptions) ?type { if (switch (@import("builtin").zig_backend) { .stage2_llvm, .stage2_c => true, else => false, } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717 - ) { + and !options.disable) { if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| { return struct { const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING); const BLOCK_LEN = @min( - MAX_UNROLL * BLOCK_LEN__, + options.max_block_len orelse PADDING, + (options.max_unroll orelse 1) * BLOCK_LEN__, @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__, ); const CharBlock = @Vector(BLOCK_LEN, u8); @@ -170,17 +178,20 @@ pub fn PaddedMany(comptime PADDING_: usize) type { pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize { if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null) @compileError("charset must not contain newline, otherwise this op is potentially unsafe."); - return ptr.indexOfNoneUnsafe(charset, .{ .max_unroll = 3 }); + return ptr.indexOfNoneUnsafe( + charset, + .{ .disable = true }, + ); } pub fn indexOfNoneUnsafe( ptr: @This(), comptime charset: []const u8, - comptime opts: struct { max_unroll: usize = 1 }, + comptime opts: SimdOptions, ) usize { var it = ptr._ptr; if (!@inComptime()) { - if (simd_maybe(opts.max_unroll)) |simd| { + if (simd_maybe(opts)) |simd| { const splats: [charset.len]simd.CharBlock = comptime blk: { var splats_: []const simd.CharBlock = &.{}; for (charset) |c| { @@ -222,17 +233,20 @@ pub fn PaddedMany(comptime PADDING_: usize) type { pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize { if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null) @compileError("charset must contain newline, otherwise this op is potentially unsafe."); - return ptr.indexOfAnyUnsafe(charset, .{ .max_unroll = 1 }); + return ptr.indexOfAnyUnsafe( + charset, + .{ .max_unroll = 2 }, + ); } pub fn indexOfAnyUnsafe( ptr: @This(), comptime charset: []const u8, - comptime opts: struct { max_unroll: usize = 1 }, + comptime opts: SimdOptions, ) usize { var it = ptr._ptr; if (!@inComptime()) { - if (simd_maybe(opts.max_unroll)) |simd| { + if (simd_maybe(opts)) |simd| { const splats: [charset.len]simd.CharBlock = comptime blk: { var splats_: []const simd.CharBlock = &.{}; for (charset) |c| {