commit 627faa642d6d6a91925ad705440d210d25e6c897
parent 6a848e2833660bccc8ac9343e66765825151cb93
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Wed, 21 May 2025 01:43:09 +0800
Make AstGen2 and AstGen3 faster and now they're the same speed
Diffstat:
4 files changed, 90 insertions(+), 46 deletions(-)
diff --git a/src/AstGen2.zig b/src/AstGen2.zig
@@ -55,7 +55,18 @@ fn nextNodeIdx(self: AstGen) Node.Idx {
@setRuntimeSafety(true);
return @enumFromInt(self.nodes.items.len);
}
-fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
+
+// These need manual inlining for some reason.
+//
+// LLVM doesn't seem to think that inlining these are worth it, but LLVM is wrong.
+// Because this constructs the Node using .fromTagged, and Node.Tagged and Node
+// have different memory representations, the only way to construct in place and
+// elide copies is for every function from this one to the callsite to be inlined.
+//
+// The same applies for Error / Error.Tagged below, but the impact is less severe
+// as appending Errors is a much less common operation. Nevertheless, we inline it
+// despite not having any data to back it up, because I have vibes that it should be faster.
+inline fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
if (self.nodes.items.len > std.math.maxInt(
@typeInfo(Node.Idx).@"enum".tag_type,
)) return error.OutOfNodeIdx;
@@ -63,7 +74,7 @@ fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
try self.nodes.append(self.gpa, .fromTagged(node));
return @enumFromInt(idx);
}
-fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx {
+inline fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag) !Node.Idx {
self.getNode(parent_idx).incrementNumChildren();
return try self.appendNode(
@unionInit(
@@ -73,7 +84,7 @@ fn appendContainerNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime con
),
);
}
-fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx {
+inline fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, len: StrLen) !Node.Idx {
self.getNode(parent_idx).incrementNumChildren();
return try self.appendNode(
@unionInit(
@@ -83,13 +94,13 @@ fn appendLeafNodeAtCursor(self: *AstGen, parent_idx: Node.Idx, comptime leaf_nod
),
);
}
-fn appendError(self: *AstGen, err: Error.Tagged) !void {
+inline fn appendError(self: *AstGen, err: Error.Tagged) !void {
if (self.errors.items.len > std.math.maxInt(
@typeInfo(Error.Idx).@"enum".tag_type,
)) return error.OutOfErrorIdx;
try self.errors.append(self.gpa, .fromTagged(err));
}
-fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+inline fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
try self.appendError(
@unionInit(
Error.Tagged,
@@ -98,7 +109,7 @@ fn appendPointErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Id
),
);
}
-fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+inline fn appendNodeErrorAtCursor(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
try self.appendError(
@unionInit(
Error.Tagged,
@@ -180,7 +191,7 @@ pub fn parse(
const ParsingContext = enum { block_context, inline_context };
-noinline fn parseRoot(self: *AstGen) !void {
+fn parseRoot(self: *AstGen) !void {
const tracy_frame = tracy.trace(@src());
defer tracy_frame.end();
const root = try self.appendNode(.{ .document = .{} });
diff --git a/src/AstGen3.zig b/src/AstGen3.zig
@@ -45,7 +45,18 @@ fn nextNodeIdx(self: *const AstGen) Node.Idx {
@setRuntimeSafety(true);
return @enumFromInt(self.nodes.items.len);
}
-fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
+
+// These need manual inlining for some reason.
+//
+// LLVM doesn't seem to think that inlining these are worth it, but LLVM is wrong.
+// Because this constructs the Node using .fromTagged, and Node.Tagged and Node
+// have different memory representations, the only way to construct in place and
+// elide copies is for every function from this one to the callsite to be inlined.
+//
+// The same applies for Error / Error.Tagged below, but the impact is less severe
+// as appending Errors is a much less common operation. Nevertheless, we inline it
+// despite not having any data to back it up, because I have vibes that it should be faster.
+inline fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
if (self.nodes.items.len > std.math.maxInt(
@typeInfo(Node.Idx).@"enum".tag_type,
)) return error.OutOfNodeIdx;
@@ -53,7 +64,7 @@ fn appendNode(self: *AstGen, node: Node.Tagged) !Node.Idx {
try self.nodes.append(self.gpa, .fromTagged(node));
return @enumFromInt(idx);
}
-fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx {
+inline fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_node_tag: Node.Tag, ptr: PaddedMany) !Node.Idx {
self.getNode(parent_idx).incrementNumChildren();
return try self.appendNode(
@unionInit(
@@ -63,7 +74,7 @@ fn appendContainerNode(self: *AstGen, parent_idx: Node.Idx, comptime container_n
),
);
}
-fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx {
+inline fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) !Node.Idx {
self.getNode(parent_idx).incrementNumChildren();
return try self.appendNode(
@unionInit(
@@ -73,13 +84,13 @@ fn appendLeafNode(self: *AstGen, parent_idx: Node.Idx, comptime leaf_node_tag: N
),
);
}
-fn appendError(self: *AstGen, err: Error.Tagged) !void {
+inline fn appendError(self: *AstGen, err: Error.Tagged) !void {
if (self.errors.items.len > std.math.maxInt(
@typeInfo(Error.Idx).@"enum".tag_type,
)) return error.OutOfErrorIdx;
try self.errors.append(self.gpa, .fromTagged(err));
}
-fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void {
+inline fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr: PaddedMany) !void {
try self.appendError(
@unionInit(
Error.Tagged,
@@ -88,7 +99,7 @@ fn appendPointError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx, ptr:
),
);
}
-fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
+inline fn appendNodeError(self: *AstGen, comptime tag: Error.Tag, idx: Node.Idx) !void {
try self.appendError(
@unionInit(
Error.Tagged,
@@ -200,7 +211,7 @@ fn parseColumn(
/// unmarked paragraphs are interrupted by block elements and
/// do not consume the whole column, whereas here, we ignore
/// block special characters.
-inline fn parseInlineColumn(
+fn parseInlineColumn(
self: *AstGen,
parent_idx: Node.Idx,
) !void {
diff --git a/src/main.zig b/src/main.zig
@@ -3,7 +3,6 @@ const std = @import("std");
const tracy = @import("tracy");
const mymarkdown = @import("mymarkdown");
-const GeneralPurposeAllocator = std.heap.GeneralPurposeAllocator(.{});
const ArenaAllocator = std.heap.ArenaAllocator;
fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8) {
@@ -25,12 +24,21 @@ fn readInput(gpa: std.mem.Allocator, arena: std.mem.Allocator) !std.ArrayList(u8
return al;
}
+var debug_allocator: std.heap.DebugAllocator(.{}) = .init;
+
pub fn main() !void {
- var gpa: GeneralPurposeAllocator = .{};
- var arena: ArenaAllocator = .init(gpa.allocator());
- defer arena.deinit();
+ const gpa, const is_debug = gpa: {
+ break :gpa switch (@import("builtin").mode) {
+ .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true },
+ .ReleaseFast, .ReleaseSmall => .{ std.heap.page_allocator, false },
+ };
+ };
+ defer _ = if (is_debug) debug_allocator.deinit();
+ var arena_instance: ArenaAllocator = .init(gpa);
+ defer arena_instance.deinit();
+ const arena = arena_instance.allocator();
- const args = try std.process.argsAlloc(arena.allocator());
+ const args = try std.process.argsAlloc(arena);
const run1, const run2, const run3 = blk: {
var run1, var run2, var run3 = .{ false, false, false };
for (args) |arg| {
@@ -44,7 +52,7 @@ pub fn main() !void {
break :blk .{ run1, run2, run3 };
};
- var input_arraylist = try readInput(gpa.allocator(), arena.allocator());
+ var input_arraylist = try readInput(gpa, arena);
defer input_arraylist.deinit();
const input = input_arraylist.items;
@@ -54,8 +62,8 @@ pub fn main() !void {
const tracy_frame = tracy.namedFrame("parse 1");
defer tracy_frame.end();
break :blk try mymarkdown.parse(
- gpa.allocator(),
- arena.allocator(),
+ gpa,
+ arena,
input,
);
});
@@ -64,8 +72,8 @@ pub fn main() !void {
const tracy_frame = tracy.namedFrame("parse 2");
defer tracy_frame.end();
break :blk try mymarkdown.parse2(
- gpa.allocator(),
- arena.allocator(),
+ gpa,
+ arena,
input,
);
});
@@ -74,8 +82,8 @@ pub fn main() !void {
const tracy_frame = tracy.namedFrame("parse 3");
defer tracy_frame.end();
break :blk try mymarkdown.parse3(
- gpa.allocator(),
- arena.allocator(),
+ gpa,
+ arena,
input,
);
});
@@ -86,8 +94,8 @@ pub fn main() !void {
const tracy_frame = tracy.namedFrame("parse 1");
defer tracy_frame.end();
break :blk try mymarkdown.parse(
- gpa.allocator(),
- arena.allocator(),
+ gpa,
+ arena,
input,
);
};
@@ -95,8 +103,8 @@ pub fn main() !void {
const tracy_frame = tracy.namedFrame("parse 2");
defer tracy_frame.end();
break :blk try mymarkdown.parse2(
- gpa.allocator(),
- arena.allocator(),
+ gpa,
+ arena,
input,
);
};
@@ -104,13 +112,13 @@ pub fn main() !void {
const tracy_frame = tracy.namedFrame("parse 3");
defer tracy_frame.end();
break :blk try mymarkdown.parse3(
- gpa.allocator(),
- arena.allocator(),
+ gpa,
+ arena,
input,
);
};
- var render_arraylist1: std.ArrayList(u8) = .init(gpa.allocator());
+ var render_arraylist1: std.ArrayList(u8) = .init(gpa);
defer render_arraylist1.deinit();
{
std.debug.print("Rendering 1\n", .{});
@@ -118,7 +126,7 @@ pub fn main() !void {
defer tracy_frame.end();
_ = try ast.renderAst(render_arraylist1.writer(), input);
}
- var render_arraylist2: std.ArrayList(u8) = .init(gpa.allocator());
+ var render_arraylist2: std.ArrayList(u8) = .init(gpa);
defer render_arraylist2.deinit();
{
std.debug.print("Rendering 2\n", .{});
@@ -126,7 +134,7 @@ pub fn main() !void {
defer tracy_frame.end();
_ = try ast2.renderAst(render_arraylist2.writer(), input);
}
- var render_arraylist3: std.ArrayList(u8) = .init(gpa.allocator());
+ var render_arraylist3: std.ArrayList(u8) = .init(gpa);
defer render_arraylist3.deinit();
{
std.debug.print("Rendering 3\n", .{});
diff --git a/src/padded_str_impl.zig b/src/padded_str_impl.zig
@@ -129,22 +129,30 @@ pub fn PaddedMany(comptime PADDING_: usize) type {
return other._ptr - ptr._ptr;
}
- fn simd_maybe(
+ const SimdOptions = struct {
+ /// Whether to disable SIMD entirely
+ disable: bool = false,
/// How many loads to do in parallel in 1 loop iteration.
/// Consider increasing if your input distribution contains long strings.
/// Note that this may not have an effect if PADDING is not large enough.
- comptime MAX_UNROLL: usize,
- ) ?type {
+ max_unroll: ?usize = null,
+ /// How many bytes to read in 1 loop iteration.
+ /// Gives more fine grained control than max_unroll.
+ max_block_len: ?usize = null,
+ };
+
+ fn simd_maybe(comptime options: SimdOptions) ?type {
if (switch (@import("builtin").zig_backend) {
.stage2_llvm, .stage2_c => true,
else => false,
} and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717
- ) {
+ and !options.disable) {
if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| {
return struct {
const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING);
const BLOCK_LEN = @min(
- MAX_UNROLL * BLOCK_LEN__,
+ options.max_block_len orelse PADDING,
+ (options.max_unroll orelse 1) * BLOCK_LEN__,
@divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__,
);
const CharBlock = @Vector(BLOCK_LEN, u8);
@@ -170,17 +178,20 @@ pub fn PaddedMany(comptime PADDING_: usize) type {
pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize {
if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null)
@compileError("charset must not contain newline, otherwise this op is potentially unsafe.");
- return ptr.indexOfNoneUnsafe(charset, .{ .max_unroll = 3 });
+ return ptr.indexOfNoneUnsafe(
+ charset,
+ .{ .disable = true },
+ );
}
pub fn indexOfNoneUnsafe(
ptr: @This(),
comptime charset: []const u8,
- comptime opts: struct { max_unroll: usize = 1 },
+ comptime opts: SimdOptions,
) usize {
var it = ptr._ptr;
if (!@inComptime()) {
- if (simd_maybe(opts.max_unroll)) |simd| {
+ if (simd_maybe(opts)) |simd| {
const splats: [charset.len]simd.CharBlock = comptime blk: {
var splats_: []const simd.CharBlock = &.{};
for (charset) |c| {
@@ -222,17 +233,20 @@ pub fn PaddedMany(comptime PADDING_: usize) type {
pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize {
if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null)
@compileError("charset must contain newline, otherwise this op is potentially unsafe.");
- return ptr.indexOfAnyUnsafe(charset, .{ .max_unroll = 1 });
+ return ptr.indexOfAnyUnsafe(
+ charset,
+ .{ .max_unroll = 2 },
+ );
}
pub fn indexOfAnyUnsafe(
ptr: @This(),
comptime charset: []const u8,
- comptime opts: struct { max_unroll: usize = 1 },
+ comptime opts: SimdOptions,
) usize {
var it = ptr._ptr;
if (!@inComptime()) {
- if (simd_maybe(opts.max_unroll)) |simd| {
+ if (simd_maybe(opts)) |simd| {
const splats: [charset.len]simd.CharBlock = comptime blk: {
var splats_: []const simd.CharBlock = &.{};
for (charset) |c| {