AstGen.zig (22728B)
1 const std = @import("std"); 2 const assert = std.debug.assert; 3 4 const tracy = @import("tracy"); 5 6 const Ast = @import("Ast.zig"); 7 const StrOffset = Ast.StrOffset; 8 const StrLen = Ast.StrLen; 9 const Node = Ast.Node; 10 const Error = Ast.Error; 11 const IndentationScanner = @import("IndentationScanner.zig"); 12 const IndentAlignment = IndentationScanner.IndentAlignment; 13 const IndentOptions = IndentationScanner.IndentOptions; 14 const PaddedMany = @import("padded_str.zig").PaddedMany; 15 const PaddedSlice = @import("padded_str.zig").PaddedSlice; 16 const str = @import("str.zig"); 17 const utils = @import("utils.zig"); 18 19 const AstGen = @This(); 20 21 gpa: std.mem.Allocator, 22 output_gpa: std.mem.Allocator, 23 output_gpa_same_as_gpa: bool, 24 input: PaddedSlice, 25 scanner: IndentationScanner, 26 nodes: std.ArrayListUnmanaged(Node), 27 errors: std.ArrayListUnmanaged(Error), 28 29 num_node_allocs: if (PRINT_ALLOC_STATS) usize else void = if (PRINT_ALLOC_STATS) 0, 30 num_error_allocs: if (PRINT_ALLOC_STATS) usize else void = if (PRINT_ALLOC_STATS) 0, 31 32 fn getNode(self: *const AstGen, idx: Node.Idx) *Node { 33 @setRuntimeSafety(true); 34 return &self.nodes.items[@intFromEnum(idx)]; 35 } 36 fn lastNodeIdx(self: *const AstGen) Node.Idx { 37 @setRuntimeSafety(true); 38 return @enumFromInt(self.nodes.items.len - 1); 39 } 40 fn nextNodeIdx(self: *const AstGen) Node.Idx { 41 @setRuntimeSafety(true); 42 return @enumFromInt(self.nodes.items.len); 43 } 44 45 const PRINT_ALLOC_STATS = false; 46 47 fn appendChildNode(self: *AstGen, parent_idx: Node.Idx, node: Node) !Node.Idx { 48 try self.ensureUnusedCapacityNode(1); 49 return self.appendAssumeCapacityChildNode(parent_idx, node); 50 } 51 fn appendNode(self: *AstGen, node: Node) !Node.Idx { 52 try self.ensureUnusedCapacityNode(1); 53 return self.appendAssumeCapacityNode(node); 54 } 55 fn ensureUnusedCapacityNode(self: *AstGen, additional_count: usize) !void { 56 if (self.nodes.items.len > (1 << @bitSizeOf( 57 @typeInfo(Node.Idx).@"enum".tag_type, 58 )) - additional_count) return error.OutOfNodeIdx; 59 return self.nodes.ensureUnusedCapacity(self.gpa, additional_count); 60 } 61 62 fn appendAssumeCapacityChildNode(self: *AstGen, parent_idx: Node.Idx, node: Node) Node.Idx { 63 self.getNode(parent_idx).numChildrenMut().?.* += 1; 64 return self.appendAssumeCapacityNode(node); 65 } 66 fn appendAssumeCapacityNode(self: *AstGen, node: Node) Node.Idx { 67 const idx = self.nodes.items.len; 68 const cap = if (PRINT_ALLOC_STATS) self.nodes.capacity; 69 self.nodes.appendAssumeCapacity(node); 70 if (PRINT_ALLOC_STATS and cap != self.nodes.capacity) { 71 self.num_node_allocs += 1; 72 } 73 return @enumFromInt(idx); 74 } 75 76 fn makeContainerNode(self: *AstGen, comptime container_node_tag: Node.Tag, ptr: PaddedMany) Node { 77 return .initContainerNode(container_node_tag, @intCast(self.input.ptr.calcOffsetTo(ptr)), 0); 78 } 79 fn makeContainerNodeRuntime(self: *AstGen, container_node_tag: Node.Tag, ptr: PaddedMany) ?Node { 80 return .initContainerNodeRuntime(container_node_tag, @intCast(self.input.ptr.calcOffsetTo(ptr)), 0); 81 } 82 fn makeLeafNode(self: *AstGen, comptime leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) Node { 83 return .initLeafNode(leaf_node_tag, @intCast(self.input.ptr.calcOffsetTo(ptr)), len); 84 } 85 fn makeLeafNodeRuntime(self: *AstGen, leaf_node_tag: Node.Tag, ptr: PaddedMany, len: StrLen) ?Node { 86 return .initLeafNodeRuntime(leaf_node_tag, @intCast(self.input.ptr.calcOffsetTo(ptr)), len); 87 } 88 89 fn appendError(self: *AstGen, err: Error) !void { 90 if (self.errors.items.len > std.math.maxInt( 91 @typeInfo(Error.Idx).@"enum".tag_type, 92 )) return error.OutOfErrorIdx; 93 const cap = if (PRINT_ALLOC_STATS) self.errors.capacity; 94 try self.errors.append(self.gpa, err); 95 if (PRINT_ALLOC_STATS and cap != self.errors.capacity) { 96 self.num_error_allocs += 1; 97 } 98 } 99 fn makePointError(self: *AstGen, comptime point_error_tag: Error.Tag, node_idx: Node.Idx, ptr: PaddedMany) Error { 100 return .initPointError(point_error_tag, node_idx, @intCast(self.input.ptr.calcOffsetTo(ptr))); 101 } 102 fn makePointErrorRuntime(self: *AstGen, point_error_tag: Error.Tag, node_idx: Node.Idx, ptr: PaddedMany) ?Error { 103 return .initPointErrorRuntime(point_error_tag, node_idx, @intCast(self.input.ptr.calcOffsetTo(ptr))); 104 } 105 106 fn castStrLen(len: usize, comptime err: anytype) @TypeOf(err)!StrLen { 107 return if (len <= std.math.maxInt(StrLen)) @intCast(len) else return err; 108 } 109 110 pub fn deinit(self: *AstGen) void { 111 self.nodes.deinit(self.gpa); 112 self.errors.deinit(self.gpa); 113 } 114 115 /// Parses mymarkdown 116 /// 117 /// : `gpa` 118 /// + A suitable allocator for scratch allocations that supports free and ideally remap. 119 /// : `output_gpa` 120 /// + If passed, no scratch allocations will outlive this function, 121 /// and any allocations returned will be allocated on this. 122 /// : `input` 123 /// + The input slice to be parsed. Must end in at least 1024 \n characters. 124 /// 125 /// Errors: 126 /// : `IndentationTooLong` 127 /// + This implementation of mymarkdown supports up to 1023 characters of indentation. 128 pub fn parse( 129 gpa: std.mem.Allocator, 130 output_gpa: ?std.mem.Allocator, 131 input_: []const u8, 132 ) !Ast { 133 const tracy_frame = tracy.trace(@src()); 134 defer tracy_frame.end(); 135 136 const input: PaddedSlice = try .init(input_); 137 if (input.len >= std.math.maxInt(u32)) 138 return error.InputTooLarge; 139 140 var ast: AstGen = .{ 141 .gpa = gpa, 142 .output_gpa = output_gpa orelse gpa, 143 .output_gpa_same_as_gpa = output_gpa == null, 144 .input = input, 145 .scanner = .init(input), 146 .nodes = .empty, 147 .errors = .empty, 148 }; 149 defer ast.deinit(); 150 151 { 152 const tracy_frame2 = tracy.traceNamed(@src(), "allocating"); 153 defer tracy_frame2.end(); 154 155 // Based on sample.my, there is around 1 node for every 26 bytes in the input. 156 // Divide the # of bytes by 32 as a heuristic. 157 // TODO: Review this when inline parsing is implemented 158 if (PRINT_ALLOC_STATS) ast.num_node_allocs += 1; 159 try ast.nodes.ensureTotalCapacity(gpa, input.len / 32); 160 161 // Expect there to be no errors 162 // try ast.errors.ensureTotalCapacity(gpa, ...); 163 } 164 165 try ast.parseRoot(); 166 167 std.sort.pdq(Error, ast.errors.items, {}, struct { 168 fn func(_: void, lhs: Error, rhs: Error) bool { 169 return @intFromEnum(lhs.node_idx) < @intFromEnum(rhs.node_idx); 170 } 171 }.func); 172 173 // If you want to figure out better parameters for allocation 174 if (PRINT_ALLOC_STATS) { 175 const num_newlines = blk: { 176 const tracy_frame2 = tracy.traceNamed(@src(), "counting newlines"); 177 defer tracy_frame2.end(); 178 179 var num_newlines: usize = 0; 180 for (input.toUnpaddedSlice()) |c| { 181 if (c == '\n') num_newlines += 1; 182 } 183 break :blk num_newlines; 184 }; 185 186 std.debug.print("num_newlines = {}\n", .{num_newlines}); 187 std.debug.print("input.len = {}\n", .{input.len}); 188 std.debug.print("ast.nodes.capacity = {}\n", .{ast.nodes.capacity}); 189 std.debug.print("ast.errors.capacity = {}\n", .{ast.errors.capacity}); 190 std.debug.print("ast.nodes.items.len = {}\n", .{ast.nodes.items.len}); 191 std.debug.print("ast.errors.items.len = {}\n", .{ast.errors.items.len}); 192 std.debug.print("ast.num_node_allocs = {}\n", .{ast.num_node_allocs}); 193 std.debug.print("ast.num_error_allocs = {}\n", .{ast.num_error_allocs}); 194 } 195 196 if (output_gpa) |gpa2| { 197 const nodes = try gpa2.dupe(Node, ast.nodes.items); 198 errdefer gpa2.free(nodes); 199 const errors = try gpa2.dupe(Error, ast.errors.items); 200 errdefer gpa2.free(errors); 201 return .{ 202 .nodes = nodes, 203 .errors = errors, 204 }; 205 } else { 206 const nodes = try ast.nodes.toOwnedSlice(gpa); 207 errdefer gpa.free(nodes); 208 const errors = try ast.errors.toOwnedSlice(gpa); 209 errdefer gpa.free(errors); 210 return .{ 211 .nodes = nodes, 212 .errors = errors, 213 }; 214 } 215 } 216 217 fn parseRoot(self: *AstGen) !void { 218 const tracy_frame = tracy.trace(@src()); 219 defer tracy_frame.end(); 220 const root = try self.appendNode(self.makeContainerNode(.document, self.input.ptr)); 221 assert(root == .root); 222 if (self.scanner.peek()) |p| assert(self.input.ptr._ptr == p.@"1".ptr._ptr); 223 224 _ = try self.parseColumn(root, null); 225 assert(self.scanner.peek() == null); 226 } 227 228 /// Used in headings, semicolon paragraphs, etc. 229 /// Has different rules as regular unmarked paragraphs, because 230 /// unmarked paragraphs are interrupted by block elements and 231 /// do not consume the whole column, whereas here, we ignore 232 /// block special characters. 233 fn parseInlineColumn( 234 self: *AstGen, 235 parent_idx: Node.Idx, 236 indent_opts: IndentOptions, 237 ) !void { 238 var error_state = false; 239 try self.scanner.indent(indent_opts); 240 defer if (!error_state) self.scanner.dedent(); 241 errdefer error_state = true; 242 243 // TODO: Inline parsing 244 { 245 var peek_maybe = self.scanner.peek(); 246 if (peek_maybe == null) return; 247 const tracy_frame = tracy.trace(@src()); 248 defer tracy_frame.end(); 249 tracy_frame.setName("inline"); 250 tracy_frame.addText(peek_maybe.?.@"1".toUnpaddedSlice()); 251 var text_tag: TextTag = .text; 252 var saw_empty_line = false; 253 while (peek_maybe) |peek| { 254 const alignment, const line = peek; 255 if (line.len == 0 or line.len == line.indexOfNotSpaceOrTab()) { 256 saw_empty_line = true; 257 self.scanner.advance(); 258 peek_maybe = self.scanner.peek(); 259 continue; 260 } 261 if (alignment == .misaligned) 262 try self.appendError(self.makePointError(.inconsistent_indentation, self.nextNodeIdx(), line.ptr)); 263 if (saw_empty_line) { 264 try self.appendError(.initNodeError(.unexpected_block_in_inline_context, self.nextNodeIdx())); 265 saw_empty_line = false; 266 } 267 try self.insertTextLine(text_tag, .text, parent_idx, line); 268 self.scanner.advance(); 269 peek_maybe = self.scanner.peek(); 270 text_tag = .space_text; 271 } 272 } 273 } 274 275 fn parseColumn( 276 self: *AstGen, 277 parent_idx: Node.Idx, 278 indent_opts_maybe: ?IndentOptions, 279 ) !void { 280 var error_state = false; 281 if (indent_opts_maybe) |indent_opts| try self.scanner.indent(indent_opts); 282 defer if (indent_opts_maybe != null and !error_state) self.scanner.dedent(); 283 errdefer error_state = true; 284 285 // The type of list and the idx of the list node 286 var current_list: ?struct { Node.Tag, Node.Idx } = null; 287 // The elaboratable node idx and the required length of the elaboration marker 288 var last_elaboratable_idx: ?struct { Node.Idx, usize } = null; 289 290 parsing_blocks: while (self.scanner.peek()) |peek| { 291 const alignment, const raw_line = peek; 292 293 // Strip leading whitespace 294 const line_start = raw_line.indexOfNone(" \t"); 295 const overaligned = line_start != 0; 296 const line = raw_line.sliceOpen(line_start); 297 298 // Skip empty lines 299 if (line.len == 0) { 300 self.scanner.advance(); 301 continue :parsing_blocks; 302 } 303 304 const TRACY_INSTRUMENT_HERE = true; 305 const tracy_frame = if (TRACY_INSTRUMENT_HERE) tracy.trace(@src()); 306 defer if (TRACY_INSTRUMENT_HERE) tracy_frame.end(); 307 if (TRACY_INSTRUMENT_HERE) tracy_frame.addText(line.toUnpaddedSlice()); 308 // Parse one block and get its id so we can attach an error to it if the block was misaligned. 309 const block_idx: Node.Idx = block_idx: { 310 // Reset list if any other block type appears 311 switch (line.index(0)) { 312 '-', '.', ':', '+' => {}, 313 else => current_list = null, 314 } 315 316 switch (line.index(0)) { 317 // Headings 318 '#' => { 319 const marker_len = line.indexOfNotCharUnsafe('#'); 320 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName(@tagName(.heading)); 321 try self.ensureUnusedCapacityNode(2); 322 const block_idx = self.appendAssumeCapacityChildNode(parent_idx, self.makeContainerNode(.heading, line.ptr)); 323 _ = self.appendAssumeCapacityChildNode(block_idx, self.makeLeafNode(.marker, line.ptr, try castStrLen(marker_len, error.MarkerTooLong))); 324 last_elaboratable_idx = .{ block_idx, marker_len }; 325 try self.parseInlineColumn(block_idx, .{ .skip = line_start + marker_len }); 326 break :block_idx block_idx; 327 }, 328 329 // List markers 330 '-', '.', ':' => |m| { 331 const base_marker_len = line.indexOfNotCharUnsafe(m); 332 const marker_len, const list_type: Node.Tag = blk: { 333 if (m == '-') { 334 var potential_task_item = line.sliceOpen(base_marker_len).indexOfNone("[ ]xX"); 335 while (potential_task_item >= 3 and line.index(base_marker_len + potential_task_item - 1) == ' ') 336 potential_task_item -= 1; 337 if (potential_task_item >= 3 and 338 line.index(base_marker_len + potential_task_item - 1) == ']' and 339 line.index(base_marker_len + potential_task_item - 3) == '[' and 340 (line.index(base_marker_len + potential_task_item - 2) == ' ' or 341 line.index(base_marker_len + potential_task_item - 2) == 'x' or 342 line.index(base_marker_len + potential_task_item - 2) == 'X') and 343 std.mem.allEqual(u8, line.toUnpaddedSlice()[base_marker_len .. base_marker_len + potential_task_item - 3], ' ')) 344 { 345 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName("task_item"); 346 break :blk .{ base_marker_len + potential_task_item, .task_item }; 347 } 348 } 349 const list_type = Node.Tag.fromMarkerRuntime(m).?; 350 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName(@tagName(list_type)); 351 break :blk .{ base_marker_len, list_type }; 352 }; 353 try self.ensureUnusedCapacityNode(3); 354 const list_node_idx = blk: { 355 if (current_list) |cur| { 356 const cur_tag, const cur_node_idx = cur; 357 if (list_type == cur_tag) { 358 break :blk cur_node_idx; 359 } 360 } 361 const list_node_idx = self.appendAssumeCapacityChildNode(parent_idx, self.makeContainerNode(.list, line.ptr)); 362 current_list = .{ list_type, list_node_idx }; 363 break :blk list_node_idx; 364 }; 365 const block_idx = self.appendAssumeCapacityChildNode(list_node_idx, self.makeContainerNodeRuntime(list_type, line.ptr).?); 366 _ = self.appendAssumeCapacityChildNode(block_idx, self.makeLeafNode(.marker, line.ptr, try castStrLen(marker_len, error.MarkerTooLong))); 367 last_elaboratable_idx = .{ block_idx, base_marker_len }; 368 try self.parseInlineColumn(block_idx, .{ .skip = line_start + marker_len }); 369 break :block_idx block_idx; 370 }, 371 372 // Elaborations 373 '+' => { 374 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName(@tagName(.elaboration)); 375 const marker_len = try castStrLen(line.indexOfNotCharUnsafe('+'), error.MarkerTooLong); 376 try self.ensureUnusedCapacityNode(2); 377 const block_idx = if (last_elaboratable_idx) |elab| blk: { 378 const elaboratable_idx, const required_marker_len = elab; 379 const block_idx = self.appendAssumeCapacityChildNode(elaboratable_idx, self.makeContainerNode(.elaboration, line.ptr)); 380 if (marker_len != required_marker_len) 381 try self.appendError(.initNodeError(.incorrect_elaboration_marker, block_idx)); 382 break :blk block_idx; 383 } else blk: { 384 const block_idx = self.appendAssumeCapacityChildNode(parent_idx, self.makeContainerNode(.elaboration, line.ptr)); 385 try self.appendError(.initNodeError(.elaboration_after_unelaboratable_node, block_idx)); 386 break :blk block_idx; 387 }; 388 _ = self.appendAssumeCapacityChildNode(block_idx, self.makeLeafNode(.marker, line.ptr, marker_len)); 389 try self.parseColumn(block_idx, .{ .skip = line_start + marker_len }); 390 break :block_idx block_idx; 391 }, 392 393 // Par-like single markers 394 ';' => { 395 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName("paragraph"); 396 const block_idx = try self.appendChildNode(parent_idx, self.makeContainerNode(.paragraph, line.ptr)); 397 last_elaboratable_idx = null; 398 try self.parseInlineColumn(block_idx, .{ .skip = line_start + 1 }); 399 break :block_idx block_idx; 400 }, 401 402 // Div-like single markers 403 '>' => { 404 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName("quote"); 405 const block_idx = try self.appendChildNode(parent_idx, self.makeContainerNode(.quote, line.ptr)); 406 last_elaboratable_idx = null; 407 try self.parseColumn(block_idx, .{ .skip = line_start + 1 }); 408 break :block_idx block_idx; 409 }, 410 411 '*' => { 412 if (std.mem.eql(u8, line.toUnpaddedSlice().ptr[0..3], "***")) { 413 const after_stars = line.sliceOpen(3); 414 if (after_stars.len == after_stars.indexOfNotSpaceOrTab()) { 415 self.scanner.advance(); 416 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName(@tagName(.thematic_break)); 417 last_elaboratable_idx = null; 418 break :block_idx try self.appendChildNode(parent_idx, self.makeLeafNode(.thematic_break, line.ptr, 3)); 419 } 420 } 421 }, 422 423 else => {}, 424 } 425 426 // Parse paragraph 427 { 428 if (TRACY_INSTRUMENT_HERE) tracy_frame.setName(@tagName(.paragraph)); 429 try self.ensureUnusedCapacityNode(3); 430 const paragraph_idx = self.appendAssumeCapacityChildNode(parent_idx, self.makeContainerNode(.paragraph, line.ptr)); 431 last_elaboratable_idx = null; 432 try self.insertTextLine(.text, .text, paragraph_idx, line); 433 self.scanner.advance(); 434 435 while (self.scanner.peek()) |peek2| { 436 const alignment2, const line2 = peek2; 437 const line_load4: u32 = @bitCast(line2.ptr._ptr[0..4].*); 438 const line_load3: u32 = line_load4 & @as(u32, @bitCast("\xff\xff\xff\x00\x00"[0..4].*)); 439 const line_load2: u32 = line_load4 & @as(u32, @bitCast("\xff\xff\x00\x00\x00"[0..4].*)); 440 441 switch (line2.ptr._ptr[0]) { 442 // Special block chars / actually empty line 443 '\n', '-', '.', ':', '+', '>', '#', '@' => break, 444 // Empty line (only whitespace) 445 ' ', '\t' => if (line2.len == line2.indexOfNotSpaceOrTab()) break, 446 // string literals have extra chars so [0..4] doesn't give a sentinel ptr 447 // Thematic break 448 '*' => if (line_load3 == 449 @as(u32, @bitCast("***\x00\x00"[0..4].*)) and 450 (line2.len == 3 or 451 line2.len == 3 + line2.sliceOpen(3).indexOfNotSpaceOrTab())) break, 452 // Verbatim block 453 '=' => if (line_load2 == 454 @as(u32, @bitCast("==\x00\x00\x00"[0..4].*))) break, 455 // Math block 456 '$' => if (line_load3 == 457 @as(u32, @bitCast("$==\x00\x00"[0..4].*))) break, 458 else => {}, 459 } 460 461 if (alignment2 == .misaligned) 462 try self.appendError(self.makePointError(.inconsistent_indentation, self.nextNodeIdx(), line2.ptr)); 463 try self.insertTextLine(.space_text, .text, paragraph_idx, line2); 464 self.scanner.advance(); 465 } 466 467 break :block_idx paragraph_idx; 468 } 469 }; 470 471 if (alignment == .misaligned or overaligned) { 472 try self.appendError(self.makePointError(.inconsistent_indentation, block_idx, raw_line.ptr)); 473 } 474 } 475 } 476 477 const TextTag = enum { text, space_text }; 478 fn insertTextLine( 479 self: *AstGen, 480 first_text_tag_: TextTag, 481 rest_text_tag_: TextTag, 482 parent_idx: Node.Idx, 483 line_: PaddedSlice, 484 ) !void { 485 const first_text_tag: Node.Tag = switch (first_text_tag_) { 486 .text => .text, 487 .space_text => .space_text, 488 }; 489 const rest_text_tag: Node.Tag = switch (rest_text_tag_) { 490 .text => .text, 491 .space_text => .space_text, 492 }; 493 494 var line = line_; 495 if (line.len <= std.math.maxInt(StrLen)) { 496 _ = try self.appendChildNode(parent_idx, self.makeLeafNodeRuntime(first_text_tag, line.ptr, @intCast(line.len)).?); 497 } else { 498 @branchHint(.cold); 499 { 500 const consumed_len = @min(line.len, std.math.maxInt(StrLen)); 501 _ = try self.appendChildNode(parent_idx, self.makeLeafNodeRuntime(first_text_tag, line.ptr, @intCast(consumed_len)).?); 502 line = line.sliceOpen(consumed_len); 503 } 504 while (line.len > 0) { 505 const consumed_len = @min(line.len, std.math.maxInt(StrLen)); 506 _ = try self.appendChildNode(parent_idx, self.makeLeafNodeRuntime(rest_text_tag, line.ptr, @intCast(consumed_len)).?); 507 line = line.sliceOpen(consumed_len); 508 } 509 } 510 }