IndentationScanner.zig (21012B)
1 //! # Indentation iterator for mymarkdown 2 //! 3 //! This file focuses on parsing just the 2d structure of mymarkdown. 4 //! 5 //! An IndentationIterator maintains the following state: 6 //! - a cursor indicating what the current line is 7 //! - a stack of expected indentations 8 //! 9 //! It supports the following operations: 10 //! - `it.peek() : ?struct { IndentAlignment, PaddedSlice }` 11 //! -- no matter what, peek does not mutate state. 12 //! -- peek returns .line if the current indentation is at least to the expected indentation, 13 //! -- else .misaligned_line, if the current intendation is greater than the parent's expected indentation, 14 //! -- else .end_of_block, signalling the end of the current indented section. 15 //! - the remaining functions always return `void` and always mutate state. 16 //! - `it.advance() : void` 17 //! -- advance updates the cursor to the next line 18 //! - `it.indent(options: struct { skip: usize }) : void` 19 //! -- indent pushes a new entry onto the stack of expected indentations. 20 //! -- first, indent treats the first `skip` characters after the expected indentation as spaces. 21 //! -- then, it scans forward for the next non-whitespace character. 22 //! --- if no such character is found, the indentation level is treated as if it were "infinity" 23 //! i.e. no indented block exists. 24 //! --- otherwise, the whitespace before the non-whitespace character found is added to the expected indentation. 25 //! - `it.dedent() : void` 26 //! -- pops from the stack of expected indentations. 27 //! 28 //! ## Example API trace (happy case) 29 //! 30 //! Input: 31 //! 32 //! == 33 //! > line 1 34 //! > line 2 35 //! > line 3 36 //! line 4 37 //! line 5 38 //! line 6 39 //! == 40 //! 41 //! - .peek() -> .{ .correct, "> line 1" } 42 //! - .peek() -> .{ .correct, "> line 1" } 43 //! + (peek does not mutate state) 44 //! - .indent(1) 45 //! -- .peek() -> .{ .correct, "line 1" } 46 //! ++ (indent treats the hyphen as a space, thus removing it from 47 //! the current line.) 48 //! -- .advance() 49 //! -- .peek() -> .{ .correct, "> line 2" } 50 //! -- .indent(1) 51 //! --- .peek() -> .{ .correct, "line 2" } 52 //! --- .advance() 53 //! --- .peek() -> .{ .correct, "> line 3" } 54 //! --- .indent(1) 55 //! ---- .peek() -> .{ .correct, "line 3" } 56 //! ---- .advance() 57 //! ++++ (the cursor is now on line 4, which has 1 space of indentation, 58 //! less than the expected 6 spaces, and less than the 4 spaces 59 //! the parent expects.) 60 //! ---- .peek() -> null 61 //! --- .dedent() 62 //! --- .peek() -> null 63 //! -- .dedent() 64 //! ++ (line 4 has only 1 space of indentation, but it is still indented 65 //! relative to the parent. so it is considered part of the current 66 //! indented block, just that it is misaligned.) 67 //! -- .peek() -> .{ .misaligned, "line 4" } 68 //! -- .advance() 69 //! -- .peek() -> .{ .correct, " line 5" } 70 //! ++ (line 5 matches 2 spaces of indent. the excess spaces are simply 71 //! included in the line.) 72 //! -- .advance() 73 //! -- .peek() -> null 74 //! ++ (expected 2 spaces, parent expects 0 spaces, line 6 has 0 spaces, 75 //! matching the parent.) 76 //! - .dedent() 77 //! - .peek() -> .{ .correct, "line 6" } 78 //! - .advance() 79 //! - .peek() -> null 80 //! 81 //! ## Tabs 82 //! 83 //! Tabs are treated as non-whitespace because it is too hard to give 84 //! intuitive semantics to them. See git tag `tab-rant` to see a version 85 //! of this doc comment that tried to do so. 86 //! 87 //! ## What should happen for misaligned indented blocks 88 //! 89 //! == 90 //! > line 1 91 //! > line 2 92 //! == 93 //! 94 //! - .peek() -> .{ .correct, "> line 1" } 95 //! - .indent(1) 96 //! -- .peek() -> .{ .correct, "line 1" } 97 //! ++ (the indentation is now considered to be 4 spaces) 98 //! -- .advance() 99 //! -- .peek() -> .{ .misaligned, "> line 2" } 100 //! ++ (line 2 is misaligned as it only has 2 spaces) 101 //! -- .indent(1) 102 //! ++ (**what should happen now?**) 103 //! 104 //! ### Does not work: Simply push the new alignment on the stack 105 //! 106 //! The alignment of "line 2" is 4 spaces, so if we simply push it 107 //! onto the indentation stack, we have: 108 //! 109 //! - indentation_stack = .{ 0, 4, 4 } 110 //! 111 //! The parent indentation is the same as our indentation! That means 112 //! "error recovery" is impossible. A 3 space indent will be treated 113 //! as misaligned to the parent, and null will be emitted. 114 //! After the dedent, this 3 space indent is still considered misaligned, 115 //! but it's misaligned to the parent. 116 //! 117 //! == 118 //! > line 1 119 //! > line 2 120 //! line 3 121 //! == 122 //! 123 //! -- .indent(1) 124 //! ++ (resuming from the first example) 125 //! --- .peek() -> .{ .correct, "line 2" } 126 //! --- .advance() 127 //! --- .peek() -> null 128 //! -- .dedent() 129 //! -- .peek() -> .{ .misaligned, "line 3" } 130 //! 131 //! This is also an odd case. Without line 1, line 3 would be considered 132 //! misaligned to line 2, however because it matches up with the parent, 133 //! it actually triggers an null, followed by .line after .dedent(). 134 //! 135 //! == 136 //! > line 1 137 //! > line 2 138 //! line 3 139 //! == 140 //! 141 //! -- .indent(1) 142 //! ++ (resuming from the first example) 143 //! --- .peek() -> .{ .correct, "line 2" } 144 //! --- .advance() 145 //! --- .peek() -> null 146 //! -- .dedent() 147 //! -- .peek() -> .{ .correct, "line 3" } 148 //! 149 //! ### What we actually do: Push two elements for each added indentation level 150 //! 151 //! The idea here is that each indentation level will have its own pair of 152 //! "misaligned" and "actual" alignments. 153 //! 154 //! The indentation stack will start with .{ 0, 0 }, indicating that 155 //! the root alignment is at 0 spaces and has no "misalignment zone". 156 //! Each .indent(...) pushes 2 values onto the stack. 157 //! 158 //! Going back to this example, after the first .indent(1), the indentation 159 //! stack will contain .{ 0, 0, 0, 4 }. After the second .indent(1), 160 //! the indentation stack will contain .{ 0, 0, 0, 4, 2, 4 }, 161 //! indicating that there are 2 spaces worth of "misalignment" possible. 162 //! 163 //! == 164 //! > line 1 165 //! > line 2 166 //! line 3 167 //! == 168 169 const std = @import("std"); 170 const maxInt = std.math.maxInt; 171 const assert = std.debug.assert; 172 const testing = std.testing; 173 174 const padded_str = @import("padded_str.zig"); 175 const PaddedMany = @import("padded_str.zig").PaddedMany; 176 const PaddedSlice = @import("padded_str.zig").PaddedSlice; 177 178 const IndentationScanner = @This(); 179 180 // ================================== 181 // Fields Fields Fields Fields Fields 182 183 /// Stack of expected indentations. 184 /// The current indentation level is equal to the number of entries. 185 _expected_indents: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS), 186 187 /// The line that would be returned by peek (if indentation matches). 188 /// We always pre-scan for indentation when assigning to this field. 189 /// `null` indicates EOF. 190 _cur_line: ?struct { 191 IndentLevel, 192 NumSpaces, 193 PaddedSlice, 194 }, 195 196 /// The remaining lines of the input. 197 _rest_lines: PaddedSlice, 198 199 // ======================================= 200 // Methods Methods Methods Methods Methods 201 202 /// Create an IndentationScanner that tokenizes the given input. 203 pub fn initUnpadded(input_: []const u8) error{InputNotPadded}!IndentationScanner { 204 const input: PaddedSlice = try .init(input_); 205 const first_line, const rest_lines = input.splitOneLine(); 206 return .{ 207 ._expected_indents = while (true) { 208 var out: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS) = .{}; 209 out.appendAssumeCapacity(.{ .misalignment_start = 0, .correct_start = 0 }); 210 break out; 211 }, 212 ._cur_line = if (first_line) |line| .{ .root, 0, line } else null, 213 ._rest_lines = rest_lines, 214 }; 215 } 216 217 /// Create an IndentationScanner that tokenizes the given input. 218 pub fn init(input: PaddedSlice) IndentationScanner { 219 const first_line, const rest_lines = input.splitOneLine(); 220 return .{ 221 ._expected_indents = while (true) { 222 var out: std.BoundedArray(ExpectedIndent, IndentLevel.NUM_INDENT_LEVELS) = .{}; 223 out.appendAssumeCapacity(.{ .misalignment_start = 0, .correct_start = 0 }); 224 break out; 225 }, 226 ._cur_line = if (first_line) |line| .{ .root, 0, line } else null, 227 ._rest_lines = rest_lines, 228 }; 229 } 230 231 /// Peek at the current line's contents, with indentation stripped. 232 /// If indentation matches the parent or less, null is emitted. 233 pub fn peek(it: *const IndentationScanner) PeekResult { 234 const level, const spaces, const content = it._cur_line orelse return null; 235 if (level.compare(.lt, it.curLevel())) return null; 236 assert(level.compare(.eq, it.curLevel())); 237 if (spaces >= it.curExpectedIndents().correct_start) 238 return .{ .correct, content } 239 else 240 return .{ .misaligned, content }; 241 } 242 243 /// Advances the cursor to the next line. 244 /// Precondition: peek() would not have returned null. 245 pub noinline fn advance(it: *IndentationScanner) void { 246 assert(it.peek() != null); 247 it._cur_line = null; 248 const line_maybe, it._rest_lines = it._rest_lines.splitOneLine(); 249 const line = line_maybe orelse return; 250 const spaces = line.indexOfNotSpace(); 251 if (spaces == line.len) { 252 // Lie to the caller -- pretend empty lines have been indented to the correct level 253 it._cur_line = .{ it.curLevel(), it.curExpectedIndents().correct_start, line.sliceOpen(spaces) }; 254 return; 255 } 256 it._cur_line = for (0..it._expected_indents.len) |rev_i| { 257 const i = it._expected_indents.len - 1 - rev_i; 258 const misalignment_start = it._expected_indents.buffer[i].misalignment_start; 259 const correct_start = it._expected_indents.buffer[i].correct_start; 260 if (spaces >= correct_start) { 261 break .{ IndentLevel.from(i) orelse unreachable, correct_start, line.sliceOpen(correct_start) }; 262 } else if (spaces >= misalignment_start) { 263 break .{ IndentLevel.from(i) orelse unreachable, spaces, line.sliceOpen(spaces) }; 264 } 265 } else unreachable; 266 } 267 268 pub const IndentOptions = struct { skip: usize }; 269 270 /// Treats the next `skip` characters as whitespace, 271 /// finds the correct level of indentation, 272 /// then pushes it onto the indentation stack. 273 /// Each `.indent(...)` call should be paired with a `.dedent()` call. 274 /// Precondition: peek() would not have returned null. 275 pub fn indent( 276 it: *IndentationScanner, 277 opts: IndentOptions, 278 ) error{TooMuchIndentation}!void { 279 const level, const spaces, const content = it._cur_line orelse unreachable; 280 assert(level.compare(.eq, it.curLevel())); 281 const additional_indent = opts.skip + content.sliceOpen(opts.skip).indexOfNotSpace(); 282 if (additional_indent == content.len) { 283 it._expected_indents.append( 284 .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES }, 285 ) catch return error.TooMuchIndentation; 286 it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) }; 287 it.advance(); 288 } else { 289 it._expected_indents.append( 290 .{ .misalignment_start = spaces + 1, .correct_start = spaces + additional_indent }, 291 ) catch return error.TooMuchIndentation; 292 it._cur_line = .{ it.curLevel(), spaces + additional_indent, content.sliceOpen(additional_indent) }; 293 } 294 } 295 296 /// Pops from the indentation stack. 297 /// Each `.indent(...)` call should be paired with a `.dedent()` call. 298 /// Precondition: peek would return null, i.e. cursor is at EOF or 299 /// current indentation level is less than expected. 300 pub fn dedent(it: *IndentationScanner) void { 301 assert(it.peek() == null); 302 assert(it._expected_indents.len > 1); 303 _ = it._expected_indents.pop(); 304 } 305 306 // ======================================= 307 // Helpers Helpers Helpers Helpers Helpers 308 309 fn curLevel(it: *const IndentationScanner) IndentLevel { 310 return IndentLevel.from(it._expected_indents.len - 1).?; 311 } 312 313 fn curExpectedIndents(it: *const IndentationScanner) ExpectedIndent { 314 return it._expected_indents.buffer[it._expected_indents.len - 1]; 315 } 316 317 // =================================== 318 // Types Types Types Types Types Types 319 320 const NumSpaces = usize; 321 const MAX_SPACES = maxInt(NumSpaces); 322 const ExpectedIndent = struct { 323 /// Indentation between misalignment_start..correct_start is considered misaligned 324 misalignment_start: NumSpaces, 325 /// Indentation between correct_start..infinity is considered correct 326 correct_start: NumSpaces, 327 }; 328 329 const IndentAlignment = enum { correct, misaligned }; 330 const IndentLevel = enum(u8) { 331 root = 0, 332 max = 254, 333 infinity = 255, 334 _, 335 336 const NUM_INDENT_LEVELS = 255; 337 338 fn from(i: usize) ?IndentLevel { 339 if (i > 254) return null; 340 return @enumFromInt(i); 341 } 342 fn pred(a: IndentLevel) IndentLevel { 343 return @enumFromInt(@intFromEnum(a) - 1); 344 } 345 fn succ(a: IndentLevel) ?IndentLevel { 346 if (a.compare(.eq, .max)) return null; 347 return @enumFromInt(@intFromEnum(a) + 1); 348 } 349 fn compare(a: IndentLevel, op: std.math.CompareOperator, b: IndentLevel) bool { 350 return std.math.compare(@intFromEnum(a), op, @intFromEnum(b)); 351 } 352 }; 353 354 pub const PeekResult = ?struct { IndentAlignment, PaddedSlice }; 355 356 // ======================================= 357 // Testing Testing Testing Testing Testing 358 inline fn expectEqualDeep(comptime T: type, expected: T, actual: T) !void { 359 try testing.expectEqualDeep(expected, actual); 360 } 361 inline fn expectEqualPeekResults(expected: PeekResult, actual: PeekResult) !void { 362 const UnpaddedPeekResult = ?struct { IndentAlignment, [:'\n']const u8 }; 363 try expectEqualDeep( 364 UnpaddedPeekResult, 365 if (expected) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null, 366 if (actual) |r| .{ r.@"0", r.@"1".toUnpaddedSlice() } else null, 367 ); 368 } 369 370 test IndentationScanner { 371 const input_ = 372 \\line 1 373 \\> 374 \\ line 3 375 \\> line 4 376 \\ 377 \\ line 5 378 \\ line 6 379 \\line 7 380 \\> line 8 381 \\ > line 9 382 \\line 10 383 \\> line 11 384 \\ > line 12 385 \\ line 13 386 \\ line 14 387 \\ line 15 388 \\ line 16 389 \\ line 17 390 \\ line 18 391 \\> line 19 392 \\ > line 20 393 \\ 394 ; 395 const input_with_padding = try PaddedSlice.pad(testing.allocator, input_); 396 defer testing.allocator.free(input_with_padding); 397 const input = try PaddedSlice.init(input_with_padding); 398 var it: IndentationScanner = .init(input); 399 400 try expectEqualDeep([]const ExpectedIndent, &.{ 401 .{ .misalignment_start = 0, .correct_start = 0 }, 402 }, it._expected_indents.slice()); 403 try expectEqualPeekResults(.{ .correct, .from("line 1") }, it.peek()); 404 405 it.advance(); 406 try expectEqualPeekResults(.{ .correct, .from(">") }, it.peek()); 407 408 try it.indent(.{ .skip = 1 }); 409 try expectEqualDeep([]const ExpectedIndent, &.{ 410 .{ .misalignment_start = 0, .correct_start = 0 }, 411 .{ .misalignment_start = MAX_SPACES, .correct_start = MAX_SPACES }, 412 }, it._expected_indents.slice()); 413 try expectEqualPeekResults(null, it.peek()); 414 415 it.dedent(); 416 try expectEqualDeep([]const ExpectedIndent, &.{ 417 .{ .misalignment_start = 0, .correct_start = 0 }, 418 }, it._expected_indents.slice()); 419 try expectEqualPeekResults(.{ .correct, .from(" line 3") }, it.peek()); 420 421 it.advance(); 422 try expectEqualPeekResults(.{ .correct, .from("> line 4") }, it.peek()); 423 424 try it.indent(.{ .skip = 1 }); 425 try expectEqualDeep([]const ExpectedIndent, &.{ 426 .{ .misalignment_start = 0, .correct_start = 0 }, 427 .{ .misalignment_start = 1, .correct_start = 2 }, 428 }, it._expected_indents.slice()); 429 try expectEqualPeekResults(.{ .correct, .from("line 4") }, it.peek()); 430 431 it.advance(); 432 try expectEqualPeekResults(.{ .correct, .from("") }, it.peek()); 433 434 it.advance(); 435 try expectEqualPeekResults(.{ .correct, .from("line 5") }, it.peek()); 436 437 it.advance(); 438 try expectEqualPeekResults(.{ .misaligned, .from("line 6") }, it.peek()); 439 440 it.advance(); 441 try expectEqualPeekResults(null, it.peek()); 442 443 it.dedent(); 444 try expectEqualDeep([]const ExpectedIndent, &.{ 445 .{ .misalignment_start = 0, .correct_start = 0 }, 446 }, it._expected_indents.slice()); 447 try expectEqualPeekResults(.{ .correct, .from("line 7") }, it.peek()); 448 449 it.advance(); 450 try expectEqualPeekResults(.{ .correct, .from("> line 8") }, it.peek()); 451 452 try it.indent(.{ .skip = 1 }); 453 try expectEqualDeep([]const ExpectedIndent, &.{ 454 .{ .misalignment_start = 0, .correct_start = 0 }, 455 .{ .misalignment_start = 1, .correct_start = 2 }, 456 }, it._expected_indents.slice()); 457 try expectEqualPeekResults(.{ .correct, .from("line 8") }, it.peek()); 458 459 it.advance(); 460 try expectEqualPeekResults(.{ .correct, .from("> line 9") }, it.peek()); 461 462 try it.indent(.{ .skip = 1 }); 463 try expectEqualDeep([]const ExpectedIndent, &.{ 464 .{ .misalignment_start = 0, .correct_start = 0 }, 465 .{ .misalignment_start = 1, .correct_start = 2 }, 466 .{ .misalignment_start = 3, .correct_start = 4 }, 467 }, it._expected_indents.slice()); 468 try expectEqualPeekResults(.{ .correct, .from("line 9") }, it.peek()); 469 470 it.advance(); 471 try expectEqualPeekResults(null, it.peek()); 472 473 it.dedent(); 474 try expectEqualDeep([]const ExpectedIndent, &.{ 475 .{ .misalignment_start = 0, .correct_start = 0 }, 476 .{ .misalignment_start = 1, .correct_start = 2 }, 477 }, it._expected_indents.slice()); 478 try expectEqualPeekResults(null, it.peek()); 479 480 it.dedent(); 481 try expectEqualDeep([]const ExpectedIndent, &.{ 482 .{ .misalignment_start = 0, .correct_start = 0 }, 483 }, it._expected_indents.slice()); 484 try expectEqualPeekResults(.{ .correct, .from("line 10") }, it.peek()); 485 486 it.advance(); 487 try expectEqualPeekResults(.{ .correct, .from("> line 11") }, it.peek()); 488 489 try it.indent(.{ .skip = 1 }); 490 try expectEqualDeep([]const ExpectedIndent, &.{ 491 .{ .misalignment_start = 0, .correct_start = 0 }, 492 .{ .misalignment_start = 1, .correct_start = 5 }, 493 }, it._expected_indents.slice()); 494 try expectEqualPeekResults(.{ .correct, .from("line 11") }, it.peek()); 495 496 it.advance(); 497 try expectEqualPeekResults(.{ .misaligned, .from("> line 12") }, it.peek()); 498 499 try it.indent(.{ .skip = 1 }); 500 try expectEqualDeep([]const ExpectedIndent, &.{ 501 .{ .misalignment_start = 0, .correct_start = 0 }, 502 .{ .misalignment_start = 1, .correct_start = 5 }, 503 .{ .misalignment_start = 3, .correct_start = 4 }, 504 }, it._expected_indents.slice()); 505 try expectEqualPeekResults(.{ .correct, .from("line 12") }, it.peek()); 506 507 it.advance(); 508 try expectEqualPeekResults(.{ .correct, .from(" line 13") }, it.peek()); 509 510 it.advance(); 511 try expectEqualPeekResults(.{ .correct, .from("line 14") }, it.peek()); 512 513 it.advance(); 514 try expectEqualPeekResults(.{ .misaligned, .from("line 15") }, it.peek()); 515 516 it.advance(); 517 try expectEqualPeekResults(null, it.peek()); 518 519 it.dedent(); 520 try expectEqualDeep([]const ExpectedIndent, &.{ 521 .{ .misalignment_start = 0, .correct_start = 0 }, 522 .{ .misalignment_start = 1, .correct_start = 5 }, 523 }, it._expected_indents.slice()); 524 try expectEqualPeekResults(.{ .misaligned, .from("line 16") }, it.peek()); 525 526 it.advance(); 527 try expectEqualPeekResults(.{ .misaligned, .from("line 17") }, it.peek()); 528 529 it.advance(); 530 try expectEqualPeekResults(.{ .correct, .from("line 18") }, it.peek()); 531 532 it.advance(); 533 try expectEqualPeekResults(null, it.peek()); 534 535 it.dedent(); 536 try expectEqualDeep([]const ExpectedIndent, &.{ 537 .{ .misalignment_start = 0, .correct_start = 0 }, 538 }, it._expected_indents.slice()); 539 try expectEqualPeekResults(.{ .correct, .from("> line 19") }, it.peek()); 540 541 try it.indent(.{ .skip = 1 }); 542 try expectEqualDeep([]const ExpectedIndent, &.{ 543 .{ .misalignment_start = 0, .correct_start = 0 }, 544 .{ .misalignment_start = 1, .correct_start = 2 }, 545 }, it._expected_indents.slice()); 546 try expectEqualPeekResults(.{ .correct, .from("line 19") }, it.peek()); 547 548 it.advance(); 549 try expectEqualPeekResults(.{ .correct, .from("> line 20") }, it.peek()); 550 551 try it.indent(.{ .skip = 1 }); 552 try expectEqualDeep([]const ExpectedIndent, &.{ 553 .{ .misalignment_start = 0, .correct_start = 0 }, 554 .{ .misalignment_start = 1, .correct_start = 2 }, 555 .{ .misalignment_start = 3, .correct_start = 4 }, 556 }, it._expected_indents.slice()); 557 try expectEqualPeekResults(.{ .correct, .from("line 20") }, it.peek()); 558 559 it.advance(); 560 try expectEqualPeekResults(null, it.peek()); 561 562 it.dedent(); 563 try expectEqualDeep([]const ExpectedIndent, &.{ 564 .{ .misalignment_start = 0, .correct_start = 0 }, 565 .{ .misalignment_start = 1, .correct_start = 2 }, 566 }, it._expected_indents.slice()); 567 try expectEqualPeekResults(null, it.peek()); 568 569 it.dedent(); 570 try expectEqualDeep([]const ExpectedIndent, &.{ 571 .{ .misalignment_start = 0, .correct_start = 0 }, 572 }, it._expected_indents.slice()); 573 try expectEqualPeekResults(null, it.peek()); 574 }