mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

padded_str_impl.zig (11636B)


      1 const std = @import("std");
      2 const assert = std.debug.assert;
      3 const panic = std.debug.panic;
      4 
      5 /// A slice pointer with a "many sentinel":
      6 /// not only must there be a newline at the end,
      7 /// the next PADDING - 1 bytes must also be readable.
      8 pub fn PaddedSlice(comptime PADDING_: usize) type {
      9     return struct {
     10         /// How many '\n' chars exists past the end of line
     11         pub const PADDING = PADDING_;
     12         ptr: PaddedMany(PADDING),
     13         len: usize,
     14 
     15         pub fn from(comptime str_lit: []const u8) @This() {
     16             return .{
     17                 .ptr = .from(str_lit),
     18                 .len = str_lit.len,
     19             };
     20         }
     21 
     22         pub fn toUnpaddedSlice(self: @This()) [:'\n']const u8 {
     23             return self.ptr._ptr[0..self.len :'\n'];
     24         }
     25 
     26         pub fn index(self: @This(), idx: usize) u8 {
     27             assert(idx < self.len);
     28             return self.ptr.index(idx);
     29         }
     30 
     31         pub fn sliceOpen(self: @This(), start: usize) @This() {
     32             assert(start <= self.len);
     33             return .{
     34                 .ptr = self.ptr.sliceOpen(start),
     35                 .len = self.len - start,
     36             };
     37         }
     38 
     39         pub fn sliceRange(self: @This(), start: usize, end: usize) @This() {
     40             assert(start <= end);
     41             assert(end <= self.len);
     42             return self.ptr.sliceRange(start, end);
     43         }
     44 
     45         pub fn pad(gpa: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 {
     46             const padded_input = try gpa.alloc(u8, input.len + PADDING);
     47             @memcpy(padded_input[0..input.len], input);
     48             padded_input[input.len] = '\n';
     49             return padded_input;
     50         }
     51 
     52         pub fn init(padded_input: []const u8) error{InputNotPadded}!@This() {
     53             if (padded_input.len < PADDING or padded_input[padded_input.len - PADDING] != '\n')
     54                 return error.InputNotPadded;
     55             const actual_input = padded_input[0 .. padded_input.len - PADDING :'\n'];
     56             return .{
     57                 .ptr = .{ ._ptr = actual_input.ptr },
     58                 .len = actual_input.len,
     59             };
     60         }
     61 
     62         pub fn indexOfNotSpaceOrTab(self: @This()) usize {
     63             return self.ptr.indexOfNotSpaceOrTab();
     64         }
     65 
     66         pub fn indexOfNotSpace(self: @This()) usize {
     67             return self.ptr.indexOfNotSpace();
     68         }
     69 
     70         /// Splits the first line from the string, returning a tuple of two padded slices.
     71         /// The first padded slice is `null` if there are no lines left.
     72         pub fn splitOneLine(self: @This()) struct { ?@This(), @This() } {
     73             if (self.len == 0) return .{ null, self };
     74             const line_len = self.ptr.indexOfNewline();
     75             return .{
     76                 .{
     77                     .ptr = .{
     78                         ._ptr = self.ptr._ptr,
     79                     },
     80                     .len = line_len,
     81                 },
     82                 .{
     83                     .ptr = .{
     84                         ._ptr = self.ptr._ptr + line_len + 1,
     85                     },
     86                     .len = self.len - line_len - 1,
     87                 },
     88             };
     89         }
     90 
     91         pub fn indexOfNone(self: @This(), comptime charset: []const u8) usize {
     92             return self.ptr.indexOfNone(charset);
     93         }
     94 
     95         pub fn indexOfAny(self: @This(), comptime charset: []const u8) usize {
     96             return self.ptr.indexOfAny(charset);
     97         }
     98 
     99         pub fn indexOfNotCharUnsafe(self: @This(), char: u8) usize {
    100             return self.ptr.indexOfNotCharUnsafe(char);
    101         }
    102     };
    103 }
    104 
    105 /// A many pointer with a "many sentinel":
    106 /// not only must there be a newline at the end,
    107 /// the next PADDING - 1 bytes must also be readable.
    108 pub fn PaddedMany(comptime PADDING_: usize) type {
    109     return struct {
    110         /// How many '\n' chars exists past the end of line
    111         pub const PADDING = PADDING_;
    112         _ptr: [*:'\n']const u8,
    113 
    114         pub fn index(self: @This(), idx: usize) u8 {
    115             return self._ptr[idx];
    116         }
    117 
    118         pub fn sliceOpen(ptr: @This(), start: usize) @This() {
    119             return .{ ._ptr = ptr._ptr[start..] };
    120         }
    121 
    122         pub fn sliceRange(self: @This(), start: usize, end: usize) PaddedSlice(PADDING) {
    123             return .{
    124                 .ptr = self.sliceOpen(start),
    125                 .len = end - start,
    126             };
    127         }
    128 
    129         pub fn calcOffsetTo(ptr: @This(), other: @This()) usize {
    130             return other._ptr - ptr._ptr;
    131         }
    132 
    133         const SimdOptions = struct {
    134             /// Whether to disable SIMD entirely
    135             disable: bool = false,
    136             /// How many loads to do in parallel in 1 loop iteration.
    137             /// Consider increasing if your input distribution contains long strings.
    138             /// Note that this may not have an effect if PADDING is not large enough.
    139             max_unroll: ?usize = null,
    140             /// How many bytes to read in 1 loop iteration.
    141             /// Gives more fine grained control than max_unroll.
    142             max_block_len: ?usize = null,
    143         };
    144 
    145         fn simd_maybe(comptime options: SimdOptions) ?type {
    146             if (switch (@import("builtin").zig_backend) {
    147                 .stage2_llvm, .stage2_c => true,
    148                 else => false,
    149             } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717
    150             and !options.disable) {
    151                 if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| {
    152                     return struct {
    153                         const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING);
    154                         const BLOCK_LEN = @min(
    155                             options.max_block_len orelse PADDING,
    156                             (options.max_unroll orelse 1) * BLOCK_LEN__,
    157                             @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__,
    158                         );
    159                         const CharBlock = @Vector(BLOCK_LEN, u8);
    160                         const BitBlock = @Vector(BLOCK_LEN, u1);
    161                         const BoolBlock = @Vector(BLOCK_LEN, bool);
    162                         const IntBlock = @Type(.{ .int = .{
    163                             .signedness = .unsigned,
    164                             .bits = BLOCK_LEN,
    165                         } });
    166                     };
    167                 }
    168             }
    169             return null;
    170         }
    171 
    172         pub fn from(comptime str_lit: []const u8) @This() {
    173             return .{ ._ptr = (str_lit ++ ("\n" ** PADDING))[0..str_lit.len :'\n'] };
    174         }
    175 
    176         /// charset should be as small as possible -- one vector `cmpeq` and `or`
    177         /// is emitted for each char in the charset. if you want range queries,
    178         /// copy paste this and write your own custom impl.
    179         pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize {
    180             if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null)
    181                 @compileError("charset must not contain newline, otherwise this op is potentially unsafe.");
    182             return ptr.indexOfNoneUnsafe(
    183                 charset,
    184                 .{ .disable = true },
    185             );
    186         }
    187 
    188         pub fn indexOfNoneUnsafe(
    189             ptr: @This(),
    190             comptime charset: []const u8,
    191             comptime opts: SimdOptions,
    192         ) usize {
    193             var it = ptr._ptr;
    194             if (!@inComptime()) {
    195                 if (simd_maybe(opts)) |simd| {
    196                     const splats: [charset.len]simd.CharBlock = comptime blk: {
    197                         var splats_: []const simd.CharBlock = &.{};
    198                         for (charset) |c| {
    199                             splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)};
    200                         }
    201                         break :blk splats_[0..charset.len].*;
    202                     };
    203                     while (true) {
    204                         const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*;
    205                         var unmatches_bits = ~@as(simd.BitBlock, @splat(0));
    206                         for (splats) |splat| {
    207                             unmatches_bits &= @bitCast(block != splat);
    208                         }
    209                         const unmatches_int: simd.IntBlock = @bitCast(unmatches_bits);
    210                         if (unmatches_int != 0) {
    211                             return it - ptr._ptr + @ctz(unmatches_int);
    212                         }
    213                         it += simd.BLOCK_LEN;
    214                     }
    215                 }
    216             }
    217 
    218             while (true) {
    219                 inline for (charset) |c| {
    220                     if (it[0] == c) break;
    221                 } else return it - ptr._ptr;
    222                 it += 1;
    223             }
    224         }
    225 
    226         /// Non-SIMD implementation here because all current callers expect this to be a short string.
    227         /// Must not be used with newline as that can be unsafe. Use std.mem.indexOfNone for that.
    228         pub fn indexOfNotCharUnsafe(ptr: @This(), c: u8) usize {
    229             if (c == '\n') panic("Unsafe use of indexOfNotCharUnsafe, {c} must not be newline", .{c});
    230             var it = ptr._ptr;
    231             while (true) {
    232                 if (it[0] != c) return it - ptr._ptr;
    233                 it += 1;
    234             }
    235         }
    236 
    237         /// charset should be as small as possible -- one vector `cmpeq` and `or`
    238         /// is emitted for each char in the charset. if you want range queries,
    239         /// copy paste this and write your own custom impl.
    240         pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize {
    241             if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null)
    242                 @compileError("charset must contain newline, otherwise this op is potentially unsafe.");
    243             return ptr.indexOfAnyUnsafe(
    244                 charset,
    245                 .{ .max_unroll = 2 },
    246             );
    247         }
    248 
    249         pub fn indexOfAnyUnsafe(
    250             ptr: @This(),
    251             comptime charset: []const u8,
    252             comptime opts: SimdOptions,
    253         ) usize {
    254             var it = ptr._ptr;
    255             if (!@inComptime()) {
    256                 if (simd_maybe(opts)) |simd| {
    257                     const splats: [charset.len]simd.CharBlock = comptime blk: {
    258                         var splats_: []const simd.CharBlock = &.{};
    259                         for (charset) |c| {
    260                             splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)};
    261                         }
    262                         break :blk splats_[0..charset.len].*;
    263                     };
    264                     while (true) {
    265                         const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*;
    266                         var matches_bits = @as(simd.BitBlock, @splat(0));
    267                         for (splats) |splat| {
    268                             matches_bits |= @bitCast(block == splat);
    269                         }
    270                         const matches_int: simd.IntBlock = @bitCast(matches_bits);
    271                         if (matches_int != 0) {
    272                             return it - ptr._ptr + @ctz(matches_int);
    273                         }
    274                         it += simd.BLOCK_LEN;
    275                     }
    276                 }
    277             }
    278 
    279             while (true) {
    280                 inline for (charset) |c| {
    281                     if (it[0] == c) return it - ptr._ptr;
    282                 }
    283                 it += 1;
    284             }
    285         }
    286 
    287         pub fn indexOfNotSpaceOrTab(ptr: @This()) usize {
    288             return ptr.indexOfNone(" \t");
    289         }
    290 
    291         pub fn indexOfNotSpace(ptr: @This()) usize {
    292             return ptr.indexOfNotCharUnsafe(' ');
    293         }
    294 
    295         pub fn indexOfNewline(ptr: @This()) usize {
    296             return ptr.indexOfAny("\n");
    297         }
    298     };
    299 }