mymarkdown

My markdown
git clone https://git.grace.moe/mymarkdown
Log | Files | Refs

str.zig (14100B)


      1 //! Utils for "strings", []u8 or []const u8 slices
      2 //!
      3 //! The only purpose of this file is to reduce typing.
      4 //! `std.mem.indexOfScalar(u8, s, c)` is very long, and
      5 //! this file lets you just type `str.indexOfChar(s, c)`.
      6 //!
      7 //! If I need any functionality I will also just put it here,
      8 //! so this file may have functions not in `std.mem`.
      9 const std = @import("std");
     10 const mem = std.mem;
     11 
     12 const tracy = @import("tracy");
     13 
     14 pub const Char = u8;
     15 pub const Str = []const u8;
     16 pub const Charset = []const u8;
     17 
     18 pub fn isAnyOf(c: Char, cs: Charset) bool {
     19     return indexOfChar(cs, c) != null;
     20 }
     21 
     22 pub fn isNoneOf(c: Char, cs: Charset) bool {
     23     return !isAnyOf(c, cs);
     24 }
     25 
     26 pub fn indexOfChar(s: Str, c: Char) ?usize {
     27     return mem.indexOfScalar(u8, s, c);
     28 }
     29 
     30 pub fn indexOfNotChar(slice: Str, value: Char) ?usize {
     31     var i: usize = 0;
     32     if (switch (@import("builtin").zig_backend) {
     33         .stage2_llvm, .stage2_c => true,
     34         else => false,
     35     } and
     36         !std.debug.inValgrind() and // https://github.com/ziglang/zig/issues/17717
     37         !@inComptime())
     38     {
     39         if (std.simd.suggestVectorLength(Char)) |block_len| {
     40             // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
     41             // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
     42             //
     43             // Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function
     44             // however this usually isn't necessary unless your arch has a performance penalty due to this.
     45             //
     46             // This may differ for other arch's. Arm for example costs a cycle when loading across a cache
     47             // line so explicit alignment prologues may be worth exploration.
     48 
     49             // Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks
     50             // instead of one which adds up.
     51             const Block = @Vector(block_len, Char);
     52             if (i + 2 * block_len < slice.len) {
     53                 const mask: Block = @splat(value);
     54                 while (true) {
     55                     inline for (0..2) |_| {
     56                         const block: Block = slice[i..][0..block_len].*;
     57                         const matches = block != mask;
     58                         if (@reduce(.Or, matches)) {
     59                             return i + std.simd.firstTrue(matches).?;
     60                         }
     61                         i += block_len;
     62                     }
     63                     if (i + 2 * block_len >= slice.len) break;
     64                 }
     65             }
     66 
     67             // {block_len, block_len / 2} check
     68             inline for (0..2) |j| {
     69                 const block_x_len = block_len / (1 << j);
     70                 comptime if (block_x_len < 4) break;
     71 
     72                 const BlockX = @Vector(block_x_len, Char);
     73                 if (i + block_x_len < slice.len) {
     74                     const mask: BlockX = @splat(value);
     75                     const block: BlockX = slice[i..][0..block_x_len].*;
     76                     const matches = block != mask;
     77                     if (@reduce(.Or, matches)) {
     78                         return i + std.simd.firstTrue(matches).?;
     79                     }
     80                     i += block_x_len;
     81                 }
     82             }
     83         }
     84     }
     85 
     86     for (slice[i..], i..) |c, j| {
     87         if (c != value) return j;
     88     }
     89     return null;
     90 }
     91 
     92 pub fn indexOfNone(s: Str, cs: Charset) ?usize {
     93     return mem.indexOfNone(u8, s, cs);
     94 }
     95 
     96 pub fn lastIndexOfNone(s: Str, cs: Charset) ?usize {
     97     return mem.lastIndexOfNone(u8, s, cs);
     98 }
     99 
    100 pub const PaddingOption = enum { padded, not_padded };
    101 
    102 pub fn escapeStringForDoubleQuotedString(
    103     writer: anytype,
    104     slice: []const u8,
    105     comptime has_padding: PaddingOption,
    106 ) !void {
    107     return escapeString(writer, slice, has_padding, .double_quoted_string);
    108 }
    109 
    110 pub fn escapeStringForSingleQuotedString(
    111     writer: anytype,
    112     slice: []const u8,
    113     comptime has_padding: PaddingOption,
    114 ) !void {
    115     return escapeString(writer, slice, has_padding, .double_quoted_string);
    116 }
    117 
    118 pub fn fmtEscapes(
    119     bytes: []const u8,
    120     comptime has_padding: PaddingOption,
    121 ) std.fmt.Formatter(if (has_padding == .padded)
    122     stringEscapeFormatterWithPadding
    123 else
    124     stringEscapeFormatterWithoutPadding) {
    125     return .{ .data = bytes };
    126 }
    127 
    128 pub fn stringEscapeFormatterWithPadding(
    129     bytes: []const u8,
    130     comptime f: []const u8,
    131     options: std.fmt.FormatOptions,
    132     writer: anytype,
    133 ) !void {
    134     _ = options;
    135     if (f.len == 1 and f[0] == '\'') {
    136         try escapeString(writer, bytes, .padded, .single_quoted_string);
    137     } else {
    138         try escapeString(writer, bytes, .padded, .double_quoted_string);
    139     }
    140 }
    141 
    142 pub fn stringEscapeFormatterWithoutPadding(
    143     bytes: []const u8,
    144     comptime f: []const u8,
    145     options: std.fmt.FormatOptions,
    146     writer: anytype,
    147 ) !void {
    148     _ = options;
    149     if (f.len == 1 and f[0] == '\'') {
    150         try escapeString(writer, bytes, .not_padded, .single_quoted_string);
    151     } else {
    152         try escapeString(writer, bytes, .not_padded, .double_quoted_string);
    153     }
    154 }
    155 
    156 pub fn escapeString(
    157     writer: anytype,
    158     slice: []const u8,
    159     comptime has_padding: PaddingOption,
    160     comptime escape_for: enum { double_quoted_string, single_quoted_string },
    161 ) !void {
    162     const tracy_frame = tracy.trace(@src());
    163     defer tracy_frame.end();
    164     tracy_frame.addText(slice);
    165 
    166     const quote = switch (escape_for) {
    167         .double_quoted_string => '"',
    168         .single_quoted_string => '\'',
    169     };
    170     const escaped_quote = switch (escape_for) {
    171         .double_quoted_string => "\\\"",
    172         .single_quoted_string => "\\'",
    173     };
    174 
    175     var i: usize = 0;
    176     if (switch (@import("builtin").zig_backend) {
    177         .stage2_llvm, .stage2_c => true,
    178         else => false,
    179     } and
    180         !std.debug.inValgrind() and // https://github.com/ziglang/zig/issues/17717
    181         !@inComptime())
    182     {
    183         if (std.simd.suggestVectorLength(u8)) |block_len| {
    184             const Block = @Vector(block_len, u8);
    185             while (i + 2 * block_len < slice.len) {
    186                 inline for (0..2) |_| {
    187                     const load: Block = slice[i .. i + block_len][0..block_len].*;
    188 
    189                     // NOTE: LLVM can auto optimize this, but we should check again
    190                     //       when x86 backend supports @Vector.
    191                     // const has_ctrl = load - @as(Block, @splat(0x20)) >= @as(Block, @splat(0x7f - 0x20));
    192                     const has_low_ctrl = load < @as(Block, @splat(0x20));
    193                     const has_high_ctrl = load >= @as(Block, @splat(0x7f));
    194 
    195                     // already in ctrl range!
    196                     // const has_tab = load == @as(Block, @splat('\t'));
    197                     // const has_cr = load == @as(Block, @splat('\r'));
    198                     // const has_lf = load == @as(Block, @splat('\n'));
    199 
    200                     // not worth it to merge these into the ctrl range,
    201                     // because doing so will include the space character,
    202                     // which is super common. space can be avoided by xor'ing with 0x07,
    203                     // but unfortunately that moves DEL from 0x7f to 0x78,
    204                     // which means that now has to be separately checked,
    205                     // resulting in the same number of instructions as before...
    206                     const has_quote = load == @as(Block, @splat(quote));
    207                     const has_backslash = load == @as(Block, @splat('\\'));
    208 
    209                     // If any character is escaped, do slow path
    210                     if (@reduce(.Or, has_low_ctrl) or
    211                         @reduce(.Or, has_high_ctrl) or
    212                         // @reduce(.Or, has_tab) or
    213                         // @reduce(.Or, has_cr) or
    214                         // @reduce(.Or, has_lf) or
    215                         @reduce(.Or, has_quote) or
    216                         @reduce(.Or, has_backslash))
    217                     {
    218                         // uncomment if you want to inspect the assembly, not that it helps much
    219                         // @branchHint(.cold);
    220                         // adapted from std.zig.stringEscape
    221                         for (slice[i .. i + block_len]) |byte| {
    222                             switch (byte) {
    223                                 '\t' => try writer.writeAll("\\t"),
    224                                 '\r' => try writer.writeAll("\\r"),
    225                                 '\n' => try writer.writeAll("\\n"),
    226                                 quote => try writer.writeAll(escaped_quote),
    227                                 '\\' => try writer.writeAll("\\\\"),
    228                                 else => if (byte < 0x20 or byte >= 0x7f) {
    229                                     try writer.writeAll("\\x");
    230                                     try std.fmt.formatInt(
    231                                         byte,
    232                                         16,
    233                                         .lower,
    234                                         .{ .width = 2, .fill = '0' },
    235                                         writer,
    236                                     );
    237                                 } else try writer.writeByte(byte),
    238                             }
    239                         }
    240                     } else {
    241                         try writer.writeAll(slice[i .. i + block_len]);
    242                     }
    243 
    244                     i += block_len;
    245                 }
    246             }
    247 
    248             if (has_padding == .padded) {
    249                 if (i == slice.len) return;
    250                 if (slice.len - i >= block_len) {
    251                     const load: Block = slice[i .. i + block_len][0..block_len].*;
    252 
    253                     const has_low_ctrl = load < @as(Block, @splat(0x20));
    254                     const has_high_ctrl = load >= @as(Block, @splat(0x7f));
    255                     const has_quote = load == @as(Block, @splat(quote));
    256                     const has_backslash = load == @as(Block, @splat('\\'));
    257 
    258                     // If any character is escaped, do slow path
    259                     if (@reduce(.Or, has_low_ctrl) or
    260                         @reduce(.Or, has_high_ctrl) or
    261                         @reduce(.Or, has_quote) or
    262                         @reduce(.Or, has_backslash))
    263                     {
    264                         // uncomment if you want to inspect the assembly, not that it helps much
    265                         // @branchHint(.cold);
    266                         // adapted from std.zig.stringEscape
    267                         for (slice[i .. i + block_len]) |byte| {
    268                             switch (byte) {
    269                                 '\t' => try writer.writeAll("\\t"),
    270                                 '\r' => try writer.writeAll("\\r"),
    271                                 '\n' => try writer.writeAll("\\n"),
    272                                 quote => try writer.writeAll(escaped_quote),
    273                                 '\\' => try writer.writeAll("\\\\"),
    274                                 else => if (byte < 0x20 or byte >= 0x7f) {
    275                                     try writer.writeAll("\\x");
    276                                     try std.fmt.formatInt(
    277                                         byte,
    278                                         16,
    279                                         .lower,
    280                                         .{ .width = 2, .fill = '0' },
    281                                         writer,
    282                                     );
    283                                 } else try writer.writeByte(byte),
    284                             }
    285                         }
    286                     } else {
    287                         try writer.writeAll(slice[i .. i + block_len]);
    288                     }
    289 
    290                     i += block_len;
    291                 }
    292 
    293                 if (i == slice.len) return;
    294 
    295                 {
    296                     const load_masks: [block_len]Block = comptime blk: {
    297                         var masks: []const Block = &.{};
    298                         var mask: [block_len]u8 = @splat(0x00);
    299                         for (0..block_len) |mask_i| {
    300                             mask[mask_i] = 0xff;
    301                             masks = masks ++ .{@as(Block, mask)};
    302                         }
    303                         break :blk masks[0..block_len].*;
    304                     };
    305 
    306                     const load: Block =
    307                         (slice.ptr[i .. i + block_len][0..block_len].* -%
    308                             @as(Block, @splat(0x20))) &
    309                         load_masks[slice.len - 1 - i];
    310 
    311                     const has_ctrl = load >= @as(Block, @splat(0x7f - 0x20));
    312                     const has_quote = load == @as(Block, @splat(quote - 0x20));
    313                     const has_backslash = load == @as(Block, @splat('\\' - 0x20));
    314 
    315                     // If any character is escaped, do slow path
    316                     if (@reduce(.Or, has_ctrl) or
    317                         @reduce(.Or, has_quote) or
    318                         @reduce(.Or, has_backslash))
    319                     {
    320                         // Fall through to the non-simd for loop
    321                     } else {
    322                         // Fast path, just write the thing
    323                         try writer.writeAll(slice[i..]);
    324                         return;
    325                     }
    326                 }
    327             }
    328         }
    329     }
    330 
    331     for (slice[i..]) |byte| {
    332         // uncomment if you want to inspect the assembly, not that it helps much
    333         // @branchHint(.cold);
    334         // adapted from std.zig.stringEscape
    335         switch (byte) {
    336             '\t' => try writer.writeAll("\\t"),
    337             '\r' => try writer.writeAll("\\r"),
    338             '\n' => try writer.writeAll("\\n"),
    339             quote => try writer.writeAll(escaped_quote),
    340             '\\' => try writer.writeAll("\\\\"),
    341             else => if (byte < 0x20 or byte >= 0x7f) {
    342                 try writer.writeAll("\\x");
    343                 try std.fmt.formatInt(
    344                     byte,
    345                     16,
    346                     .lower,
    347                     .{ .width = 2, .fill = '0' },
    348                     writer,
    349                 );
    350             } else try writer.writeByte(byte),
    351         }
    352     }
    353 }