str.zig (14100B)
1 //! Utils for "strings", []u8 or []const u8 slices 2 //! 3 //! The only purpose of this file is to reduce typing. 4 //! `std.mem.indexOfScalar(u8, s, c)` is very long, and 5 //! this file lets you just type `str.indexOfChar(s, c)`. 6 //! 7 //! If I need any functionality I will also just put it here, 8 //! so this file may have functions not in `std.mem`. 9 const std = @import("std"); 10 const mem = std.mem; 11 12 const tracy = @import("tracy"); 13 14 pub const Char = u8; 15 pub const Str = []const u8; 16 pub const Charset = []const u8; 17 18 pub fn isAnyOf(c: Char, cs: Charset) bool { 19 return indexOfChar(cs, c) != null; 20 } 21 22 pub fn isNoneOf(c: Char, cs: Charset) bool { 23 return !isAnyOf(c, cs); 24 } 25 26 pub fn indexOfChar(s: Str, c: Char) ?usize { 27 return mem.indexOfScalar(u8, s, c); 28 } 29 30 pub fn indexOfNotChar(slice: Str, value: Char) ?usize { 31 var i: usize = 0; 32 if (switch (@import("builtin").zig_backend) { 33 .stage2_llvm, .stage2_c => true, 34 else => false, 35 } and 36 !std.debug.inValgrind() and // https://github.com/ziglang/zig/issues/17717 37 !@inComptime()) 38 { 39 if (std.simd.suggestVectorLength(Char)) |block_len| { 40 // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result 41 // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning. 42 // 43 // Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function 44 // however this usually isn't necessary unless your arch has a performance penalty due to this. 45 // 46 // This may differ for other arch's. Arm for example costs a cycle when loading across a cache 47 // line so explicit alignment prologues may be worth exploration. 48 49 // Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks 50 // instead of one which adds up. 51 const Block = @Vector(block_len, Char); 52 if (i + 2 * block_len < slice.len) { 53 const mask: Block = @splat(value); 54 while (true) { 55 inline for (0..2) |_| { 56 const block: Block = slice[i..][0..block_len].*; 57 const matches = block != mask; 58 if (@reduce(.Or, matches)) { 59 return i + std.simd.firstTrue(matches).?; 60 } 61 i += block_len; 62 } 63 if (i + 2 * block_len >= slice.len) break; 64 } 65 } 66 67 // {block_len, block_len / 2} check 68 inline for (0..2) |j| { 69 const block_x_len = block_len / (1 << j); 70 comptime if (block_x_len < 4) break; 71 72 const BlockX = @Vector(block_x_len, Char); 73 if (i + block_x_len < slice.len) { 74 const mask: BlockX = @splat(value); 75 const block: BlockX = slice[i..][0..block_x_len].*; 76 const matches = block != mask; 77 if (@reduce(.Or, matches)) { 78 return i + std.simd.firstTrue(matches).?; 79 } 80 i += block_x_len; 81 } 82 } 83 } 84 } 85 86 for (slice[i..], i..) |c, j| { 87 if (c != value) return j; 88 } 89 return null; 90 } 91 92 pub fn indexOfNone(s: Str, cs: Charset) ?usize { 93 return mem.indexOfNone(u8, s, cs); 94 } 95 96 pub fn lastIndexOfNone(s: Str, cs: Charset) ?usize { 97 return mem.lastIndexOfNone(u8, s, cs); 98 } 99 100 pub const PaddingOption = enum { padded, not_padded }; 101 102 pub fn escapeStringForDoubleQuotedString( 103 writer: anytype, 104 slice: []const u8, 105 comptime has_padding: PaddingOption, 106 ) !void { 107 return escapeString(writer, slice, has_padding, .double_quoted_string); 108 } 109 110 pub fn escapeStringForSingleQuotedString( 111 writer: anytype, 112 slice: []const u8, 113 comptime has_padding: PaddingOption, 114 ) !void { 115 return escapeString(writer, slice, has_padding, .double_quoted_string); 116 } 117 118 pub fn fmtEscapes( 119 bytes: []const u8, 120 comptime has_padding: PaddingOption, 121 ) std.fmt.Formatter(if (has_padding == .padded) 122 stringEscapeFormatterWithPadding 123 else 124 stringEscapeFormatterWithoutPadding) { 125 return .{ .data = bytes }; 126 } 127 128 pub fn stringEscapeFormatterWithPadding( 129 bytes: []const u8, 130 comptime f: []const u8, 131 options: std.fmt.FormatOptions, 132 writer: anytype, 133 ) !void { 134 _ = options; 135 if (f.len == 1 and f[0] == '\'') { 136 try escapeString(writer, bytes, .padded, .single_quoted_string); 137 } else { 138 try escapeString(writer, bytes, .padded, .double_quoted_string); 139 } 140 } 141 142 pub fn stringEscapeFormatterWithoutPadding( 143 bytes: []const u8, 144 comptime f: []const u8, 145 options: std.fmt.FormatOptions, 146 writer: anytype, 147 ) !void { 148 _ = options; 149 if (f.len == 1 and f[0] == '\'') { 150 try escapeString(writer, bytes, .not_padded, .single_quoted_string); 151 } else { 152 try escapeString(writer, bytes, .not_padded, .double_quoted_string); 153 } 154 } 155 156 pub fn escapeString( 157 writer: anytype, 158 slice: []const u8, 159 comptime has_padding: PaddingOption, 160 comptime escape_for: enum { double_quoted_string, single_quoted_string }, 161 ) !void { 162 const tracy_frame = tracy.trace(@src()); 163 defer tracy_frame.end(); 164 tracy_frame.addText(slice); 165 166 const quote = switch (escape_for) { 167 .double_quoted_string => '"', 168 .single_quoted_string => '\'', 169 }; 170 const escaped_quote = switch (escape_for) { 171 .double_quoted_string => "\\\"", 172 .single_quoted_string => "\\'", 173 }; 174 175 var i: usize = 0; 176 if (switch (@import("builtin").zig_backend) { 177 .stage2_llvm, .stage2_c => true, 178 else => false, 179 } and 180 !std.debug.inValgrind() and // https://github.com/ziglang/zig/issues/17717 181 !@inComptime()) 182 { 183 if (std.simd.suggestVectorLength(u8)) |block_len| { 184 const Block = @Vector(block_len, u8); 185 while (i + 2 * block_len < slice.len) { 186 inline for (0..2) |_| { 187 const load: Block = slice[i .. i + block_len][0..block_len].*; 188 189 // NOTE: LLVM can auto optimize this, but we should check again 190 // when x86 backend supports @Vector. 191 // const has_ctrl = load - @as(Block, @splat(0x20)) >= @as(Block, @splat(0x7f - 0x20)); 192 const has_low_ctrl = load < @as(Block, @splat(0x20)); 193 const has_high_ctrl = load >= @as(Block, @splat(0x7f)); 194 195 // already in ctrl range! 196 // const has_tab = load == @as(Block, @splat('\t')); 197 // const has_cr = load == @as(Block, @splat('\r')); 198 // const has_lf = load == @as(Block, @splat('\n')); 199 200 // not worth it to merge these into the ctrl range, 201 // because doing so will include the space character, 202 // which is super common. space can be avoided by xor'ing with 0x07, 203 // but unfortunately that moves DEL from 0x7f to 0x78, 204 // which means that now has to be separately checked, 205 // resulting in the same number of instructions as before... 206 const has_quote = load == @as(Block, @splat(quote)); 207 const has_backslash = load == @as(Block, @splat('\\')); 208 209 // If any character is escaped, do slow path 210 if (@reduce(.Or, has_low_ctrl) or 211 @reduce(.Or, has_high_ctrl) or 212 // @reduce(.Or, has_tab) or 213 // @reduce(.Or, has_cr) or 214 // @reduce(.Or, has_lf) or 215 @reduce(.Or, has_quote) or 216 @reduce(.Or, has_backslash)) 217 { 218 // uncomment if you want to inspect the assembly, not that it helps much 219 // @branchHint(.cold); 220 // adapted from std.zig.stringEscape 221 for (slice[i .. i + block_len]) |byte| { 222 switch (byte) { 223 '\t' => try writer.writeAll("\\t"), 224 '\r' => try writer.writeAll("\\r"), 225 '\n' => try writer.writeAll("\\n"), 226 quote => try writer.writeAll(escaped_quote), 227 '\\' => try writer.writeAll("\\\\"), 228 else => if (byte < 0x20 or byte >= 0x7f) { 229 try writer.writeAll("\\x"); 230 try std.fmt.formatInt( 231 byte, 232 16, 233 .lower, 234 .{ .width = 2, .fill = '0' }, 235 writer, 236 ); 237 } else try writer.writeByte(byte), 238 } 239 } 240 } else { 241 try writer.writeAll(slice[i .. i + block_len]); 242 } 243 244 i += block_len; 245 } 246 } 247 248 if (has_padding == .padded) { 249 if (i == slice.len) return; 250 if (slice.len - i >= block_len) { 251 const load: Block = slice[i .. i + block_len][0..block_len].*; 252 253 const has_low_ctrl = load < @as(Block, @splat(0x20)); 254 const has_high_ctrl = load >= @as(Block, @splat(0x7f)); 255 const has_quote = load == @as(Block, @splat(quote)); 256 const has_backslash = load == @as(Block, @splat('\\')); 257 258 // If any character is escaped, do slow path 259 if (@reduce(.Or, has_low_ctrl) or 260 @reduce(.Or, has_high_ctrl) or 261 @reduce(.Or, has_quote) or 262 @reduce(.Or, has_backslash)) 263 { 264 // uncomment if you want to inspect the assembly, not that it helps much 265 // @branchHint(.cold); 266 // adapted from std.zig.stringEscape 267 for (slice[i .. i + block_len]) |byte| { 268 switch (byte) { 269 '\t' => try writer.writeAll("\\t"), 270 '\r' => try writer.writeAll("\\r"), 271 '\n' => try writer.writeAll("\\n"), 272 quote => try writer.writeAll(escaped_quote), 273 '\\' => try writer.writeAll("\\\\"), 274 else => if (byte < 0x20 or byte >= 0x7f) { 275 try writer.writeAll("\\x"); 276 try std.fmt.formatInt( 277 byte, 278 16, 279 .lower, 280 .{ .width = 2, .fill = '0' }, 281 writer, 282 ); 283 } else try writer.writeByte(byte), 284 } 285 } 286 } else { 287 try writer.writeAll(slice[i .. i + block_len]); 288 } 289 290 i += block_len; 291 } 292 293 if (i == slice.len) return; 294 295 { 296 const load_masks: [block_len]Block = comptime blk: { 297 var masks: []const Block = &.{}; 298 var mask: [block_len]u8 = @splat(0x00); 299 for (0..block_len) |mask_i| { 300 mask[mask_i] = 0xff; 301 masks = masks ++ .{@as(Block, mask)}; 302 } 303 break :blk masks[0..block_len].*; 304 }; 305 306 const load: Block = 307 (slice.ptr[i .. i + block_len][0..block_len].* -% 308 @as(Block, @splat(0x20))) & 309 load_masks[slice.len - 1 - i]; 310 311 const has_ctrl = load >= @as(Block, @splat(0x7f - 0x20)); 312 const has_quote = load == @as(Block, @splat(quote - 0x20)); 313 const has_backslash = load == @as(Block, @splat('\\' - 0x20)); 314 315 // If any character is escaped, do slow path 316 if (@reduce(.Or, has_ctrl) or 317 @reduce(.Or, has_quote) or 318 @reduce(.Or, has_backslash)) 319 { 320 // Fall through to the non-simd for loop 321 } else { 322 // Fast path, just write the thing 323 try writer.writeAll(slice[i..]); 324 return; 325 } 326 } 327 } 328 } 329 } 330 331 for (slice[i..]) |byte| { 332 // uncomment if you want to inspect the assembly, not that it helps much 333 // @branchHint(.cold); 334 // adapted from std.zig.stringEscape 335 switch (byte) { 336 '\t' => try writer.writeAll("\\t"), 337 '\r' => try writer.writeAll("\\r"), 338 '\n' => try writer.writeAll("\\n"), 339 quote => try writer.writeAll(escaped_quote), 340 '\\' => try writer.writeAll("\\\\"), 341 else => if (byte < 0x20 or byte >= 0x7f) { 342 try writer.writeAll("\\x"); 343 try std.fmt.formatInt( 344 byte, 345 16, 346 .lower, 347 .{ .width = 2, .fill = '0' }, 348 writer, 349 ); 350 } else try writer.writeByte(byte), 351 } 352 } 353 }