padded_str_impl.zig (11636B)
1 const std = @import("std"); 2 const assert = std.debug.assert; 3 const panic = std.debug.panic; 4 5 /// A slice pointer with a "many sentinel": 6 /// not only must there be a newline at the end, 7 /// the next PADDING - 1 bytes must also be readable. 8 pub fn PaddedSlice(comptime PADDING_: usize) type { 9 return struct { 10 /// How many '\n' chars exists past the end of line 11 pub const PADDING = PADDING_; 12 ptr: PaddedMany(PADDING), 13 len: usize, 14 15 pub fn from(comptime str_lit: []const u8) @This() { 16 return .{ 17 .ptr = .from(str_lit), 18 .len = str_lit.len, 19 }; 20 } 21 22 pub fn toUnpaddedSlice(self: @This()) [:'\n']const u8 { 23 return self.ptr._ptr[0..self.len :'\n']; 24 } 25 26 pub fn index(self: @This(), idx: usize) u8 { 27 assert(idx < self.len); 28 return self.ptr.index(idx); 29 } 30 31 pub fn sliceOpen(self: @This(), start: usize) @This() { 32 assert(start <= self.len); 33 return .{ 34 .ptr = self.ptr.sliceOpen(start), 35 .len = self.len - start, 36 }; 37 } 38 39 pub fn sliceRange(self: @This(), start: usize, end: usize) @This() { 40 assert(start <= end); 41 assert(end <= self.len); 42 return self.ptr.sliceRange(start, end); 43 } 44 45 pub fn pad(gpa: std.mem.Allocator, input: []const u8) error{OutOfMemory}![]const u8 { 46 const padded_input = try gpa.alloc(u8, input.len + PADDING); 47 @memcpy(padded_input[0..input.len], input); 48 padded_input[input.len] = '\n'; 49 return padded_input; 50 } 51 52 pub fn init(padded_input: []const u8) error{InputNotPadded}!@This() { 53 if (padded_input.len < PADDING or padded_input[padded_input.len - PADDING] != '\n') 54 return error.InputNotPadded; 55 const actual_input = padded_input[0 .. padded_input.len - PADDING :'\n']; 56 return .{ 57 .ptr = .{ ._ptr = actual_input.ptr }, 58 .len = actual_input.len, 59 }; 60 } 61 62 pub fn indexOfNotSpaceOrTab(self: @This()) usize { 63 return self.ptr.indexOfNotSpaceOrTab(); 64 } 65 66 pub fn indexOfNotSpace(self: @This()) usize { 67 return self.ptr.indexOfNotSpace(); 68 } 69 70 /// Splits the first line from the string, returning a tuple of two padded slices. 71 /// The first padded slice is `null` if there are no lines left. 72 pub fn splitOneLine(self: @This()) struct { ?@This(), @This() } { 73 if (self.len == 0) return .{ null, self }; 74 const line_len = self.ptr.indexOfNewline(); 75 return .{ 76 .{ 77 .ptr = .{ 78 ._ptr = self.ptr._ptr, 79 }, 80 .len = line_len, 81 }, 82 .{ 83 .ptr = .{ 84 ._ptr = self.ptr._ptr + line_len + 1, 85 }, 86 .len = self.len - line_len - 1, 87 }, 88 }; 89 } 90 91 pub fn indexOfNone(self: @This(), comptime charset: []const u8) usize { 92 return self.ptr.indexOfNone(charset); 93 } 94 95 pub fn indexOfAny(self: @This(), comptime charset: []const u8) usize { 96 return self.ptr.indexOfAny(charset); 97 } 98 99 pub fn indexOfNotCharUnsafe(self: @This(), char: u8) usize { 100 return self.ptr.indexOfNotCharUnsafe(char); 101 } 102 }; 103 } 104 105 /// A many pointer with a "many sentinel": 106 /// not only must there be a newline at the end, 107 /// the next PADDING - 1 bytes must also be readable. 108 pub fn PaddedMany(comptime PADDING_: usize) type { 109 return struct { 110 /// How many '\n' chars exists past the end of line 111 pub const PADDING = PADDING_; 112 _ptr: [*:'\n']const u8, 113 114 pub fn index(self: @This(), idx: usize) u8 { 115 return self._ptr[idx]; 116 } 117 118 pub fn sliceOpen(ptr: @This(), start: usize) @This() { 119 return .{ ._ptr = ptr._ptr[start..] }; 120 } 121 122 pub fn sliceRange(self: @This(), start: usize, end: usize) PaddedSlice(PADDING) { 123 return .{ 124 .ptr = self.sliceOpen(start), 125 .len = end - start, 126 }; 127 } 128 129 pub fn calcOffsetTo(ptr: @This(), other: @This()) usize { 130 return other._ptr - ptr._ptr; 131 } 132 133 const SimdOptions = struct { 134 /// Whether to disable SIMD entirely 135 disable: bool = false, 136 /// How many loads to do in parallel in 1 loop iteration. 137 /// Consider increasing if your input distribution contains long strings. 138 /// Note that this may not have an effect if PADDING is not large enough. 139 max_unroll: ?usize = null, 140 /// How many bytes to read in 1 loop iteration. 141 /// Gives more fine grained control than max_unroll. 142 max_block_len: ?usize = null, 143 }; 144 145 fn simd_maybe(comptime options: SimdOptions) ?type { 146 if (switch (@import("builtin").zig_backend) { 147 .stage2_llvm, .stage2_c => true, 148 else => false, 149 } and !std.debug.inValgrind() // https://github.com/ziglang/zig/issues/17717 150 and !options.disable) { 151 if (std.simd.suggestVectorLength(u8)) |BLOCK_LEN_| { 152 return struct { 153 const BLOCK_LEN__: comptime_int = @min(BLOCK_LEN_, PADDING); 154 const BLOCK_LEN = @min( 155 options.max_block_len orelse PADDING, 156 (options.max_unroll orelse 1) * BLOCK_LEN__, 157 @divFloor(PADDING, BLOCK_LEN__) * BLOCK_LEN__, 158 ); 159 const CharBlock = @Vector(BLOCK_LEN, u8); 160 const BitBlock = @Vector(BLOCK_LEN, u1); 161 const BoolBlock = @Vector(BLOCK_LEN, bool); 162 const IntBlock = @Type(.{ .int = .{ 163 .signedness = .unsigned, 164 .bits = BLOCK_LEN, 165 } }); 166 }; 167 } 168 } 169 return null; 170 } 171 172 pub fn from(comptime str_lit: []const u8) @This() { 173 return .{ ._ptr = (str_lit ++ ("\n" ** PADDING))[0..str_lit.len :'\n'] }; 174 } 175 176 /// charset should be as small as possible -- one vector `cmpeq` and `or` 177 /// is emitted for each char in the charset. if you want range queries, 178 /// copy paste this and write your own custom impl. 179 pub fn indexOfNone(ptr: @This(), comptime charset: []const u8) usize { 180 if (comptime std.mem.indexOfScalar(u8, charset, '\n') != null) 181 @compileError("charset must not contain newline, otherwise this op is potentially unsafe."); 182 return ptr.indexOfNoneUnsafe( 183 charset, 184 .{ .disable = true }, 185 ); 186 } 187 188 pub fn indexOfNoneUnsafe( 189 ptr: @This(), 190 comptime charset: []const u8, 191 comptime opts: SimdOptions, 192 ) usize { 193 var it = ptr._ptr; 194 if (!@inComptime()) { 195 if (simd_maybe(opts)) |simd| { 196 const splats: [charset.len]simd.CharBlock = comptime blk: { 197 var splats_: []const simd.CharBlock = &.{}; 198 for (charset) |c| { 199 splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)}; 200 } 201 break :blk splats_[0..charset.len].*; 202 }; 203 while (true) { 204 const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*; 205 var unmatches_bits = ~@as(simd.BitBlock, @splat(0)); 206 for (splats) |splat| { 207 unmatches_bits &= @bitCast(block != splat); 208 } 209 const unmatches_int: simd.IntBlock = @bitCast(unmatches_bits); 210 if (unmatches_int != 0) { 211 return it - ptr._ptr + @ctz(unmatches_int); 212 } 213 it += simd.BLOCK_LEN; 214 } 215 } 216 } 217 218 while (true) { 219 inline for (charset) |c| { 220 if (it[0] == c) break; 221 } else return it - ptr._ptr; 222 it += 1; 223 } 224 } 225 226 /// Non-SIMD implementation here because all current callers expect this to be a short string. 227 /// Must not be used with newline as that can be unsafe. Use std.mem.indexOfNone for that. 228 pub fn indexOfNotCharUnsafe(ptr: @This(), c: u8) usize { 229 if (c == '\n') panic("Unsafe use of indexOfNotCharUnsafe, {c} must not be newline", .{c}); 230 var it = ptr._ptr; 231 while (true) { 232 if (it[0] != c) return it - ptr._ptr; 233 it += 1; 234 } 235 } 236 237 /// charset should be as small as possible -- one vector `cmpeq` and `or` 238 /// is emitted for each char in the charset. if you want range queries, 239 /// copy paste this and write your own custom impl. 240 pub fn indexOfAny(ptr: @This(), comptime charset: []const u8) usize { 241 if (comptime std.mem.indexOfScalar(u8, charset, '\n') == null) 242 @compileError("charset must contain newline, otherwise this op is potentially unsafe."); 243 return ptr.indexOfAnyUnsafe( 244 charset, 245 .{ .max_unroll = 2 }, 246 ); 247 } 248 249 pub fn indexOfAnyUnsafe( 250 ptr: @This(), 251 comptime charset: []const u8, 252 comptime opts: SimdOptions, 253 ) usize { 254 var it = ptr._ptr; 255 if (!@inComptime()) { 256 if (simd_maybe(opts)) |simd| { 257 const splats: [charset.len]simd.CharBlock = comptime blk: { 258 var splats_: []const simd.CharBlock = &.{}; 259 for (charset) |c| { 260 splats_ = splats_ ++ &[1]simd.CharBlock{@splat(c)}; 261 } 262 break :blk splats_[0..charset.len].*; 263 }; 264 while (true) { 265 const block: simd.CharBlock = it[0..simd.BLOCK_LEN].*; 266 var matches_bits = @as(simd.BitBlock, @splat(0)); 267 for (splats) |splat| { 268 matches_bits |= @bitCast(block == splat); 269 } 270 const matches_int: simd.IntBlock = @bitCast(matches_bits); 271 if (matches_int != 0) { 272 return it - ptr._ptr + @ctz(matches_int); 273 } 274 it += simd.BLOCK_LEN; 275 } 276 } 277 } 278 279 while (true) { 280 inline for (charset) |c| { 281 if (it[0] == c) return it - ptr._ptr; 282 } 283 it += 1; 284 } 285 } 286 287 pub fn indexOfNotSpaceOrTab(ptr: @This()) usize { 288 return ptr.indexOfNone(" \t"); 289 } 290 291 pub fn indexOfNotSpace(ptr: @This()) usize { 292 return ptr.indexOfNotCharUnsafe(' '); 293 } 294 295 pub fn indexOfNewline(ptr: @This()) usize { 296 return ptr.indexOfAny("\n"); 297 } 298 }; 299 }