forked from zwegner/faster-utf8-validator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathz_validate.c
353 lines (307 loc) · 14.7 KB
/
z_validate.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
// faster-utf8-validator
//
// Copyright (c) 2019 Zach Wegner
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <stdint.h>
#include <immintrin.h>
// How this validator works:
//
// [[[ UTF-8 refresher: UTF-8 encodes text in sequences of "code points",
// each one from 1-4 bytes. For each code point that is longer than one byte,
// the code point begins with a unique prefix that specifies how many bytes
// follow. All bytes in the code point after this first have a continuation
// marker. All code points in UTF-8 will thus look like one of the following
// binary sequences, with x meaning "don't care":
// 1 byte: 0xxxxxxx
// 2 bytes: 110xxxxx 10xxxxxx
// 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
// 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// ]]]
//
// This validator works in two basic steps: checking continuation bytes, and
// handling special cases. Each step works on one vector's worth of input
// bytes at a time.
//
// The continuation bytes are handled in a fairly straightforward manner in
// the scalar domain. A mask is created from the input byte vector for each
// of the highest four bits of every byte. The first mask allows us to quickly
// skip pure ASCII input vectors, which have no bits set. The first and
// (inverted) second masks together give us every continuation byte (10xxxxxx).
// The other masks are used to find prefixes of multi-byte code points (110,
// 1110, 11110). For these, we keep a "required continuation" mask, by shifting
// these masks 1, 2, and 3 bits respectively forward in the byte stream. That
// is, we take a mask of all bytes that start with 11, and shift it left one
// bit forward to get the mask of all the first continuation bytes, then do the
// same for the second and third continuation bytes. Here's an example input
// sequence along with the corresponding masks:
//
// bytes: 61 C3 80 62 E0 A0 80 63 F0 90 80 80 00
// code points: 61|C3 80|62|E0 A0 80|63|F0 90 80 80|00
// # of bytes: 1 |2 - |1 |3 - - |1 |4 - - - |1
// cont. mask 1: - - 1 - - 1 - - - 1 - - -
// cont. mask 2: - - - - - - 1 - - - 1 - -
// cont. mask 3: - - - - - - - - - - - 1 -
// cont. mask *: 0 0 1 0 0 1 1 0 0 1 1 1 0
//
// The final required continuation mask is then compared with the mask of
// actual continuation bytes, and must match exactly in valid UTF-8. The only
// complication in this step is that the shifted masks can cross vector
// boundaries, so we need to keep a "carry" mask of the bits that were shifted
// past the boundary in the last loop iteration.
//
// Besides the basic prefix coding of UTF-8, there are several invalid byte
// sequences that need special handling. These are due to three factors:
// code points that could be described in fewer bytes, code points that are
// part of a surrogate pair (which are only valid in UTF-16), and code points
// that are past the highest valid code point U+10FFFF.
//
// All of the invalid sequences can be detected by independently observing
// the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte
// lookup in parallel for all 32 bytes in a vector, we can create bit masks
// for all of these error conditions, look up the bit masks for the three
// nibbles for all input bytes, and AND them together to get a final error mask,
// that must be all zero for valid UTF-8. This is somewhat complicated by
// needing to shift the error masks from the first and second nibbles forward in
// the byte stream to line up with the third nibble.
//
// We have these possible values for valid UTF-8 sequences, broken down
// by the first three nibbles:
//
// 1st 2nd 3rd comment
// 0..7 0..F ASCII
// 8..B 0..F continuation bytes
// C 2..F 8..B C0 xx and C1 xx can be encoded in 1 byte
// D 0..F 8..B D0..DF are valid with a continuation byte
// E 0 A..B E0 8x and E0 9x can be encoded with 2 bytes
// 1..C 8..B E1..EC are valid with continuation bytes
// D 8..9 ED Ax and ED Bx correspond to surrogate pairs
// E..F 8..B EE..EF are valid with continuation bytes
// F 0 9..B F0 8x can be encoded with 3 bytes
// 1..3 8..B F1..F3 are valid with continuation bytes
// 4 8 F4 8F BF BF is the maximum valid code point
//
// That leaves us with these invalid sequences, which would otherwise fit
// into UTF-8's prefix encoding. Each of these invalid sequences needs to
// be detected separately, with their own bits in the error mask.
//
// 1st 2nd 3rd error bit
// C 0..1 0..F 0x01
// E 0 8..9 0x02
// D A..B 0x04
// F 0 0..8 0x08
// 4 9..F 0x10
// 5..F 0..F 0x20
//
// For every possible value of the first, second, and third nibbles, we keep
// a lookup table that contains the bitwise OR of all errors that that nibble
// value can cause. For example, the first nibble has zeroes in every entry
// except for C, E, and F, and the third nibble lookup has the 0x21 bits in
// every entry, since those errors don't depend on the third nibble. After
// doing a parallel lookup of the first/second/third nibble values for all
// bytes, we AND them together. Only when all three have an error bit in common
// do we fail validation.
#if defined(AVX2)
// AVX2 definitions
# define z_validate_utf8 z_validate_utf8_avx2
# define z_validate_vec z_validate_vec_avx2
# define V_LEN (32)
// Vector and vector mask types. We use #defines instead of typedefs so this
// header can be included multiple times with different configurations
# define vec_t __m256i
# define vmask_t uint32_t
# define vmask2_t uint64_t
# define v_load(x) _mm256_loadu_si256((vec_t *)(x))
# define v_set1 _mm256_set1_epi8
# define v_and _mm256_and_si256
# define v_test_bit(input, bit) \
_mm256_movemask_epi8(_mm256_slli_epi16((input), 7 - (bit)))
// Parallel table lookup for all bytes in a vector. We need to AND with 0x0F
// for the lookup, because vpshufb has the neat "feature" that negative values
// in an index byte will result in a zero.
# define v_lookup(table, index, shift) \
_mm256_shuffle_epi8((table), \
v_and(_mm256_srli_epi16((index), (shift)), v_set1(0x0F)))
# define v_testz _mm256_testz_si256
// Simple macro to make a vector lookup table for use with vpshufb. Since
// AVX2 is two 16-byte halves, we duplicate the input values.
# define V_TABLE_16(...) _mm256_setr_epi8(__VA_ARGS__, __VA_ARGS__)
# define v_shift_lanes_left v_shift_lanes_left_avx2
// Move all the bytes in "input" to the left by one and fill in the first byte
// with zero. Since AVX2 generally works on two separate 16-byte vectors glued
// together, this needs two steps. The permute2x128 takes the middle 32 bytes
// of the 64-byte concatenation v_zero:input. The align then gives the final
// result in each half:
// top half: input_L:input_H --> input_L[15]:input_H[0:14]
// bottom half: zero_H:input_L --> zero_H[15]:input_L[0:14]
static inline vec_t v_shift_lanes_left(vec_t input) {
vec_t zero = v_set1(0);
vec_t shl_16 = _mm256_permute2x128_si256(input, zero, 0x03);
return _mm256_alignr_epi8(input, shl_16, 15);
}
#elif defined(SSE4)
// SSE definitions. We require at least SSE4.1 for _mm_test_all_zeros()
# define z_validate_utf8 z_validate_utf8_sse4
# define z_validate_vec z_validate_vec_sse4
# define V_LEN (16)
# define vec_t __m128i
# define vmask_t uint16_t
# define vmask2_t uint32_t
# define v_load(x) _mm_lddqu_si128((vec_t *)(x))
# define v_set1 _mm_set1_epi8
# define v_and _mm_and_si128
# define v_testz _mm_test_all_zeros
# define v_test_bit(input, bit) \
_mm_movemask_epi8(_mm_slli_epi16((input), (uint8_t)(7 - (bit))))
# define v_lookup(table, index, shift) \
_mm_shuffle_epi8((table), \
v_and(_mm_srli_epi16((index), (shift)), v_set1(0x0F)))
# define V_TABLE_16(...) _mm_setr_epi8(__VA_ARGS__)
# define v_shift_lanes_left v_shift_lanes_left_sse4
static inline vec_t v_shift_lanes_left(vec_t top) {
return _mm_alignr_epi8(top, v_set1(0), 15);
}
#else
# error "No valid configuration: must define one of AVX2 or SSE4
#endif
// Validate one vector's worth of input bytes
inline int z_validate_vec(vec_t bytes, vec_t shifted_bytes, vmask_t *last_cont) {
// Error lookup tables for the first, second, and third nibbles
const vec_t error_1 = V_TABLE_16(
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x06, 0x38
);
const vec_t error_2 = V_TABLE_16(
0x0B, 0x01, 0x00, 0x00,
0x10, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20,
0x20, 0x24, 0x20, 0x20
);
const vec_t error_3 = V_TABLE_16(
0x29, 0x29, 0x29, 0x29,
0x29, 0x29, 0x29, 0x29,
0x2B, 0x33, 0x35, 0x35,
0x31, 0x31, 0x31, 0x31
);
// Quick skip for ascii-only input. If there are no bytes with the high bit
// set, we don't need to do any more work. We return either valid or
// invalid based on whether we expected any continuation bytes here.
vmask_t high = v_test_bit(bytes, 7);
if (!high)
return *last_cont == 0;
// Which bytes are required to be continuation bytes
vmask2_t req = *last_cont;
// A bitmask of the actual continuation bytes in the input
vmask_t cont;
// Compute the continuation byte mask by finding bytes that start with
// 11x, 111x, and 1111. For each of these prefixes, we get a bitmask
// and shift it forward by 1, 2, or 3. This loop should be unrolled by
// the compiler, and the (n == 1) branch inside eliminated.
vmask_t set = high;
for (int n = 1; n <= 3; n++) {
set &= v_test_bit(bytes, 7 - n);
// Mark continuation bytes: those that have the high bit set but
// not the next one
if (n == 1)
cont = high ^ set;
// We add the shifted mask here instead of ORing it, which would
// be the more natural operation, so that this line can be done
// with one lea. While adding could give a different result due
// to carries, this will only happen for invalid UTF-8 sequences,
// and in a way that won't cause it to pass validation. Reasoning:
// Any bits for required continuation bytes come after the bits
// for their leader bytes, and are all contiguous. For a carry to
// happen, two of these bit sequences would have to overlap. If
// this is the case, there is a leader byte before the second set
// of required continuation bytes (and thus before the bit that
// will be cleared by a carry). This leader byte will not be
// in the continuation mask, despite being required. QEDish.
req += (vmask2_t)set << n;
}
// Check that continuation bytes match. We must cast req from vmask2_t
// (which holds the carry mask in the upper half) to vmask_t, which
// zeroes out the upper bits
if (cont != (vmask_t)req)
return 0;
// Look up error masks for three consecutive nibbles.
vec_t e_1 = v_lookup(error_1, shifted_bytes, 4);
vec_t e_2 = v_lookup(error_2, shifted_bytes, 0);
vec_t e_3 = v_lookup(error_3, bytes, 4);
// Check if any bits are set in all three error masks
if (!v_testz(v_and(e_1, e_2), e_3))
return 0;
// Save continuation bits and input bytes for the next round
*last_cont = req >> V_LEN;
return 1;
}
int z_validate_utf8(const char *data, size_t len) {
vec_t bytes, shifted_bytes;
// Keep continuation bits from the previous iteration that carry over to
// each input chunk vector
vmask_t last_cont = 0;
size_t offset = 0;
// Deal with the input up until the last section of bytes
if (len >= V_LEN) {
// We need a vector of the input byte stream shifted forward one byte.
// Since we don't want to read the memory before the data pointer
// (which might not even be mapped), for the first chunk of input just
// use vector instructions.
shifted_bytes = v_shift_lanes_left(v_load(data));
// Loop over input in V_LEN-byte chunks, as long as we can safely read
// that far into memory
for (; offset + V_LEN < len; offset += V_LEN) {
bytes = v_load(data + offset);
if (!z_validate_vec(bytes, shifted_bytes, &last_cont))
return 0;
shifted_bytes = v_load(data + offset + V_LEN - 1);
}
}
// Deal with any bytes remaining. Rather than making a separate scalar path,
// just fill in a buffer, reading bytes only up to len, and load from that.
if (offset < len) {
char buffer[V_LEN + 1] = { 0 };
if (offset > 0)
buffer[0] = data[offset - 1];
for (int i = 0; i < (int)(len - offset); i++)
buffer[i + 1] = data[offset + i];
bytes = v_load(buffer + 1);
shifted_bytes = v_load(buffer);
if (!z_validate_vec(bytes, shifted_bytes, &last_cont))
return 0;
}
// The input is valid if we don't have any more expected continuation bytes
return last_cont == 0;
}
// Undefine all macros
#undef z_validate_utf8
#undef z_validate_vec
#undef V_LEN
#undef vec_t
#undef vmask_t
#undef vmask2_t
#undef v_load
#undef v_set1
#undef v_and
#undef v_test_bit
#undef v_testz
#undef v_lookup
#undef V_TABLE_16
#undef v_shift_lanes_left