1 /* crc32.c -- compute the CRC-32 of a data stream
2 * Copyright (C) 1995-2022 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 *
5 * This interleaved implementation of a CRC makes use of pipelined multiple
6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8 */
9
10 /*
11 Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
12 protection on the static variables used to control the first-use generation
13 of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
14 first call get_crc_table() to initialize the tables before allowing more than
15 one thread to use crc32().
16
17 MAKECRCH can be #defined to write out crc32.h. A main() routine is also
18 produced, so that this one source file can be compiled to an executable.
19 */
20
21 #ifdef MAKECRCH
22 # include <stdio.h>
23 # ifndef DYNAMIC_CRC_TABLE
24 # define DYNAMIC_CRC_TABLE
25 # endif /* !DYNAMIC_CRC_TABLE */
26 #endif /* MAKECRCH */
27
28 #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
29
30 /*
31 A CRC of a message is computed on N braids of words in the message, where
32 each word consists of W bytes (4 or 8). If N is 3, for example, then three
33 running sparse CRCs are calculated respectively on each braid, at these
34 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
35 This is done starting at a word boundary, and continues until as many blocks
36 of N * W bytes as are available have been processed. The results are combined
37 into a single CRC at the end. For this code, N must be in the range 1..6 and
38 W must be 4 or 8. The upper limit on N can be increased if desired by adding
39 more #if blocks, extending the patterns apparent in the code. In addition,
40 crc32.h would need to be regenerated, if the maximum N value is increased.
41
42 N and W are chosen empirically by benchmarking the execution time on a given
43 processor. The choices for N and W below were based on testing on Intel Kaby
44 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
45 Octeon II processors. The Intel, AMD, and ARM processors were all fastest
46 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
47 They were all tested with either gcc or clang, all using the -O3 optimization
48 level. Your mileage may vary.
49 */
50
51 /* Define N */
52 #ifdef Z_TESTN
53 # define N Z_TESTN
54 #else
55 # define N 5
56 #endif
57 #if N < 1 || N > 6
58 # error N must be in 1..6
59 #endif
60
61 /*
62 z_crc_t must be at least 32 bits. z_word_t must be at least as long as
63 z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
64 that bytes are eight bits.
65 */
66
67 /*
68 Define W and the associated z_word_t type. If W is not defined, then a
69 braided calculation is not used, and the associated tables and code are not
70 compiled.
71 */
72 #ifdef Z_TESTW
73 # if Z_TESTW-1 != -1
74 # define W Z_TESTW
75 # endif
76 #else
77 # ifdef MAKECRCH
78 # define W 8 /* required for MAKECRCH */
79 # else
80 # if defined(__x86_64__) || defined(__aarch64__)
81 # define W 8
82 # else
83 # define W 4
84 # endif
85 # endif
86 #endif
87 #ifdef W
88 # if W == 8 && defined(Z_U8)
89 typedef Z_U8 z_word_t;
90 # elif defined(Z_U4)
91 # undef W
92 # define W 4
93 typedef Z_U4 z_word_t;
94 # else
95 # undef W
96 # endif
97 #endif
98
99 /* If available, use the ARM processor CRC32 instruction. */
100 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
101 # define ARMCRC32
102 #endif
103
104 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
105 /*
106 Swap the bytes in a z_word_t to convert between little and big endian. Any
107 self-respecting compiler will optimize this to a single machine byte-swap
108 instruction, if one is available. This assumes that word_t is either 32 bits
109 or 64 bits.
110 */
byte_swap(z_word_t word)111 local z_word_t byte_swap(z_word_t word) {
112 # if W == 8
113 return
114 (word & 0xff00000000000000) >> 56 |
115 (word & 0xff000000000000) >> 40 |
116 (word & 0xff0000000000) >> 24 |
117 (word & 0xff00000000) >> 8 |
118 (word & 0xff000000) << 8 |
119 (word & 0xff0000) << 24 |
120 (word & 0xff00) << 40 |
121 (word & 0xff) << 56;
122 # else /* W == 4 */
123 return
124 (word & 0xff000000) >> 24 |
125 (word & 0xff0000) >> 8 |
126 (word & 0xff00) << 8 |
127 (word & 0xff) << 24;
128 # endif
129 }
130 #endif
131
132 #ifdef DYNAMIC_CRC_TABLE
133 /* =========================================================================
134 * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
135 * below.
136 */
137 local z_crc_t FAR x2n_table[32];
138 #else
139 /* =========================================================================
140 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
141 * of x for combining CRC-32s, all made by make_crc_table().
142 */
143 # include "crc32.h"
144 #endif
145
146 /* CRC polynomial. */
147 #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
148
149 /*
150 Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
151 reflected. For speed, this requires that a not be zero.
152 */
multmodp(z_crc_t a,z_crc_t b)153 local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
154 z_crc_t m, p;
155
156 m = (z_crc_t)1 << 31;
157 p = 0;
158 for (;;) {
159 if (a & m) {
160 p ^= b;
161 if ((a & (m - 1)) == 0)
162 break;
163 }
164 m >>= 1;
165 b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
166 }
167 return p;
168 }
169
170 /*
171 Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
172 initialized.
173 */
x2nmodp(z_off64_t n,unsigned k)174 local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
175 z_crc_t p;
176
177 p = (z_crc_t)1 << 31; /* x^0 == 1 */
178 while (n) {
179 if (n & 1)
180 p = multmodp(x2n_table[k & 31], p);
181 n >>= 1;
182 k++;
183 }
184 return p;
185 }
186
187 #ifdef DYNAMIC_CRC_TABLE
188 /* =========================================================================
189 * Build the tables for byte-wise and braided CRC-32 calculations, and a table
190 * of powers of x for combining CRC-32s.
191 */
192 local z_crc_t FAR crc_table[256];
193 #ifdef W
194 local z_word_t FAR crc_big_table[256];
195 local z_crc_t FAR crc_braid_table[W][256];
196 local z_word_t FAR crc_braid_big_table[W][256];
197 local void braid(z_crc_t [][256], z_word_t [][256], int, int);
198 #endif
199 #ifdef MAKECRCH
200 local void write_table(FILE *, const z_crc_t FAR *, int);
201 local void write_table32hi(FILE *, const z_word_t FAR *, int);
202 local void write_table64(FILE *, const z_word_t FAR *, int);
203 #endif /* MAKECRCH */
204
205 /*
206 Define a once() function depending on the availability of atomics. If this is
207 compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
208 multiple threads, and if atomics are not available, then get_crc_table() must
209 be called to initialize the tables and must return before any threads are
210 allowed to compute or combine CRCs.
211 */
212
213 /* Definition of once functionality. */
214 typedef struct once_s once_t;
215
216 /* Check for the availability of atomics. */
217 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
218 !defined(__STDC_NO_ATOMICS__) && !defined(SMALL)
219
220 #include <stdatomic.h>
221
222 /* Structure for once(), which must be initialized with ONCE_INIT. */
223 struct once_s {
224 atomic_flag begun;
225 atomic_int done;
226 };
227 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
228
229 /*
230 Run the provided init() function exactly once, even if multiple threads
231 invoke once() at the same time. The state must be a once_t initialized with
232 ONCE_INIT.
233 */
once(once_t * state,void (* init)(void))234 local void once(once_t *state, void (*init)(void)) {
235 if (!atomic_load(&state->done)) {
236 if (atomic_flag_test_and_set(&state->begun))
237 while (!atomic_load(&state->done))
238 ;
239 else {
240 init();
241 atomic_store(&state->done, 1);
242 }
243 }
244 }
245
246 #else /* no atomics */
247
248 /* Structure for once(), which must be initialized with ONCE_INIT. */
249 struct once_s {
250 volatile int begun;
251 volatile int done;
252 };
253 #define ONCE_INIT {0, 0}
254
255 /* Test and set. Alas, not atomic, but tries to minimize the period of
256 vulnerability. */
test_and_set(int volatile * flag)257 local int test_and_set(int volatile *flag) {
258 int was;
259
260 was = *flag;
261 *flag = 1;
262 return was;
263 }
264
265 /* Run the provided init() function once. This is not thread-safe. */
once(once_t * state,void (* init)(void))266 local void once(once_t *state, void (*init)(void)) {
267 if (!state->done) {
268 if (test_and_set(&state->begun))
269 while (!state->done)
270 ;
271 else {
272 init();
273 state->done = 1;
274 }
275 }
276 }
277
278 #endif
279
280 /* State for once(). */
281 local once_t made = ONCE_INIT;
282
283 /*
284 Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
285 x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
286
287 Polynomials over GF(2) are represented in binary, one bit per coefficient,
288 with the lowest powers in the most significant bit. Then adding polynomials
289 is just exclusive-or, and multiplying a polynomial by x is a right shift by
290 one. If we call the above polynomial p, and represent a byte as the
291 polynomial q, also with the lowest power in the most significant bit (so the
292 byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
293 where a mod b means the remainder after dividing a by b.
294
295 This calculation is done using the shift-register method of multiplying and
296 taking the remainder. The register is initialized to zero, and for each
297 incoming bit, x^32 is added mod p to the register if the bit is a one (where
298 x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
299 (which is shifting right by one and adding x^32 mod p if the bit shifted out
300 is a one). We start with the highest power (least significant bit) of q and
301 repeat for all eight bits of q.
302
303 The table is simply the CRC of all possible eight bit values. This is all the
304 information needed to generate CRCs on data a byte at a time for all
305 combinations of CRC register values and incoming bytes.
306 */
307
make_crc_table(void)308 local void make_crc_table(void) {
309 unsigned i, j, n;
310 z_crc_t p;
311
312 /* initialize the CRC of bytes tables */
313 for (i = 0; i < 256; i++) {
314 p = i;
315 for (j = 0; j < 8; j++)
316 p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
317 crc_table[i] = p;
318 #ifdef W
319 crc_big_table[i] = byte_swap(p);
320 #endif
321 }
322
323 /* initialize the x^2^n mod p(x) table */
324 p = (z_crc_t)1 << 30; /* x^1 */
325 x2n_table[0] = p;
326 for (n = 1; n < 32; n++)
327 x2n_table[n] = p = multmodp(p, p);
328
329 #ifdef W
330 /* initialize the braiding tables -- needs x2n_table[] */
331 braid(crc_braid_table, crc_braid_big_table, N, W);
332 #endif
333
334 #ifdef MAKECRCH
335 {
336 /*
337 The crc32.h header file contains tables for both 32-bit and 64-bit
338 z_word_t's, and so requires a 64-bit type be available. In that case,
339 z_word_t must be defined to be 64-bits. This code then also generates
340 and writes out the tables for the case that z_word_t is 32 bits.
341 */
342 #if !defined(W) || W != 8
343 # error Need a 64-bit integer type in order to generate crc32.h.
344 #endif
345 FILE *out;
346 int k, n;
347 z_crc_t ltl[8][256];
348 z_word_t big[8][256];
349
350 out = fopen("crc32.h", "w");
351 if (out == NULL) return;
352
353 /* write out little-endian CRC table to crc32.h */
354 fprintf(out,
355 "/* crc32.h -- tables for rapid CRC calculation\n"
356 " * Generated automatically by crc32.c\n */\n"
357 "\n"
358 "local const z_crc_t FAR crc_table[] = {\n"
359 " ");
360 write_table(out, crc_table, 256);
361 fprintf(out,
362 "};\n");
363
364 /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
365 fprintf(out,
366 "\n"
367 "#ifdef W\n"
368 "\n"
369 "#if W == 8\n"
370 "\n"
371 "local const z_word_t FAR crc_big_table[] = {\n"
372 " ");
373 write_table64(out, crc_big_table, 256);
374 fprintf(out,
375 "};\n");
376
377 /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
378 fprintf(out,
379 "\n"
380 "#else /* W == 4 */\n"
381 "\n"
382 "local const z_word_t FAR crc_big_table[] = {\n"
383 " ");
384 write_table32hi(out, crc_big_table, 256);
385 fprintf(out,
386 "};\n"
387 "\n"
388 "#endif\n");
389
390 /* write out braid tables for each value of N */
391 for (n = 1; n <= 6; n++) {
392 fprintf(out,
393 "\n"
394 "#if N == %d\n", n);
395
396 /* compute braid tables for this N and 64-bit word_t */
397 braid(ltl, big, n, 8);
398
399 /* write out braid tables for 64-bit z_word_t to crc32.h */
400 fprintf(out,
401 "\n"
402 "#if W == 8\n"
403 "\n"
404 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
405 for (k = 0; k < 8; k++) {
406 fprintf(out, " {");
407 write_table(out, ltl[k], 256);
408 fprintf(out, "}%s", k < 7 ? ",\n" : "");
409 }
410 fprintf(out,
411 "};\n"
412 "\n"
413 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
414 for (k = 0; k < 8; k++) {
415 fprintf(out, " {");
416 write_table64(out, big[k], 256);
417 fprintf(out, "}%s", k < 7 ? ",\n" : "");
418 }
419 fprintf(out,
420 "};\n");
421
422 /* compute braid tables for this N and 32-bit word_t */
423 braid(ltl, big, n, 4);
424
425 /* write out braid tables for 32-bit z_word_t to crc32.h */
426 fprintf(out,
427 "\n"
428 "#else /* W == 4 */\n"
429 "\n"
430 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
431 for (k = 0; k < 4; k++) {
432 fprintf(out, " {");
433 write_table(out, ltl[k], 256);
434 fprintf(out, "}%s", k < 3 ? ",\n" : "");
435 }
436 fprintf(out,
437 "};\n"
438 "\n"
439 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
440 for (k = 0; k < 4; k++) {
441 fprintf(out, " {");
442 write_table32hi(out, big[k], 256);
443 fprintf(out, "}%s", k < 3 ? ",\n" : "");
444 }
445 fprintf(out,
446 "};\n"
447 "\n"
448 "#endif\n"
449 "\n"
450 "#endif\n");
451 }
452 fprintf(out,
453 "\n"
454 "#endif\n");
455
456 /* write out zeros operator table to crc32.h */
457 fprintf(out,
458 "\n"
459 "local const z_crc_t FAR x2n_table[] = {\n"
460 " ");
461 write_table(out, x2n_table, 32);
462 fprintf(out,
463 "};\n");
464 fclose(out);
465 }
466 #endif /* MAKECRCH */
467 }
468
469 #ifdef MAKECRCH
470
471 /*
472 Write the 32-bit values in table[0..k-1] to out, five per line in
473 hexadecimal separated by commas.
474 */
write_table(FILE * out,const z_crc_t FAR * table,int k)475 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
476 int n;
477
478 for (n = 0; n < k; n++)
479 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
480 (unsigned long)(table[n]),
481 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
482 }
483
484 /*
485 Write the high 32-bits of each value in table[0..k-1] to out, five per line
486 in hexadecimal separated by commas.
487 */
write_table32hi(FILE * out,const z_word_t FAR * table,int k)488 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
489 int n;
490
491 for (n = 0; n < k; n++)
492 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
493 (unsigned long)(table[n] >> 32),
494 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
495 }
496
497 /*
498 Write the 64-bit values in table[0..k-1] to out, three per line in
499 hexadecimal separated by commas. This assumes that if there is a 64-bit
500 type, then there is also a long long integer type, and it is at least 64
501 bits. If not, then the type cast and format string can be adjusted
502 accordingly.
503 */
write_table64(FILE * out,const z_word_t FAR * table,int k)504 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
505 int n;
506
507 for (n = 0; n < k; n++)
508 fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ",
509 (unsigned long long)(table[n]),
510 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
511 }
512
513 /* Actually do the deed. */
main(void)514 int main(void) {
515 make_crc_table();
516 return 0;
517 }
518
519 #endif /* MAKECRCH */
520
521 #ifdef W
522 /*
523 Generate the little and big-endian braid tables for the given n and z_word_t
524 size w. Each array must have room for w blocks of 256 elements.
525 */
braid(z_crc_t ltl[][256],z_word_t big[][256],int n,int w)526 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
527 int k;
528 z_crc_t i, p, q;
529 for (k = 0; k < w; k++) {
530 p = x2nmodp((n * w + 3 - k) << 3, 0);
531 ltl[k][0] = 0;
532 big[w - 1 - k][0] = 0;
533 for (i = 1; i < 256; i++) {
534 ltl[k][i] = q = multmodp(i << 24, p);
535 big[w - 1 - k][i] = byte_swap(q);
536 }
537 }
538 }
539 #endif
540
541 #endif /* DYNAMIC_CRC_TABLE */
542
543 /* =========================================================================
544 * This function can be used by asm versions of crc32(), and to force the
545 * generation of the CRC tables in a threaded application.
546 */
get_crc_table(void)547 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
548 #ifdef DYNAMIC_CRC_TABLE
549 once(&made, make_crc_table);
550 #endif /* DYNAMIC_CRC_TABLE */
551 return (const z_crc_t FAR *)crc_table;
552 }
553
554 /* =========================================================================
555 * Use ARM machine instructions if available. This will compute the CRC about
556 * ten times faster than the braided calculation. This code does not check for
557 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
558 * only be defined if the compilation specifies an ARM processor architecture
559 * that has the instructions. For example, compiling with -march=armv8.1-a or
560 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
561 * instructions.
562 */
563 #ifdef ARMCRC32
564
565 /*
566 Constants empirically determined to maximize speed. These values are from
567 measurements on a Cortex-A57. Your mileage may vary.
568 */
569 #define Z_BATCH 3990 /* number of words in a batch */
570 #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
571 #define Z_BATCH_MIN 800 /* fewest words in a final batch */
572
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)573 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
574 z_size_t len) {
575 z_crc_t val;
576 z_word_t crc1, crc2;
577 const z_word_t *word;
578 z_word_t val0, val1, val2;
579 z_size_t last, last2, i;
580 z_size_t num;
581
582 /* Return initial CRC, if requested. */
583 if (buf == Z_NULL) return 0;
584
585 #ifdef DYNAMIC_CRC_TABLE
586 once(&made, make_crc_table);
587 #endif /* DYNAMIC_CRC_TABLE */
588
589 /* Pre-condition the CRC */
590 crc = (~crc) & 0xffffffff;
591
592 /* Compute the CRC up to a word boundary. */
593 while (len && ((z_size_t)buf & 7) != 0) {
594 len--;
595 val = *buf++;
596 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
597 }
598
599 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
600 word = (z_word_t const *)buf;
601 num = len >> 3;
602 len &= 7;
603
604 /* Do three interleaved CRCs to realize the throughput of one crc32x
605 instruction per cycle. Each CRC is calculated on Z_BATCH words. The
606 three CRCs are combined into a single CRC after each set of batches. */
607 while (num >= 3 * Z_BATCH) {
608 crc1 = 0;
609 crc2 = 0;
610 for (i = 0; i < Z_BATCH; i++) {
611 val0 = word[i];
612 val1 = word[i + Z_BATCH];
613 val2 = word[i + 2 * Z_BATCH];
614 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
615 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
616 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
617 }
618 word += 3 * Z_BATCH;
619 num -= 3 * Z_BATCH;
620 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
621 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
622 }
623
624 /* Do one last smaller batch with the remaining words, if there are enough
625 to pay for the combination of CRCs. */
626 last = num / 3;
627 if (last >= Z_BATCH_MIN) {
628 last2 = last << 1;
629 crc1 = 0;
630 crc2 = 0;
631 for (i = 0; i < last; i++) {
632 val0 = word[i];
633 val1 = word[i + last];
634 val2 = word[i + last2];
635 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
636 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
637 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
638 }
639 word += 3 * last;
640 num -= 3 * last;
641 val = x2nmodp(last, 6);
642 crc = multmodp(val, crc) ^ crc1;
643 crc = multmodp(val, crc) ^ crc2;
644 }
645
646 /* Compute the CRC on any remaining words. */
647 for (i = 0; i < num; i++) {
648 val0 = word[i];
649 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
650 }
651 word += num;
652
653 /* Complete the CRC on any remaining bytes. */
654 buf = (const unsigned char FAR *)word;
655 while (len) {
656 len--;
657 val = *buf++;
658 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
659 }
660
661 /* Return the CRC, post-conditioned. */
662 return crc ^ 0xffffffff;
663 }
664
665 #else
666
667 #ifdef W
668
669 /*
670 Return the CRC of the W bytes in the word_t data, taking the
671 least-significant byte of the word as the first byte of data, without any pre
672 or post conditioning. This is used to combine the CRCs of each braid.
673 */
crc_word(z_word_t data)674 local z_crc_t crc_word(z_word_t data) {
675 int k;
676 for (k = 0; k < W; k++)
677 data = (data >> 8) ^ crc_table[data & 0xff];
678 return (z_crc_t)data;
679 }
680
crc_word_big(z_word_t data)681 local z_word_t crc_word_big(z_word_t data) {
682 int k;
683 for (k = 0; k < W; k++)
684 data = (data << 8) ^
685 crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
686 return data;
687 }
688
689 #endif
690
691 /* ========================================================================= */
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)692 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
693 z_size_t len) {
694 /* Return initial CRC, if requested. */
695 if (buf == Z_NULL) return 0;
696
697 #ifdef DYNAMIC_CRC_TABLE
698 once(&made, make_crc_table);
699 #endif /* DYNAMIC_CRC_TABLE */
700
701 /* Pre-condition the CRC */
702 crc = (~crc) & 0xffffffff;
703
704 #ifdef W
705
706 /* If provided enough bytes, do a braided CRC calculation. */
707 if (len >= N * W + W - 1) {
708 z_size_t blks;
709 z_word_t const *words;
710 unsigned endian;
711 int k;
712
713 /* Compute the CRC up to a z_word_t boundary. */
714 while (len && ((z_size_t)buf & (W - 1)) != 0) {
715 len--;
716 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
717 }
718
719 /* Compute the CRC on as many N z_word_t blocks as are available. */
720 blks = len / (N * W);
721 len -= blks * N * W;
722 words = (z_word_t const *)buf;
723
724 /* Do endian check at execution time instead of compile time, since ARM
725 processors can change the endianness at execution time. If the
726 compiler knows what the endianness will be, it can optimize out the
727 check and the unused branch. */
728 endian = 1;
729 if (*(unsigned char *)&endian) {
730 /* Little endian. */
731
732 z_crc_t crc0;
733 z_word_t word0;
734 #if N > 1
735 z_crc_t crc1;
736 z_word_t word1;
737 #if N > 2
738 z_crc_t crc2;
739 z_word_t word2;
740 #if N > 3
741 z_crc_t crc3;
742 z_word_t word3;
743 #if N > 4
744 z_crc_t crc4;
745 z_word_t word4;
746 #if N > 5
747 z_crc_t crc5;
748 z_word_t word5;
749 #endif
750 #endif
751 #endif
752 #endif
753 #endif
754
755 /* Initialize the CRC for each braid. */
756 crc0 = crc;
757 #if N > 1
758 crc1 = 0;
759 #if N > 2
760 crc2 = 0;
761 #if N > 3
762 crc3 = 0;
763 #if N > 4
764 crc4 = 0;
765 #if N > 5
766 crc5 = 0;
767 #endif
768 #endif
769 #endif
770 #endif
771 #endif
772
773 /*
774 Process the first blks-1 blocks, computing the CRCs on each braid
775 independently.
776 */
777 while (--blks) {
778 /* Load the word for each braid into registers. */
779 word0 = crc0 ^ words[0];
780 #if N > 1
781 word1 = crc1 ^ words[1];
782 #if N > 2
783 word2 = crc2 ^ words[2];
784 #if N > 3
785 word3 = crc3 ^ words[3];
786 #if N > 4
787 word4 = crc4 ^ words[4];
788 #if N > 5
789 word5 = crc5 ^ words[5];
790 #endif
791 #endif
792 #endif
793 #endif
794 #endif
795 words += N;
796
797 /* Compute and update the CRC for each word. The loop should
798 get unrolled. */
799 crc0 = crc_braid_table[0][word0 & 0xff];
800 #if N > 1
801 crc1 = crc_braid_table[0][word1 & 0xff];
802 #if N > 2
803 crc2 = crc_braid_table[0][word2 & 0xff];
804 #if N > 3
805 crc3 = crc_braid_table[0][word3 & 0xff];
806 #if N > 4
807 crc4 = crc_braid_table[0][word4 & 0xff];
808 #if N > 5
809 crc5 = crc_braid_table[0][word5 & 0xff];
810 #endif
811 #endif
812 #endif
813 #endif
814 #endif
815 for (k = 1; k < W; k++) {
816 crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
817 #if N > 1
818 crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
819 #if N > 2
820 crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
821 #if N > 3
822 crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
823 #if N > 4
824 crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
825 #if N > 5
826 crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
827 #endif
828 #endif
829 #endif
830 #endif
831 #endif
832 }
833 }
834
835 /*
836 Process the last block, combining the CRCs of the N braids at the
837 same time.
838 */
839 crc = crc_word(crc0 ^ words[0]);
840 #if N > 1
841 crc = crc_word(crc1 ^ words[1] ^ crc);
842 #if N > 2
843 crc = crc_word(crc2 ^ words[2] ^ crc);
844 #if N > 3
845 crc = crc_word(crc3 ^ words[3] ^ crc);
846 #if N > 4
847 crc = crc_word(crc4 ^ words[4] ^ crc);
848 #if N > 5
849 crc = crc_word(crc5 ^ words[5] ^ crc);
850 #endif
851 #endif
852 #endif
853 #endif
854 #endif
855 words += N;
856 }
857 else {
858 /* Big endian. */
859
860 z_word_t crc0, word0, comb;
861 #if N > 1
862 z_word_t crc1, word1;
863 #if N > 2
864 z_word_t crc2, word2;
865 #if N > 3
866 z_word_t crc3, word3;
867 #if N > 4
868 z_word_t crc4, word4;
869 #if N > 5
870 z_word_t crc5, word5;
871 #endif
872 #endif
873 #endif
874 #endif
875 #endif
876
877 /* Initialize the CRC for each braid. */
878 crc0 = byte_swap(crc);
879 #if N > 1
880 crc1 = 0;
881 #if N > 2
882 crc2 = 0;
883 #if N > 3
884 crc3 = 0;
885 #if N > 4
886 crc4 = 0;
887 #if N > 5
888 crc5 = 0;
889 #endif
890 #endif
891 #endif
892 #endif
893 #endif
894
895 /*
896 Process the first blks-1 blocks, computing the CRCs on each braid
897 independently.
898 */
899 while (--blks) {
900 /* Load the word for each braid into registers. */
901 word0 = crc0 ^ words[0];
902 #if N > 1
903 word1 = crc1 ^ words[1];
904 #if N > 2
905 word2 = crc2 ^ words[2];
906 #if N > 3
907 word3 = crc3 ^ words[3];
908 #if N > 4
909 word4 = crc4 ^ words[4];
910 #if N > 5
911 word5 = crc5 ^ words[5];
912 #endif
913 #endif
914 #endif
915 #endif
916 #endif
917 words += N;
918
919 /* Compute and update the CRC for each word. The loop should
920 get unrolled. */
921 crc0 = crc_braid_big_table[0][word0 & 0xff];
922 #if N > 1
923 crc1 = crc_braid_big_table[0][word1 & 0xff];
924 #if N > 2
925 crc2 = crc_braid_big_table[0][word2 & 0xff];
926 #if N > 3
927 crc3 = crc_braid_big_table[0][word3 & 0xff];
928 #if N > 4
929 crc4 = crc_braid_big_table[0][word4 & 0xff];
930 #if N > 5
931 crc5 = crc_braid_big_table[0][word5 & 0xff];
932 #endif
933 #endif
934 #endif
935 #endif
936 #endif
937 for (k = 1; k < W; k++) {
938 crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
939 #if N > 1
940 crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
941 #if N > 2
942 crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
943 #if N > 3
944 crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
945 #if N > 4
946 crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
947 #if N > 5
948 crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
949 #endif
950 #endif
951 #endif
952 #endif
953 #endif
954 }
955 }
956
957 /*
958 Process the last block, combining the CRCs of the N braids at the
959 same time.
960 */
961 comb = crc_word_big(crc0 ^ words[0]);
962 #if N > 1
963 comb = crc_word_big(crc1 ^ words[1] ^ comb);
964 #if N > 2
965 comb = crc_word_big(crc2 ^ words[2] ^ comb);
966 #if N > 3
967 comb = crc_word_big(crc3 ^ words[3] ^ comb);
968 #if N > 4
969 comb = crc_word_big(crc4 ^ words[4] ^ comb);
970 #if N > 5
971 comb = crc_word_big(crc5 ^ words[5] ^ comb);
972 #endif
973 #endif
974 #endif
975 #endif
976 #endif
977 words += N;
978 crc = byte_swap(comb);
979 }
980
981 /*
982 Update the pointer to the remaining bytes to process.
983 */
984 buf = (unsigned char const *)words;
985 }
986
987 #endif /* W */
988
989 /* Complete the computation of the CRC on any remaining bytes. */
990 while (len >= 8) {
991 len -= 8;
992 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
993 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
994 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
995 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
996 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
997 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
998 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
999 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1000 }
1001 while (len) {
1002 len--;
1003 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1004 }
1005
1006 /* Return the CRC, post-conditioned. */
1007 return crc ^ 0xffffffff;
1008 }
1009
1010 #endif
1011
1012 /* ========================================================================= */
crc32(unsigned long crc,const unsigned char FAR * buf,uInt len)1013 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1014 uInt len) {
1015 return crc32_z(crc, buf, len);
1016 }
1017
1018 /* ========================================================================= */
crc32_combine64(uLong crc1,uLong crc2,z_off64_t len2)1019 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1020 #ifdef DYNAMIC_CRC_TABLE
1021 once(&made, make_crc_table);
1022 #endif /* DYNAMIC_CRC_TABLE */
1023 return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1024 }
1025
1026 /* ========================================================================= */
crc32_combine(uLong crc1,uLong crc2,z_off_t len2)1027 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1028 return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1029 }
1030
1031 /* ========================================================================= */
crc32_combine_gen64(z_off64_t len2)1032 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1033 #ifdef DYNAMIC_CRC_TABLE
1034 once(&made, make_crc_table);
1035 #endif /* DYNAMIC_CRC_TABLE */
1036 return x2nmodp(len2, 3);
1037 }
1038
1039 /* ========================================================================= */
crc32_combine_gen(z_off_t len2)1040 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1041 return crc32_combine_gen64((z_off64_t)len2);
1042 }
1043
1044 /* ========================================================================= */
crc32_combine_op(uLong crc1,uLong crc2,uLong op)1045 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1046 return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1047 }
1048