xref: /netbsd-src/common/dist/zlib/crc32.c (revision 3117ece4fc4a4ca4489ba793710b60b0d26bab6c)
1 /*	$NetBSD: crc32.c,v 1.7 2024/09/22 19:12:27 christos Exp $	*/
2 
3 /* crc32.c -- compute the CRC-32 of a data stream
4  * Copyright (C) 1995-2022 Mark Adler
5  * For conditions of distribution and use, see copyright notice in zlib.h
6  *
7  * This interleaved implementation of a CRC makes use of pipelined multiple
8  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
9  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
10  */
11 
12 /* @(#) Id */
13 
14 /*
15   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
16   protection on the static variables used to control the first-use generation
17   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
18   first call get_crc_table() to initialize the tables before allowing more than
19   one thread to use crc32().
20 
21   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
22   produced, so that this one source file can be compiled to an executable.
23  */
24 
25 #ifdef MAKECRCH
26 #  include <stdio.h>
27 #  ifndef DYNAMIC_CRC_TABLE
28 #    define DYNAMIC_CRC_TABLE
29 #  endif /* !DYNAMIC_CRC_TABLE */
30 #endif /* MAKECRCH */
31 
32 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
33 
34  /*
35   A CRC of a message is computed on N braids of words in the message, where
36   each word consists of W bytes (4 or 8). If N is 3, for example, then three
37   running sparse CRCs are calculated respectively on each braid, at these
38   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
39   This is done starting at a word boundary, and continues until as many blocks
40   of N * W bytes as are available have been processed. The results are combined
41   into a single CRC at the end. For this code, N must be in the range 1..6 and
42   W must be 4 or 8. The upper limit on N can be increased if desired by adding
43   more #if blocks, extending the patterns apparent in the code. In addition,
44   crc32.h would need to be regenerated, if the maximum N value is increased.
45 
46   N and W are chosen empirically by benchmarking the execution time on a given
47   processor. The choices for N and W below were based on testing on Intel Kaby
48   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
49   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
50   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
51   They were all tested with either gcc or clang, all using the -O3 optimization
52   level. Your mileage may vary.
53  */
54 
55 /* Define N */
56 #ifdef Z_TESTN
57 #  define N Z_TESTN
58 #else
59 #  define N 5
60 #endif
61 #if N < 1 || N > 6
62 #  error N must be in 1..6
63 #endif
64 
65 /*
66   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
67   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
68   that bytes are eight bits.
69  */
70 
71 /*
72   Define W and the associated z_word_t type. If W is not defined, then a
73   braided calculation is not used, and the associated tables and code are not
74   compiled.
75  */
76 #ifdef Z_TESTW
77 #  if Z_TESTW-1 != -1
78 #    define W Z_TESTW
79 #  endif
80 #else
81 #  ifdef MAKECRCH
82 #    define W 8         /* required for MAKECRCH */
83 #  else
84 #    if defined(__x86_64__) || defined(__aarch64__)
85 #      define W 8
86 #    else
87 #      define W 4
88 #    endif
89 #  endif
90 #endif
91 #ifdef W
92 #  if W == 8 && defined(Z_U8)
93      typedef Z_U8 z_word_t;
94 #  elif defined(Z_U4)
95 #    undef W
96 #    define W 4
97      typedef Z_U4 z_word_t;
98 #  else
99 #    undef W
100 #  endif
101 #endif
102 
103 /* If available, use the ARM processor CRC32 instruction. */
104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
105 #  define ARMCRC32
106 #endif
107 
108 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
109 /*
110   Swap the bytes in a z_word_t to convert between little and big endian. Any
111   self-respecting compiler will optimize this to a single machine byte-swap
112   instruction, if one is available. This assumes that word_t is either 32 bits
113   or 64 bits.
114  */
115 local z_word_t byte_swap(z_word_t word) {
116 #  if W == 8
117     return
118         (word & 0xff00000000000000) >> 56 |
119         (word & 0xff000000000000) >> 40 |
120         (word & 0xff0000000000) >> 24 |
121         (word & 0xff00000000) >> 8 |
122         (word & 0xff000000) << 8 |
123         (word & 0xff0000) << 24 |
124         (word & 0xff00) << 40 |
125         (word & 0xff) << 56;
126 #  else   /* W == 4 */
127     return
128         (word & 0xff000000) >> 24 |
129         (word & 0xff0000) >> 8 |
130         (word & 0xff00) << 8 |
131         (word & 0xff) << 24;
132 #  endif
133 }
134 #endif
135 
136 #ifdef DYNAMIC_CRC_TABLE
137 /* =========================================================================
138  * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
139  * below.
140  */
141    local z_crc_t FAR x2n_table[32];
142 #else
143 /* =========================================================================
144  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
145  * of x for combining CRC-32s, all made by make_crc_table().
146  */
147 #  include "crc32.h"
148 #endif
149 
150 /* CRC polynomial. */
151 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
152 
153 /*
154   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
155   reflected. For speed, this requires that a not be zero.
156  */
157 local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
158     z_crc_t m, p;
159 
160     m = (z_crc_t)1 << 31;
161     p = 0;
162     for (;;) {
163         if (a & m) {
164             p ^= b;
165             if ((a & (m - 1)) == 0)
166                 break;
167         }
168         m >>= 1;
169         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
170     }
171     return p;
172 }
173 
174 /*
175   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
176   initialized.
177  */
178 local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
179     z_crc_t p;
180 
181     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
182     while (n) {
183         if (n & 1)
184             p = multmodp(x2n_table[k & 31], p);
185         n >>= 1;
186         k++;
187     }
188     return p;
189 }
190 
191 #ifdef DYNAMIC_CRC_TABLE
192 /* =========================================================================
193  * Build the tables for byte-wise and braided CRC-32 calculations, and a table
194  * of powers of x for combining CRC-32s.
195  */
196 local z_crc_t FAR crc_table[256];
197 #ifdef W
198    local z_word_t FAR crc_big_table[256];
199    local z_crc_t FAR crc_braid_table[W][256];
200    local z_word_t FAR crc_braid_big_table[W][256];
201    local void braid(z_crc_t [][256], z_word_t [][256], int, int);
202 #endif
203 #ifdef MAKECRCH
204    local void write_table(FILE *, const z_crc_t FAR *, int);
205    local void write_table32hi(FILE *, const z_word_t FAR *, int);
206    local void write_table64(FILE *, const z_word_t FAR *, int);
207 #endif /* MAKECRCH */
208 
209 /*
210   Define a once() function depending on the availability of atomics. If this is
211   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
212   multiple threads, and if atomics are not available, then get_crc_table() must
213   be called to initialize the tables and must return before any threads are
214   allowed to compute or combine CRCs.
215  */
216 
217 /* Definition of once functionality. */
218 typedef struct once_s once_t;
219 
220 /* Check for the availability of atomics. */
221 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
222     !defined(__STDC_NO_ATOMICS__)
223 
224 #include <stdatomic.h>
225 
226 /* Structure for once(), which must be initialized with ONCE_INIT. */
227 struct once_s {
228     atomic_flag begun;
229     atomic_int done;
230 };
231 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
232 
233 /*
234   Run the provided init() function exactly once, even if multiple threads
235   invoke once() at the same time. The state must be a once_t initialized with
236   ONCE_INIT.
237  */
238 local void once(once_t *state, void (*init)(void)) {
239     if (!atomic_load(&state->done)) {
240         if (atomic_flag_test_and_set(&state->begun))
241             while (!atomic_load(&state->done))
242                 ;
243         else {
244             init();
245             atomic_store(&state->done, 1);
246         }
247     }
248 }
249 
250 #else   /* no atomics */
251 
252 /* Structure for once(), which must be initialized with ONCE_INIT. */
253 struct once_s {
254     volatile int begun;
255     volatile int done;
256 };
257 #define ONCE_INIT {0, 0}
258 
259 /* Test and set. Alas, not atomic, but tries to minimize the period of
260    vulnerability. */
261 local int test_and_set(int volatile *flag) {
262     int was;
263 
264     was = *flag;
265     *flag = 1;
266     return was;
267 }
268 
269 /* Run the provided init() function once. This is not thread-safe. */
270 local void once(once_t *state, void (*init)(void)) {
271     if (!state->done) {
272         if (test_and_set(&state->begun))
273             while (!state->done)
274                 ;
275         else {
276             init();
277             state->done = 1;
278         }
279     }
280 }
281 
282 #endif
283 
284 /* State for once(). */
285 local once_t made = ONCE_INIT;
286 
287 /*
288   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
289   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
290 
291   Polynomials over GF(2) are represented in binary, one bit per coefficient,
292   with the lowest powers in the most significant bit. Then adding polynomials
293   is just exclusive-or, and multiplying a polynomial by x is a right shift by
294   one. If we call the above polynomial p, and represent a byte as the
295   polynomial q, also with the lowest power in the most significant bit (so the
296   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
297   where a mod b means the remainder after dividing a by b.
298 
299   This calculation is done using the shift-register method of multiplying and
300   taking the remainder. The register is initialized to zero, and for each
301   incoming bit, x^32 is added mod p to the register if the bit is a one (where
302   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
303   (which is shifting right by one and adding x^32 mod p if the bit shifted out
304   is a one). We start with the highest power (least significant bit) of q and
305   repeat for all eight bits of q.
306 
307   The table is simply the CRC of all possible eight bit values. This is all the
308   information needed to generate CRCs on data a byte at a time for all
309   combinations of CRC register values and incoming bytes.
310  */
311 
312 local void make_crc_table(void) {
313     unsigned i, j, n;
314     z_crc_t p;
315 
316     /* initialize the CRC of bytes tables */
317     for (i = 0; i < 256; i++) {
318         p = i;
319         for (j = 0; j < 8; j++)
320             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
321         crc_table[i] = p;
322 #ifdef W
323         crc_big_table[i] = byte_swap(p);
324 #endif
325     }
326 
327     /* initialize the x^2^n mod p(x) table */
328     p = (z_crc_t)1 << 30;         /* x^1 */
329     x2n_table[0] = p;
330     for (n = 1; n < 32; n++)
331         x2n_table[n] = p = multmodp(p, p);
332 
333 #ifdef W
334     /* initialize the braiding tables -- needs x2n_table[] */
335     braid(crc_braid_table, crc_braid_big_table, N, W);
336 #endif
337 
338 #ifdef MAKECRCH
339     {
340         /*
341           The crc32.h header file contains tables for both 32-bit and 64-bit
342           z_word_t's, and so requires a 64-bit type be available. In that case,
343           z_word_t must be defined to be 64-bits. This code then also generates
344           and writes out the tables for the case that z_word_t is 32 bits.
345          */
346 #if !defined(W) || W != 8
347 #  error Need a 64-bit integer type in order to generate crc32.h.
348 #endif
349         FILE *out;
350         int k, n;
351         z_crc_t ltl[8][256];
352         z_word_t big[8][256];
353 
354         out = fopen("crc32.h", "w");
355         if (out == NULL) return;
356 
357         /* write out little-endian CRC table to crc32.h */
358         fprintf(out,
359             "/* crc32.h -- tables for rapid CRC calculation\n"
360             " * Generated automatically by crc32.c\n */\n"
361             "\n"
362             "local const z_crc_t FAR crc_table[] = {\n"
363             "    ");
364         write_table(out, crc_table, 256);
365         fprintf(out,
366             "};\n");
367 
368         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
369         fprintf(out,
370             "\n"
371             "#ifdef W\n"
372             "\n"
373             "#if W == 8\n"
374             "\n"
375             "local const z_word_t FAR crc_big_table[] = {\n"
376             "    ");
377         write_table64(out, crc_big_table, 256);
378         fprintf(out,
379             "};\n");
380 
381         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
382         fprintf(out,
383             "\n"
384             "#else /* W == 4 */\n"
385             "\n"
386             "local const z_word_t FAR crc_big_table[] = {\n"
387             "    ");
388         write_table32hi(out, crc_big_table, 256);
389         fprintf(out,
390             "};\n"
391             "\n"
392             "#endif\n");
393 
394         /* write out braid tables for each value of N */
395         for (n = 1; n <= 6; n++) {
396             fprintf(out,
397             "\n"
398             "#if N == %d\n", n);
399 
400             /* compute braid tables for this N and 64-bit word_t */
401             braid(ltl, big, n, 8);
402 
403             /* write out braid tables for 64-bit z_word_t to crc32.h */
404             fprintf(out,
405             "\n"
406             "#if W == 8\n"
407             "\n"
408             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
409             for (k = 0; k < 8; k++) {
410                 fprintf(out, "   {");
411                 write_table(out, ltl[k], 256);
412                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
413             }
414             fprintf(out,
415             "};\n"
416             "\n"
417             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
418             for (k = 0; k < 8; k++) {
419                 fprintf(out, "   {");
420                 write_table64(out, big[k], 256);
421                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
422             }
423             fprintf(out,
424             "};\n");
425 
426             /* compute braid tables for this N and 32-bit word_t */
427             braid(ltl, big, n, 4);
428 
429             /* write out braid tables for 32-bit z_word_t to crc32.h */
430             fprintf(out,
431             "\n"
432             "#else /* W == 4 */\n"
433             "\n"
434             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
435             for (k = 0; k < 4; k++) {
436                 fprintf(out, "   {");
437                 write_table(out, ltl[k], 256);
438                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
439             }
440             fprintf(out,
441             "};\n"
442             "\n"
443             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
444             for (k = 0; k < 4; k++) {
445                 fprintf(out, "   {");
446                 write_table32hi(out, big[k], 256);
447                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
448             }
449             fprintf(out,
450             "};\n"
451             "\n"
452             "#endif\n"
453             "\n"
454             "#endif\n");
455         }
456         fprintf(out,
457             "\n"
458             "#endif\n");
459 
460         /* write out zeros operator table to crc32.h */
461         fprintf(out,
462             "\n"
463             "local const z_crc_t FAR x2n_table[] = {\n"
464             "    ");
465         write_table(out, x2n_table, 32);
466         fprintf(out,
467             "};\n");
468         fclose(out);
469     }
470 #endif /* MAKECRCH */
471 }
472 
473 #ifdef MAKECRCH
474 
475 /*
476    Write the 32-bit values in table[0..k-1] to out, five per line in
477    hexadecimal separated by commas.
478  */
479 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
480     int n;
481 
482     for (n = 0; n < k; n++)
483         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
484                 (unsigned long)(table[n]),
485                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
486 }
487 
488 /*
489    Write the high 32-bits of each value in table[0..k-1] to out, five per line
490    in hexadecimal separated by commas.
491  */
492 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
493     int n;
494 
495     for (n = 0; n < k; n++)
496         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
497                 (unsigned long)(table[n] >> 32),
498                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
499 }
500 
501 /*
502   Write the 64-bit values in table[0..k-1] to out, three per line in
503   hexadecimal separated by commas. This assumes that if there is a 64-bit
504   type, then there is also a long long integer type, and it is at least 64
505   bits. If not, then the type cast and format string can be adjusted
506   accordingly.
507  */
508 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
509     int n;
510 
511     for (n = 0; n < k; n++)
512         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
513                 (unsigned long long)(table[n]),
514                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
515 }
516 
517 /* Actually do the deed. */
518 int main(void) {
519     make_crc_table();
520     return 0;
521 }
522 
523 #endif /* MAKECRCH */
524 
525 #ifdef W
526 /*
527   Generate the little and big-endian braid tables for the given n and z_word_t
528   size w. Each array must have room for w blocks of 256 elements.
529  */
530 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
531     int k;
532     z_crc_t i, p, q;
533     for (k = 0; k < w; k++) {
534         p = x2nmodp((n * w + 3 - k) << 3, 0);
535         ltl[k][0] = 0;
536         big[w - 1 - k][0] = 0;
537         for (i = 1; i < 256; i++) {
538             ltl[k][i] = q = multmodp(i << 24, p);
539             big[w - 1 - k][i] = byte_swap(q);
540         }
541     }
542 }
543 #endif
544 
545 #endif /* DYNAMIC_CRC_TABLE */
546 
547 /* =========================================================================
548  * This function can be used by asm versions of crc32(), and to force the
549  * generation of the CRC tables in a threaded application.
550  */
551 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
552 #ifdef DYNAMIC_CRC_TABLE
553     once(&made, make_crc_table);
554 #endif /* DYNAMIC_CRC_TABLE */
555     return (const z_crc_t FAR *)crc_table;
556 }
557 
558 /* =========================================================================
559  * Use ARM machine instructions if available. This will compute the CRC about
560  * ten times faster than the braided calculation. This code does not check for
561  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
562  * only be defined if the compilation specifies an ARM processor architecture
563  * that has the instructions. For example, compiling with -march=armv8.1-a or
564  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
565  * instructions.
566  */
567 #ifdef ARMCRC32
568 
569 /*
570    Constants empirically determined to maximize speed. These values are from
571    measurements on a Cortex-A57. Your mileage may vary.
572  */
573 #define Z_BATCH 3990                /* number of words in a batch */
574 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
575 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
576 
577 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
578                               z_size_t len) {
579     z_crc_t val;
580     z_word_t crc1, crc2;
581     const z_word_t *word;
582     z_word_t val0, val1, val2;
583     z_size_t last, last2, i;
584     z_size_t num;
585 
586     /* Return initial CRC, if requested. */
587     if (buf == Z_NULL) return 0;
588 
589 #ifdef DYNAMIC_CRC_TABLE
590     once(&made, make_crc_table);
591 #endif /* DYNAMIC_CRC_TABLE */
592 
593     /* Pre-condition the CRC */
594     crc = (~crc) & 0xffffffff;
595 
596     /* Compute the CRC up to a word boundary. */
597     while (len && ((z_size_t)buf & 7) != 0) {
598         len--;
599         val = *buf++;
600         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
601     }
602 
603     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
604     word = (z_word_t const *)buf;
605     num = len >> 3;
606     len &= 7;
607 
608     /* Do three interleaved CRCs to realize the throughput of one crc32x
609        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
610        three CRCs are combined into a single CRC after each set of batches. */
611     while (num >= 3 * Z_BATCH) {
612         crc1 = 0;
613         crc2 = 0;
614         for (i = 0; i < Z_BATCH; i++) {
615             val0 = word[i];
616             val1 = word[i + Z_BATCH];
617             val2 = word[i + 2 * Z_BATCH];
618             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
619             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
620             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
621         }
622         word += 3 * Z_BATCH;
623         num -= 3 * Z_BATCH;
624         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
625         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
626     }
627 
628     /* Do one last smaller batch with the remaining words, if there are enough
629        to pay for the combination of CRCs. */
630     last = num / 3;
631     if (last >= Z_BATCH_MIN) {
632         last2 = last << 1;
633         crc1 = 0;
634         crc2 = 0;
635         for (i = 0; i < last; i++) {
636             val0 = word[i];
637             val1 = word[i + last];
638             val2 = word[i + last2];
639             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
640             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
641             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
642         }
643         word += 3 * last;
644         num -= 3 * last;
645         val = x2nmodp(last, 6);
646         crc = multmodp(val, crc) ^ crc1;
647         crc = multmodp(val, crc) ^ crc2;
648     }
649 
650     /* Compute the CRC on any remaining words. */
651     for (i = 0; i < num; i++) {
652         val0 = word[i];
653         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
654     }
655     word += num;
656 
657     /* Complete the CRC on any remaining bytes. */
658     buf = (const unsigned char FAR *)word;
659     while (len) {
660         len--;
661         val = *buf++;
662         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
663     }
664 
665     /* Return the CRC, post-conditioned. */
666     return crc ^ 0xffffffff;
667 }
668 
669 #else
670 
671 #ifdef W
672 
673 /*
674   Return the CRC of the W bytes in the word_t data, taking the
675   least-significant byte of the word as the first byte of data, without any pre
676   or post conditioning. This is used to combine the CRCs of each braid.
677  */
678 local z_crc_t crc_word(z_word_t data) {
679     int k;
680     for (k = 0; k < W; k++)
681         data = (data >> 8) ^ crc_table[data & 0xff];
682     return (z_crc_t)data;
683 }
684 
685 local z_word_t crc_word_big(z_word_t data) {
686     int k;
687     for (k = 0; k < W; k++)
688         data = (data << 8) ^
689             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
690     return data;
691 }
692 
693 #endif
694 
695 /* ========================================================================= */
696 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
697                               z_size_t len) {
698     /* Return initial CRC, if requested. */
699     if (buf == Z_NULL) return 0;
700 
701 #ifdef DYNAMIC_CRC_TABLE
702     once(&made, make_crc_table);
703 #endif /* DYNAMIC_CRC_TABLE */
704 
705     /* Pre-condition the CRC */
706     crc = (~crc) & 0xffffffff;
707 
708 #ifdef W
709 
710     /* If provided enough bytes, do a braided CRC calculation. */
711     if (len >= N * W + W - 1) {
712         z_size_t blks;
713         z_word_t const *words;
714         unsigned endian;
715         int k;
716 
717         /* Compute the CRC up to a z_word_t boundary. */
718         while (len && ((z_size_t)buf & (W - 1)) != 0) {
719             len--;
720             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
721         }
722 
723         /* Compute the CRC on as many N z_word_t blocks as are available. */
724         blks = len / (N * W);
725         len -= blks * N * W;
726         words = (z_word_t const *)buf;
727 
728         /* Do endian check at execution time instead of compile time, since ARM
729            processors can change the endianness at execution time. If the
730            compiler knows what the endianness will be, it can optimize out the
731            check and the unused branch. */
732         endian = 1;
733         if (*(unsigned char *)&endian) {
734             /* Little endian. */
735 
736             z_crc_t crc0;
737             z_word_t word0;
738 #if N > 1
739             z_crc_t crc1;
740             z_word_t word1;
741 #if N > 2
742             z_crc_t crc2;
743             z_word_t word2;
744 #if N > 3
745             z_crc_t crc3;
746             z_word_t word3;
747 #if N > 4
748             z_crc_t crc4;
749             z_word_t word4;
750 #if N > 5
751             z_crc_t crc5;
752             z_word_t word5;
753 #endif
754 #endif
755 #endif
756 #endif
757 #endif
758 
759             /* Initialize the CRC for each braid. */
760             crc0 = crc;
761 #if N > 1
762             crc1 = 0;
763 #if N > 2
764             crc2 = 0;
765 #if N > 3
766             crc3 = 0;
767 #if N > 4
768             crc4 = 0;
769 #if N > 5
770             crc5 = 0;
771 #endif
772 #endif
773 #endif
774 #endif
775 #endif
776 
777             /*
778               Process the first blks-1 blocks, computing the CRCs on each braid
779               independently.
780              */
781             while (--blks) {
782                 /* Load the word for each braid into registers. */
783                 word0 = crc0 ^ words[0];
784 #if N > 1
785                 word1 = crc1 ^ words[1];
786 #if N > 2
787                 word2 = crc2 ^ words[2];
788 #if N > 3
789                 word3 = crc3 ^ words[3];
790 #if N > 4
791                 word4 = crc4 ^ words[4];
792 #if N > 5
793                 word5 = crc5 ^ words[5];
794 #endif
795 #endif
796 #endif
797 #endif
798 #endif
799                 words += N;
800 
801                 /* Compute and update the CRC for each word. The loop should
802                    get unrolled. */
803                 crc0 = crc_braid_table[0][word0 & 0xff];
804 #if N > 1
805                 crc1 = crc_braid_table[0][word1 & 0xff];
806 #if N > 2
807                 crc2 = crc_braid_table[0][word2 & 0xff];
808 #if N > 3
809                 crc3 = crc_braid_table[0][word3 & 0xff];
810 #if N > 4
811                 crc4 = crc_braid_table[0][word4 & 0xff];
812 #if N > 5
813                 crc5 = crc_braid_table[0][word5 & 0xff];
814 #endif
815 #endif
816 #endif
817 #endif
818 #endif
819                 for (k = 1; k < W; k++) {
820                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
821 #if N > 1
822                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
823 #if N > 2
824                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
825 #if N > 3
826                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
827 #if N > 4
828                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
829 #if N > 5
830                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
831 #endif
832 #endif
833 #endif
834 #endif
835 #endif
836                 }
837             }
838 
839             /*
840               Process the last block, combining the CRCs of the N braids at the
841               same time.
842              */
843             crc = crc_word(crc0 ^ words[0]);
844 #if N > 1
845             crc = crc_word(crc1 ^ words[1] ^ crc);
846 #if N > 2
847             crc = crc_word(crc2 ^ words[2] ^ crc);
848 #if N > 3
849             crc = crc_word(crc3 ^ words[3] ^ crc);
850 #if N > 4
851             crc = crc_word(crc4 ^ words[4] ^ crc);
852 #if N > 5
853             crc = crc_word(crc5 ^ words[5] ^ crc);
854 #endif
855 #endif
856 #endif
857 #endif
858 #endif
859             words += N;
860         }
861         else {
862             /* Big endian. */
863 
864             z_word_t crc0, word0, comb;
865 #if N > 1
866             z_word_t crc1, word1;
867 #if N > 2
868             z_word_t crc2, word2;
869 #if N > 3
870             z_word_t crc3, word3;
871 #if N > 4
872             z_word_t crc4, word4;
873 #if N > 5
874             z_word_t crc5, word5;
875 #endif
876 #endif
877 #endif
878 #endif
879 #endif
880 
881             /* Initialize the CRC for each braid. */
882             crc0 = byte_swap(crc);
883 #if N > 1
884             crc1 = 0;
885 #if N > 2
886             crc2 = 0;
887 #if N > 3
888             crc3 = 0;
889 #if N > 4
890             crc4 = 0;
891 #if N > 5
892             crc5 = 0;
893 #endif
894 #endif
895 #endif
896 #endif
897 #endif
898 
899             /*
900               Process the first blks-1 blocks, computing the CRCs on each braid
901               independently.
902              */
903             while (--blks) {
904                 /* Load the word for each braid into registers. */
905                 word0 = crc0 ^ words[0];
906 #if N > 1
907                 word1 = crc1 ^ words[1];
908 #if N > 2
909                 word2 = crc2 ^ words[2];
910 #if N > 3
911                 word3 = crc3 ^ words[3];
912 #if N > 4
913                 word4 = crc4 ^ words[4];
914 #if N > 5
915                 word5 = crc5 ^ words[5];
916 #endif
917 #endif
918 #endif
919 #endif
920 #endif
921                 words += N;
922 
923                 /* Compute and update the CRC for each word. The loop should
924                    get unrolled. */
925                 crc0 = crc_braid_big_table[0][word0 & 0xff];
926 #if N > 1
927                 crc1 = crc_braid_big_table[0][word1 & 0xff];
928 #if N > 2
929                 crc2 = crc_braid_big_table[0][word2 & 0xff];
930 #if N > 3
931                 crc3 = crc_braid_big_table[0][word3 & 0xff];
932 #if N > 4
933                 crc4 = crc_braid_big_table[0][word4 & 0xff];
934 #if N > 5
935                 crc5 = crc_braid_big_table[0][word5 & 0xff];
936 #endif
937 #endif
938 #endif
939 #endif
940 #endif
941                 for (k = 1; k < W; k++) {
942                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
943 #if N > 1
944                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
945 #if N > 2
946                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
947 #if N > 3
948                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
949 #if N > 4
950                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
951 #if N > 5
952                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
953 #endif
954 #endif
955 #endif
956 #endif
957 #endif
958                 }
959             }
960 
961             /*
962               Process the last block, combining the CRCs of the N braids at the
963               same time.
964              */
965             comb = crc_word_big(crc0 ^ words[0]);
966 #if N > 1
967             comb = crc_word_big(crc1 ^ words[1] ^ comb);
968 #if N > 2
969             comb = crc_word_big(crc2 ^ words[2] ^ comb);
970 #if N > 3
971             comb = crc_word_big(crc3 ^ words[3] ^ comb);
972 #if N > 4
973             comb = crc_word_big(crc4 ^ words[4] ^ comb);
974 #if N > 5
975             comb = crc_word_big(crc5 ^ words[5] ^ comb);
976 #endif
977 #endif
978 #endif
979 #endif
980 #endif
981             words += N;
982             crc = byte_swap(comb);
983         }
984 
985         /*
986           Update the pointer to the remaining bytes to process.
987          */
988         buf = (unsigned char const *)words;
989     }
990 
991 #endif /* W */
992 
993     /* Complete the computation of the CRC on any remaining bytes. */
994     while (len >= 8) {
995         len -= 8;
996         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
997         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
998         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
999         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1000         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1001         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1002         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1003         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1004     }
1005     while (len) {
1006         len--;
1007         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1008     }
1009 
1010     /* Return the CRC, post-conditioned. */
1011     return crc ^ 0xffffffff;
1012 }
1013 
1014 #endif
1015 
1016 /* ========================================================================= */
1017 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1018                             uInt len) {
1019     return crc32_z(crc, buf, len);
1020 }
1021 
1022 /* ========================================================================= */
1023 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1024 #ifdef DYNAMIC_CRC_TABLE
1025     once(&made, make_crc_table);
1026 #endif /* DYNAMIC_CRC_TABLE */
1027     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1028 }
1029 
1030 /* ========================================================================= */
1031 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1032     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1033 }
1034 
1035 /* ========================================================================= */
1036 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1037 #ifdef DYNAMIC_CRC_TABLE
1038     once(&made, make_crc_table);
1039 #endif /* DYNAMIC_CRC_TABLE */
1040     return x2nmodp(len2, 3);
1041 }
1042 
1043 /* ========================================================================= */
1044 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1045     return crc32_combine_gen64((z_off64_t)len2);
1046 }
1047 
1048 /* ========================================================================= */
1049 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1050     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1051 }
1052