xref: /openbsd-src/sys/lib/libz/crc32.c (revision 3374c67d44f9b75b98444cbf63020f777792342e)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /*
11   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
12   protection on the static variables used to control the first-use generation
13   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
14   first call get_crc_table() to initialize the tables before allowing more than
15   one thread to use crc32().
16 
17   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
18   produced, so that this one source file can be compiled to an executable.
19  */
20 
21 #ifdef MAKECRCH
22 #  include <stdio.h>
23 #  ifndef DYNAMIC_CRC_TABLE
24 #    define DYNAMIC_CRC_TABLE
25 #  endif /* !DYNAMIC_CRC_TABLE */
26 #endif /* MAKECRCH */
27 
28 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
29 
30  /*
31   A CRC of a message is computed on N braids of words in the message, where
32   each word consists of W bytes (4 or 8). If N is 3, for example, then three
33   running sparse CRCs are calculated respectively on each braid, at these
34   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
35   This is done starting at a word boundary, and continues until as many blocks
36   of N * W bytes as are available have been processed. The results are combined
37   into a single CRC at the end. For this code, N must be in the range 1..6 and
38   W must be 4 or 8. The upper limit on N can be increased if desired by adding
39   more #if blocks, extending the patterns apparent in the code. In addition,
40   crc32.h would need to be regenerated, if the maximum N value is increased.
41 
42   N and W are chosen empirically by benchmarking the execution time on a given
43   processor. The choices for N and W below were based on testing on Intel Kaby
44   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
45   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
46   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
47   They were all tested with either gcc or clang, all using the -O3 optimization
48   level. Your mileage may vary.
49  */
50 
51 /* Define N */
52 #ifdef Z_TESTN
53 #  define N Z_TESTN
54 #else
55 #  define N 5
56 #endif
57 #if N < 1 || N > 6
58 #  error N must be in 1..6
59 #endif
60 
61 /*
62   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
63   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
64   that bytes are eight bits.
65  */
66 
67 /*
68   Define W and the associated z_word_t type. If W is not defined, then a
69   braided calculation is not used, and the associated tables and code are not
70   compiled.
71  */
72 #ifdef Z_TESTW
73 #  if Z_TESTW-1 != -1
74 #    define W Z_TESTW
75 #  endif
76 #else
77 #  ifdef MAKECRCH
78 #    define W 8         /* required for MAKECRCH */
79 #  else
80 #    if defined(__x86_64__) || defined(__aarch64__)
81 #      define W 8
82 #    else
83 #      define W 4
84 #    endif
85 #  endif
86 #endif
87 #ifdef W
88 #  if W == 8 && defined(Z_U8)
89      typedef Z_U8 z_word_t;
90 #  elif defined(Z_U4)
91 #    undef W
92 #    define W 4
93      typedef Z_U4 z_word_t;
94 #  else
95 #    undef W
96 #  endif
97 #endif
98 
99 /* If available, use the ARM processor CRC32 instruction. */
100 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
101 #  define ARMCRC32
102 #endif
103 
104 /* Local functions. */
105 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
106 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
107 
108 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
109     local z_word_t byte_swap OF((z_word_t word));
110 #endif
111 
112 #if defined(W) && !defined(ARMCRC32)
113     local z_crc_t crc_word OF((z_word_t data));
114     local z_word_t crc_word_big OF((z_word_t data));
115 #endif
116 
117 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
118 /*
119   Swap the bytes in a z_word_t to convert between little and big endian. Any
120   self-respecting compiler will optimize this to a single machine byte-swap
121   instruction, if one is available. This assumes that word_t is either 32 bits
122   or 64 bits.
123  */
124 local z_word_t byte_swap(word)
125     z_word_t word;
126 {
127 #  if W == 8
128     return
129         (word & 0xff00000000000000) >> 56 |
130         (word & 0xff000000000000) >> 40 |
131         (word & 0xff0000000000) >> 24 |
132         (word & 0xff00000000) >> 8 |
133         (word & 0xff000000) << 8 |
134         (word & 0xff0000) << 24 |
135         (word & 0xff00) << 40 |
136         (word & 0xff) << 56;
137 #  else   /* W == 4 */
138     return
139         (word & 0xff000000) >> 24 |
140         (word & 0xff0000) >> 8 |
141         (word & 0xff00) << 8 |
142         (word & 0xff) << 24;
143 #  endif
144 }
145 #endif
146 
147 /* CRC polynomial. */
148 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
149 
150 #ifdef DYNAMIC_CRC_TABLE
151 
152 local z_crc_t FAR crc_table[256];
153 local z_crc_t FAR x2n_table[32];
154 local void make_crc_table OF((void));
155 #ifdef W
156    local z_word_t FAR crc_big_table[256];
157    local z_crc_t FAR crc_braid_table[W][256];
158    local z_word_t FAR crc_braid_big_table[W][256];
159    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
160 #endif
161 #ifdef MAKECRCH
162    local void write_table OF((FILE *, const z_crc_t FAR *, int));
163    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
164    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
165 #endif /* MAKECRCH */
166 
167 /*
168   Define a once() function depending on the availability of atomics. If this is
169   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
170   multiple threads, and if atomics are not available, then get_crc_table() must
171   be called to initialize the tables and must return before any threads are
172   allowed to compute or combine CRCs.
173  */
174 
175 /* Definition of once functionality. */
176 typedef struct once_s once_t;
177 local void once OF((once_t *, void (*)(void)));
178 
179 /* Check for the availability of atomics. */
180 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
181     !defined(__STDC_NO_ATOMICS__) && !defined(SMALL)
182 
183 #include <stdatomic.h>
184 
185 /* Structure for once(), which must be initialized with ONCE_INIT. */
186 struct once_s {
187     atomic_flag begun;
188     atomic_int done;
189 };
190 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
191 
192 /*
193   Run the provided init() function exactly once, even if multiple threads
194   invoke once() at the same time. The state must be a once_t initialized with
195   ONCE_INIT.
196  */
197 local void once(state, init)
198     once_t *state;
199     void (*init)(void);
200 {
201     if (!atomic_load(&state->done)) {
202         if (atomic_flag_test_and_set(&state->begun))
203             while (!atomic_load(&state->done))
204                 ;
205         else {
206             init();
207             atomic_store(&state->done, 1);
208         }
209     }
210 }
211 
212 #else   /* no atomics */
213 
214 /* Structure for once(), which must be initialized with ONCE_INIT. */
215 struct once_s {
216     volatile int begun;
217     volatile int done;
218 };
219 #define ONCE_INIT {0, 0}
220 
221 /* Test and set. Alas, not atomic, but tries to minimize the period of
222    vulnerability. */
223 local int test_and_set OF((int volatile *));
224 local int test_and_set(flag)
225     int volatile *flag;
226 {
227     int was;
228 
229     was = *flag;
230     *flag = 1;
231     return was;
232 }
233 
234 /* Run the provided init() function once. This is not thread-safe. */
235 local void once(state, init)
236     once_t *state;
237     void (*init)(void);
238 {
239     if (!state->done) {
240         if (test_and_set(&state->begun))
241             while (!state->done)
242                 ;
243         else {
244             init();
245             state->done = 1;
246         }
247     }
248 }
249 
250 #endif
251 
252 /* State for once(). */
253 local once_t made = ONCE_INIT;
254 
255 /*
256   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
257   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
258 
259   Polynomials over GF(2) are represented in binary, one bit per coefficient,
260   with the lowest powers in the most significant bit. Then adding polynomials
261   is just exclusive-or, and multiplying a polynomial by x is a right shift by
262   one. If we call the above polynomial p, and represent a byte as the
263   polynomial q, also with the lowest power in the most significant bit (so the
264   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
265   where a mod b means the remainder after dividing a by b.
266 
267   This calculation is done using the shift-register method of multiplying and
268   taking the remainder. The register is initialized to zero, and for each
269   incoming bit, x^32 is added mod p to the register if the bit is a one (where
270   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
271   (which is shifting right by one and adding x^32 mod p if the bit shifted out
272   is a one). We start with the highest power (least significant bit) of q and
273   repeat for all eight bits of q.
274 
275   The table is simply the CRC of all possible eight bit values. This is all the
276   information needed to generate CRCs on data a byte at a time for all
277   combinations of CRC register values and incoming bytes.
278  */
279 
280 local void make_crc_table()
281 {
282     unsigned i, j, n;
283     z_crc_t p;
284 
285     /* initialize the CRC of bytes tables */
286     for (i = 0; i < 256; i++) {
287         p = i;
288         for (j = 0; j < 8; j++)
289             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
290         crc_table[i] = p;
291 #ifdef W
292         crc_big_table[i] = byte_swap(p);
293 #endif
294     }
295 
296     /* initialize the x^2^n mod p(x) table */
297     p = (z_crc_t)1 << 30;         /* x^1 */
298     x2n_table[0] = p;
299     for (n = 1; n < 32; n++)
300         x2n_table[n] = p = multmodp(p, p);
301 
302 #ifdef W
303     /* initialize the braiding tables -- needs x2n_table[] */
304     braid(crc_braid_table, crc_braid_big_table, N, W);
305 #endif
306 
307 #ifdef MAKECRCH
308     {
309         /*
310           The crc32.h header file contains tables for both 32-bit and 64-bit
311           z_word_t's, and so requires a 64-bit type be available. In that case,
312           z_word_t must be defined to be 64-bits. This code then also generates
313           and writes out the tables for the case that z_word_t is 32 bits.
314          */
315 #if !defined(W) || W != 8
316 #  error Need a 64-bit integer type in order to generate crc32.h.
317 #endif
318         FILE *out;
319         int k, n;
320         z_crc_t ltl[8][256];
321         z_word_t big[8][256];
322 
323         out = fopen("crc32.h", "w");
324         if (out == NULL) return;
325 
326         /* write out little-endian CRC table to crc32.h */
327         fprintf(out,
328             "/* crc32.h -- tables for rapid CRC calculation\n"
329             " * Generated automatically by crc32.c\n */\n"
330             "\n"
331             "local const z_crc_t FAR crc_table[] = {\n"
332             "    ");
333         write_table(out, crc_table, 256);
334         fprintf(out,
335             "};\n");
336 
337         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
338         fprintf(out,
339             "\n"
340             "#ifdef W\n"
341             "\n"
342             "#if W == 8\n"
343             "\n"
344             "local const z_word_t FAR crc_big_table[] = {\n"
345             "    ");
346         write_table64(out, crc_big_table, 256);
347         fprintf(out,
348             "};\n");
349 
350         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
351         fprintf(out,
352             "\n"
353             "#else /* W == 4 */\n"
354             "\n"
355             "local const z_word_t FAR crc_big_table[] = {\n"
356             "    ");
357         write_table32hi(out, crc_big_table, 256);
358         fprintf(out,
359             "};\n"
360             "\n"
361             "#endif\n");
362 
363         /* write out braid tables for each value of N */
364         for (n = 1; n <= 6; n++) {
365             fprintf(out,
366             "\n"
367             "#if N == %d\n", n);
368 
369             /* compute braid tables for this N and 64-bit word_t */
370             braid(ltl, big, n, 8);
371 
372             /* write out braid tables for 64-bit z_word_t to crc32.h */
373             fprintf(out,
374             "\n"
375             "#if W == 8\n"
376             "\n"
377             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
378             for (k = 0; k < 8; k++) {
379                 fprintf(out, "   {");
380                 write_table(out, ltl[k], 256);
381                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
382             }
383             fprintf(out,
384             "};\n"
385             "\n"
386             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
387             for (k = 0; k < 8; k++) {
388                 fprintf(out, "   {");
389                 write_table64(out, big[k], 256);
390                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
391             }
392             fprintf(out,
393             "};\n");
394 
395             /* compute braid tables for this N and 32-bit word_t */
396             braid(ltl, big, n, 4);
397 
398             /* write out braid tables for 32-bit z_word_t to crc32.h */
399             fprintf(out,
400             "\n"
401             "#else /* W == 4 */\n"
402             "\n"
403             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
404             for (k = 0; k < 4; k++) {
405                 fprintf(out, "   {");
406                 write_table(out, ltl[k], 256);
407                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
408             }
409             fprintf(out,
410             "};\n"
411             "\n"
412             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
413             for (k = 0; k < 4; k++) {
414                 fprintf(out, "   {");
415                 write_table32hi(out, big[k], 256);
416                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
417             }
418             fprintf(out,
419             "};\n"
420             "\n"
421             "#endif\n"
422             "\n"
423             "#endif\n");
424         }
425         fprintf(out,
426             "\n"
427             "#endif\n");
428 
429         /* write out zeros operator table to crc32.h */
430         fprintf(out,
431             "\n"
432             "local const z_crc_t FAR x2n_table[] = {\n"
433             "    ");
434         write_table(out, x2n_table, 32);
435         fprintf(out,
436             "};\n");
437         fclose(out);
438     }
439 #endif /* MAKECRCH */
440 }
441 
442 #ifdef MAKECRCH
443 
444 /*
445    Write the 32-bit values in table[0..k-1] to out, five per line in
446    hexadecimal separated by commas.
447  */
448 local void write_table(out, table, k)
449     FILE *out;
450     const z_crc_t FAR *table;
451     int k;
452 {
453     int n;
454 
455     for (n = 0; n < k; n++)
456         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
457                 (unsigned long)(table[n]),
458                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
459 }
460 
461 /*
462    Write the high 32-bits of each value in table[0..k-1] to out, five per line
463    in hexadecimal separated by commas.
464  */
465 local void write_table32hi(out, table, k)
466 FILE *out;
467 const z_word_t FAR *table;
468 int k;
469 {
470     int n;
471 
472     for (n = 0; n < k; n++)
473         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
474                 (unsigned long)(table[n] >> 32),
475                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
476 }
477 
478 /*
479   Write the 64-bit values in table[0..k-1] to out, three per line in
480   hexadecimal separated by commas. This assumes that if there is a 64-bit
481   type, then there is also a long long integer type, and it is at least 64
482   bits. If not, then the type cast and format string can be adjusted
483   accordingly.
484  */
485 local void write_table64(out, table, k)
486     FILE *out;
487     const z_word_t FAR *table;
488     int k;
489 {
490     int n;
491 
492     for (n = 0; n < k; n++)
493         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
494                 (unsigned long long)(table[n]),
495                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
496 }
497 
498 /* Actually do the deed. */
499 int main()
500 {
501     make_crc_table();
502     return 0;
503 }
504 
505 #endif /* MAKECRCH */
506 
507 #ifdef W
508 /*
509   Generate the little and big-endian braid tables for the given n and z_word_t
510   size w. Each array must have room for w blocks of 256 elements.
511  */
512 local void braid(ltl, big, n, w)
513     z_crc_t ltl[][256];
514     z_word_t big[][256];
515     int n;
516     int w;
517 {
518     int k;
519     z_crc_t i, p, q;
520     for (k = 0; k < w; k++) {
521         p = x2nmodp((n * w + 3 - k) << 3, 0);
522         ltl[k][0] = 0;
523         big[w - 1 - k][0] = 0;
524         for (i = 1; i < 256; i++) {
525             ltl[k][i] = q = multmodp(i << 24, p);
526             big[w - 1 - k][i] = byte_swap(q);
527         }
528     }
529 }
530 #endif
531 
532 #else /* !DYNAMIC_CRC_TABLE */
533 /* ========================================================================
534  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
535  * of x for combining CRC-32s, all made by make_crc_table().
536  */
537 #include "crc32.h"
538 #endif /* DYNAMIC_CRC_TABLE */
539 
540 /* ========================================================================
541  * Routines used for CRC calculation. Some are also required for the table
542  * generation above.
543  */
544 
545 /*
546   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
547   reflected. For speed, this requires that a not be zero.
548  */
549 local z_crc_t multmodp(a, b)
550     z_crc_t a;
551     z_crc_t b;
552 {
553     z_crc_t m, p;
554 
555     m = (z_crc_t)1 << 31;
556     p = 0;
557     for (;;) {
558         if (a & m) {
559             p ^= b;
560             if ((a & (m - 1)) == 0)
561                 break;
562         }
563         m >>= 1;
564         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
565     }
566     return p;
567 }
568 
569 /*
570   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
571   initialized.
572  */
573 local z_crc_t x2nmodp(n, k)
574     z_off64_t n;
575     unsigned k;
576 {
577     z_crc_t p;
578 
579     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
580     while (n) {
581         if (n & 1)
582             p = multmodp(x2n_table[k & 31], p);
583         n >>= 1;
584         k++;
585     }
586     return p;
587 }
588 
589 /* =========================================================================
590  * This function can be used by asm versions of crc32(), and to force the
591  * generation of the CRC tables in a threaded application.
592  */
593 const z_crc_t FAR * ZEXPORT get_crc_table()
594 {
595 #ifdef DYNAMIC_CRC_TABLE
596     once(&made, make_crc_table);
597 #endif /* DYNAMIC_CRC_TABLE */
598     return (const z_crc_t FAR *)crc_table;
599 }
600 
601 /* =========================================================================
602  * Use ARM machine instructions if available. This will compute the CRC about
603  * ten times faster than the braided calculation. This code does not check for
604  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
605  * only be defined if the compilation specifies an ARM processor architecture
606  * that has the instructions. For example, compiling with -march=armv8.1-a or
607  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
608  * instructions.
609  */
610 #ifdef ARMCRC32
611 
612 /*
613    Constants empirically determined to maximize speed. These values are from
614    measurements on a Cortex-A57. Your mileage may vary.
615  */
616 #define Z_BATCH 3990                /* number of words in a batch */
617 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
618 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
619 
620 unsigned long ZEXPORT crc32_z(crc, buf, len)
621     unsigned long crc;
622     const unsigned char FAR *buf;
623     z_size_t len;
624 {
625     z_crc_t val;
626     z_word_t crc1, crc2;
627     const z_word_t *word;
628     z_word_t val0, val1, val2;
629     z_size_t last, last2, i;
630     z_size_t num;
631 
632     /* Return initial CRC, if requested. */
633     if (buf == Z_NULL) return 0;
634 
635 #ifdef DYNAMIC_CRC_TABLE
636     once(&made, make_crc_table);
637 #endif /* DYNAMIC_CRC_TABLE */
638 
639     /* Pre-condition the CRC */
640     crc = (~crc) & 0xffffffff;
641 
642     /* Compute the CRC up to a word boundary. */
643     while (len && ((z_size_t)buf & 7) != 0) {
644         len--;
645         val = *buf++;
646         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
647     }
648 
649     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
650     word = (z_word_t const *)buf;
651     num = len >> 3;
652     len &= 7;
653 
654     /* Do three interleaved CRCs to realize the throughput of one crc32x
655        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
656        three CRCs are combined into a single CRC after each set of batches. */
657     while (num >= 3 * Z_BATCH) {
658         crc1 = 0;
659         crc2 = 0;
660         for (i = 0; i < Z_BATCH; i++) {
661             val0 = word[i];
662             val1 = word[i + Z_BATCH];
663             val2 = word[i + 2 * Z_BATCH];
664             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
665             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
666             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
667         }
668         word += 3 * Z_BATCH;
669         num -= 3 * Z_BATCH;
670         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
671         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
672     }
673 
674     /* Do one last smaller batch with the remaining words, if there are enough
675        to pay for the combination of CRCs. */
676     last = num / 3;
677     if (last >= Z_BATCH_MIN) {
678         last2 = last << 1;
679         crc1 = 0;
680         crc2 = 0;
681         for (i = 0; i < last; i++) {
682             val0 = word[i];
683             val1 = word[i + last];
684             val2 = word[i + last2];
685             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
686             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
687             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
688         }
689         word += 3 * last;
690         num -= 3 * last;
691         val = x2nmodp(last, 6);
692         crc = multmodp(val, crc) ^ crc1;
693         crc = multmodp(val, crc) ^ crc2;
694     }
695 
696     /* Compute the CRC on any remaining words. */
697     for (i = 0; i < num; i++) {
698         val0 = word[i];
699         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
700     }
701     word += num;
702 
703     /* Complete the CRC on any remaining bytes. */
704     buf = (const unsigned char FAR *)word;
705     while (len) {
706         len--;
707         val = *buf++;
708         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
709     }
710 
711     /* Return the CRC, post-conditioned. */
712     return crc ^ 0xffffffff;
713 }
714 
715 #else
716 
717 #ifdef W
718 
719 /*
720   Return the CRC of the W bytes in the word_t data, taking the
721   least-significant byte of the word as the first byte of data, without any pre
722   or post conditioning. This is used to combine the CRCs of each braid.
723  */
724 local z_crc_t crc_word(data)
725     z_word_t data;
726 {
727     int k;
728     for (k = 0; k < W; k++)
729         data = (data >> 8) ^ crc_table[data & 0xff];
730     return (z_crc_t)data;
731 }
732 
733 local z_word_t crc_word_big(data)
734     z_word_t data;
735 {
736     int k;
737     for (k = 0; k < W; k++)
738         data = (data << 8) ^
739             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
740     return data;
741 }
742 
743 #endif
744 
745 /* ========================================================================= */
746 unsigned long ZEXPORT crc32_z(crc, buf, len)
747     unsigned long crc;
748     const unsigned char FAR *buf;
749     z_size_t len;
750 {
751     /* Return initial CRC, if requested. */
752     if (buf == Z_NULL) return 0;
753 
754 #ifdef DYNAMIC_CRC_TABLE
755     once(&made, make_crc_table);
756 #endif /* DYNAMIC_CRC_TABLE */
757 
758     /* Pre-condition the CRC */
759     crc = (~crc) & 0xffffffff;
760 
761 #ifdef W
762 
763     /* If provided enough bytes, do a braided CRC calculation. */
764     if (len >= N * W + W - 1) {
765         z_size_t blks;
766         z_word_t const *words;
767         unsigned endian;
768         int k;
769 
770         /* Compute the CRC up to a z_word_t boundary. */
771         while (len && ((z_size_t)buf & (W - 1)) != 0) {
772             len--;
773             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
774         }
775 
776         /* Compute the CRC on as many N z_word_t blocks as are available. */
777         blks = len / (N * W);
778         len -= blks * N * W;
779         words = (z_word_t const *)buf;
780 
781         /* Do endian check at execution time instead of compile time, since ARM
782            processors can change the endianess at execution time. If the
783            compiler knows what the endianess will be, it can optimize out the
784            check and the unused branch. */
785         endian = 1;
786         if (*(unsigned char *)&endian) {
787             /* Little endian. */
788 
789             z_crc_t crc0;
790             z_word_t word0;
791 #if N > 1
792             z_crc_t crc1;
793             z_word_t word1;
794 #if N > 2
795             z_crc_t crc2;
796             z_word_t word2;
797 #if N > 3
798             z_crc_t crc3;
799             z_word_t word3;
800 #if N > 4
801             z_crc_t crc4;
802             z_word_t word4;
803 #if N > 5
804             z_crc_t crc5;
805             z_word_t word5;
806 #endif
807 #endif
808 #endif
809 #endif
810 #endif
811 
812             /* Initialize the CRC for each braid. */
813             crc0 = crc;
814 #if N > 1
815             crc1 = 0;
816 #if N > 2
817             crc2 = 0;
818 #if N > 3
819             crc3 = 0;
820 #if N > 4
821             crc4 = 0;
822 #if N > 5
823             crc5 = 0;
824 #endif
825 #endif
826 #endif
827 #endif
828 #endif
829 
830             /*
831               Process the first blks-1 blocks, computing the CRCs on each braid
832               independently.
833              */
834             while (--blks) {
835                 /* Load the word for each braid into registers. */
836                 word0 = crc0 ^ words[0];
837 #if N > 1
838                 word1 = crc1 ^ words[1];
839 #if N > 2
840                 word2 = crc2 ^ words[2];
841 #if N > 3
842                 word3 = crc3 ^ words[3];
843 #if N > 4
844                 word4 = crc4 ^ words[4];
845 #if N > 5
846                 word5 = crc5 ^ words[5];
847 #endif
848 #endif
849 #endif
850 #endif
851 #endif
852                 words += N;
853 
854                 /* Compute and update the CRC for each word. The loop should
855                    get unrolled. */
856                 crc0 = crc_braid_table[0][word0 & 0xff];
857 #if N > 1
858                 crc1 = crc_braid_table[0][word1 & 0xff];
859 #if N > 2
860                 crc2 = crc_braid_table[0][word2 & 0xff];
861 #if N > 3
862                 crc3 = crc_braid_table[0][word3 & 0xff];
863 #if N > 4
864                 crc4 = crc_braid_table[0][word4 & 0xff];
865 #if N > 5
866                 crc5 = crc_braid_table[0][word5 & 0xff];
867 #endif
868 #endif
869 #endif
870 #endif
871 #endif
872                 for (k = 1; k < W; k++) {
873                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
874 #if N > 1
875                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
876 #if N > 2
877                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
878 #if N > 3
879                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
880 #if N > 4
881                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
882 #if N > 5
883                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
884 #endif
885 #endif
886 #endif
887 #endif
888 #endif
889                 }
890             }
891 
892             /*
893               Process the last block, combining the CRCs of the N braids at the
894               same time.
895              */
896             crc = crc_word(crc0 ^ words[0]);
897 #if N > 1
898             crc = crc_word(crc1 ^ words[1] ^ crc);
899 #if N > 2
900             crc = crc_word(crc2 ^ words[2] ^ crc);
901 #if N > 3
902             crc = crc_word(crc3 ^ words[3] ^ crc);
903 #if N > 4
904             crc = crc_word(crc4 ^ words[4] ^ crc);
905 #if N > 5
906             crc = crc_word(crc5 ^ words[5] ^ crc);
907 #endif
908 #endif
909 #endif
910 #endif
911 #endif
912             words += N;
913         }
914         else {
915             /* Big endian. */
916 
917             z_word_t crc0, word0, comb;
918 #if N > 1
919             z_word_t crc1, word1;
920 #if N > 2
921             z_word_t crc2, word2;
922 #if N > 3
923             z_word_t crc3, word3;
924 #if N > 4
925             z_word_t crc4, word4;
926 #if N > 5
927             z_word_t crc5, word5;
928 #endif
929 #endif
930 #endif
931 #endif
932 #endif
933 
934             /* Initialize the CRC for each braid. */
935             crc0 = byte_swap(crc);
936 #if N > 1
937             crc1 = 0;
938 #if N > 2
939             crc2 = 0;
940 #if N > 3
941             crc3 = 0;
942 #if N > 4
943             crc4 = 0;
944 #if N > 5
945             crc5 = 0;
946 #endif
947 #endif
948 #endif
949 #endif
950 #endif
951 
952             /*
953               Process the first blks-1 blocks, computing the CRCs on each braid
954               independently.
955              */
956             while (--blks) {
957                 /* Load the word for each braid into registers. */
958                 word0 = crc0 ^ words[0];
959 #if N > 1
960                 word1 = crc1 ^ words[1];
961 #if N > 2
962                 word2 = crc2 ^ words[2];
963 #if N > 3
964                 word3 = crc3 ^ words[3];
965 #if N > 4
966                 word4 = crc4 ^ words[4];
967 #if N > 5
968                 word5 = crc5 ^ words[5];
969 #endif
970 #endif
971 #endif
972 #endif
973 #endif
974                 words += N;
975 
976                 /* Compute and update the CRC for each word. The loop should
977                    get unrolled. */
978                 crc0 = crc_braid_big_table[0][word0 & 0xff];
979 #if N > 1
980                 crc1 = crc_braid_big_table[0][word1 & 0xff];
981 #if N > 2
982                 crc2 = crc_braid_big_table[0][word2 & 0xff];
983 #if N > 3
984                 crc3 = crc_braid_big_table[0][word3 & 0xff];
985 #if N > 4
986                 crc4 = crc_braid_big_table[0][word4 & 0xff];
987 #if N > 5
988                 crc5 = crc_braid_big_table[0][word5 & 0xff];
989 #endif
990 #endif
991 #endif
992 #endif
993 #endif
994                 for (k = 1; k < W; k++) {
995                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
996 #if N > 1
997                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
998 #if N > 2
999                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1000 #if N > 3
1001                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1002 #if N > 4
1003                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1004 #if N > 5
1005                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1006 #endif
1007 #endif
1008 #endif
1009 #endif
1010 #endif
1011                 }
1012             }
1013 
1014             /*
1015               Process the last block, combining the CRCs of the N braids at the
1016               same time.
1017              */
1018             comb = crc_word_big(crc0 ^ words[0]);
1019 #if N > 1
1020             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1021 #if N > 2
1022             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1023 #if N > 3
1024             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1025 #if N > 4
1026             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1027 #if N > 5
1028             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1029 #endif
1030 #endif
1031 #endif
1032 #endif
1033 #endif
1034             words += N;
1035             crc = byte_swap(comb);
1036         }
1037 
1038         /*
1039           Update the pointer to the remaining bytes to process.
1040          */
1041         buf = (unsigned char const *)words;
1042     }
1043 
1044 #endif /* W */
1045 
1046     /* Complete the computation of the CRC on any remaining bytes. */
1047     while (len >= 8) {
1048         len -= 8;
1049         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1050         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1051         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1052         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1056         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1057     }
1058     while (len) {
1059         len--;
1060         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1061     }
1062 
1063     /* Return the CRC, post-conditioned. */
1064     return crc ^ 0xffffffff;
1065 }
1066 
1067 #endif
1068 
1069 /* ========================================================================= */
1070 unsigned long ZEXPORT crc32(crc, buf, len)
1071     unsigned long crc;
1072     const unsigned char FAR *buf;
1073     uInt len;
1074 {
1075     return crc32_z(crc, buf, len);
1076 }
1077 
1078 /* ========================================================================= */
1079 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1080     uLong crc1;
1081     uLong crc2;
1082     z_off64_t len2;
1083 {
1084 #ifdef DYNAMIC_CRC_TABLE
1085     once(&made, make_crc_table);
1086 #endif /* DYNAMIC_CRC_TABLE */
1087     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1088 }
1089 
1090 /* ========================================================================= */
1091 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1092     uLong crc1;
1093     uLong crc2;
1094     z_off_t len2;
1095 {
1096     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1097 }
1098 
1099 /* ========================================================================= */
1100 uLong ZEXPORT crc32_combine_gen64(len2)
1101     z_off64_t len2;
1102 {
1103 #ifdef DYNAMIC_CRC_TABLE
1104     once(&made, make_crc_table);
1105 #endif /* DYNAMIC_CRC_TABLE */
1106     return x2nmodp(len2, 3);
1107 }
1108 
1109 /* ========================================================================= */
1110 uLong ZEXPORT crc32_combine_gen(len2)
1111     z_off_t len2;
1112 {
1113     return crc32_combine_gen64((z_off64_t)len2);
1114 }
1115 
1116 /* ========================================================================= */
1117 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1118     uLong crc1;
1119     uLong crc2;
1120     uLong op;
1121 {
1122     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1123 }
1124