xref: /netbsd-src/common/dist/zlib/crc32.c (revision c42dbd0ed2e61fe6eda8590caa852ccf34719964)
1 /*	$NetBSD: crc32.c,v 1.6 2022/10/15 19:49:32 christos Exp $	*/
2 
3 /* crc32.c -- compute the CRC-32 of a data stream
4  * Copyright (C) 1995-2022 Mark Adler
5  * For conditions of distribution and use, see copyright notice in zlib.h
6  *
7  * This interleaved implementation of a CRC makes use of pipelined multiple
8  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
9  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
10  */
11 
12 /* @(#) Id */
13 
14 /*
15   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
16   protection on the static variables used to control the first-use generation
17   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
18   first call get_crc_table() to initialize the tables before allowing more than
19   one thread to use crc32().
20 
21   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
22   produced, so that this one source file can be compiled to an executable.
23  */
24 
25 #ifdef MAKECRCH
26 #  include <stdio.h>
27 #  ifndef DYNAMIC_CRC_TABLE
28 #    define DYNAMIC_CRC_TABLE
29 #  endif /* !DYNAMIC_CRC_TABLE */
30 #endif /* MAKECRCH */
31 
32 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
33 
34  /*
35   A CRC of a message is computed on N braids of words in the message, where
36   each word consists of W bytes (4 or 8). If N is 3, for example, then three
37   running sparse CRCs are calculated respectively on each braid, at these
38   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
39   This is done starting at a word boundary, and continues until as many blocks
40   of N * W bytes as are available have been processed. The results are combined
41   into a single CRC at the end. For this code, N must be in the range 1..6 and
42   W must be 4 or 8. The upper limit on N can be increased if desired by adding
43   more #if blocks, extending the patterns apparent in the code. In addition,
44   crc32.h would need to be regenerated, if the maximum N value is increased.
45 
46   N and W are chosen empirically by benchmarking the execution time on a given
47   processor. The choices for N and W below were based on testing on Intel Kaby
48   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
49   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
50   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
51   They were all tested with either gcc or clang, all using the -O3 optimization
52   level. Your mileage may vary.
53  */
54 
55 /* Define N */
56 #ifdef Z_TESTN
57 #  define N Z_TESTN
58 #else
59 #  define N 5
60 #endif
61 #if N < 1 || N > 6
62 #  error N must be in 1..6
63 #endif
64 
65 /*
66   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
67   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
68   that bytes are eight bits.
69  */
70 
71 /*
72   Define W and the associated z_word_t type. If W is not defined, then a
73   braided calculation is not used, and the associated tables and code are not
74   compiled.
75  */
76 #ifdef Z_TESTW
77 #  if Z_TESTW-1 != -1
78 #    define W Z_TESTW
79 #  endif
80 #else
81 #  ifdef MAKECRCH
82 #    define W 8         /* required for MAKECRCH */
83 #  else
84 #    if defined(__x86_64__) || defined(__aarch64__)
85 #      define W 8
86 #    else
87 #      define W 4
88 #    endif
89 #  endif
90 #endif
91 #ifdef W
92 #  if W == 8 && defined(Z_U8)
93      typedef Z_U8 z_word_t;
94 #  elif defined(Z_U4)
95 #    undef W
96 #    define W 4
97      typedef Z_U4 z_word_t;
98 #  else
99 #    undef W
100 #  endif
101 #endif
102 
103 /* If available, use the ARM processor CRC32 instruction. */
104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
105 #  define ARMCRC32
106 #endif
107 
108 /* Local functions. */
109 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
110 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
111 
112 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
113     local z_word_t byte_swap OF((z_word_t word));
114 #endif
115 
116 #if defined(W) && !defined(ARMCRC32)
117     local z_crc_t crc_word OF((z_word_t data));
118     local z_word_t crc_word_big OF((z_word_t data));
119 #endif
120 
121 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
122 /*
123   Swap the bytes in a z_word_t to convert between little and big endian. Any
124   self-respecting compiler will optimize this to a single machine byte-swap
125   instruction, if one is available. This assumes that word_t is either 32 bits
126   or 64 bits.
127  */
128 local z_word_t byte_swap(word)
129     z_word_t word;
130 {
131 #  if W == 8
132     return
133         (word & 0xff00000000000000) >> 56 |
134         (word & 0xff000000000000) >> 40 |
135         (word & 0xff0000000000) >> 24 |
136         (word & 0xff00000000) >> 8 |
137         (word & 0xff000000) << 8 |
138         (word & 0xff0000) << 24 |
139         (word & 0xff00) << 40 |
140         (word & 0xff) << 56;
141 #  else   /* W == 4 */
142     return
143         (word & 0xff000000) >> 24 |
144         (word & 0xff0000) >> 8 |
145         (word & 0xff00) << 8 |
146         (word & 0xff) << 24;
147 #  endif
148 }
149 #endif
150 
151 /* CRC polynomial. */
152 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
153 
154 #ifdef DYNAMIC_CRC_TABLE
155 
156 local z_crc_t FAR crc_table[256];
157 local z_crc_t FAR x2n_table[32];
158 local void make_crc_table OF((void));
159 #ifdef W
160    local z_word_t FAR crc_big_table[256];
161    local z_crc_t FAR crc_braid_table[W][256];
162    local z_word_t FAR crc_braid_big_table[W][256];
163    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
164 #endif
165 #ifdef MAKECRCH
166    local void write_table OF((FILE *, const z_crc_t FAR *, int));
167    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
168    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
169 #endif /* MAKECRCH */
170 
171 /*
172   Define a once() function depending on the availability of atomics. If this is
173   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
174   multiple threads, and if atomics are not available, then get_crc_table() must
175   be called to initialize the tables and must return before any threads are
176   allowed to compute or combine CRCs.
177  */
178 
179 /* Definition of once functionality. */
180 typedef struct once_s once_t;
181 local void once OF((once_t *, void (*)(void)));
182 
183 /* Check for the availability of atomics. */
184 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
185     !defined(__STDC_NO_ATOMICS__)
186 
187 #include <stdatomic.h>
188 
189 /* Structure for once(), which must be initialized with ONCE_INIT. */
190 struct once_s {
191     atomic_flag begun;
192     atomic_int done;
193 };
194 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
195 
196 /*
197   Run the provided init() function exactly once, even if multiple threads
198   invoke once() at the same time. The state must be a once_t initialized with
199   ONCE_INIT.
200  */
201 local void once(state, init)
202     once_t *state;
203     void (*init)(void);
204 {
205     if (!atomic_load(&state->done)) {
206         if (atomic_flag_test_and_set(&state->begun))
207             while (!atomic_load(&state->done))
208                 ;
209         else {
210             init();
211             atomic_store(&state->done, 1);
212         }
213     }
214 }
215 
216 #else   /* no atomics */
217 
218 /* Structure for once(), which must be initialized with ONCE_INIT. */
219 struct once_s {
220     volatile int begun;
221     volatile int done;
222 };
223 #define ONCE_INIT {0, 0}
224 
225 /* Test and set. Alas, not atomic, but tries to minimize the period of
226    vulnerability. */
227 local int test_and_set OF((int volatile *));
228 local int test_and_set(flag)
229     int volatile *flag;
230 {
231     int was;
232 
233     was = *flag;
234     *flag = 1;
235     return was;
236 }
237 
238 /* Run the provided init() function once. This is not thread-safe. */
239 local void once(state, init)
240     once_t *state;
241     void (*init)(void);
242 {
243     if (!state->done) {
244         if (test_and_set(&state->begun))
245             while (!state->done)
246                 ;
247         else {
248             init();
249             state->done = 1;
250         }
251     }
252 }
253 
254 #endif
255 
256 /* State for once(). */
257 local once_t made = ONCE_INIT;
258 
259 /*
260   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
261   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
262 
263   Polynomials over GF(2) are represented in binary, one bit per coefficient,
264   with the lowest powers in the most significant bit. Then adding polynomials
265   is just exclusive-or, and multiplying a polynomial by x is a right shift by
266   one. If we call the above polynomial p, and represent a byte as the
267   polynomial q, also with the lowest power in the most significant bit (so the
268   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
269   where a mod b means the remainder after dividing a by b.
270 
271   This calculation is done using the shift-register method of multiplying and
272   taking the remainder. The register is initialized to zero, and for each
273   incoming bit, x^32 is added mod p to the register if the bit is a one (where
274   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
275   (which is shifting right by one and adding x^32 mod p if the bit shifted out
276   is a one). We start with the highest power (least significant bit) of q and
277   repeat for all eight bits of q.
278 
279   The table is simply the CRC of all possible eight bit values. This is all the
280   information needed to generate CRCs on data a byte at a time for all
281   combinations of CRC register values and incoming bytes.
282  */
283 
284 local void make_crc_table()
285 {
286     unsigned i, j, n;
287     z_crc_t p;
288 
289     /* initialize the CRC of bytes tables */
290     for (i = 0; i < 256; i++) {
291         p = i;
292         for (j = 0; j < 8; j++)
293             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
294         crc_table[i] = p;
295 #ifdef W
296         crc_big_table[i] = byte_swap(p);
297 #endif
298     }
299 
300     /* initialize the x^2^n mod p(x) table */
301     p = (z_crc_t)1 << 30;         /* x^1 */
302     x2n_table[0] = p;
303     for (n = 1; n < 32; n++)
304         x2n_table[n] = p = multmodp(p, p);
305 
306 #ifdef W
307     /* initialize the braiding tables -- needs x2n_table[] */
308     braid(crc_braid_table, crc_braid_big_table, N, W);
309 #endif
310 
311 #ifdef MAKECRCH
312     {
313         /*
314           The crc32.h header file contains tables for both 32-bit and 64-bit
315           z_word_t's, and so requires a 64-bit type be available. In that case,
316           z_word_t must be defined to be 64-bits. This code then also generates
317           and writes out the tables for the case that z_word_t is 32 bits.
318          */
319 #if !defined(W) || W != 8
320 #  error Need a 64-bit integer type in order to generate crc32.h.
321 #endif
322         FILE *out;
323         int k, n;
324         z_crc_t ltl[8][256];
325         z_word_t big[8][256];
326 
327         out = fopen("crc32.h", "w");
328         if (out == NULL) return;
329 
330         /* write out little-endian CRC table to crc32.h */
331         fprintf(out,
332             "/* crc32.h -- tables for rapid CRC calculation\n"
333             " * Generated automatically by crc32.c\n */\n"
334             "\n"
335             "local const z_crc_t FAR crc_table[] = {\n"
336             "    ");
337         write_table(out, crc_table, 256);
338         fprintf(out,
339             "};\n");
340 
341         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
342         fprintf(out,
343             "\n"
344             "#ifdef W\n"
345             "\n"
346             "#if W == 8\n"
347             "\n"
348             "local const z_word_t FAR crc_big_table[] = {\n"
349             "    ");
350         write_table64(out, crc_big_table, 256);
351         fprintf(out,
352             "};\n");
353 
354         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
355         fprintf(out,
356             "\n"
357             "#else /* W == 4 */\n"
358             "\n"
359             "local const z_word_t FAR crc_big_table[] = {\n"
360             "    ");
361         write_table32hi(out, crc_big_table, 256);
362         fprintf(out,
363             "};\n"
364             "\n"
365             "#endif\n");
366 
367         /* write out braid tables for each value of N */
368         for (n = 1; n <= 6; n++) {
369             fprintf(out,
370             "\n"
371             "#if N == %d\n", n);
372 
373             /* compute braid tables for this N and 64-bit word_t */
374             braid(ltl, big, n, 8);
375 
376             /* write out braid tables for 64-bit z_word_t to crc32.h */
377             fprintf(out,
378             "\n"
379             "#if W == 8\n"
380             "\n"
381             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
382             for (k = 0; k < 8; k++) {
383                 fprintf(out, "   {");
384                 write_table(out, ltl[k], 256);
385                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
386             }
387             fprintf(out,
388             "};\n"
389             "\n"
390             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
391             for (k = 0; k < 8; k++) {
392                 fprintf(out, "   {");
393                 write_table64(out, big[k], 256);
394                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
395             }
396             fprintf(out,
397             "};\n");
398 
399             /* compute braid tables for this N and 32-bit word_t */
400             braid(ltl, big, n, 4);
401 
402             /* write out braid tables for 32-bit z_word_t to crc32.h */
403             fprintf(out,
404             "\n"
405             "#else /* W == 4 */\n"
406             "\n"
407             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
408             for (k = 0; k < 4; k++) {
409                 fprintf(out, "   {");
410                 write_table(out, ltl[k], 256);
411                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
412             }
413             fprintf(out,
414             "};\n"
415             "\n"
416             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
417             for (k = 0; k < 4; k++) {
418                 fprintf(out, "   {");
419                 write_table32hi(out, big[k], 256);
420                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
421             }
422             fprintf(out,
423             "};\n"
424             "\n"
425             "#endif\n"
426             "\n"
427             "#endif\n");
428         }
429         fprintf(out,
430             "\n"
431             "#endif\n");
432 
433         /* write out zeros operator table to crc32.h */
434         fprintf(out,
435             "\n"
436             "local const z_crc_t FAR x2n_table[] = {\n"
437             "    ");
438         write_table(out, x2n_table, 32);
439         fprintf(out,
440             "};\n");
441         fclose(out);
442     }
443 #endif /* MAKECRCH */
444 }
445 
446 #ifdef MAKECRCH
447 
448 /*
449    Write the 32-bit values in table[0..k-1] to out, five per line in
450    hexadecimal separated by commas.
451  */
452 local void write_table(out, table, k)
453     FILE *out;
454     const z_crc_t FAR *table;
455     int k;
456 {
457     int n;
458 
459     for (n = 0; n < k; n++)
460         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
461                 (unsigned long)(table[n]),
462                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
463 }
464 
465 /*
466    Write the high 32-bits of each value in table[0..k-1] to out, five per line
467    in hexadecimal separated by commas.
468  */
469 local void write_table32hi(out, table, k)
470 FILE *out;
471 const z_word_t FAR *table;
472 int k;
473 {
474     int n;
475 
476     for (n = 0; n < k; n++)
477         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
478                 (unsigned long)(table[n] >> 32),
479                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
480 }
481 
482 /*
483   Write the 64-bit values in table[0..k-1] to out, three per line in
484   hexadecimal separated by commas. This assumes that if there is a 64-bit
485   type, then there is also a long long integer type, and it is at least 64
486   bits. If not, then the type cast and format string can be adjusted
487   accordingly.
488  */
489 local void write_table64(out, table, k)
490     FILE *out;
491     const z_word_t FAR *table;
492     int k;
493 {
494     int n;
495 
496     for (n = 0; n < k; n++)
497         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
498                 (unsigned long long)(table[n]),
499                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
500 }
501 
502 /* Actually do the deed. */
503 int main()
504 {
505     make_crc_table();
506     return 0;
507 }
508 
509 #endif /* MAKECRCH */
510 
511 #ifdef W
512 /*
513   Generate the little and big-endian braid tables for the given n and z_word_t
514   size w. Each array must have room for w blocks of 256 elements.
515  */
516 local void braid(ltl, big, n, w)
517     z_crc_t ltl[][256];
518     z_word_t big[][256];
519     int n;
520     int w;
521 {
522     int k;
523     z_crc_t i, p, q;
524     for (k = 0; k < w; k++) {
525         p = x2nmodp((n * w + 3 - k) << 3, 0);
526         ltl[k][0] = 0;
527         big[w - 1 - k][0] = 0;
528         for (i = 1; i < 256; i++) {
529             ltl[k][i] = q = multmodp(i << 24, p);
530             big[w - 1 - k][i] = byte_swap(q);
531         }
532     }
533 }
534 #endif
535 
536 #else /* !DYNAMIC_CRC_TABLE */
537 /* ========================================================================
538  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
539  * of x for combining CRC-32s, all made by make_crc_table().
540  */
541 #include "crc32.h"
542 #endif /* DYNAMIC_CRC_TABLE */
543 
544 /* ========================================================================
545  * Routines used for CRC calculation. Some are also required for the table
546  * generation above.
547  */
548 
549 /*
550   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
551   reflected. For speed, this requires that a not be zero.
552  */
553 local z_crc_t multmodp(a, b)
554     z_crc_t a;
555     z_crc_t b;
556 {
557     z_crc_t m, p;
558 
559     m = (z_crc_t)1 << 31;
560     p = 0;
561     for (;;) {
562         if (a & m) {
563             p ^= b;
564             if ((a & (m - 1)) == 0)
565                 break;
566         }
567         m >>= 1;
568         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
569     }
570     return p;
571 }
572 
573 /*
574   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
575   initialized.
576  */
577 local z_crc_t x2nmodp(n, k)
578     z_off64_t n;
579     unsigned k;
580 {
581     z_crc_t p;
582 
583     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
584     while (n) {
585         if (n & 1)
586             p = multmodp(x2n_table[k & 31], p);
587         n >>= 1;
588         k++;
589     }
590     return p;
591 }
592 
593 /* =========================================================================
594  * This function can be used by asm versions of crc32(), and to force the
595  * generation of the CRC tables in a threaded application.
596  */
597 const z_crc_t FAR * ZEXPORT get_crc_table()
598 {
599 #ifdef DYNAMIC_CRC_TABLE
600     once(&made, make_crc_table);
601 #endif /* DYNAMIC_CRC_TABLE */
602     return (const z_crc_t FAR *)crc_table;
603 }
604 
605 /* =========================================================================
606  * Use ARM machine instructions if available. This will compute the CRC about
607  * ten times faster than the braided calculation. This code does not check for
608  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
609  * only be defined if the compilation specifies an ARM processor architecture
610  * that has the instructions. For example, compiling with -march=armv8.1-a or
611  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
612  * instructions.
613  */
614 #ifdef ARMCRC32
615 
616 /*
617    Constants empirically determined to maximize speed. These values are from
618    measurements on a Cortex-A57. Your mileage may vary.
619  */
620 #define Z_BATCH 3990                /* number of words in a batch */
621 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
622 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
623 
624 unsigned long ZEXPORT crc32_z(crc, buf, len)
625     unsigned long crc;
626     const unsigned char FAR *buf;
627     z_size_t len;
628 {
629     z_crc_t val;
630     z_word_t crc1, crc2;
631     const z_word_t *word;
632     z_word_t val0, val1, val2;
633     z_size_t last, last2, i;
634     z_size_t num;
635 
636     /* Return initial CRC, if requested. */
637     if (buf == Z_NULL) return 0;
638 
639 #ifdef DYNAMIC_CRC_TABLE
640     once(&made, make_crc_table);
641 #endif /* DYNAMIC_CRC_TABLE */
642 
643     /* Pre-condition the CRC */
644     crc = (~crc) & 0xffffffff;
645 
646     /* Compute the CRC up to a word boundary. */
647     while (len && ((z_size_t)buf & 7) != 0) {
648         len--;
649         val = *buf++;
650         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
651     }
652 
653     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
654     word = (z_word_t const *)buf;
655     num = len >> 3;
656     len &= 7;
657 
658     /* Do three interleaved CRCs to realize the throughput of one crc32x
659        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
660        three CRCs are combined into a single CRC after each set of batches. */
661     while (num >= 3 * Z_BATCH) {
662         crc1 = 0;
663         crc2 = 0;
664         for (i = 0; i < Z_BATCH; i++) {
665             val0 = word[i];
666             val1 = word[i + Z_BATCH];
667             val2 = word[i + 2 * Z_BATCH];
668             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
669             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
670             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
671         }
672         word += 3 * Z_BATCH;
673         num -= 3 * Z_BATCH;
674         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
675         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
676     }
677 
678     /* Do one last smaller batch with the remaining words, if there are enough
679        to pay for the combination of CRCs. */
680     last = num / 3;
681     if (last >= Z_BATCH_MIN) {
682         last2 = last << 1;
683         crc1 = 0;
684         crc2 = 0;
685         for (i = 0; i < last; i++) {
686             val0 = word[i];
687             val1 = word[i + last];
688             val2 = word[i + last2];
689             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
690             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
691             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
692         }
693         word += 3 * last;
694         num -= 3 * last;
695         val = x2nmodp(last, 6);
696         crc = multmodp(val, crc) ^ crc1;
697         crc = multmodp(val, crc) ^ crc2;
698     }
699 
700     /* Compute the CRC on any remaining words. */
701     for (i = 0; i < num; i++) {
702         val0 = word[i];
703         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
704     }
705     word += num;
706 
707     /* Complete the CRC on any remaining bytes. */
708     buf = (const unsigned char FAR *)word;
709     while (len) {
710         len--;
711         val = *buf++;
712         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
713     }
714 
715     /* Return the CRC, post-conditioned. */
716     return crc ^ 0xffffffff;
717 }
718 
719 #else
720 
721 #ifdef W
722 
723 /*
724   Return the CRC of the W bytes in the word_t data, taking the
725   least-significant byte of the word as the first byte of data, without any pre
726   or post conditioning. This is used to combine the CRCs of each braid.
727  */
728 local z_crc_t crc_word(data)
729     z_word_t data;
730 {
731     int k;
732     for (k = 0; k < W; k++)
733         data = (data >> 8) ^ crc_table[data & 0xff];
734     return (z_crc_t)data;
735 }
736 
737 local z_word_t crc_word_big(data)
738     z_word_t data;
739 {
740     int k;
741     for (k = 0; k < W; k++)
742         data = (data << 8) ^
743             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
744     return data;
745 }
746 
747 #endif
748 
749 /* ========================================================================= */
750 unsigned long ZEXPORT crc32_z(crc, buf, len)
751     unsigned long crc;
752     const unsigned char FAR *buf;
753     z_size_t len;
754 {
755     /* Return initial CRC, if requested. */
756     if (buf == Z_NULL) return 0;
757 
758 #ifdef DYNAMIC_CRC_TABLE
759     once(&made, make_crc_table);
760 #endif /* DYNAMIC_CRC_TABLE */
761 
762     /* Pre-condition the CRC */
763     crc = (~crc) & 0xffffffff;
764 
765 #ifdef W
766 
767     /* If provided enough bytes, do a braided CRC calculation. */
768     if (len >= N * W + W - 1) {
769         z_size_t blks;
770         z_word_t const *words;
771         unsigned endian;
772         int k;
773 
774         /* Compute the CRC up to a z_word_t boundary. */
775         while (len && ((z_size_t)buf & (W - 1)) != 0) {
776             len--;
777             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
778         }
779 
780         /* Compute the CRC on as many N z_word_t blocks as are available. */
781         blks = len / (N * W);
782         len -= blks * N * W;
783         words = (z_word_t const *)buf;
784 
785         /* Do endian check at execution time instead of compile time, since ARM
786            processors can change the endianess at execution time. If the
787            compiler knows what the endianess will be, it can optimize out the
788            check and the unused branch. */
789         endian = 1;
790         if (*(unsigned char *)&endian) {
791             /* Little endian. */
792 
793             z_crc_t crc0;
794             z_word_t word0;
795 #if N > 1
796             z_crc_t crc1;
797             z_word_t word1;
798 #if N > 2
799             z_crc_t crc2;
800             z_word_t word2;
801 #if N > 3
802             z_crc_t crc3;
803             z_word_t word3;
804 #if N > 4
805             z_crc_t crc4;
806             z_word_t word4;
807 #if N > 5
808             z_crc_t crc5;
809             z_word_t word5;
810 #endif
811 #endif
812 #endif
813 #endif
814 #endif
815 
816             /* Initialize the CRC for each braid. */
817             crc0 = crc;
818 #if N > 1
819             crc1 = 0;
820 #if N > 2
821             crc2 = 0;
822 #if N > 3
823             crc3 = 0;
824 #if N > 4
825             crc4 = 0;
826 #if N > 5
827             crc5 = 0;
828 #endif
829 #endif
830 #endif
831 #endif
832 #endif
833 
834             /*
835               Process the first blks-1 blocks, computing the CRCs on each braid
836               independently.
837              */
838             while (--blks) {
839                 /* Load the word for each braid into registers. */
840                 word0 = crc0 ^ words[0];
841 #if N > 1
842                 word1 = crc1 ^ words[1];
843 #if N > 2
844                 word2 = crc2 ^ words[2];
845 #if N > 3
846                 word3 = crc3 ^ words[3];
847 #if N > 4
848                 word4 = crc4 ^ words[4];
849 #if N > 5
850                 word5 = crc5 ^ words[5];
851 #endif
852 #endif
853 #endif
854 #endif
855 #endif
856                 words += N;
857 
858                 /* Compute and update the CRC for each word. The loop should
859                    get unrolled. */
860                 crc0 = crc_braid_table[0][word0 & 0xff];
861 #if N > 1
862                 crc1 = crc_braid_table[0][word1 & 0xff];
863 #if N > 2
864                 crc2 = crc_braid_table[0][word2 & 0xff];
865 #if N > 3
866                 crc3 = crc_braid_table[0][word3 & 0xff];
867 #if N > 4
868                 crc4 = crc_braid_table[0][word4 & 0xff];
869 #if N > 5
870                 crc5 = crc_braid_table[0][word5 & 0xff];
871 #endif
872 #endif
873 #endif
874 #endif
875 #endif
876                 for (k = 1; k < W; k++) {
877                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
878 #if N > 1
879                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
880 #if N > 2
881                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
882 #if N > 3
883                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
884 #if N > 4
885                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
886 #if N > 5
887                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
888 #endif
889 #endif
890 #endif
891 #endif
892 #endif
893                 }
894             }
895 
896             /*
897               Process the last block, combining the CRCs of the N braids at the
898               same time.
899              */
900             crc = crc_word(crc0 ^ words[0]);
901 #if N > 1
902             crc = crc_word(crc1 ^ words[1] ^ crc);
903 #if N > 2
904             crc = crc_word(crc2 ^ words[2] ^ crc);
905 #if N > 3
906             crc = crc_word(crc3 ^ words[3] ^ crc);
907 #if N > 4
908             crc = crc_word(crc4 ^ words[4] ^ crc);
909 #if N > 5
910             crc = crc_word(crc5 ^ words[5] ^ crc);
911 #endif
912 #endif
913 #endif
914 #endif
915 #endif
916             words += N;
917         }
918         else {
919             /* Big endian. */
920 
921             z_word_t crc0, word0, comb;
922 #if N > 1
923             z_word_t crc1, word1;
924 #if N > 2
925             z_word_t crc2, word2;
926 #if N > 3
927             z_word_t crc3, word3;
928 #if N > 4
929             z_word_t crc4, word4;
930 #if N > 5
931             z_word_t crc5, word5;
932 #endif
933 #endif
934 #endif
935 #endif
936 #endif
937 
938             /* Initialize the CRC for each braid. */
939             crc0 = byte_swap(crc);
940 #if N > 1
941             crc1 = 0;
942 #if N > 2
943             crc2 = 0;
944 #if N > 3
945             crc3 = 0;
946 #if N > 4
947             crc4 = 0;
948 #if N > 5
949             crc5 = 0;
950 #endif
951 #endif
952 #endif
953 #endif
954 #endif
955 
956             /*
957               Process the first blks-1 blocks, computing the CRCs on each braid
958               independently.
959              */
960             while (--blks) {
961                 /* Load the word for each braid into registers. */
962                 word0 = crc0 ^ words[0];
963 #if N > 1
964                 word1 = crc1 ^ words[1];
965 #if N > 2
966                 word2 = crc2 ^ words[2];
967 #if N > 3
968                 word3 = crc3 ^ words[3];
969 #if N > 4
970                 word4 = crc4 ^ words[4];
971 #if N > 5
972                 word5 = crc5 ^ words[5];
973 #endif
974 #endif
975 #endif
976 #endif
977 #endif
978                 words += N;
979 
980                 /* Compute and update the CRC for each word. The loop should
981                    get unrolled. */
982                 crc0 = crc_braid_big_table[0][word0 & 0xff];
983 #if N > 1
984                 crc1 = crc_braid_big_table[0][word1 & 0xff];
985 #if N > 2
986                 crc2 = crc_braid_big_table[0][word2 & 0xff];
987 #if N > 3
988                 crc3 = crc_braid_big_table[0][word3 & 0xff];
989 #if N > 4
990                 crc4 = crc_braid_big_table[0][word4 & 0xff];
991 #if N > 5
992                 crc5 = crc_braid_big_table[0][word5 & 0xff];
993 #endif
994 #endif
995 #endif
996 #endif
997 #endif
998                 for (k = 1; k < W; k++) {
999                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
1000 #if N > 1
1001                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1002 #if N > 2
1003                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1004 #if N > 3
1005                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1006 #if N > 4
1007                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1008 #if N > 5
1009                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1010 #endif
1011 #endif
1012 #endif
1013 #endif
1014 #endif
1015                 }
1016             }
1017 
1018             /*
1019               Process the last block, combining the CRCs of the N braids at the
1020               same time.
1021              */
1022             comb = crc_word_big(crc0 ^ words[0]);
1023 #if N > 1
1024             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1025 #if N > 2
1026             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1027 #if N > 3
1028             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1029 #if N > 4
1030             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1031 #if N > 5
1032             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1033 #endif
1034 #endif
1035 #endif
1036 #endif
1037 #endif
1038             words += N;
1039             crc = byte_swap(comb);
1040         }
1041 
1042         /*
1043           Update the pointer to the remaining bytes to process.
1044          */
1045         buf = (unsigned char const *)words;
1046     }
1047 
1048 #endif /* W */
1049 
1050     /* Complete the computation of the CRC on any remaining bytes. */
1051     while (len >= 8) {
1052         len -= 8;
1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1056         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1057         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1058         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1059         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1060         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1061     }
1062     while (len) {
1063         len--;
1064         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1065     }
1066 
1067     /* Return the CRC, post-conditioned. */
1068     return crc ^ 0xffffffff;
1069 }
1070 
1071 #endif
1072 
1073 /* ========================================================================= */
1074 unsigned long ZEXPORT crc32(crc, buf, len)
1075     unsigned long crc;
1076     const unsigned char FAR *buf;
1077     uInt len;
1078 {
1079     return crc32_z(crc, buf, len);
1080 }
1081 
1082 /* ========================================================================= */
1083 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1084     uLong crc1;
1085     uLong crc2;
1086     z_off64_t len2;
1087 {
1088 #ifdef DYNAMIC_CRC_TABLE
1089     once(&made, make_crc_table);
1090 #endif /* DYNAMIC_CRC_TABLE */
1091     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1092 }
1093 
1094 /* ========================================================================= */
1095 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1096     uLong crc1;
1097     uLong crc2;
1098     z_off_t len2;
1099 {
1100     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1101 }
1102 
1103 /* ========================================================================= */
1104 uLong ZEXPORT crc32_combine_gen64(len2)
1105     z_off64_t len2;
1106 {
1107 #ifdef DYNAMIC_CRC_TABLE
1108     once(&made, make_crc_table);
1109 #endif /* DYNAMIC_CRC_TABLE */
1110     return x2nmodp(len2, 3);
1111 }
1112 
1113 /* ========================================================================= */
1114 uLong ZEXPORT crc32_combine_gen(len2)
1115     z_off_t len2;
1116 {
1117     return crc32_combine_gen64((z_off64_t)len2);
1118 }
1119 
1120 /* ========================================================================= */
1121 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1122     uLong crc1;
1123     uLong crc2;
1124     uLong op;
1125 {
1126     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1127 }
1128