xref: /openbsd-src/lib/libz/crc32.c (revision 70ce8dca00628be1d4e8ae025e352eafab511f42)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /*
11   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
12   protection on the static variables used to control the first-use generation
13   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
14   first call get_crc_table() to initialize the tables before allowing more than
15   one thread to use crc32().
16 
17   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
18   produced, so that this one source file can be compiled to an executable.
19  */
20 
21 #ifdef MAKECRCH
22 #  include <stdio.h>
23 #  ifndef DYNAMIC_CRC_TABLE
24 #    define DYNAMIC_CRC_TABLE
25 #  endif /* !DYNAMIC_CRC_TABLE */
26 #endif /* MAKECRCH */
27 
28 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
29 
30  /*
31   A CRC of a message is computed on N braids of words in the message, where
32   each word consists of W bytes (4 or 8). If N is 3, for example, then three
33   running sparse CRCs are calculated respectively on each braid, at these
34   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
35   This is done starting at a word boundary, and continues until as many blocks
36   of N * W bytes as are available have been processed. The results are combined
37   into a single CRC at the end. For this code, N must be in the range 1..6 and
38   W must be 4 or 8. The upper limit on N can be increased if desired by adding
39   more #if blocks, extending the patterns apparent in the code. In addition,
40   crc32.h would need to be regenerated, if the maximum N value is increased.
41 
42   N and W are chosen empirically by benchmarking the execution time on a given
43   processor. The choices for N and W below were based on testing on Intel Kaby
44   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
45   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
46   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
47   They were all tested with either gcc or clang, all using the -O3 optimization
48   level. Your mileage may vary.
49  */
50 
51 /* Define N */
52 #ifdef Z_TESTN
53 #  define N Z_TESTN
54 #else
55 #  define N 5
56 #endif
57 #if N < 1 || N > 6
58 #  error N must be in 1..6
59 #endif
60 
61 /*
62   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
63   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
64   that bytes are eight bits.
65  */
66 
67 /*
68   Define W and the associated z_word_t type. If W is not defined, then a
69   braided calculation is not used, and the associated tables and code are not
70   compiled.
71  */
72 #ifdef Z_TESTW
73 #  if Z_TESTW-1 != -1
74 #    define W Z_TESTW
75 #  endif
76 #else
77 #  ifdef MAKECRCH
78 #    define W 8         /* required for MAKECRCH */
79 #  else
80 #    if defined(__x86_64__) || defined(__aarch64__)
81 #      define W 8
82 #    else
83 #      define W 4
84 #    endif
85 #  endif
86 #endif
87 #ifdef W
88 #  if W == 8 && defined(Z_U8)
89      typedef Z_U8 z_word_t;
90 #  elif defined(Z_U4)
91 #    undef W
92 #    define W 4
93      typedef Z_U4 z_word_t;
94 #  else
95 #    undef W
96 #  endif
97 #endif
98 
99 /* If available, use the ARM processor CRC32 instruction. */
100 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
101 #  define ARMCRC32
102 #endif
103 
104 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
105 /*
106   Swap the bytes in a z_word_t to convert between little and big endian. Any
107   self-respecting compiler will optimize this to a single machine byte-swap
108   instruction, if one is available. This assumes that word_t is either 32 bits
109   or 64 bits.
110  */
byte_swap(z_word_t word)111 local z_word_t byte_swap(z_word_t word) {
112 #  if W == 8
113     return
114         (word & 0xff00000000000000) >> 56 |
115         (word & 0xff000000000000) >> 40 |
116         (word & 0xff0000000000) >> 24 |
117         (word & 0xff00000000) >> 8 |
118         (word & 0xff000000) << 8 |
119         (word & 0xff0000) << 24 |
120         (word & 0xff00) << 40 |
121         (word & 0xff) << 56;
122 #  else   /* W == 4 */
123     return
124         (word & 0xff000000) >> 24 |
125         (word & 0xff0000) >> 8 |
126         (word & 0xff00) << 8 |
127         (word & 0xff) << 24;
128 #  endif
129 }
130 #endif
131 
132 #ifdef DYNAMIC_CRC_TABLE
133 /* =========================================================================
134  * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
135  * below.
136  */
137    local z_crc_t FAR x2n_table[32];
138 #else
139 /* =========================================================================
140  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
141  * of x for combining CRC-32s, all made by make_crc_table().
142  */
143 #  include "crc32.h"
144 #endif
145 
146 /* CRC polynomial. */
147 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
148 
149 /*
150   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
151   reflected. For speed, this requires that a not be zero.
152  */
multmodp(z_crc_t a,z_crc_t b)153 local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
154     z_crc_t m, p;
155 
156     m = (z_crc_t)1 << 31;
157     p = 0;
158     for (;;) {
159         if (a & m) {
160             p ^= b;
161             if ((a & (m - 1)) == 0)
162                 break;
163         }
164         m >>= 1;
165         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
166     }
167     return p;
168 }
169 
170 /*
171   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
172   initialized.
173  */
x2nmodp(z_off64_t n,unsigned k)174 local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
175     z_crc_t p;
176 
177     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
178     while (n) {
179         if (n & 1)
180             p = multmodp(x2n_table[k & 31], p);
181         n >>= 1;
182         k++;
183     }
184     return p;
185 }
186 
187 #ifdef DYNAMIC_CRC_TABLE
188 /* =========================================================================
189  * Build the tables for byte-wise and braided CRC-32 calculations, and a table
190  * of powers of x for combining CRC-32s.
191  */
192 local z_crc_t FAR crc_table[256];
193 #ifdef W
194    local z_word_t FAR crc_big_table[256];
195    local z_crc_t FAR crc_braid_table[W][256];
196    local z_word_t FAR crc_braid_big_table[W][256];
197    local void braid(z_crc_t [][256], z_word_t [][256], int, int);
198 #endif
199 #ifdef MAKECRCH
200    local void write_table(FILE *, const z_crc_t FAR *, int);
201    local void write_table32hi(FILE *, const z_word_t FAR *, int);
202    local void write_table64(FILE *, const z_word_t FAR *, int);
203 #endif /* MAKECRCH */
204 
205 /*
206   Define a once() function depending on the availability of atomics. If this is
207   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
208   multiple threads, and if atomics are not available, then get_crc_table() must
209   be called to initialize the tables and must return before any threads are
210   allowed to compute or combine CRCs.
211  */
212 
213 /* Definition of once functionality. */
214 typedef struct once_s once_t;
215 
216 /* Check for the availability of atomics. */
217 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
218     !defined(__STDC_NO_ATOMICS__) && !defined(SMALL)
219 
220 #include <stdatomic.h>
221 
222 /* Structure for once(), which must be initialized with ONCE_INIT. */
223 struct once_s {
224     atomic_flag begun;
225     atomic_int done;
226 };
227 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
228 
229 /*
230   Run the provided init() function exactly once, even if multiple threads
231   invoke once() at the same time. The state must be a once_t initialized with
232   ONCE_INIT.
233  */
once(once_t * state,void (* init)(void))234 local void once(once_t *state, void (*init)(void)) {
235     if (!atomic_load(&state->done)) {
236         if (atomic_flag_test_and_set(&state->begun))
237             while (!atomic_load(&state->done))
238                 ;
239         else {
240             init();
241             atomic_store(&state->done, 1);
242         }
243     }
244 }
245 
246 #else   /* no atomics */
247 
248 /* Structure for once(), which must be initialized with ONCE_INIT. */
249 struct once_s {
250     volatile int begun;
251     volatile int done;
252 };
253 #define ONCE_INIT {0, 0}
254 
255 /* Test and set. Alas, not atomic, but tries to minimize the period of
256    vulnerability. */
test_and_set(int volatile * flag)257 local int test_and_set(int volatile *flag) {
258     int was;
259 
260     was = *flag;
261     *flag = 1;
262     return was;
263 }
264 
265 /* Run the provided init() function once. This is not thread-safe. */
once(once_t * state,void (* init)(void))266 local void once(once_t *state, void (*init)(void)) {
267     if (!state->done) {
268         if (test_and_set(&state->begun))
269             while (!state->done)
270                 ;
271         else {
272             init();
273             state->done = 1;
274         }
275     }
276 }
277 
278 #endif
279 
280 /* State for once(). */
281 local once_t made = ONCE_INIT;
282 
283 /*
284   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
285   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
286 
287   Polynomials over GF(2) are represented in binary, one bit per coefficient,
288   with the lowest powers in the most significant bit. Then adding polynomials
289   is just exclusive-or, and multiplying a polynomial by x is a right shift by
290   one. If we call the above polynomial p, and represent a byte as the
291   polynomial q, also with the lowest power in the most significant bit (so the
292   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
293   where a mod b means the remainder after dividing a by b.
294 
295   This calculation is done using the shift-register method of multiplying and
296   taking the remainder. The register is initialized to zero, and for each
297   incoming bit, x^32 is added mod p to the register if the bit is a one (where
298   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
299   (which is shifting right by one and adding x^32 mod p if the bit shifted out
300   is a one). We start with the highest power (least significant bit) of q and
301   repeat for all eight bits of q.
302 
303   The table is simply the CRC of all possible eight bit values. This is all the
304   information needed to generate CRCs on data a byte at a time for all
305   combinations of CRC register values and incoming bytes.
306  */
307 
make_crc_table(void)308 local void make_crc_table(void) {
309     unsigned i, j, n;
310     z_crc_t p;
311 
312     /* initialize the CRC of bytes tables */
313     for (i = 0; i < 256; i++) {
314         p = i;
315         for (j = 0; j < 8; j++)
316             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
317         crc_table[i] = p;
318 #ifdef W
319         crc_big_table[i] = byte_swap(p);
320 #endif
321     }
322 
323     /* initialize the x^2^n mod p(x) table */
324     p = (z_crc_t)1 << 30;         /* x^1 */
325     x2n_table[0] = p;
326     for (n = 1; n < 32; n++)
327         x2n_table[n] = p = multmodp(p, p);
328 
329 #ifdef W
330     /* initialize the braiding tables -- needs x2n_table[] */
331     braid(crc_braid_table, crc_braid_big_table, N, W);
332 #endif
333 
334 #ifdef MAKECRCH
335     {
336         /*
337           The crc32.h header file contains tables for both 32-bit and 64-bit
338           z_word_t's, and so requires a 64-bit type be available. In that case,
339           z_word_t must be defined to be 64-bits. This code then also generates
340           and writes out the tables for the case that z_word_t is 32 bits.
341          */
342 #if !defined(W) || W != 8
343 #  error Need a 64-bit integer type in order to generate crc32.h.
344 #endif
345         FILE *out;
346         int k, n;
347         z_crc_t ltl[8][256];
348         z_word_t big[8][256];
349 
350         out = fopen("crc32.h", "w");
351         if (out == NULL) return;
352 
353         /* write out little-endian CRC table to crc32.h */
354         fprintf(out,
355             "/* crc32.h -- tables for rapid CRC calculation\n"
356             " * Generated automatically by crc32.c\n */\n"
357             "\n"
358             "local const z_crc_t FAR crc_table[] = {\n"
359             "    ");
360         write_table(out, crc_table, 256);
361         fprintf(out,
362             "};\n");
363 
364         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
365         fprintf(out,
366             "\n"
367             "#ifdef W\n"
368             "\n"
369             "#if W == 8\n"
370             "\n"
371             "local const z_word_t FAR crc_big_table[] = {\n"
372             "    ");
373         write_table64(out, crc_big_table, 256);
374         fprintf(out,
375             "};\n");
376 
377         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
378         fprintf(out,
379             "\n"
380             "#else /* W == 4 */\n"
381             "\n"
382             "local const z_word_t FAR crc_big_table[] = {\n"
383             "    ");
384         write_table32hi(out, crc_big_table, 256);
385         fprintf(out,
386             "};\n"
387             "\n"
388             "#endif\n");
389 
390         /* write out braid tables for each value of N */
391         for (n = 1; n <= 6; n++) {
392             fprintf(out,
393             "\n"
394             "#if N == %d\n", n);
395 
396             /* compute braid tables for this N and 64-bit word_t */
397             braid(ltl, big, n, 8);
398 
399             /* write out braid tables for 64-bit z_word_t to crc32.h */
400             fprintf(out,
401             "\n"
402             "#if W == 8\n"
403             "\n"
404             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
405             for (k = 0; k < 8; k++) {
406                 fprintf(out, "   {");
407                 write_table(out, ltl[k], 256);
408                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
409             }
410             fprintf(out,
411             "};\n"
412             "\n"
413             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
414             for (k = 0; k < 8; k++) {
415                 fprintf(out, "   {");
416                 write_table64(out, big[k], 256);
417                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
418             }
419             fprintf(out,
420             "};\n");
421 
422             /* compute braid tables for this N and 32-bit word_t */
423             braid(ltl, big, n, 4);
424 
425             /* write out braid tables for 32-bit z_word_t to crc32.h */
426             fprintf(out,
427             "\n"
428             "#else /* W == 4 */\n"
429             "\n"
430             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
431             for (k = 0; k < 4; k++) {
432                 fprintf(out, "   {");
433                 write_table(out, ltl[k], 256);
434                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
435             }
436             fprintf(out,
437             "};\n"
438             "\n"
439             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
440             for (k = 0; k < 4; k++) {
441                 fprintf(out, "   {");
442                 write_table32hi(out, big[k], 256);
443                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
444             }
445             fprintf(out,
446             "};\n"
447             "\n"
448             "#endif\n"
449             "\n"
450             "#endif\n");
451         }
452         fprintf(out,
453             "\n"
454             "#endif\n");
455 
456         /* write out zeros operator table to crc32.h */
457         fprintf(out,
458             "\n"
459             "local const z_crc_t FAR x2n_table[] = {\n"
460             "    ");
461         write_table(out, x2n_table, 32);
462         fprintf(out,
463             "};\n");
464         fclose(out);
465     }
466 #endif /* MAKECRCH */
467 }
468 
469 #ifdef MAKECRCH
470 
471 /*
472    Write the 32-bit values in table[0..k-1] to out, five per line in
473    hexadecimal separated by commas.
474  */
write_table(FILE * out,const z_crc_t FAR * table,int k)475 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
476     int n;
477 
478     for (n = 0; n < k; n++)
479         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
480                 (unsigned long)(table[n]),
481                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
482 }
483 
484 /*
485    Write the high 32-bits of each value in table[0..k-1] to out, five per line
486    in hexadecimal separated by commas.
487  */
write_table32hi(FILE * out,const z_word_t FAR * table,int k)488 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
489     int n;
490 
491     for (n = 0; n < k; n++)
492         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
493                 (unsigned long)(table[n] >> 32),
494                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
495 }
496 
497 /*
498   Write the 64-bit values in table[0..k-1] to out, three per line in
499   hexadecimal separated by commas. This assumes that if there is a 64-bit
500   type, then there is also a long long integer type, and it is at least 64
501   bits. If not, then the type cast and format string can be adjusted
502   accordingly.
503  */
write_table64(FILE * out,const z_word_t FAR * table,int k)504 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
505     int n;
506 
507     for (n = 0; n < k; n++)
508         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
509                 (unsigned long long)(table[n]),
510                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
511 }
512 
513 /* Actually do the deed. */
main(void)514 int main(void) {
515     make_crc_table();
516     return 0;
517 }
518 
519 #endif /* MAKECRCH */
520 
521 #ifdef W
522 /*
523   Generate the little and big-endian braid tables for the given n and z_word_t
524   size w. Each array must have room for w blocks of 256 elements.
525  */
braid(z_crc_t ltl[][256],z_word_t big[][256],int n,int w)526 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
527     int k;
528     z_crc_t i, p, q;
529     for (k = 0; k < w; k++) {
530         p = x2nmodp((n * w + 3 - k) << 3, 0);
531         ltl[k][0] = 0;
532         big[w - 1 - k][0] = 0;
533         for (i = 1; i < 256; i++) {
534             ltl[k][i] = q = multmodp(i << 24, p);
535             big[w - 1 - k][i] = byte_swap(q);
536         }
537     }
538 }
539 #endif
540 
541 #endif /* DYNAMIC_CRC_TABLE */
542 
543 /* =========================================================================
544  * This function can be used by asm versions of crc32(), and to force the
545  * generation of the CRC tables in a threaded application.
546  */
get_crc_table(void)547 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
548 #ifdef DYNAMIC_CRC_TABLE
549     once(&made, make_crc_table);
550 #endif /* DYNAMIC_CRC_TABLE */
551     return (const z_crc_t FAR *)crc_table;
552 }
553 
554 /* =========================================================================
555  * Use ARM machine instructions if available. This will compute the CRC about
556  * ten times faster than the braided calculation. This code does not check for
557  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
558  * only be defined if the compilation specifies an ARM processor architecture
559  * that has the instructions. For example, compiling with -march=armv8.1-a or
560  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
561  * instructions.
562  */
563 #ifdef ARMCRC32
564 
565 /*
566    Constants empirically determined to maximize speed. These values are from
567    measurements on a Cortex-A57. Your mileage may vary.
568  */
569 #define Z_BATCH 3990                /* number of words in a batch */
570 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
571 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
572 
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)573 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
574                               z_size_t len) {
575     z_crc_t val;
576     z_word_t crc1, crc2;
577     const z_word_t *word;
578     z_word_t val0, val1, val2;
579     z_size_t last, last2, i;
580     z_size_t num;
581 
582     /* Return initial CRC, if requested. */
583     if (buf == Z_NULL) return 0;
584 
585 #ifdef DYNAMIC_CRC_TABLE
586     once(&made, make_crc_table);
587 #endif /* DYNAMIC_CRC_TABLE */
588 
589     /* Pre-condition the CRC */
590     crc = (~crc) & 0xffffffff;
591 
592     /* Compute the CRC up to a word boundary. */
593     while (len && ((z_size_t)buf & 7) != 0) {
594         len--;
595         val = *buf++;
596         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
597     }
598 
599     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
600     word = (z_word_t const *)buf;
601     num = len >> 3;
602     len &= 7;
603 
604     /* Do three interleaved CRCs to realize the throughput of one crc32x
605        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
606        three CRCs are combined into a single CRC after each set of batches. */
607     while (num >= 3 * Z_BATCH) {
608         crc1 = 0;
609         crc2 = 0;
610         for (i = 0; i < Z_BATCH; i++) {
611             val0 = word[i];
612             val1 = word[i + Z_BATCH];
613             val2 = word[i + 2 * Z_BATCH];
614             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
615             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
616             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
617         }
618         word += 3 * Z_BATCH;
619         num -= 3 * Z_BATCH;
620         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
621         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
622     }
623 
624     /* Do one last smaller batch with the remaining words, if there are enough
625        to pay for the combination of CRCs. */
626     last = num / 3;
627     if (last >= Z_BATCH_MIN) {
628         last2 = last << 1;
629         crc1 = 0;
630         crc2 = 0;
631         for (i = 0; i < last; i++) {
632             val0 = word[i];
633             val1 = word[i + last];
634             val2 = word[i + last2];
635             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
636             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
637             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
638         }
639         word += 3 * last;
640         num -= 3 * last;
641         val = x2nmodp(last, 6);
642         crc = multmodp(val, crc) ^ crc1;
643         crc = multmodp(val, crc) ^ crc2;
644     }
645 
646     /* Compute the CRC on any remaining words. */
647     for (i = 0; i < num; i++) {
648         val0 = word[i];
649         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
650     }
651     word += num;
652 
653     /* Complete the CRC on any remaining bytes. */
654     buf = (const unsigned char FAR *)word;
655     while (len) {
656         len--;
657         val = *buf++;
658         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
659     }
660 
661     /* Return the CRC, post-conditioned. */
662     return crc ^ 0xffffffff;
663 }
664 
665 #else
666 
667 #ifdef W
668 
669 /*
670   Return the CRC of the W bytes in the word_t data, taking the
671   least-significant byte of the word as the first byte of data, without any pre
672   or post conditioning. This is used to combine the CRCs of each braid.
673  */
crc_word(z_word_t data)674 local z_crc_t crc_word(z_word_t data) {
675     int k;
676     for (k = 0; k < W; k++)
677         data = (data >> 8) ^ crc_table[data & 0xff];
678     return (z_crc_t)data;
679 }
680 
crc_word_big(z_word_t data)681 local z_word_t crc_word_big(z_word_t data) {
682     int k;
683     for (k = 0; k < W; k++)
684         data = (data << 8) ^
685             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
686     return data;
687 }
688 
689 #endif
690 
691 /* ========================================================================= */
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)692 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
693                               z_size_t len) {
694     /* Return initial CRC, if requested. */
695     if (buf == Z_NULL) return 0;
696 
697 #ifdef DYNAMIC_CRC_TABLE
698     once(&made, make_crc_table);
699 #endif /* DYNAMIC_CRC_TABLE */
700 
701     /* Pre-condition the CRC */
702     crc = (~crc) & 0xffffffff;
703 
704 #ifdef W
705 
706     /* If provided enough bytes, do a braided CRC calculation. */
707     if (len >= N * W + W - 1) {
708         z_size_t blks;
709         z_word_t const *words;
710         unsigned endian;
711         int k;
712 
713         /* Compute the CRC up to a z_word_t boundary. */
714         while (len && ((z_size_t)buf & (W - 1)) != 0) {
715             len--;
716             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
717         }
718 
719         /* Compute the CRC on as many N z_word_t blocks as are available. */
720         blks = len / (N * W);
721         len -= blks * N * W;
722         words = (z_word_t const *)buf;
723 
724         /* Do endian check at execution time instead of compile time, since ARM
725            processors can change the endianness at execution time. If the
726            compiler knows what the endianness will be, it can optimize out the
727            check and the unused branch. */
728         endian = 1;
729         if (*(unsigned char *)&endian) {
730             /* Little endian. */
731 
732             z_crc_t crc0;
733             z_word_t word0;
734 #if N > 1
735             z_crc_t crc1;
736             z_word_t word1;
737 #if N > 2
738             z_crc_t crc2;
739             z_word_t word2;
740 #if N > 3
741             z_crc_t crc3;
742             z_word_t word3;
743 #if N > 4
744             z_crc_t crc4;
745             z_word_t word4;
746 #if N > 5
747             z_crc_t crc5;
748             z_word_t word5;
749 #endif
750 #endif
751 #endif
752 #endif
753 #endif
754 
755             /* Initialize the CRC for each braid. */
756             crc0 = crc;
757 #if N > 1
758             crc1 = 0;
759 #if N > 2
760             crc2 = 0;
761 #if N > 3
762             crc3 = 0;
763 #if N > 4
764             crc4 = 0;
765 #if N > 5
766             crc5 = 0;
767 #endif
768 #endif
769 #endif
770 #endif
771 #endif
772 
773             /*
774               Process the first blks-1 blocks, computing the CRCs on each braid
775               independently.
776              */
777             while (--blks) {
778                 /* Load the word for each braid into registers. */
779                 word0 = crc0 ^ words[0];
780 #if N > 1
781                 word1 = crc1 ^ words[1];
782 #if N > 2
783                 word2 = crc2 ^ words[2];
784 #if N > 3
785                 word3 = crc3 ^ words[3];
786 #if N > 4
787                 word4 = crc4 ^ words[4];
788 #if N > 5
789                 word5 = crc5 ^ words[5];
790 #endif
791 #endif
792 #endif
793 #endif
794 #endif
795                 words += N;
796 
797                 /* Compute and update the CRC for each word. The loop should
798                    get unrolled. */
799                 crc0 = crc_braid_table[0][word0 & 0xff];
800 #if N > 1
801                 crc1 = crc_braid_table[0][word1 & 0xff];
802 #if N > 2
803                 crc2 = crc_braid_table[0][word2 & 0xff];
804 #if N > 3
805                 crc3 = crc_braid_table[0][word3 & 0xff];
806 #if N > 4
807                 crc4 = crc_braid_table[0][word4 & 0xff];
808 #if N > 5
809                 crc5 = crc_braid_table[0][word5 & 0xff];
810 #endif
811 #endif
812 #endif
813 #endif
814 #endif
815                 for (k = 1; k < W; k++) {
816                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
817 #if N > 1
818                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
819 #if N > 2
820                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
821 #if N > 3
822                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
823 #if N > 4
824                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
825 #if N > 5
826                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
827 #endif
828 #endif
829 #endif
830 #endif
831 #endif
832                 }
833             }
834 
835             /*
836               Process the last block, combining the CRCs of the N braids at the
837               same time.
838              */
839             crc = crc_word(crc0 ^ words[0]);
840 #if N > 1
841             crc = crc_word(crc1 ^ words[1] ^ crc);
842 #if N > 2
843             crc = crc_word(crc2 ^ words[2] ^ crc);
844 #if N > 3
845             crc = crc_word(crc3 ^ words[3] ^ crc);
846 #if N > 4
847             crc = crc_word(crc4 ^ words[4] ^ crc);
848 #if N > 5
849             crc = crc_word(crc5 ^ words[5] ^ crc);
850 #endif
851 #endif
852 #endif
853 #endif
854 #endif
855             words += N;
856         }
857         else {
858             /* Big endian. */
859 
860             z_word_t crc0, word0, comb;
861 #if N > 1
862             z_word_t crc1, word1;
863 #if N > 2
864             z_word_t crc2, word2;
865 #if N > 3
866             z_word_t crc3, word3;
867 #if N > 4
868             z_word_t crc4, word4;
869 #if N > 5
870             z_word_t crc5, word5;
871 #endif
872 #endif
873 #endif
874 #endif
875 #endif
876 
877             /* Initialize the CRC for each braid. */
878             crc0 = byte_swap(crc);
879 #if N > 1
880             crc1 = 0;
881 #if N > 2
882             crc2 = 0;
883 #if N > 3
884             crc3 = 0;
885 #if N > 4
886             crc4 = 0;
887 #if N > 5
888             crc5 = 0;
889 #endif
890 #endif
891 #endif
892 #endif
893 #endif
894 
895             /*
896               Process the first blks-1 blocks, computing the CRCs on each braid
897               independently.
898              */
899             while (--blks) {
900                 /* Load the word for each braid into registers. */
901                 word0 = crc0 ^ words[0];
902 #if N > 1
903                 word1 = crc1 ^ words[1];
904 #if N > 2
905                 word2 = crc2 ^ words[2];
906 #if N > 3
907                 word3 = crc3 ^ words[3];
908 #if N > 4
909                 word4 = crc4 ^ words[4];
910 #if N > 5
911                 word5 = crc5 ^ words[5];
912 #endif
913 #endif
914 #endif
915 #endif
916 #endif
917                 words += N;
918 
919                 /* Compute and update the CRC for each word. The loop should
920                    get unrolled. */
921                 crc0 = crc_braid_big_table[0][word0 & 0xff];
922 #if N > 1
923                 crc1 = crc_braid_big_table[0][word1 & 0xff];
924 #if N > 2
925                 crc2 = crc_braid_big_table[0][word2 & 0xff];
926 #if N > 3
927                 crc3 = crc_braid_big_table[0][word3 & 0xff];
928 #if N > 4
929                 crc4 = crc_braid_big_table[0][word4 & 0xff];
930 #if N > 5
931                 crc5 = crc_braid_big_table[0][word5 & 0xff];
932 #endif
933 #endif
934 #endif
935 #endif
936 #endif
937                 for (k = 1; k < W; k++) {
938                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
939 #if N > 1
940                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
941 #if N > 2
942                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
943 #if N > 3
944                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
945 #if N > 4
946                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
947 #if N > 5
948                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
949 #endif
950 #endif
951 #endif
952 #endif
953 #endif
954                 }
955             }
956 
957             /*
958               Process the last block, combining the CRCs of the N braids at the
959               same time.
960              */
961             comb = crc_word_big(crc0 ^ words[0]);
962 #if N > 1
963             comb = crc_word_big(crc1 ^ words[1] ^ comb);
964 #if N > 2
965             comb = crc_word_big(crc2 ^ words[2] ^ comb);
966 #if N > 3
967             comb = crc_word_big(crc3 ^ words[3] ^ comb);
968 #if N > 4
969             comb = crc_word_big(crc4 ^ words[4] ^ comb);
970 #if N > 5
971             comb = crc_word_big(crc5 ^ words[5] ^ comb);
972 #endif
973 #endif
974 #endif
975 #endif
976 #endif
977             words += N;
978             crc = byte_swap(comb);
979         }
980 
981         /*
982           Update the pointer to the remaining bytes to process.
983          */
984         buf = (unsigned char const *)words;
985     }
986 
987 #endif /* W */
988 
989     /* Complete the computation of the CRC on any remaining bytes. */
990     while (len >= 8) {
991         len -= 8;
992         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
993         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
994         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
995         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
996         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
997         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
998         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
999         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1000     }
1001     while (len) {
1002         len--;
1003         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1004     }
1005 
1006     /* Return the CRC, post-conditioned. */
1007     return crc ^ 0xffffffff;
1008 }
1009 
1010 #endif
1011 
1012 /* ========================================================================= */
crc32(unsigned long crc,const unsigned char FAR * buf,uInt len)1013 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1014                             uInt len) {
1015     return crc32_z(crc, buf, len);
1016 }
1017 
1018 /* ========================================================================= */
crc32_combine64(uLong crc1,uLong crc2,z_off64_t len2)1019 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1020 #ifdef DYNAMIC_CRC_TABLE
1021     once(&made, make_crc_table);
1022 #endif /* DYNAMIC_CRC_TABLE */
1023     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1024 }
1025 
1026 /* ========================================================================= */
crc32_combine(uLong crc1,uLong crc2,z_off_t len2)1027 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1028     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1029 }
1030 
1031 /* ========================================================================= */
crc32_combine_gen64(z_off64_t len2)1032 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1033 #ifdef DYNAMIC_CRC_TABLE
1034     once(&made, make_crc_table);
1035 #endif /* DYNAMIC_CRC_TABLE */
1036     return x2nmodp(len2, 3);
1037 }
1038 
1039 /* ========================================================================= */
crc32_combine_gen(z_off_t len2)1040 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1041     return crc32_combine_gen64((z_off64_t)len2);
1042 }
1043 
1044 /* ========================================================================= */
crc32_combine_op(uLong crc1,uLong crc2,uLong op)1045 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1046     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1047 }
1048