1 /* $NetBSD: crc32.c,v 1.6 2022/10/15 19:49:32 christos Exp $ */ 2 3 /* crc32.c -- compute the CRC-32 of a data stream 4 * Copyright (C) 1995-2022 Mark Adler 5 * For conditions of distribution and use, see copyright notice in zlib.h 6 * 7 * This interleaved implementation of a CRC makes use of pipelined multiple 8 * arithmetic-logic units, commonly found in modern CPU cores. It is due to 9 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. 10 */ 11 12 /* @(#) Id */ 13 14 /* 15 Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore 16 protection on the static variables used to control the first-use generation 17 of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should 18 first call get_crc_table() to initialize the tables before allowing more than 19 one thread to use crc32(). 20 21 MAKECRCH can be #defined to write out crc32.h. A main() routine is also 22 produced, so that this one source file can be compiled to an executable. 23 */ 24 25 #ifdef MAKECRCH 26 # include <stdio.h> 27 # ifndef DYNAMIC_CRC_TABLE 28 # define DYNAMIC_CRC_TABLE 29 # endif /* !DYNAMIC_CRC_TABLE */ 30 #endif /* MAKECRCH */ 31 32 #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ 33 34 /* 35 A CRC of a message is computed on N braids of words in the message, where 36 each word consists of W bytes (4 or 8). If N is 3, for example, then three 37 running sparse CRCs are calculated respectively on each braid, at these 38 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... 39 This is done starting at a word boundary, and continues until as many blocks 40 of N * W bytes as are available have been processed. The results are combined 41 into a single CRC at the end. For this code, N must be in the range 1..6 and 42 W must be 4 or 8. The upper limit on N can be increased if desired by adding 43 more #if blocks, extending the patterns apparent in the code. In addition, 44 crc32.h would need to be regenerated, if the maximum N value is increased. 45 46 N and W are chosen empirically by benchmarking the execution time on a given 47 processor. The choices for N and W below were based on testing on Intel Kaby 48 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64 49 Octeon II processors. The Intel, AMD, and ARM processors were all fastest 50 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4. 51 They were all tested with either gcc or clang, all using the -O3 optimization 52 level. Your mileage may vary. 53 */ 54 55 /* Define N */ 56 #ifdef Z_TESTN 57 # define N Z_TESTN 58 #else 59 # define N 5 60 #endif 61 #if N < 1 || N > 6 62 # error N must be in 1..6 63 #endif 64 65 /* 66 z_crc_t must be at least 32 bits. z_word_t must be at least as long as 67 z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and 68 that bytes are eight bits. 69 */ 70 71 /* 72 Define W and the associated z_word_t type. If W is not defined, then a 73 braided calculation is not used, and the associated tables and code are not 74 compiled. 75 */ 76 #ifdef Z_TESTW 77 # if Z_TESTW-1 != -1 78 # define W Z_TESTW 79 # endif 80 #else 81 # ifdef MAKECRCH 82 # define W 8 /* required for MAKECRCH */ 83 # else 84 # if defined(__x86_64__) || defined(__aarch64__) 85 # define W 8 86 # else 87 # define W 4 88 # endif 89 # endif 90 #endif 91 #ifdef W 92 # if W == 8 && defined(Z_U8) 93 typedef Z_U8 z_word_t; 94 # elif defined(Z_U4) 95 # undef W 96 # define W 4 97 typedef Z_U4 z_word_t; 98 # else 99 # undef W 100 # endif 101 #endif 102 103 /* If available, use the ARM processor CRC32 instruction. */ 104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 105 # define ARMCRC32 106 #endif 107 108 /* Local functions. */ 109 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b)); 110 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k)); 111 112 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE)) 113 local z_word_t byte_swap OF((z_word_t word)); 114 #endif 115 116 #if defined(W) && !defined(ARMCRC32) 117 local z_crc_t crc_word OF((z_word_t data)); 118 local z_word_t crc_word_big OF((z_word_t data)); 119 #endif 120 121 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE)) 122 /* 123 Swap the bytes in a z_word_t to convert between little and big endian. Any 124 self-respecting compiler will optimize this to a single machine byte-swap 125 instruction, if one is available. This assumes that word_t is either 32 bits 126 or 64 bits. 127 */ 128 local z_word_t byte_swap(word) 129 z_word_t word; 130 { 131 # if W == 8 132 return 133 (word & 0xff00000000000000) >> 56 | 134 (word & 0xff000000000000) >> 40 | 135 (word & 0xff0000000000) >> 24 | 136 (word & 0xff00000000) >> 8 | 137 (word & 0xff000000) << 8 | 138 (word & 0xff0000) << 24 | 139 (word & 0xff00) << 40 | 140 (word & 0xff) << 56; 141 # else /* W == 4 */ 142 return 143 (word & 0xff000000) >> 24 | 144 (word & 0xff0000) >> 8 | 145 (word & 0xff00) << 8 | 146 (word & 0xff) << 24; 147 # endif 148 } 149 #endif 150 151 /* CRC polynomial. */ 152 #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */ 153 154 #ifdef DYNAMIC_CRC_TABLE 155 156 local z_crc_t FAR crc_table[256]; 157 local z_crc_t FAR x2n_table[32]; 158 local void make_crc_table OF((void)); 159 #ifdef W 160 local z_word_t FAR crc_big_table[256]; 161 local z_crc_t FAR crc_braid_table[W][256]; 162 local z_word_t FAR crc_braid_big_table[W][256]; 163 local void braid OF((z_crc_t [][256], z_word_t [][256], int, int)); 164 #endif 165 #ifdef MAKECRCH 166 local void write_table OF((FILE *, const z_crc_t FAR *, int)); 167 local void write_table32hi OF((FILE *, const z_word_t FAR *, int)); 168 local void write_table64 OF((FILE *, const z_word_t FAR *, int)); 169 #endif /* MAKECRCH */ 170 171 /* 172 Define a once() function depending on the availability of atomics. If this is 173 compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in 174 multiple threads, and if atomics are not available, then get_crc_table() must 175 be called to initialize the tables and must return before any threads are 176 allowed to compute or combine CRCs. 177 */ 178 179 /* Definition of once functionality. */ 180 typedef struct once_s once_t; 181 local void once OF((once_t *, void (*)(void))); 182 183 /* Check for the availability of atomics. */ 184 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \ 185 !defined(__STDC_NO_ATOMICS__) 186 187 #include <stdatomic.h> 188 189 /* Structure for once(), which must be initialized with ONCE_INIT. */ 190 struct once_s { 191 atomic_flag begun; 192 atomic_int done; 193 }; 194 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0} 195 196 /* 197 Run the provided init() function exactly once, even if multiple threads 198 invoke once() at the same time. The state must be a once_t initialized with 199 ONCE_INIT. 200 */ 201 local void once(state, init) 202 once_t *state; 203 void (*init)(void); 204 { 205 if (!atomic_load(&state->done)) { 206 if (atomic_flag_test_and_set(&state->begun)) 207 while (!atomic_load(&state->done)) 208 ; 209 else { 210 init(); 211 atomic_store(&state->done, 1); 212 } 213 } 214 } 215 216 #else /* no atomics */ 217 218 /* Structure for once(), which must be initialized with ONCE_INIT. */ 219 struct once_s { 220 volatile int begun; 221 volatile int done; 222 }; 223 #define ONCE_INIT {0, 0} 224 225 /* Test and set. Alas, not atomic, but tries to minimize the period of 226 vulnerability. */ 227 local int test_and_set OF((int volatile *)); 228 local int test_and_set(flag) 229 int volatile *flag; 230 { 231 int was; 232 233 was = *flag; 234 *flag = 1; 235 return was; 236 } 237 238 /* Run the provided init() function once. This is not thread-safe. */ 239 local void once(state, init) 240 once_t *state; 241 void (*init)(void); 242 { 243 if (!state->done) { 244 if (test_and_set(&state->begun)) 245 while (!state->done) 246 ; 247 else { 248 init(); 249 state->done = 1; 250 } 251 } 252 } 253 254 #endif 255 256 /* State for once(). */ 257 local once_t made = ONCE_INIT; 258 259 /* 260 Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: 261 x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. 262 263 Polynomials over GF(2) are represented in binary, one bit per coefficient, 264 with the lowest powers in the most significant bit. Then adding polynomials 265 is just exclusive-or, and multiplying a polynomial by x is a right shift by 266 one. If we call the above polynomial p, and represent a byte as the 267 polynomial q, also with the lowest power in the most significant bit (so the 268 byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p, 269 where a mod b means the remainder after dividing a by b. 270 271 This calculation is done using the shift-register method of multiplying and 272 taking the remainder. The register is initialized to zero, and for each 273 incoming bit, x^32 is added mod p to the register if the bit is a one (where 274 x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x 275 (which is shifting right by one and adding x^32 mod p if the bit shifted out 276 is a one). We start with the highest power (least significant bit) of q and 277 repeat for all eight bits of q. 278 279 The table is simply the CRC of all possible eight bit values. This is all the 280 information needed to generate CRCs on data a byte at a time for all 281 combinations of CRC register values and incoming bytes. 282 */ 283 284 local void make_crc_table() 285 { 286 unsigned i, j, n; 287 z_crc_t p; 288 289 /* initialize the CRC of bytes tables */ 290 for (i = 0; i < 256; i++) { 291 p = i; 292 for (j = 0; j < 8; j++) 293 p = p & 1 ? (p >> 1) ^ POLY : p >> 1; 294 crc_table[i] = p; 295 #ifdef W 296 crc_big_table[i] = byte_swap(p); 297 #endif 298 } 299 300 /* initialize the x^2^n mod p(x) table */ 301 p = (z_crc_t)1 << 30; /* x^1 */ 302 x2n_table[0] = p; 303 for (n = 1; n < 32; n++) 304 x2n_table[n] = p = multmodp(p, p); 305 306 #ifdef W 307 /* initialize the braiding tables -- needs x2n_table[] */ 308 braid(crc_braid_table, crc_braid_big_table, N, W); 309 #endif 310 311 #ifdef MAKECRCH 312 { 313 /* 314 The crc32.h header file contains tables for both 32-bit and 64-bit 315 z_word_t's, and so requires a 64-bit type be available. In that case, 316 z_word_t must be defined to be 64-bits. This code then also generates 317 and writes out the tables for the case that z_word_t is 32 bits. 318 */ 319 #if !defined(W) || W != 8 320 # error Need a 64-bit integer type in order to generate crc32.h. 321 #endif 322 FILE *out; 323 int k, n; 324 z_crc_t ltl[8][256]; 325 z_word_t big[8][256]; 326 327 out = fopen("crc32.h", "w"); 328 if (out == NULL) return; 329 330 /* write out little-endian CRC table to crc32.h */ 331 fprintf(out, 332 "/* crc32.h -- tables for rapid CRC calculation\n" 333 " * Generated automatically by crc32.c\n */\n" 334 "\n" 335 "local const z_crc_t FAR crc_table[] = {\n" 336 " "); 337 write_table(out, crc_table, 256); 338 fprintf(out, 339 "};\n"); 340 341 /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */ 342 fprintf(out, 343 "\n" 344 "#ifdef W\n" 345 "\n" 346 "#if W == 8\n" 347 "\n" 348 "local const z_word_t FAR crc_big_table[] = {\n" 349 " "); 350 write_table64(out, crc_big_table, 256); 351 fprintf(out, 352 "};\n"); 353 354 /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */ 355 fprintf(out, 356 "\n" 357 "#else /* W == 4 */\n" 358 "\n" 359 "local const z_word_t FAR crc_big_table[] = {\n" 360 " "); 361 write_table32hi(out, crc_big_table, 256); 362 fprintf(out, 363 "};\n" 364 "\n" 365 "#endif\n"); 366 367 /* write out braid tables for each value of N */ 368 for (n = 1; n <= 6; n++) { 369 fprintf(out, 370 "\n" 371 "#if N == %d\n", n); 372 373 /* compute braid tables for this N and 64-bit word_t */ 374 braid(ltl, big, n, 8); 375 376 /* write out braid tables for 64-bit z_word_t to crc32.h */ 377 fprintf(out, 378 "\n" 379 "#if W == 8\n" 380 "\n" 381 "local const z_crc_t FAR crc_braid_table[][256] = {\n"); 382 for (k = 0; k < 8; k++) { 383 fprintf(out, " {"); 384 write_table(out, ltl[k], 256); 385 fprintf(out, "}%s", k < 7 ? ",\n" : ""); 386 } 387 fprintf(out, 388 "};\n" 389 "\n" 390 "local const z_word_t FAR crc_braid_big_table[][256] = {\n"); 391 for (k = 0; k < 8; k++) { 392 fprintf(out, " {"); 393 write_table64(out, big[k], 256); 394 fprintf(out, "}%s", k < 7 ? ",\n" : ""); 395 } 396 fprintf(out, 397 "};\n"); 398 399 /* compute braid tables for this N and 32-bit word_t */ 400 braid(ltl, big, n, 4); 401 402 /* write out braid tables for 32-bit z_word_t to crc32.h */ 403 fprintf(out, 404 "\n" 405 "#else /* W == 4 */\n" 406 "\n" 407 "local const z_crc_t FAR crc_braid_table[][256] = {\n"); 408 for (k = 0; k < 4; k++) { 409 fprintf(out, " {"); 410 write_table(out, ltl[k], 256); 411 fprintf(out, "}%s", k < 3 ? ",\n" : ""); 412 } 413 fprintf(out, 414 "};\n" 415 "\n" 416 "local const z_word_t FAR crc_braid_big_table[][256] = {\n"); 417 for (k = 0; k < 4; k++) { 418 fprintf(out, " {"); 419 write_table32hi(out, big[k], 256); 420 fprintf(out, "}%s", k < 3 ? ",\n" : ""); 421 } 422 fprintf(out, 423 "};\n" 424 "\n" 425 "#endif\n" 426 "\n" 427 "#endif\n"); 428 } 429 fprintf(out, 430 "\n" 431 "#endif\n"); 432 433 /* write out zeros operator table to crc32.h */ 434 fprintf(out, 435 "\n" 436 "local const z_crc_t FAR x2n_table[] = {\n" 437 " "); 438 write_table(out, x2n_table, 32); 439 fprintf(out, 440 "};\n"); 441 fclose(out); 442 } 443 #endif /* MAKECRCH */ 444 } 445 446 #ifdef MAKECRCH 447 448 /* 449 Write the 32-bit values in table[0..k-1] to out, five per line in 450 hexadecimal separated by commas. 451 */ 452 local void write_table(out, table, k) 453 FILE *out; 454 const z_crc_t FAR *table; 455 int k; 456 { 457 int n; 458 459 for (n = 0; n < k; n++) 460 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ", 461 (unsigned long)(table[n]), 462 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", ")); 463 } 464 465 /* 466 Write the high 32-bits of each value in table[0..k-1] to out, five per line 467 in hexadecimal separated by commas. 468 */ 469 local void write_table32hi(out, table, k) 470 FILE *out; 471 const z_word_t FAR *table; 472 int k; 473 { 474 int n; 475 476 for (n = 0; n < k; n++) 477 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ", 478 (unsigned long)(table[n] >> 32), 479 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", ")); 480 } 481 482 /* 483 Write the 64-bit values in table[0..k-1] to out, three per line in 484 hexadecimal separated by commas. This assumes that if there is a 64-bit 485 type, then there is also a long long integer type, and it is at least 64 486 bits. If not, then the type cast and format string can be adjusted 487 accordingly. 488 */ 489 local void write_table64(out, table, k) 490 FILE *out; 491 const z_word_t FAR *table; 492 int k; 493 { 494 int n; 495 496 for (n = 0; n < k; n++) 497 fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ", 498 (unsigned long long)(table[n]), 499 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", ")); 500 } 501 502 /* Actually do the deed. */ 503 int main() 504 { 505 make_crc_table(); 506 return 0; 507 } 508 509 #endif /* MAKECRCH */ 510 511 #ifdef W 512 /* 513 Generate the little and big-endian braid tables for the given n and z_word_t 514 size w. Each array must have room for w blocks of 256 elements. 515 */ 516 local void braid(ltl, big, n, w) 517 z_crc_t ltl[][256]; 518 z_word_t big[][256]; 519 int n; 520 int w; 521 { 522 int k; 523 z_crc_t i, p, q; 524 for (k = 0; k < w; k++) { 525 p = x2nmodp((n * w + 3 - k) << 3, 0); 526 ltl[k][0] = 0; 527 big[w - 1 - k][0] = 0; 528 for (i = 1; i < 256; i++) { 529 ltl[k][i] = q = multmodp(i << 24, p); 530 big[w - 1 - k][i] = byte_swap(q); 531 } 532 } 533 } 534 #endif 535 536 #else /* !DYNAMIC_CRC_TABLE */ 537 /* ======================================================================== 538 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers 539 * of x for combining CRC-32s, all made by make_crc_table(). 540 */ 541 #include "crc32.h" 542 #endif /* DYNAMIC_CRC_TABLE */ 543 544 /* ======================================================================== 545 * Routines used for CRC calculation. Some are also required for the table 546 * generation above. 547 */ 548 549 /* 550 Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial, 551 reflected. For speed, this requires that a not be zero. 552 */ 553 local z_crc_t multmodp(a, b) 554 z_crc_t a; 555 z_crc_t b; 556 { 557 z_crc_t m, p; 558 559 m = (z_crc_t)1 << 31; 560 p = 0; 561 for (;;) { 562 if (a & m) { 563 p ^= b; 564 if ((a & (m - 1)) == 0) 565 break; 566 } 567 m >>= 1; 568 b = b & 1 ? (b >> 1) ^ POLY : b >> 1; 569 } 570 return p; 571 } 572 573 /* 574 Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been 575 initialized. 576 */ 577 local z_crc_t x2nmodp(n, k) 578 z_off64_t n; 579 unsigned k; 580 { 581 z_crc_t p; 582 583 p = (z_crc_t)1 << 31; /* x^0 == 1 */ 584 while (n) { 585 if (n & 1) 586 p = multmodp(x2n_table[k & 31], p); 587 n >>= 1; 588 k++; 589 } 590 return p; 591 } 592 593 /* ========================================================================= 594 * This function can be used by asm versions of crc32(), and to force the 595 * generation of the CRC tables in a threaded application. 596 */ 597 const z_crc_t FAR * ZEXPORT get_crc_table() 598 { 599 #ifdef DYNAMIC_CRC_TABLE 600 once(&made, make_crc_table); 601 #endif /* DYNAMIC_CRC_TABLE */ 602 return (const z_crc_t FAR *)crc_table; 603 } 604 605 /* ========================================================================= 606 * Use ARM machine instructions if available. This will compute the CRC about 607 * ten times faster than the braided calculation. This code does not check for 608 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will 609 * only be defined if the compilation specifies an ARM processor architecture 610 * that has the instructions. For example, compiling with -march=armv8.1-a or 611 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32 612 * instructions. 613 */ 614 #ifdef ARMCRC32 615 616 /* 617 Constants empirically determined to maximize speed. These values are from 618 measurements on a Cortex-A57. Your mileage may vary. 619 */ 620 #define Z_BATCH 3990 /* number of words in a batch */ 621 #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */ 622 #define Z_BATCH_MIN 800 /* fewest words in a final batch */ 623 624 unsigned long ZEXPORT crc32_z(crc, buf, len) 625 unsigned long crc; 626 const unsigned char FAR *buf; 627 z_size_t len; 628 { 629 z_crc_t val; 630 z_word_t crc1, crc2; 631 const z_word_t *word; 632 z_word_t val0, val1, val2; 633 z_size_t last, last2, i; 634 z_size_t num; 635 636 /* Return initial CRC, if requested. */ 637 if (buf == Z_NULL) return 0; 638 639 #ifdef DYNAMIC_CRC_TABLE 640 once(&made, make_crc_table); 641 #endif /* DYNAMIC_CRC_TABLE */ 642 643 /* Pre-condition the CRC */ 644 crc = (~crc) & 0xffffffff; 645 646 /* Compute the CRC up to a word boundary. */ 647 while (len && ((z_size_t)buf & 7) != 0) { 648 len--; 649 val = *buf++; 650 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val)); 651 } 652 653 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */ 654 word = (z_word_t const *)buf; 655 num = len >> 3; 656 len &= 7; 657 658 /* Do three interleaved CRCs to realize the throughput of one crc32x 659 instruction per cycle. Each CRC is calculated on Z_BATCH words. The 660 three CRCs are combined into a single CRC after each set of batches. */ 661 while (num >= 3 * Z_BATCH) { 662 crc1 = 0; 663 crc2 = 0; 664 for (i = 0; i < Z_BATCH; i++) { 665 val0 = word[i]; 666 val1 = word[i + Z_BATCH]; 667 val2 = word[i + 2 * Z_BATCH]; 668 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); 669 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1)); 670 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2)); 671 } 672 word += 3 * Z_BATCH; 673 num -= 3 * Z_BATCH; 674 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1; 675 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2; 676 } 677 678 /* Do one last smaller batch with the remaining words, if there are enough 679 to pay for the combination of CRCs. */ 680 last = num / 3; 681 if (last >= Z_BATCH_MIN) { 682 last2 = last << 1; 683 crc1 = 0; 684 crc2 = 0; 685 for (i = 0; i < last; i++) { 686 val0 = word[i]; 687 val1 = word[i + last]; 688 val2 = word[i + last2]; 689 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); 690 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1)); 691 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2)); 692 } 693 word += 3 * last; 694 num -= 3 * last; 695 val = x2nmodp(last, 6); 696 crc = multmodp(val, crc) ^ crc1; 697 crc = multmodp(val, crc) ^ crc2; 698 } 699 700 /* Compute the CRC on any remaining words. */ 701 for (i = 0; i < num; i++) { 702 val0 = word[i]; 703 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); 704 } 705 word += num; 706 707 /* Complete the CRC on any remaining bytes. */ 708 buf = (const unsigned char FAR *)word; 709 while (len) { 710 len--; 711 val = *buf++; 712 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val)); 713 } 714 715 /* Return the CRC, post-conditioned. */ 716 return crc ^ 0xffffffff; 717 } 718 719 #else 720 721 #ifdef W 722 723 /* 724 Return the CRC of the W bytes in the word_t data, taking the 725 least-significant byte of the word as the first byte of data, without any pre 726 or post conditioning. This is used to combine the CRCs of each braid. 727 */ 728 local z_crc_t crc_word(data) 729 z_word_t data; 730 { 731 int k; 732 for (k = 0; k < W; k++) 733 data = (data >> 8) ^ crc_table[data & 0xff]; 734 return (z_crc_t)data; 735 } 736 737 local z_word_t crc_word_big(data) 738 z_word_t data; 739 { 740 int k; 741 for (k = 0; k < W; k++) 742 data = (data << 8) ^ 743 crc_big_table[(data >> ((W - 1) << 3)) & 0xff]; 744 return data; 745 } 746 747 #endif 748 749 /* ========================================================================= */ 750 unsigned long ZEXPORT crc32_z(crc, buf, len) 751 unsigned long crc; 752 const unsigned char FAR *buf; 753 z_size_t len; 754 { 755 /* Return initial CRC, if requested. */ 756 if (buf == Z_NULL) return 0; 757 758 #ifdef DYNAMIC_CRC_TABLE 759 once(&made, make_crc_table); 760 #endif /* DYNAMIC_CRC_TABLE */ 761 762 /* Pre-condition the CRC */ 763 crc = (~crc) & 0xffffffff; 764 765 #ifdef W 766 767 /* If provided enough bytes, do a braided CRC calculation. */ 768 if (len >= N * W + W - 1) { 769 z_size_t blks; 770 z_word_t const *words; 771 unsigned endian; 772 int k; 773 774 /* Compute the CRC up to a z_word_t boundary. */ 775 while (len && ((z_size_t)buf & (W - 1)) != 0) { 776 len--; 777 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 778 } 779 780 /* Compute the CRC on as many N z_word_t blocks as are available. */ 781 blks = len / (N * W); 782 len -= blks * N * W; 783 words = (z_word_t const *)buf; 784 785 /* Do endian check at execution time instead of compile time, since ARM 786 processors can change the endianess at execution time. If the 787 compiler knows what the endianess will be, it can optimize out the 788 check and the unused branch. */ 789 endian = 1; 790 if (*(unsigned char *)&endian) { 791 /* Little endian. */ 792 793 z_crc_t crc0; 794 z_word_t word0; 795 #if N > 1 796 z_crc_t crc1; 797 z_word_t word1; 798 #if N > 2 799 z_crc_t crc2; 800 z_word_t word2; 801 #if N > 3 802 z_crc_t crc3; 803 z_word_t word3; 804 #if N > 4 805 z_crc_t crc4; 806 z_word_t word4; 807 #if N > 5 808 z_crc_t crc5; 809 z_word_t word5; 810 #endif 811 #endif 812 #endif 813 #endif 814 #endif 815 816 /* Initialize the CRC for each braid. */ 817 crc0 = crc; 818 #if N > 1 819 crc1 = 0; 820 #if N > 2 821 crc2 = 0; 822 #if N > 3 823 crc3 = 0; 824 #if N > 4 825 crc4 = 0; 826 #if N > 5 827 crc5 = 0; 828 #endif 829 #endif 830 #endif 831 #endif 832 #endif 833 834 /* 835 Process the first blks-1 blocks, computing the CRCs on each braid 836 independently. 837 */ 838 while (--blks) { 839 /* Load the word for each braid into registers. */ 840 word0 = crc0 ^ words[0]; 841 #if N > 1 842 word1 = crc1 ^ words[1]; 843 #if N > 2 844 word2 = crc2 ^ words[2]; 845 #if N > 3 846 word3 = crc3 ^ words[3]; 847 #if N > 4 848 word4 = crc4 ^ words[4]; 849 #if N > 5 850 word5 = crc5 ^ words[5]; 851 #endif 852 #endif 853 #endif 854 #endif 855 #endif 856 words += N; 857 858 /* Compute and update the CRC for each word. The loop should 859 get unrolled. */ 860 crc0 = crc_braid_table[0][word0 & 0xff]; 861 #if N > 1 862 crc1 = crc_braid_table[0][word1 & 0xff]; 863 #if N > 2 864 crc2 = crc_braid_table[0][word2 & 0xff]; 865 #if N > 3 866 crc3 = crc_braid_table[0][word3 & 0xff]; 867 #if N > 4 868 crc4 = crc_braid_table[0][word4 & 0xff]; 869 #if N > 5 870 crc5 = crc_braid_table[0][word5 & 0xff]; 871 #endif 872 #endif 873 #endif 874 #endif 875 #endif 876 for (k = 1; k < W; k++) { 877 crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff]; 878 #if N > 1 879 crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff]; 880 #if N > 2 881 crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff]; 882 #if N > 3 883 crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff]; 884 #if N > 4 885 crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff]; 886 #if N > 5 887 crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff]; 888 #endif 889 #endif 890 #endif 891 #endif 892 #endif 893 } 894 } 895 896 /* 897 Process the last block, combining the CRCs of the N braids at the 898 same time. 899 */ 900 crc = crc_word(crc0 ^ words[0]); 901 #if N > 1 902 crc = crc_word(crc1 ^ words[1] ^ crc); 903 #if N > 2 904 crc = crc_word(crc2 ^ words[2] ^ crc); 905 #if N > 3 906 crc = crc_word(crc3 ^ words[3] ^ crc); 907 #if N > 4 908 crc = crc_word(crc4 ^ words[4] ^ crc); 909 #if N > 5 910 crc = crc_word(crc5 ^ words[5] ^ crc); 911 #endif 912 #endif 913 #endif 914 #endif 915 #endif 916 words += N; 917 } 918 else { 919 /* Big endian. */ 920 921 z_word_t crc0, word0, comb; 922 #if N > 1 923 z_word_t crc1, word1; 924 #if N > 2 925 z_word_t crc2, word2; 926 #if N > 3 927 z_word_t crc3, word3; 928 #if N > 4 929 z_word_t crc4, word4; 930 #if N > 5 931 z_word_t crc5, word5; 932 #endif 933 #endif 934 #endif 935 #endif 936 #endif 937 938 /* Initialize the CRC for each braid. */ 939 crc0 = byte_swap(crc); 940 #if N > 1 941 crc1 = 0; 942 #if N > 2 943 crc2 = 0; 944 #if N > 3 945 crc3 = 0; 946 #if N > 4 947 crc4 = 0; 948 #if N > 5 949 crc5 = 0; 950 #endif 951 #endif 952 #endif 953 #endif 954 #endif 955 956 /* 957 Process the first blks-1 blocks, computing the CRCs on each braid 958 independently. 959 */ 960 while (--blks) { 961 /* Load the word for each braid into registers. */ 962 word0 = crc0 ^ words[0]; 963 #if N > 1 964 word1 = crc1 ^ words[1]; 965 #if N > 2 966 word2 = crc2 ^ words[2]; 967 #if N > 3 968 word3 = crc3 ^ words[3]; 969 #if N > 4 970 word4 = crc4 ^ words[4]; 971 #if N > 5 972 word5 = crc5 ^ words[5]; 973 #endif 974 #endif 975 #endif 976 #endif 977 #endif 978 words += N; 979 980 /* Compute and update the CRC for each word. The loop should 981 get unrolled. */ 982 crc0 = crc_braid_big_table[0][word0 & 0xff]; 983 #if N > 1 984 crc1 = crc_braid_big_table[0][word1 & 0xff]; 985 #if N > 2 986 crc2 = crc_braid_big_table[0][word2 & 0xff]; 987 #if N > 3 988 crc3 = crc_braid_big_table[0][word3 & 0xff]; 989 #if N > 4 990 crc4 = crc_braid_big_table[0][word4 & 0xff]; 991 #if N > 5 992 crc5 = crc_braid_big_table[0][word5 & 0xff]; 993 #endif 994 #endif 995 #endif 996 #endif 997 #endif 998 for (k = 1; k < W; k++) { 999 crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff]; 1000 #if N > 1 1001 crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff]; 1002 #if N > 2 1003 crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff]; 1004 #if N > 3 1005 crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff]; 1006 #if N > 4 1007 crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff]; 1008 #if N > 5 1009 crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff]; 1010 #endif 1011 #endif 1012 #endif 1013 #endif 1014 #endif 1015 } 1016 } 1017 1018 /* 1019 Process the last block, combining the CRCs of the N braids at the 1020 same time. 1021 */ 1022 comb = crc_word_big(crc0 ^ words[0]); 1023 #if N > 1 1024 comb = crc_word_big(crc1 ^ words[1] ^ comb); 1025 #if N > 2 1026 comb = crc_word_big(crc2 ^ words[2] ^ comb); 1027 #if N > 3 1028 comb = crc_word_big(crc3 ^ words[3] ^ comb); 1029 #if N > 4 1030 comb = crc_word_big(crc4 ^ words[4] ^ comb); 1031 #if N > 5 1032 comb = crc_word_big(crc5 ^ words[5] ^ comb); 1033 #endif 1034 #endif 1035 #endif 1036 #endif 1037 #endif 1038 words += N; 1039 crc = byte_swap(comb); 1040 } 1041 1042 /* 1043 Update the pointer to the remaining bytes to process. 1044 */ 1045 buf = (unsigned char const *)words; 1046 } 1047 1048 #endif /* W */ 1049 1050 /* Complete the computation of the CRC on any remaining bytes. */ 1051 while (len >= 8) { 1052 len -= 8; 1053 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1054 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1055 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1056 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1057 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1058 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1059 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1060 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1061 } 1062 while (len) { 1063 len--; 1064 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1065 } 1066 1067 /* Return the CRC, post-conditioned. */ 1068 return crc ^ 0xffffffff; 1069 } 1070 1071 #endif 1072 1073 /* ========================================================================= */ 1074 unsigned long ZEXPORT crc32(crc, buf, len) 1075 unsigned long crc; 1076 const unsigned char FAR *buf; 1077 uInt len; 1078 { 1079 return crc32_z(crc, buf, len); 1080 } 1081 1082 /* ========================================================================= */ 1083 uLong ZEXPORT crc32_combine64(crc1, crc2, len2) 1084 uLong crc1; 1085 uLong crc2; 1086 z_off64_t len2; 1087 { 1088 #ifdef DYNAMIC_CRC_TABLE 1089 once(&made, make_crc_table); 1090 #endif /* DYNAMIC_CRC_TABLE */ 1091 return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff); 1092 } 1093 1094 /* ========================================================================= */ 1095 uLong ZEXPORT crc32_combine(crc1, crc2, len2) 1096 uLong crc1; 1097 uLong crc2; 1098 z_off_t len2; 1099 { 1100 return crc32_combine64(crc1, crc2, (z_off64_t)len2); 1101 } 1102 1103 /* ========================================================================= */ 1104 uLong ZEXPORT crc32_combine_gen64(len2) 1105 z_off64_t len2; 1106 { 1107 #ifdef DYNAMIC_CRC_TABLE 1108 once(&made, make_crc_table); 1109 #endif /* DYNAMIC_CRC_TABLE */ 1110 return x2nmodp(len2, 3); 1111 } 1112 1113 /* ========================================================================= */ 1114 uLong ZEXPORT crc32_combine_gen(len2) 1115 z_off_t len2; 1116 { 1117 return crc32_combine_gen64((z_off64_t)len2); 1118 } 1119 1120 /* ========================================================================= */ 1121 uLong ZEXPORT crc32_combine_op(crc1, crc2, op) 1122 uLong crc1; 1123 uLong crc2; 1124 uLong op; 1125 { 1126 return multmodp(op, crc1) ^ (crc2 & 0xffffffff); 1127 } 1128