1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <stdalign.h> 6 7 #include <rte_common.h> 8 9 #include "net_crc.h" 10 11 #include <x86intrin.h> 12 13 /* VPCLMULQDQ CRC computation context structure */ 14 struct crc_vpclmulqdq_ctx { 15 __m512i rk1_rk2; 16 __m512i rk3_rk4; 17 __m512i fold_7x128b; 18 __m512i fold_3x128b; 19 __m128i rk5_rk6; 20 __m128i rk7_rk8; 21 __m128i fold_1x128b; 22 }; 23 24 static alignas(64) struct crc_vpclmulqdq_ctx crc32_eth; 25 static alignas(64) struct crc_vpclmulqdq_ctx crc16_ccitt; 26 27 static uint16_t byte_len_to_mask_table[] = { 28 0x0000, 0x0001, 0x0003, 0x0007, 29 0x000f, 0x001f, 0x003f, 0x007f, 30 0x00ff, 0x01ff, 0x03ff, 0x07ff, 31 0x0fff, 0x1fff, 0x3fff, 0x7fff, 32 0xffff}; 33 34 static const alignas(16) uint8_t shf_table[32] = { 35 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 36 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 37 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 38 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 39 }; 40 41 static const alignas(16) uint32_t mask[4] = { 42 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 43 }; 44 45 static const alignas(16) uint32_t mask2[4] = { 46 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff 47 }; 48 49 static __rte_always_inline __m512i 50 crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold) 51 { 52 __m512i tmp0, tmp1; 53 54 tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01); 55 tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10); 56 57 return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96); 58 } 59 60 static __rte_always_inline __m128i 61 crc32_fold_128(__m512i fold0, __m512i fold1, 62 const struct crc_vpclmulqdq_ctx *params) 63 { 64 __m128i res, res2; 65 __m256i a; 66 __m512i tmp0, tmp1, tmp2, tmp3; 67 __m512i tmp4; 68 69 tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01); 70 tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10); 71 72 res = _mm512_extracti64x2_epi64(fold1, 3); 73 tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res); 74 75 tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01); 76 tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10); 77 78 tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96); 79 tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96); 80 81 tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e); 82 83 a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0); 84 res = _mm256_extracti64x2_epi64(a, 1); 85 res2 = _mm_xor_si128(res, *(__m128i *)&a); 86 87 return res2; 88 } 89 90 static __rte_always_inline __m128i 91 last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res, 92 const struct crc_vpclmulqdq_ctx *params) 93 { 94 uint32_t offset; 95 __m128i res2, res3, res4, pshufb_shf; 96 97 const alignas(16) uint32_t mask3[4] = { 98 0x80808080, 0x80808080, 0x80808080, 0x80808080 99 }; 100 101 res2 = res; 102 offset = data_len - n; 103 res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]); 104 105 pshufb_shf = _mm_loadu_si128((const __m128i *) 106 (shf_table + (data_len-n))); 107 108 res = _mm_shuffle_epi8(res, pshufb_shf); 109 pshufb_shf = _mm_xor_si128(pshufb_shf, 110 _mm_load_si128((const __m128i *) mask3)); 111 res2 = _mm_shuffle_epi8(res2, pshufb_shf); 112 113 res2 = _mm_blendv_epi8(res2, res3, pshufb_shf); 114 115 res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01); 116 res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10); 117 res = _mm_ternarylogic_epi64(res, res2, res4, 0x96); 118 119 return res; 120 } 121 122 static __rte_always_inline __m128i 123 done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params) 124 { 125 __m128i res1; 126 127 res1 = res; 128 129 res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0); 130 res1 = _mm_srli_si128(res1, 8); 131 res = _mm_xor_si128(res, res1); 132 133 res1 = res; 134 res = _mm_slli_si128(res, 4); 135 res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10); 136 res = _mm_xor_si128(res, res1); 137 138 return res; 139 } 140 141 static __rte_always_inline uint32_t 142 barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params) 143 { 144 __m128i tmp0, tmp1; 145 146 data64 = _mm_and_si128(data64, *(const __m128i *)mask2); 147 tmp0 = data64; 148 tmp1 = data64; 149 150 data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0); 151 data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask, 152 0x28); 153 154 tmp1 = data64; 155 data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10); 156 data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96); 157 158 return _mm_extract_epi32(data64, 2); 159 } 160 161 static __rte_always_inline void 162 reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n, 163 const struct crc_vpclmulqdq_ctx *params) 164 { 165 __m128i tmp, tmp1; 166 167 tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1); 168 *fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10); 169 *fold = _mm_xor_si128(*fold, tmp); 170 tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]); 171 *fold = _mm_xor_si128(*fold, tmp1); 172 *n += 16; 173 *len -= 16; 174 } 175 176 static __rte_always_inline uint32_t 177 crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc, 178 const struct crc_vpclmulqdq_ctx *params) 179 { 180 __m128i res, d, b; 181 __m512i temp, k; 182 __m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3; 183 __m512i fold0, fold1, fold2, fold3; 184 __mmask16 mask; 185 uint32_t n = 0; 186 int reduction = 0; 187 188 /* Get CRC init value */ 189 b = _mm_cvtsi32_si128(crc); 190 temp = _mm512_castsi128_si512(b); 191 192 if (data_len > 255) { 193 fold0 = _mm512_loadu_si512((const __m512i *)data); 194 fold1 = _mm512_loadu_si512((const __m512i *)(data+64)); 195 fold2 = _mm512_loadu_si512((const __m512i *)(data+128)); 196 fold3 = _mm512_loadu_si512((const __m512i *)(data+192)); 197 fold0 = _mm512_xor_si512(fold0, temp); 198 199 /* Main folding loop */ 200 k = params->rk1_rk2; 201 for (n = 256; (n + 256) <= data_len; n += 256) { 202 qw0 = _mm512_loadu_si512((const __m512i *)&data[n]); 203 qw1 = _mm512_loadu_si512((const __m512i *) 204 &(data[n+64])); 205 qw2 = _mm512_loadu_si512((const __m512i *) 206 &(data[n+128])); 207 qw3 = _mm512_loadu_si512((const __m512i *) 208 &(data[n+192])); 209 fold0 = crcr32_folding_round(qw0, k, fold0); 210 fold1 = crcr32_folding_round(qw1, k, fold1); 211 fold2 = crcr32_folding_round(qw2, k, fold2); 212 fold3 = crcr32_folding_round(qw3, k, fold3); 213 } 214 215 /* 256 to 128 fold */ 216 k = params->rk3_rk4; 217 fold0 = crcr32_folding_round(fold2, k, fold0); 218 fold1 = crcr32_folding_round(fold3, k, fold1); 219 220 res = crc32_fold_128(fold0, fold1, params); 221 222 reduction = 240 - ((n+256)-data_len); 223 224 while (reduction > 0) 225 reduction_loop(&res, &reduction, data, &n, 226 params); 227 228 reduction += 16; 229 230 if (n != data_len) 231 res = last_two_xmm(data, data_len, n, res, 232 params); 233 } else { 234 if (data_len > 31) { 235 res = _mm_cvtsi32_si128(crc); 236 d = _mm_loadu_si128((const __m128i *)data); 237 res = _mm_xor_si128(res, d); 238 n += 16; 239 240 reduction = 240 - ((n+256)-data_len); 241 242 while (reduction > 0) 243 reduction_loop(&res, &reduction, data, &n, 244 params); 245 246 if (n != data_len) 247 res = last_two_xmm(data, data_len, n, res, 248 params); 249 } else if (data_len > 16) { 250 res = _mm_cvtsi32_si128(crc); 251 d = _mm_loadu_si128((const __m128i *)data); 252 res = _mm_xor_si128(res, d); 253 n += 16; 254 255 if (n != data_len) 256 res = last_two_xmm(data, data_len, n, res, 257 params); 258 } else if (data_len == 16) { 259 res = _mm_cvtsi32_si128(crc); 260 d = _mm_loadu_si128((const __m128i *)data); 261 res = _mm_xor_si128(res, d); 262 } else { 263 res = _mm_cvtsi32_si128(crc); 264 mask = byte_len_to_mask_table[data_len]; 265 d = _mm_maskz_loadu_epi8(mask, data); 266 res = _mm_xor_si128(res, d); 267 268 if (data_len > 3) { 269 d = _mm_loadu_si128((const __m128i *) 270 &shf_table[data_len]); 271 res = _mm_shuffle_epi8(res, d); 272 } else if (data_len > 2) { 273 res = _mm_slli_si128(res, 5); 274 goto do_barrett_reduction; 275 } else if (data_len > 1) { 276 res = _mm_slli_si128(res, 6); 277 goto do_barrett_reduction; 278 } else if (data_len > 0) { 279 res = _mm_slli_si128(res, 7); 280 goto do_barrett_reduction; 281 } else { 282 /* zero length case */ 283 return crc; 284 } 285 } 286 } 287 288 res = done_128(res, params); 289 290 do_barrett_reduction: 291 n = barrett_reduction(res, params); 292 293 return n; 294 } 295 296 static void 297 crc32_load_init_constants(void) 298 { 299 __m128i a; 300 /* fold constants */ 301 uint64_t c0 = 0x00000000e95c1271; 302 uint64_t c1 = 0x00000000ce3371cb; 303 uint64_t c2 = 0x00000000910eeec1; 304 uint64_t c3 = 0x0000000033fff533; 305 uint64_t c4 = 0x000000000cbec0ed; 306 uint64_t c5 = 0x0000000031f8303f; 307 uint64_t c6 = 0x0000000057c54819; 308 uint64_t c7 = 0x00000000df068dc2; 309 uint64_t c8 = 0x00000000ae0b5394; 310 uint64_t c9 = 0x000000001c279815; 311 uint64_t c10 = 0x000000001d9513d7; 312 uint64_t c11 = 0x000000008f352d95; 313 uint64_t c12 = 0x00000000af449247; 314 uint64_t c13 = 0x000000003db1ecdc; 315 uint64_t c14 = 0x0000000081256527; 316 uint64_t c15 = 0x00000000f1da05aa; 317 uint64_t c16 = 0x00000000ccaa009e; 318 uint64_t c17 = 0x00000000ae689191; 319 uint64_t c18 = 0x00000000ccaa009e; 320 uint64_t c19 = 0x00000000b8bc6765; 321 uint64_t c20 = 0x00000001f7011640; 322 uint64_t c21 = 0x00000001db710640; 323 324 a = _mm_set_epi64x(c1, c0); 325 crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a); 326 327 a = _mm_set_epi64x(c3, c2); 328 crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a); 329 330 crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8, 331 c9, c10, c11); 332 crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15, 333 c16, c17, 0, 0); 334 crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16), 335 _mm_cvtsi64_m64(c17)); 336 337 crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18), 338 _mm_cvtsi64_m64(c19)); 339 crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20), 340 _mm_cvtsi64_m64(c21)); 341 } 342 343 static void 344 crc16_load_init_constants(void) 345 { 346 __m128i a; 347 /* fold constants */ 348 uint64_t c0 = 0x0000000000009a19; 349 uint64_t c1 = 0x0000000000002df8; 350 uint64_t c2 = 0x00000000000068af; 351 uint64_t c3 = 0x000000000000b6c9; 352 uint64_t c4 = 0x000000000000c64f; 353 uint64_t c5 = 0x000000000000cd95; 354 uint64_t c6 = 0x000000000000d341; 355 uint64_t c7 = 0x000000000000b8f2; 356 uint64_t c8 = 0x0000000000000842; 357 uint64_t c9 = 0x000000000000b072; 358 uint64_t c10 = 0x00000000000047e3; 359 uint64_t c11 = 0x000000000000922d; 360 uint64_t c12 = 0x0000000000000e3a; 361 uint64_t c13 = 0x0000000000004d7a; 362 uint64_t c14 = 0x0000000000005b44; 363 uint64_t c15 = 0x0000000000007762; 364 uint64_t c16 = 0x00000000000081bf; 365 uint64_t c17 = 0x0000000000008e10; 366 uint64_t c18 = 0x00000000000081bf; 367 uint64_t c19 = 0x0000000000001cbb; 368 uint64_t c20 = 0x000000011c581910; 369 uint64_t c21 = 0x0000000000010810; 370 371 a = _mm_set_epi64x(c1, c0); 372 crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a); 373 374 a = _mm_set_epi64x(c3, c2); 375 crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a); 376 377 crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8, 378 c9, c10, c11); 379 crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15, 380 c16, c17, 0, 0); 381 crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16), 382 _mm_cvtsi64_m64(c17)); 383 384 crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18), 385 _mm_cvtsi64_m64(c19)); 386 crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20), 387 _mm_cvtsi64_m64(c21)); 388 } 389 390 void 391 rte_net_crc_avx512_init(void) 392 { 393 crc32_load_init_constants(); 394 crc16_load_init_constants(); 395 396 /* 397 * Reset the register as following calculation may 398 * use other data types such as float, double, etc. 399 */ 400 _mm_empty(); 401 } 402 403 uint32_t 404 rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len) 405 { 406 /* return 16-bit CRC value */ 407 return (uint16_t)~crc32_eth_calc_vpclmulqdq(data, 408 data_len, 409 0xffff, 410 &crc16_ccitt); 411 } 412 413 uint32_t 414 rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len) 415 { 416 /* return 32-bit CRC value */ 417 return ~crc32_eth_calc_vpclmulqdq(data, 418 data_len, 419 0xffffffffUL, 420 &crc32_eth); 421 } 422