1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <string.h> 6 7 #include <rte_common.h> 8 #include <rte_branch_prediction.h> 9 #include <rte_cpuflags.h> 10 11 #include "net_crc.h" 12 13 #include <x86intrin.h> 14 15 /* VPCLMULQDQ CRC computation context structure */ 16 struct crc_vpclmulqdq_ctx { 17 __m512i rk1_rk2; 18 __m512i rk3_rk4; 19 __m512i fold_7x128b; 20 __m512i fold_3x128b; 21 __m128i rk5_rk6; 22 __m128i rk7_rk8; 23 __m128i fold_1x128b; 24 }; 25 26 static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64); 27 static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64); 28 29 static uint16_t byte_len_to_mask_table[] = { 30 0x0000, 0x0001, 0x0003, 0x0007, 31 0x000f, 0x001f, 0x003f, 0x007f, 32 0x00ff, 0x01ff, 0x03ff, 0x07ff, 33 0x0fff, 0x1fff, 0x3fff, 0x7fff, 34 0xffff}; 35 36 static const uint8_t shf_table[32] __rte_aligned(16) = { 37 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 38 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 39 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 40 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 41 }; 42 43 static const uint32_t mask[4] __rte_aligned(16) = { 44 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 45 }; 46 47 static const uint32_t mask2[4] __rte_aligned(16) = { 48 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff 49 }; 50 51 static __rte_always_inline __m512i 52 crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold) 53 { 54 __m512i tmp0, tmp1; 55 56 tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01); 57 tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10); 58 59 return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96); 60 } 61 62 static __rte_always_inline __m128i 63 crc32_fold_128(__m512i fold0, __m512i fold1, 64 const struct crc_vpclmulqdq_ctx *params) 65 { 66 __m128i res, res2; 67 __m256i a; 68 __m512i tmp0, tmp1, tmp2, tmp3; 69 __m512i tmp4; 70 71 tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01); 72 tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10); 73 74 res = _mm512_extracti64x2_epi64(fold1, 3); 75 tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res); 76 77 tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01); 78 tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10); 79 80 tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96); 81 tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96); 82 83 tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e); 84 85 a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0); 86 res = _mm256_extracti64x2_epi64(a, 1); 87 res2 = _mm_xor_si128(res, *(__m128i *)&a); 88 89 return res2; 90 } 91 92 static __rte_always_inline __m128i 93 last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res, 94 const struct crc_vpclmulqdq_ctx *params) 95 { 96 uint32_t offset; 97 __m128i res2, res3, res4, pshufb_shf; 98 99 const uint32_t mask3[4] __rte_aligned(16) = { 100 0x80808080, 0x80808080, 0x80808080, 0x80808080 101 }; 102 103 res2 = res; 104 offset = data_len - n; 105 res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]); 106 107 pshufb_shf = _mm_loadu_si128((const __m128i *) 108 (shf_table + (data_len-n))); 109 110 res = _mm_shuffle_epi8(res, pshufb_shf); 111 pshufb_shf = _mm_xor_si128(pshufb_shf, 112 _mm_load_si128((const __m128i *) mask3)); 113 res2 = _mm_shuffle_epi8(res2, pshufb_shf); 114 115 res2 = _mm_blendv_epi8(res2, res3, pshufb_shf); 116 117 res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01); 118 res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10); 119 res = _mm_ternarylogic_epi64(res, res2, res4, 0x96); 120 121 return res; 122 } 123 124 static __rte_always_inline __m128i 125 done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params) 126 { 127 __m128i res1; 128 129 res1 = res; 130 131 res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0); 132 res1 = _mm_srli_si128(res1, 8); 133 res = _mm_xor_si128(res, res1); 134 135 res1 = res; 136 res = _mm_slli_si128(res, 4); 137 res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10); 138 res = _mm_xor_si128(res, res1); 139 140 return res; 141 } 142 143 static __rte_always_inline uint32_t 144 barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params) 145 { 146 __m128i tmp0, tmp1; 147 148 data64 = _mm_and_si128(data64, *(const __m128i *)mask2); 149 tmp0 = data64; 150 tmp1 = data64; 151 152 data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0); 153 data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask, 154 0x28); 155 156 tmp1 = data64; 157 data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10); 158 data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96); 159 160 return _mm_extract_epi32(data64, 2); 161 } 162 163 static __rte_always_inline void 164 reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n, 165 const struct crc_vpclmulqdq_ctx *params) 166 { 167 __m128i tmp, tmp1; 168 169 tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1); 170 *fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10); 171 *fold = _mm_xor_si128(*fold, tmp); 172 tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]); 173 *fold = _mm_xor_si128(*fold, tmp1); 174 *n += 16; 175 *len -= 16; 176 } 177 178 static __rte_always_inline uint32_t 179 crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc, 180 const struct crc_vpclmulqdq_ctx *params) 181 { 182 __m128i res, d, b; 183 __m512i temp, k; 184 __m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3; 185 __m512i fold0, fold1, fold2, fold3; 186 __mmask16 mask; 187 uint32_t n = 0; 188 int reduction = 0; 189 190 /* Get CRC init value */ 191 b = _mm_cvtsi32_si128(crc); 192 temp = _mm512_castsi128_si512(b); 193 194 if (data_len > 255) { 195 fold0 = _mm512_loadu_si512((const __m512i *)data); 196 fold1 = _mm512_loadu_si512((const __m512i *)(data+64)); 197 fold2 = _mm512_loadu_si512((const __m512i *)(data+128)); 198 fold3 = _mm512_loadu_si512((const __m512i *)(data+192)); 199 fold0 = _mm512_xor_si512(fold0, temp); 200 201 /* Main folding loop */ 202 k = params->rk1_rk2; 203 for (n = 256; (n + 256) <= data_len; n += 256) { 204 qw0 = _mm512_loadu_si512((const __m512i *)&data[n]); 205 qw1 = _mm512_loadu_si512((const __m512i *) 206 &(data[n+64])); 207 qw2 = _mm512_loadu_si512((const __m512i *) 208 &(data[n+128])); 209 qw3 = _mm512_loadu_si512((const __m512i *) 210 &(data[n+192])); 211 fold0 = crcr32_folding_round(qw0, k, fold0); 212 fold1 = crcr32_folding_round(qw1, k, fold1); 213 fold2 = crcr32_folding_round(qw2, k, fold2); 214 fold3 = crcr32_folding_round(qw3, k, fold3); 215 } 216 217 /* 256 to 128 fold */ 218 k = params->rk3_rk4; 219 fold0 = crcr32_folding_round(fold2, k, fold0); 220 fold1 = crcr32_folding_round(fold3, k, fold1); 221 222 res = crc32_fold_128(fold0, fold1, params); 223 224 reduction = 240 - ((n+256)-data_len); 225 226 while (reduction > 0) 227 reduction_loop(&res, &reduction, data, &n, 228 params); 229 230 reduction += 16; 231 232 if (n != data_len) 233 res = last_two_xmm(data, data_len, n, res, 234 params); 235 } else { 236 if (data_len > 31) { 237 res = _mm_cvtsi32_si128(crc); 238 d = _mm_loadu_si128((const __m128i *)data); 239 res = _mm_xor_si128(res, d); 240 n += 16; 241 242 reduction = 240 - ((n+256)-data_len); 243 244 while (reduction > 0) 245 reduction_loop(&res, &reduction, data, &n, 246 params); 247 248 if (n != data_len) 249 res = last_two_xmm(data, data_len, n, res, 250 params); 251 } else if (data_len > 16) { 252 res = _mm_cvtsi32_si128(crc); 253 d = _mm_loadu_si128((const __m128i *)data); 254 res = _mm_xor_si128(res, d); 255 n += 16; 256 257 if (n != data_len) 258 res = last_two_xmm(data, data_len, n, res, 259 params); 260 } else if (data_len == 16) { 261 res = _mm_cvtsi32_si128(crc); 262 d = _mm_loadu_si128((const __m128i *)data); 263 res = _mm_xor_si128(res, d); 264 } else { 265 res = _mm_cvtsi32_si128(crc); 266 mask = byte_len_to_mask_table[data_len]; 267 d = _mm_maskz_loadu_epi8(mask, data); 268 res = _mm_xor_si128(res, d); 269 270 if (data_len > 3) { 271 d = _mm_loadu_si128((const __m128i *) 272 &shf_table[data_len]); 273 res = _mm_shuffle_epi8(res, d); 274 } else if (data_len > 2) { 275 res = _mm_slli_si128(res, 5); 276 goto do_barrett_reduction; 277 } else if (data_len > 1) { 278 res = _mm_slli_si128(res, 6); 279 goto do_barrett_reduction; 280 } else if (data_len > 0) { 281 res = _mm_slli_si128(res, 7); 282 goto do_barrett_reduction; 283 } else { 284 /* zero length case */ 285 return crc; 286 } 287 } 288 } 289 290 res = done_128(res, params); 291 292 do_barrett_reduction: 293 n = barrett_reduction(res, params); 294 295 return n; 296 } 297 298 static void 299 crc32_load_init_constants(void) 300 { 301 __m128i a; 302 /* fold constants */ 303 uint64_t c0 = 0x00000000e95c1271; 304 uint64_t c1 = 0x00000000ce3371cb; 305 uint64_t c2 = 0x00000000910eeec1; 306 uint64_t c3 = 0x0000000033fff533; 307 uint64_t c4 = 0x000000000cbec0ed; 308 uint64_t c5 = 0x0000000031f8303f; 309 uint64_t c6 = 0x0000000057c54819; 310 uint64_t c7 = 0x00000000df068dc2; 311 uint64_t c8 = 0x00000000ae0b5394; 312 uint64_t c9 = 0x000000001c279815; 313 uint64_t c10 = 0x000000001d9513d7; 314 uint64_t c11 = 0x000000008f352d95; 315 uint64_t c12 = 0x00000000af449247; 316 uint64_t c13 = 0x000000003db1ecdc; 317 uint64_t c14 = 0x0000000081256527; 318 uint64_t c15 = 0x00000000f1da05aa; 319 uint64_t c16 = 0x00000000ccaa009e; 320 uint64_t c17 = 0x00000000ae689191; 321 uint64_t c18 = 0x00000000ccaa009e; 322 uint64_t c19 = 0x00000000b8bc6765; 323 uint64_t c20 = 0x00000001f7011640; 324 uint64_t c21 = 0x00000001db710640; 325 326 a = _mm_set_epi64x(c1, c0); 327 crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a); 328 329 a = _mm_set_epi64x(c3, c2); 330 crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a); 331 332 crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8, 333 c9, c10, c11); 334 crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15, 335 c16, c17, 0, 0); 336 crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16), 337 _mm_cvtsi64_m64(c17)); 338 339 crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18), 340 _mm_cvtsi64_m64(c19)); 341 crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20), 342 _mm_cvtsi64_m64(c21)); 343 } 344 345 static void 346 crc16_load_init_constants(void) 347 { 348 __m128i a; 349 /* fold constants */ 350 uint64_t c0 = 0x0000000000009a19; 351 uint64_t c1 = 0x0000000000002df8; 352 uint64_t c2 = 0x00000000000068af; 353 uint64_t c3 = 0x000000000000b6c9; 354 uint64_t c4 = 0x000000000000c64f; 355 uint64_t c5 = 0x000000000000cd95; 356 uint64_t c6 = 0x000000000000d341; 357 uint64_t c7 = 0x000000000000b8f2; 358 uint64_t c8 = 0x0000000000000842; 359 uint64_t c9 = 0x000000000000b072; 360 uint64_t c10 = 0x00000000000047e3; 361 uint64_t c11 = 0x000000000000922d; 362 uint64_t c12 = 0x0000000000000e3a; 363 uint64_t c13 = 0x0000000000004d7a; 364 uint64_t c14 = 0x0000000000005b44; 365 uint64_t c15 = 0x0000000000007762; 366 uint64_t c16 = 0x00000000000081bf; 367 uint64_t c17 = 0x0000000000008e10; 368 uint64_t c18 = 0x00000000000081bf; 369 uint64_t c19 = 0x0000000000001cbb; 370 uint64_t c20 = 0x000000011c581910; 371 uint64_t c21 = 0x0000000000010810; 372 373 a = _mm_set_epi64x(c1, c0); 374 crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a); 375 376 a = _mm_set_epi64x(c3, c2); 377 crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a); 378 379 crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8, 380 c9, c10, c11); 381 crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15, 382 c16, c17, 0, 0); 383 crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16), 384 _mm_cvtsi64_m64(c17)); 385 386 crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18), 387 _mm_cvtsi64_m64(c19)); 388 crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20), 389 _mm_cvtsi64_m64(c21)); 390 } 391 392 void 393 rte_net_crc_avx512_init(void) 394 { 395 crc32_load_init_constants(); 396 crc16_load_init_constants(); 397 398 /* 399 * Reset the register as following calculation may 400 * use other data types such as float, double, etc. 401 */ 402 _mm_empty(); 403 } 404 405 uint32_t 406 rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len) 407 { 408 /* return 16-bit CRC value */ 409 return (uint16_t)~crc32_eth_calc_vpclmulqdq(data, 410 data_len, 411 0xffff, 412 &crc16_ccitt); 413 } 414 415 uint32_t 416 rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len) 417 { 418 /* return 32-bit CRC value */ 419 return ~crc32_eth_calc_vpclmulqdq(data, 420 data_len, 421 0xffffffffUL, 422 &crc32_eth); 423 } 424