1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017-2020 Intel Corporation 3 */ 4 5 #include <string.h> 6 7 #include <rte_common.h> 8 #include <rte_branch_prediction.h> 9 #include <rte_cpuflags.h> 10 11 #include "net_crc.h" 12 13 #include <x86intrin.h> 14 15 /** PCLMULQDQ CRC computation context structure */ 16 struct crc_pclmulqdq_ctx { 17 __m128i rk1_rk2; 18 __m128i rk5_rk6; 19 __m128i rk7_rk8; 20 }; 21 22 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16); 23 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16); 24 /** 25 * @brief Performs one folding round 26 * 27 * Logically function operates as follows: 28 * DATA = READ_NEXT_16BYTES(); 29 * F1 = LSB8(FOLD) 30 * F2 = MSB8(FOLD) 31 * T1 = CLMUL(F1, RK1) 32 * T2 = CLMUL(F2, RK2) 33 * FOLD = XOR(T1, T2, DATA) 34 * 35 * @param data_block 36 * 16 byte data block 37 * @param precomp 38 * Precomputed rk1 constant 39 * @param fold 40 * Current16 byte folded data 41 * 42 * @return 43 * New 16 byte folded data 44 */ 45 static __rte_always_inline __m128i 46 crcr32_folding_round(__m128i data_block, 47 __m128i precomp, 48 __m128i fold) 49 { 50 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01); 51 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10); 52 53 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0)); 54 } 55 56 /** 57 * Performs reduction from 128 bits to 64 bits 58 * 59 * @param data128 60 * 128 bits data to be reduced 61 * @param precomp 62 * precomputed constants rk5, rk6 63 * 64 * @return 65 * 64 bits reduced data 66 */ 67 68 static __rte_always_inline __m128i 69 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp) 70 { 71 __m128i tmp0, tmp1, tmp2; 72 73 /* 64b fold */ 74 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00); 75 tmp1 = _mm_srli_si128(data128, 8); 76 tmp0 = _mm_xor_si128(tmp0, tmp1); 77 78 /* 32b fold */ 79 tmp2 = _mm_slli_si128(tmp0, 4); 80 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10); 81 82 return _mm_xor_si128(tmp1, tmp0); 83 } 84 85 /** 86 * Performs Barret's reduction from 64 bits to 32 bits 87 * 88 * @param data64 89 * 64 bits data to be reduced 90 * @param precomp 91 * rk7 precomputed constant 92 * 93 * @return 94 * reduced 32 bits data 95 */ 96 97 static __rte_always_inline uint32_t 98 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp) 99 { 100 static const uint32_t mask1[4] __rte_aligned(16) = { 101 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 102 }; 103 104 static const uint32_t mask2[4] __rte_aligned(16) = { 105 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff 106 }; 107 __m128i tmp0, tmp1, tmp2; 108 109 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2)); 110 111 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00); 112 tmp1 = _mm_xor_si128(tmp1, tmp0); 113 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1)); 114 115 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10); 116 tmp2 = _mm_xor_si128(tmp2, tmp1); 117 tmp2 = _mm_xor_si128(tmp2, tmp0); 118 119 return _mm_extract_epi32(tmp2, 2); 120 } 121 122 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = { 123 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 124 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 125 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 126 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 128 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 129 }; 130 131 /** 132 * Shifts left 128 bit register by specified number of bytes 133 * 134 * @param reg 135 * 128 bit value 136 * @param num 137 * number of bytes to shift left reg by (0-16) 138 * 139 * @return 140 * reg << (num * 8) 141 */ 142 143 static __rte_always_inline __m128i 144 xmm_shift_left(__m128i reg, const unsigned int num) 145 { 146 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num); 147 148 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p)); 149 } 150 151 static __rte_always_inline uint32_t 152 crc32_eth_calc_pclmulqdq( 153 const uint8_t *data, 154 uint32_t data_len, 155 uint32_t crc, 156 const struct crc_pclmulqdq_ctx *params) 157 { 158 __m128i temp, fold, k; 159 uint32_t n; 160 161 /* Get CRC init value */ 162 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); 163 164 /** 165 * Folding all data into single 16 byte data block 166 * Assumes: fold holds first 16 bytes of data 167 */ 168 169 if (unlikely(data_len < 32)) { 170 if (unlikely(data_len == 16)) { 171 /* 16 bytes */ 172 fold = _mm_loadu_si128((const __m128i *)data); 173 fold = _mm_xor_si128(fold, temp); 174 goto reduction_128_64; 175 } 176 177 if (unlikely(data_len < 16)) { 178 /* 0 to 15 bytes */ 179 uint8_t buffer[16] __rte_aligned(16); 180 181 memset(buffer, 0, sizeof(buffer)); 182 memcpy(buffer, data, data_len); 183 184 fold = _mm_load_si128((const __m128i *)buffer); 185 fold = _mm_xor_si128(fold, temp); 186 if (unlikely(data_len < 4)) { 187 fold = xmm_shift_left(fold, 8 - data_len); 188 goto barret_reduction; 189 } 190 fold = xmm_shift_left(fold, 16 - data_len); 191 goto reduction_128_64; 192 } 193 /* 17 to 31 bytes */ 194 fold = _mm_loadu_si128((const __m128i *)data); 195 fold = _mm_xor_si128(fold, temp); 196 n = 16; 197 k = params->rk1_rk2; 198 goto partial_bytes; 199 } 200 201 /** At least 32 bytes in the buffer */ 202 /** Apply CRC initial value */ 203 fold = _mm_loadu_si128((const __m128i *)data); 204 fold = _mm_xor_si128(fold, temp); 205 206 /** Main folding loop - the last 16 bytes is processed separately */ 207 k = params->rk1_rk2; 208 for (n = 16; (n + 16) <= data_len; n += 16) { 209 temp = _mm_loadu_si128((const __m128i *)&data[n]); 210 fold = crcr32_folding_round(temp, k, fold); 211 } 212 213 partial_bytes: 214 if (likely(n < data_len)) { 215 216 const uint32_t mask3[4] __rte_aligned(16) = { 217 0x80808080, 0x80808080, 0x80808080, 0x80808080 218 }; 219 220 const uint8_t shf_table[32] __rte_aligned(16) = { 221 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 222 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 223 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 224 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 225 }; 226 227 __m128i last16, a, b; 228 229 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]); 230 231 temp = _mm_loadu_si128((const __m128i *) 232 &shf_table[data_len & 15]); 233 a = _mm_shuffle_epi8(fold, temp); 234 235 temp = _mm_xor_si128(temp, 236 _mm_load_si128((const __m128i *)mask3)); 237 b = _mm_shuffle_epi8(fold, temp); 238 b = _mm_blendv_epi8(b, last16, temp); 239 240 /* k = rk1 & rk2 */ 241 temp = _mm_clmulepi64_si128(a, k, 0x01); 242 fold = _mm_clmulepi64_si128(a, k, 0x10); 243 244 fold = _mm_xor_si128(fold, temp); 245 fold = _mm_xor_si128(fold, b); 246 } 247 248 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ 249 reduction_128_64: 250 k = params->rk5_rk6; 251 fold = crcr32_reduce_128_to_64(fold, k); 252 253 barret_reduction: 254 k = params->rk7_rk8; 255 n = crcr32_reduce_64_to_32(fold, k); 256 257 return n; 258 } 259 260 void 261 rte_net_crc_sse42_init(void) 262 { 263 uint64_t k1, k2, k5, k6; 264 uint64_t p = 0, q = 0; 265 266 /** Initialize CRC16 data */ 267 k1 = 0x189aeLLU; 268 k2 = 0x8e10LLU; 269 k5 = 0x189aeLLU; 270 k6 = 0x114aaLLU; 271 q = 0x11c581910LLU; 272 p = 0x10811LLU; 273 274 /** Save the params in context structure */ 275 crc16_ccitt_pclmulqdq.rk1_rk2 = 276 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); 277 crc16_ccitt_pclmulqdq.rk5_rk6 = 278 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); 279 crc16_ccitt_pclmulqdq.rk7_rk8 = 280 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); 281 282 /** Initialize CRC32 data */ 283 k1 = 0xccaa009eLLU; 284 k2 = 0x1751997d0LLU; 285 k5 = 0xccaa009eLLU; 286 k6 = 0x163cd6124LLU; 287 q = 0x1f7011640LLU; 288 p = 0x1db710641LLU; 289 290 /** Save the params in context structure */ 291 crc32_eth_pclmulqdq.rk1_rk2 = 292 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); 293 crc32_eth_pclmulqdq.rk5_rk6 = 294 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); 295 crc32_eth_pclmulqdq.rk7_rk8 = 296 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); 297 298 /** 299 * Reset the register as following calculation may 300 * use other data types such as float, double, etc. 301 */ 302 _mm_empty(); 303 } 304 305 uint32_t 306 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len) 307 { 308 /** return 16-bit CRC value */ 309 return (uint16_t)~crc32_eth_calc_pclmulqdq(data, 310 data_len, 311 0xffff, 312 &crc16_ccitt_pclmulqdq); 313 } 314 315 uint32_t 316 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len) 317 { 318 return ~crc32_eth_calc_pclmulqdq(data, 319 data_len, 320 0xffffffffUL, 321 &crc32_eth_pclmulqdq); 322 } 323