1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017-2020 Intel Corporation
3 */
4
5 #include <stdalign.h>
6 #include <string.h>
7
8 #include <rte_common.h>
9 #include <rte_vect.h>
10 #include <rte_branch_prediction.h>
11
12 #include "net_crc.h"
13
14 /** PCLMULQDQ CRC computation context structure */
15 struct crc_pclmulqdq_ctx {
16 __m128i rk1_rk2;
17 __m128i rk5_rk6;
18 __m128i rk7_rk8;
19 };
20
21 static alignas(16) struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq;
22 static alignas(16) struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq;
23 /**
24 * @brief Performs one folding round
25 *
26 * Logically function operates as follows:
27 * DATA = READ_NEXT_16BYTES();
28 * F1 = LSB8(FOLD)
29 * F2 = MSB8(FOLD)
30 * T1 = CLMUL(F1, RK1)
31 * T2 = CLMUL(F2, RK2)
32 * FOLD = XOR(T1, T2, DATA)
33 *
34 * @param data_block
35 * 16 byte data block
36 * @param precomp
37 * Precomputed rk1 constant
38 * @param fold
39 * Current16 byte folded data
40 *
41 * @return
42 * New 16 byte folded data
43 */
44 static __rte_always_inline __m128i
crcr32_folding_round(__m128i data_block,__m128i precomp,__m128i fold)45 crcr32_folding_round(__m128i data_block,
46 __m128i precomp,
47 __m128i fold)
48 {
49 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
50 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
51
52 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
53 }
54
55 /**
56 * Performs reduction from 128 bits to 64 bits
57 *
58 * @param data128
59 * 128 bits data to be reduced
60 * @param precomp
61 * precomputed constants rk5, rk6
62 *
63 * @return
64 * 64 bits reduced data
65 */
66
67 static __rte_always_inline __m128i
crcr32_reduce_128_to_64(__m128i data128,__m128i precomp)68 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
69 {
70 __m128i tmp0, tmp1, tmp2;
71
72 /* 64b fold */
73 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
74 tmp1 = _mm_srli_si128(data128, 8);
75 tmp0 = _mm_xor_si128(tmp0, tmp1);
76
77 /* 32b fold */
78 tmp2 = _mm_slli_si128(tmp0, 4);
79 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
80
81 return _mm_xor_si128(tmp1, tmp0);
82 }
83
84 /**
85 * Performs Barret's reduction from 64 bits to 32 bits
86 *
87 * @param data64
88 * 64 bits data to be reduced
89 * @param precomp
90 * rk7 precomputed constant
91 *
92 * @return
93 * reduced 32 bits data
94 */
95
96 static __rte_always_inline uint32_t
crcr32_reduce_64_to_32(__m128i data64,__m128i precomp)97 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
98 {
99 static const alignas(16) uint32_t mask1[4] = {
100 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
101 };
102
103 static const alignas(16) uint32_t mask2[4] = {
104 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
105 };
106 __m128i tmp0, tmp1, tmp2;
107
108 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
109
110 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
111 tmp1 = _mm_xor_si128(tmp1, tmp0);
112 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
113
114 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
115 tmp2 = _mm_xor_si128(tmp2, tmp1);
116 tmp2 = _mm_xor_si128(tmp2, tmp0);
117
118 return _mm_extract_epi32(tmp2, 2);
119 }
120
121 static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
122 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
123 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
125 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
126 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
128 };
129
130 /**
131 * Shifts left 128 bit register by specified number of bytes
132 *
133 * @param reg
134 * 128 bit value
135 * @param num
136 * number of bytes to shift left reg by (0-16)
137 *
138 * @return
139 * reg << (num * 8)
140 */
141
142 static __rte_always_inline __m128i
xmm_shift_left(__m128i reg,const unsigned int num)143 xmm_shift_left(__m128i reg, const unsigned int num)
144 {
145 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
146
147 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
148 }
149
150 static __rte_always_inline uint32_t
crc32_eth_calc_pclmulqdq(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pclmulqdq_ctx * params)151 crc32_eth_calc_pclmulqdq(
152 const uint8_t *data,
153 uint32_t data_len,
154 uint32_t crc,
155 const struct crc_pclmulqdq_ctx *params)
156 {
157 __m128i temp, fold, k;
158 uint32_t n;
159
160 /* Get CRC init value */
161 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
162
163 /**
164 * Folding all data into single 16 byte data block
165 * Assumes: fold holds first 16 bytes of data
166 */
167
168 if (unlikely(data_len < 32)) {
169 if (unlikely(data_len == 16)) {
170 /* 16 bytes */
171 fold = _mm_loadu_si128((const __m128i *)data);
172 fold = _mm_xor_si128(fold, temp);
173 goto reduction_128_64;
174 }
175
176 if (unlikely(data_len < 16)) {
177 /* 0 to 15 bytes */
178 alignas(16) uint8_t buffer[16];
179
180 memset(buffer, 0, sizeof(buffer));
181 memcpy(buffer, data, data_len);
182
183 fold = _mm_load_si128((const __m128i *)buffer);
184 fold = _mm_xor_si128(fold, temp);
185 if (unlikely(data_len < 4)) {
186 fold = xmm_shift_left(fold, 8 - data_len);
187 goto barret_reduction;
188 }
189 fold = xmm_shift_left(fold, 16 - data_len);
190 goto reduction_128_64;
191 }
192 /* 17 to 31 bytes */
193 fold = _mm_loadu_si128((const __m128i *)data);
194 fold = _mm_xor_si128(fold, temp);
195 n = 16;
196 k = params->rk1_rk2;
197 goto partial_bytes;
198 }
199
200 /** At least 32 bytes in the buffer */
201 /** Apply CRC initial value */
202 fold = _mm_loadu_si128((const __m128i *)data);
203 fold = _mm_xor_si128(fold, temp);
204
205 /** Main folding loop - the last 16 bytes is processed separately */
206 k = params->rk1_rk2;
207 for (n = 16; (n + 16) <= data_len; n += 16) {
208 temp = _mm_loadu_si128((const __m128i *)&data[n]);
209 fold = crcr32_folding_round(temp, k, fold);
210 }
211
212 partial_bytes:
213 if (likely(n < data_len)) {
214
215 const alignas(16) uint32_t mask3[4] = {
216 0x80808080, 0x80808080, 0x80808080, 0x80808080
217 };
218
219 const alignas(16) uint8_t shf_table[32] = {
220 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
221 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
222 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
223 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
224 };
225
226 __m128i last16, a, b;
227
228 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
229
230 temp = _mm_loadu_si128((const __m128i *)
231 &shf_table[data_len & 15]);
232 a = _mm_shuffle_epi8(fold, temp);
233
234 temp = _mm_xor_si128(temp,
235 _mm_load_si128((const __m128i *)mask3));
236 b = _mm_shuffle_epi8(fold, temp);
237 b = _mm_blendv_epi8(b, last16, temp);
238
239 /* k = rk1 & rk2 */
240 temp = _mm_clmulepi64_si128(a, k, 0x01);
241 fold = _mm_clmulepi64_si128(a, k, 0x10);
242
243 fold = _mm_xor_si128(fold, temp);
244 fold = _mm_xor_si128(fold, b);
245 }
246
247 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
248 reduction_128_64:
249 k = params->rk5_rk6;
250 fold = crcr32_reduce_128_to_64(fold, k);
251
252 barret_reduction:
253 k = params->rk7_rk8;
254 n = crcr32_reduce_64_to_32(fold, k);
255
256 return n;
257 }
258
259 void
rte_net_crc_sse42_init(void)260 rte_net_crc_sse42_init(void)
261 {
262 uint64_t k1, k2, k5, k6;
263 uint64_t p = 0, q = 0;
264
265 /** Initialize CRC16 data */
266 k1 = 0x189aeLLU;
267 k2 = 0x8e10LLU;
268 k5 = 0x189aeLLU;
269 k6 = 0x114aaLLU;
270 q = 0x11c581910LLU;
271 p = 0x10811LLU;
272
273 /** Save the params in context structure */
274 crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
275 crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
276 crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
277
278 /** Initialize CRC32 data */
279 k1 = 0xccaa009eLLU;
280 k2 = 0x1751997d0LLU;
281 k5 = 0xccaa009eLLU;
282 k6 = 0x163cd6124LLU;
283 q = 0x1f7011640LLU;
284 p = 0x1db710641LLU;
285
286 /** Save the params in context structure */
287 crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
288 crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
289 crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
290 }
291
292 uint32_t
rte_crc16_ccitt_sse42_handler(const uint8_t * data,uint32_t data_len)293 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
294 {
295 /** return 16-bit CRC value */
296 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
297 data_len,
298 0xffff,
299 &crc16_ccitt_pclmulqdq);
300 }
301
302 uint32_t
rte_crc32_eth_sse42_handler(const uint8_t * data,uint32_t data_len)303 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
304 {
305 return ~crc32_eth_calc_pclmulqdq(data,
306 data_len,
307 0xffffffffUL,
308 &crc32_eth_pclmulqdq);
309 }
310