xref: /dpdk/lib/net/net_crc_avx512.c (revision df2c51a9bc47a5187db2bf16f13b9bb9b3e6f4b8)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <stdalign.h>
6 
7 #include <rte_common.h>
8 #include <rte_vect.h>
9 
10 #include "net_crc.h"
11 
12 /* VPCLMULQDQ CRC computation context structure */
13 struct crc_vpclmulqdq_ctx {
14 	__m512i rk1_rk2;
15 	__m512i rk3_rk4;
16 	__m512i fold_7x128b;
17 	__m512i fold_3x128b;
18 	__m128i rk5_rk6;
19 	__m128i rk7_rk8;
20 	__m128i fold_1x128b;
21 };
22 
23 static alignas(64) struct crc_vpclmulqdq_ctx crc32_eth;
24 static alignas(64) struct crc_vpclmulqdq_ctx crc16_ccitt;
25 
26 static uint16_t byte_len_to_mask_table[] = {
27 	0x0000, 0x0001, 0x0003, 0x0007,
28 	0x000f, 0x001f, 0x003f, 0x007f,
29 	0x00ff, 0x01ff, 0x03ff, 0x07ff,
30 	0x0fff, 0x1fff, 0x3fff, 0x7fff,
31 	0xffff};
32 
33 static const alignas(16) uint8_t shf_table[32] = {
34 	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
35 	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
36 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
37 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
38 };
39 
40 static const alignas(16) uint32_t mask[4] = {
41 	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
42 };
43 
44 static const alignas(16) uint32_t mask2[4] = {
45 	0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
46 };
47 
48 static __rte_always_inline __m512i
crcr32_folding_round(__m512i data_block,__m512i precomp,__m512i fold)49 crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
50 {
51 	__m512i tmp0, tmp1;
52 
53 	tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
54 	tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
55 
56 	return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
57 }
58 
59 static __rte_always_inline __m128i
crc32_fold_128(__m512i fold0,__m512i fold1,const struct crc_vpclmulqdq_ctx * params)60 crc32_fold_128(__m512i fold0, __m512i fold1,
61 	const struct crc_vpclmulqdq_ctx *params)
62 {
63 	__m128i res, res2;
64 	__m256i a;
65 	__m512i tmp0, tmp1, tmp2, tmp3;
66 	__m512i tmp4;
67 
68 	tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
69 	tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
70 
71 	res = _mm512_extracti64x2_epi64(fold1, 3);
72 	tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
73 
74 	tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
75 	tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
76 
77 	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
78 	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
79 
80 	tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
81 
82 	a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
83 	res = _mm256_extracti64x2_epi64(a, 1);
84 	res2 = _mm_xor_si128(res, *(__m128i *)&a);
85 
86 	return res2;
87 }
88 
89 static __rte_always_inline __m128i
last_two_xmm(const uint8_t * data,uint32_t data_len,uint32_t n,__m128i res,const struct crc_vpclmulqdq_ctx * params)90 last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
91 	const struct crc_vpclmulqdq_ctx *params)
92 {
93 	uint32_t offset;
94 	__m128i res2, res3, res4, pshufb_shf;
95 
96 	const alignas(16) uint32_t mask3[4] = {
97 		   0x80808080, 0x80808080, 0x80808080, 0x80808080
98 	};
99 
100 	res2 = res;
101 	offset = data_len - n;
102 	res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
103 
104 	pshufb_shf = _mm_loadu_si128((const __m128i *)
105 			(shf_table + (data_len-n)));
106 
107 	res = _mm_shuffle_epi8(res, pshufb_shf);
108 	pshufb_shf = _mm_xor_si128(pshufb_shf,
109 			_mm_load_si128((const __m128i *) mask3));
110 	res2 = _mm_shuffle_epi8(res2, pshufb_shf);
111 
112 	res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
113 
114 	res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
115 	res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
116 	res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
117 
118 	return res;
119 }
120 
121 static __rte_always_inline __m128i
done_128(__m128i res,const struct crc_vpclmulqdq_ctx * params)122 done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
123 {
124 	__m128i res1;
125 
126 	res1 = res;
127 
128 	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
129 	res1 = _mm_srli_si128(res1, 8);
130 	res = _mm_xor_si128(res, res1);
131 
132 	res1 = res;
133 	res = _mm_slli_si128(res, 4);
134 	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
135 	res = _mm_xor_si128(res, res1);
136 
137 	return res;
138 }
139 
140 static __rte_always_inline uint32_t
barrett_reduction(__m128i data64,const struct crc_vpclmulqdq_ctx * params)141 barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
142 {
143 	__m128i tmp0, tmp1;
144 
145 	data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
146 	tmp0 = data64;
147 	tmp1 = data64;
148 
149 	data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
150 	data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
151 			0x28);
152 
153 	tmp1 = data64;
154 	data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
155 	data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
156 
157 	return _mm_extract_epi32(data64, 2);
158 }
159 
160 static __rte_always_inline void
reduction_loop(__m128i * fold,int * len,const uint8_t * data,uint32_t * n,const struct crc_vpclmulqdq_ctx * params)161 reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
162 	const struct crc_vpclmulqdq_ctx *params)
163 {
164 	__m128i tmp, tmp1;
165 
166 	tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
167 	*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
168 	*fold = _mm_xor_si128(*fold, tmp);
169 	tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
170 	*fold = _mm_xor_si128(*fold, tmp1);
171 	*n += 16;
172 	*len -= 16;
173 }
174 
175 static __rte_always_inline uint32_t
crc32_eth_calc_vpclmulqdq(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_vpclmulqdq_ctx * params)176 crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
177 	const struct crc_vpclmulqdq_ctx *params)
178 {
179 	__m128i res, d, b;
180 	__m512i temp, k;
181 	__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
182 	__m512i fold0, fold1, fold2, fold3;
183 	__mmask16 mask;
184 	uint32_t n = 0;
185 	int reduction = 0;
186 
187 	/* Get CRC init value */
188 	b = _mm_cvtsi32_si128(crc);
189 	temp = _mm512_castsi128_si512(b);
190 
191 	if (data_len > 255) {
192 		fold0 = _mm512_loadu_si512((const __m512i *)data);
193 		fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
194 		fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
195 		fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
196 		fold0 = _mm512_xor_si512(fold0, temp);
197 
198 		/* Main folding loop */
199 		k = params->rk1_rk2;
200 		for (n = 256; (n + 256) <= data_len; n += 256) {
201 			qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
202 			qw1 = _mm512_loadu_si512((const __m512i *)
203 					&(data[n+64]));
204 			qw2 = _mm512_loadu_si512((const __m512i *)
205 					&(data[n+128]));
206 			qw3 = _mm512_loadu_si512((const __m512i *)
207 					&(data[n+192]));
208 			fold0 = crcr32_folding_round(qw0, k, fold0);
209 			fold1 = crcr32_folding_round(qw1, k, fold1);
210 			fold2 = crcr32_folding_round(qw2, k, fold2);
211 			fold3 = crcr32_folding_round(qw3, k, fold3);
212 		}
213 
214 		/* 256 to 128 fold */
215 		k = params->rk3_rk4;
216 		fold0 = crcr32_folding_round(fold2, k, fold0);
217 		fold1 = crcr32_folding_round(fold3, k, fold1);
218 
219 		res = crc32_fold_128(fold0, fold1, params);
220 
221 		reduction = 240 - ((n+256)-data_len);
222 
223 		while (reduction > 0)
224 			reduction_loop(&res, &reduction, data, &n,
225 					params);
226 
227 		reduction += 16;
228 
229 		if (n != data_len)
230 			res = last_two_xmm(data, data_len, n, res,
231 					params);
232 	} else {
233 		if (data_len > 31) {
234 			res = _mm_cvtsi32_si128(crc);
235 			d = _mm_loadu_si128((const __m128i *)data);
236 			res = _mm_xor_si128(res, d);
237 			n += 16;
238 
239 			reduction = 240 - ((n+256)-data_len);
240 
241 			while (reduction > 0)
242 				reduction_loop(&res, &reduction, data, &n,
243 						params);
244 
245 			if (n != data_len)
246 				res = last_two_xmm(data, data_len, n, res,
247 						params);
248 		} else if (data_len > 16) {
249 			res = _mm_cvtsi32_si128(crc);
250 			d = _mm_loadu_si128((const __m128i *)data);
251 			res = _mm_xor_si128(res, d);
252 			n += 16;
253 
254 			if (n != data_len)
255 				res = last_two_xmm(data, data_len, n, res,
256 						params);
257 		} else if (data_len == 16) {
258 			res = _mm_cvtsi32_si128(crc);
259 			d = _mm_loadu_si128((const __m128i *)data);
260 			res = _mm_xor_si128(res, d);
261 		} else {
262 			res = _mm_cvtsi32_si128(crc);
263 			mask = byte_len_to_mask_table[data_len];
264 			d = _mm_maskz_loadu_epi8(mask, data);
265 			res = _mm_xor_si128(res, d);
266 
267 			if (data_len > 3) {
268 				d = _mm_loadu_si128((const __m128i *)
269 						&shf_table[data_len]);
270 				res = _mm_shuffle_epi8(res, d);
271 			} else if (data_len > 2) {
272 				res = _mm_slli_si128(res, 5);
273 				goto do_barrett_reduction;
274 			} else if (data_len > 1) {
275 				res = _mm_slli_si128(res, 6);
276 				goto do_barrett_reduction;
277 			} else if (data_len > 0) {
278 				res = _mm_slli_si128(res, 7);
279 				goto do_barrett_reduction;
280 			} else {
281 				/* zero length case */
282 				return crc;
283 			}
284 		}
285 	}
286 
287 	res = done_128(res, params);
288 
289 do_barrett_reduction:
290 	n = barrett_reduction(res, params);
291 
292 	return n;
293 }
294 
295 static void
crc32_load_init_constants(void)296 crc32_load_init_constants(void)
297 {
298 	__m128i a;
299 	/* fold constants */
300 	uint64_t c0 = 0x00000000e95c1271;
301 	uint64_t c1 = 0x00000000ce3371cb;
302 	uint64_t c2 = 0x00000000910eeec1;
303 	uint64_t c3 = 0x0000000033fff533;
304 	uint64_t c4 = 0x000000000cbec0ed;
305 	uint64_t c5 = 0x0000000031f8303f;
306 	uint64_t c6 = 0x0000000057c54819;
307 	uint64_t c7 = 0x00000000df068dc2;
308 	uint64_t c8 = 0x00000000ae0b5394;
309 	uint64_t c9 = 0x000000001c279815;
310 	uint64_t c10 = 0x000000001d9513d7;
311 	uint64_t c11 = 0x000000008f352d95;
312 	uint64_t c12 = 0x00000000af449247;
313 	uint64_t c13 = 0x000000003db1ecdc;
314 	uint64_t c14 = 0x0000000081256527;
315 	uint64_t c15 = 0x00000000f1da05aa;
316 	uint64_t c16 = 0x00000000ccaa009e;
317 	uint64_t c17 = 0x00000000ae689191;
318 	uint64_t c18 = 0x00000000ccaa009e;
319 	uint64_t c19 = 0x00000000b8bc6765;
320 	uint64_t c20 = 0x00000001f7011640;
321 	uint64_t c21 = 0x00000001db710640;
322 
323 	a = _mm_set_epi64x(c1, c0);
324 	crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
325 
326 	a = _mm_set_epi64x(c3, c2);
327 	crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
328 
329 	crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
330 			c9, c10, c11);
331 	crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
332 			c16, c17, 0, 0);
333 	crc32_eth.fold_1x128b = _mm_set_epi64x(c17, c16);
334 
335 	crc32_eth.rk5_rk6 = _mm_set_epi64x(c19, c18);
336 	crc32_eth.rk7_rk8 = _mm_set_epi64x(c21, c20);
337 }
338 
339 static void
crc16_load_init_constants(void)340 crc16_load_init_constants(void)
341 {
342 	__m128i a;
343 	/* fold constants */
344 	uint64_t c0 = 0x0000000000009a19;
345 	uint64_t c1 = 0x0000000000002df8;
346 	uint64_t c2 = 0x00000000000068af;
347 	uint64_t c3 = 0x000000000000b6c9;
348 	uint64_t c4 = 0x000000000000c64f;
349 	uint64_t c5 = 0x000000000000cd95;
350 	uint64_t c6 = 0x000000000000d341;
351 	uint64_t c7 = 0x000000000000b8f2;
352 	uint64_t c8 = 0x0000000000000842;
353 	uint64_t c9 = 0x000000000000b072;
354 	uint64_t c10 = 0x00000000000047e3;
355 	uint64_t c11 = 0x000000000000922d;
356 	uint64_t c12 = 0x0000000000000e3a;
357 	uint64_t c13 = 0x0000000000004d7a;
358 	uint64_t c14 = 0x0000000000005b44;
359 	uint64_t c15 = 0x0000000000007762;
360 	uint64_t c16 = 0x00000000000081bf;
361 	uint64_t c17 = 0x0000000000008e10;
362 	uint64_t c18 = 0x00000000000081bf;
363 	uint64_t c19 = 0x0000000000001cbb;
364 	uint64_t c20 = 0x000000011c581910;
365 	uint64_t c21 = 0x0000000000010810;
366 
367 	a = _mm_set_epi64x(c1, c0);
368 	crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
369 
370 	a = _mm_set_epi64x(c3, c2);
371 	crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
372 
373 	crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
374 			c9, c10, c11);
375 	crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
376 			c16, c17, 0, 0);
377 	crc16_ccitt.fold_1x128b = _mm_set_epi64x(c17, c16);
378 
379 	crc16_ccitt.rk5_rk6 = _mm_set_epi64x(c19, c18);
380 	crc16_ccitt.rk7_rk8 = _mm_set_epi64x(c21, c20);
381 }
382 
383 void
rte_net_crc_avx512_init(void)384 rte_net_crc_avx512_init(void)
385 {
386 	crc32_load_init_constants();
387 	crc16_load_init_constants();
388 }
389 
390 uint32_t
rte_crc16_ccitt_avx512_handler(const uint8_t * data,uint32_t data_len)391 rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
392 {
393 	/* return 16-bit CRC value */
394 	return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
395 		data_len,
396 		0xffff,
397 		&crc16_ccitt);
398 }
399 
400 uint32_t
rte_crc32_eth_avx512_handler(const uint8_t * data,uint32_t data_len)401 rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
402 {
403 	/* return 32-bit CRC value */
404 	return ~crc32_eth_calc_vpclmulqdq(data,
405 		data_len,
406 		0xffffffffUL,
407 		&crc32_eth);
408 }
409