xref: /dpdk/lib/net/net_crc_avx512.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <string.h>
6 
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_cpuflags.h>
10 
11 #include "net_crc.h"
12 
13 #include <x86intrin.h>
14 
15 /* VPCLMULQDQ CRC computation context structure */
16 struct crc_vpclmulqdq_ctx {
17 	__m512i rk1_rk2;
18 	__m512i rk3_rk4;
19 	__m512i fold_7x128b;
20 	__m512i fold_3x128b;
21 	__m128i rk5_rk6;
22 	__m128i rk7_rk8;
23 	__m128i fold_1x128b;
24 };
25 
26 static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
27 static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);
28 
29 static uint16_t byte_len_to_mask_table[] = {
30 	0x0000, 0x0001, 0x0003, 0x0007,
31 	0x000f, 0x001f, 0x003f, 0x007f,
32 	0x00ff, 0x01ff, 0x03ff, 0x07ff,
33 	0x0fff, 0x1fff, 0x3fff, 0x7fff,
34 	0xffff};
35 
36 static const uint8_t shf_table[32] __rte_aligned(16) = {
37 	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
38 	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
39 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
40 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
41 };
42 
43 static const uint32_t mask[4] __rte_aligned(16) = {
44 	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
45 };
46 
47 static const uint32_t mask2[4] __rte_aligned(16) = {
48 	0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
49 };
50 
51 static __rte_always_inline __m512i
52 crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
53 {
54 	__m512i tmp0, tmp1;
55 
56 	tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
57 	tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
58 
59 	return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
60 }
61 
62 static __rte_always_inline __m128i
63 crc32_fold_128(__m512i fold0, __m512i fold1,
64 	const struct crc_vpclmulqdq_ctx *params)
65 {
66 	__m128i res, res2;
67 	__m256i a;
68 	__m512i tmp0, tmp1, tmp2, tmp3;
69 	__m512i tmp4;
70 
71 	tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
72 	tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
73 
74 	res = _mm512_extracti64x2_epi64(fold1, 3);
75 	tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
76 
77 	tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
78 	tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
79 
80 	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
81 	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
82 
83 	tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
84 
85 	a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
86 	res = _mm256_extracti64x2_epi64(a, 1);
87 	res2 = _mm_xor_si128(res, *(__m128i *)&a);
88 
89 	return res2;
90 }
91 
92 static __rte_always_inline __m128i
93 last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
94 	const struct crc_vpclmulqdq_ctx *params)
95 {
96 	uint32_t offset;
97 	__m128i res2, res3, res4, pshufb_shf;
98 
99 	const uint32_t mask3[4] __rte_aligned(16) = {
100 		   0x80808080, 0x80808080, 0x80808080, 0x80808080
101 	};
102 
103 	res2 = res;
104 	offset = data_len - n;
105 	res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
106 
107 	pshufb_shf = _mm_loadu_si128((const __m128i *)
108 			(shf_table + (data_len-n)));
109 
110 	res = _mm_shuffle_epi8(res, pshufb_shf);
111 	pshufb_shf = _mm_xor_si128(pshufb_shf,
112 			_mm_load_si128((const __m128i *) mask3));
113 	res2 = _mm_shuffle_epi8(res2, pshufb_shf);
114 
115 	res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
116 
117 	res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
118 	res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
119 	res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
120 
121 	return res;
122 }
123 
124 static __rte_always_inline __m128i
125 done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
126 {
127 	__m128i res1;
128 
129 	res1 = res;
130 
131 	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
132 	res1 = _mm_srli_si128(res1, 8);
133 	res = _mm_xor_si128(res, res1);
134 
135 	res1 = res;
136 	res = _mm_slli_si128(res, 4);
137 	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
138 	res = _mm_xor_si128(res, res1);
139 
140 	return res;
141 }
142 
143 static __rte_always_inline uint32_t
144 barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
145 {
146 	__m128i tmp0, tmp1;
147 
148 	data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
149 	tmp0 = data64;
150 	tmp1 = data64;
151 
152 	data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
153 	data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
154 			0x28);
155 
156 	tmp1 = data64;
157 	data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
158 	data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
159 
160 	return _mm_extract_epi32(data64, 2);
161 }
162 
163 static __rte_always_inline void
164 reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
165 	const struct crc_vpclmulqdq_ctx *params)
166 {
167 	__m128i tmp, tmp1;
168 
169 	tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
170 	*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
171 	*fold = _mm_xor_si128(*fold, tmp);
172 	tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
173 	*fold = _mm_xor_si128(*fold, tmp1);
174 	*n += 16;
175 	*len -= 16;
176 }
177 
178 static __rte_always_inline uint32_t
179 crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
180 	const struct crc_vpclmulqdq_ctx *params)
181 {
182 	__m128i res, d, b;
183 	__m512i temp, k;
184 	__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
185 	__m512i fold0, fold1, fold2, fold3;
186 	__mmask16 mask;
187 	uint32_t n = 0;
188 	int reduction = 0;
189 
190 	/* Get CRC init value */
191 	b = _mm_cvtsi32_si128(crc);
192 	temp = _mm512_castsi128_si512(b);
193 
194 	if (data_len > 255) {
195 		fold0 = _mm512_loadu_si512((const __m512i *)data);
196 		fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
197 		fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
198 		fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
199 		fold0 = _mm512_xor_si512(fold0, temp);
200 
201 		/* Main folding loop */
202 		k = params->rk1_rk2;
203 		for (n = 256; (n + 256) <= data_len; n += 256) {
204 			qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
205 			qw1 = _mm512_loadu_si512((const __m512i *)
206 					&(data[n+64]));
207 			qw2 = _mm512_loadu_si512((const __m512i *)
208 					&(data[n+128]));
209 			qw3 = _mm512_loadu_si512((const __m512i *)
210 					&(data[n+192]));
211 			fold0 = crcr32_folding_round(qw0, k, fold0);
212 			fold1 = crcr32_folding_round(qw1, k, fold1);
213 			fold2 = crcr32_folding_round(qw2, k, fold2);
214 			fold3 = crcr32_folding_round(qw3, k, fold3);
215 		}
216 
217 		/* 256 to 128 fold */
218 		k = params->rk3_rk4;
219 		fold0 = crcr32_folding_round(fold2, k, fold0);
220 		fold1 = crcr32_folding_round(fold3, k, fold1);
221 
222 		res = crc32_fold_128(fold0, fold1, params);
223 
224 		reduction = 240 - ((n+256)-data_len);
225 
226 		while (reduction > 0)
227 			reduction_loop(&res, &reduction, data, &n,
228 					params);
229 
230 		reduction += 16;
231 
232 		if (n != data_len)
233 			res = last_two_xmm(data, data_len, n, res,
234 					params);
235 	} else {
236 		if (data_len > 31) {
237 			res = _mm_cvtsi32_si128(crc);
238 			d = _mm_loadu_si128((const __m128i *)data);
239 			res = _mm_xor_si128(res, d);
240 			n += 16;
241 
242 			reduction = 240 - ((n+256)-data_len);
243 
244 			while (reduction > 0)
245 				reduction_loop(&res, &reduction, data, &n,
246 						params);
247 
248 			if (n != data_len)
249 				res = last_two_xmm(data, data_len, n, res,
250 						params);
251 		} else if (data_len > 16) {
252 			res = _mm_cvtsi32_si128(crc);
253 			d = _mm_loadu_si128((const __m128i *)data);
254 			res = _mm_xor_si128(res, d);
255 			n += 16;
256 
257 			if (n != data_len)
258 				res = last_two_xmm(data, data_len, n, res,
259 						params);
260 		} else if (data_len == 16) {
261 			res = _mm_cvtsi32_si128(crc);
262 			d = _mm_loadu_si128((const __m128i *)data);
263 			res = _mm_xor_si128(res, d);
264 		} else {
265 			res = _mm_cvtsi32_si128(crc);
266 			mask = byte_len_to_mask_table[data_len];
267 			d = _mm_maskz_loadu_epi8(mask, data);
268 			res = _mm_xor_si128(res, d);
269 
270 			if (data_len > 3) {
271 				d = _mm_loadu_si128((const __m128i *)
272 						&shf_table[data_len]);
273 				res = _mm_shuffle_epi8(res, d);
274 			} else if (data_len > 2) {
275 				res = _mm_slli_si128(res, 5);
276 				goto do_barrett_reduction;
277 			} else if (data_len > 1) {
278 				res = _mm_slli_si128(res, 6);
279 				goto do_barrett_reduction;
280 			} else if (data_len > 0) {
281 				res = _mm_slli_si128(res, 7);
282 				goto do_barrett_reduction;
283 			} else {
284 				/* zero length case */
285 				return crc;
286 			}
287 		}
288 	}
289 
290 	res = done_128(res, params);
291 
292 do_barrett_reduction:
293 	n = barrett_reduction(res, params);
294 
295 	return n;
296 }
297 
298 static void
299 crc32_load_init_constants(void)
300 {
301 	__m128i a;
302 	/* fold constants */
303 	uint64_t c0 = 0x00000000e95c1271;
304 	uint64_t c1 = 0x00000000ce3371cb;
305 	uint64_t c2 = 0x00000000910eeec1;
306 	uint64_t c3 = 0x0000000033fff533;
307 	uint64_t c4 = 0x000000000cbec0ed;
308 	uint64_t c5 = 0x0000000031f8303f;
309 	uint64_t c6 = 0x0000000057c54819;
310 	uint64_t c7 = 0x00000000df068dc2;
311 	uint64_t c8 = 0x00000000ae0b5394;
312 	uint64_t c9 = 0x000000001c279815;
313 	uint64_t c10 = 0x000000001d9513d7;
314 	uint64_t c11 = 0x000000008f352d95;
315 	uint64_t c12 = 0x00000000af449247;
316 	uint64_t c13 = 0x000000003db1ecdc;
317 	uint64_t c14 = 0x0000000081256527;
318 	uint64_t c15 = 0x00000000f1da05aa;
319 	uint64_t c16 = 0x00000000ccaa009e;
320 	uint64_t c17 = 0x00000000ae689191;
321 	uint64_t c18 = 0x00000000ccaa009e;
322 	uint64_t c19 = 0x00000000b8bc6765;
323 	uint64_t c20 = 0x00000001f7011640;
324 	uint64_t c21 = 0x00000001db710640;
325 
326 	a = _mm_set_epi64x(c1, c0);
327 	crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
328 
329 	a = _mm_set_epi64x(c3, c2);
330 	crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
331 
332 	crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
333 			c9, c10, c11);
334 	crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
335 			c16, c17, 0, 0);
336 	crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
337 			_mm_cvtsi64_m64(c17));
338 
339 	crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
340 			_mm_cvtsi64_m64(c19));
341 	crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
342 			_mm_cvtsi64_m64(c21));
343 }
344 
345 static void
346 crc16_load_init_constants(void)
347 {
348 	__m128i a;
349 	/* fold constants */
350 	uint64_t c0 = 0x0000000000009a19;
351 	uint64_t c1 = 0x0000000000002df8;
352 	uint64_t c2 = 0x00000000000068af;
353 	uint64_t c3 = 0x000000000000b6c9;
354 	uint64_t c4 = 0x000000000000c64f;
355 	uint64_t c5 = 0x000000000000cd95;
356 	uint64_t c6 = 0x000000000000d341;
357 	uint64_t c7 = 0x000000000000b8f2;
358 	uint64_t c8 = 0x0000000000000842;
359 	uint64_t c9 = 0x000000000000b072;
360 	uint64_t c10 = 0x00000000000047e3;
361 	uint64_t c11 = 0x000000000000922d;
362 	uint64_t c12 = 0x0000000000000e3a;
363 	uint64_t c13 = 0x0000000000004d7a;
364 	uint64_t c14 = 0x0000000000005b44;
365 	uint64_t c15 = 0x0000000000007762;
366 	uint64_t c16 = 0x00000000000081bf;
367 	uint64_t c17 = 0x0000000000008e10;
368 	uint64_t c18 = 0x00000000000081bf;
369 	uint64_t c19 = 0x0000000000001cbb;
370 	uint64_t c20 = 0x000000011c581910;
371 	uint64_t c21 = 0x0000000000010810;
372 
373 	a = _mm_set_epi64x(c1, c0);
374 	crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
375 
376 	a = _mm_set_epi64x(c3, c2);
377 	crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
378 
379 	crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
380 			c9, c10, c11);
381 	crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
382 			c16, c17, 0, 0);
383 	crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
384 			_mm_cvtsi64_m64(c17));
385 
386 	crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
387 			_mm_cvtsi64_m64(c19));
388 	crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
389 			_mm_cvtsi64_m64(c21));
390 }
391 
392 void
393 rte_net_crc_avx512_init(void)
394 {
395 	crc32_load_init_constants();
396 	crc16_load_init_constants();
397 
398 	/*
399 	 * Reset the register as following calculation may
400 	 * use other data types such as float, double, etc.
401 	 */
402 	_mm_empty();
403 }
404 
405 uint32_t
406 rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
407 {
408 	/* return 16-bit CRC value */
409 	return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
410 		data_len,
411 		0xffff,
412 		&crc16_ccitt);
413 }
414 
415 uint32_t
416 rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
417 {
418 	/* return 32-bit CRC value */
419 	return ~crc32_eth_calc_vpclmulqdq(data,
420 		data_len,
421 		0xffffffffUL,
422 		&crc32_eth);
423 }
424