xref: /dpdk/lib/net/net_crc_avx512.c (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <stdalign.h>
6 
7 #include <rte_common.h>
8 
9 #include "net_crc.h"
10 
11 #include <x86intrin.h>
12 
13 /* VPCLMULQDQ CRC computation context structure */
14 struct crc_vpclmulqdq_ctx {
15 	__m512i rk1_rk2;
16 	__m512i rk3_rk4;
17 	__m512i fold_7x128b;
18 	__m512i fold_3x128b;
19 	__m128i rk5_rk6;
20 	__m128i rk7_rk8;
21 	__m128i fold_1x128b;
22 };
23 
24 static alignas(64) struct crc_vpclmulqdq_ctx crc32_eth;
25 static alignas(64) struct crc_vpclmulqdq_ctx crc16_ccitt;
26 
27 static uint16_t byte_len_to_mask_table[] = {
28 	0x0000, 0x0001, 0x0003, 0x0007,
29 	0x000f, 0x001f, 0x003f, 0x007f,
30 	0x00ff, 0x01ff, 0x03ff, 0x07ff,
31 	0x0fff, 0x1fff, 0x3fff, 0x7fff,
32 	0xffff};
33 
34 static const alignas(16) uint8_t shf_table[32] = {
35 	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
36 	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
37 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
38 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
39 };
40 
41 static const alignas(16) uint32_t mask[4] = {
42 	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
43 };
44 
45 static const alignas(16) uint32_t mask2[4] = {
46 	0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
47 };
48 
49 static __rte_always_inline __m512i
50 crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
51 {
52 	__m512i tmp0, tmp1;
53 
54 	tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
55 	tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
56 
57 	return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
58 }
59 
60 static __rte_always_inline __m128i
61 crc32_fold_128(__m512i fold0, __m512i fold1,
62 	const struct crc_vpclmulqdq_ctx *params)
63 {
64 	__m128i res, res2;
65 	__m256i a;
66 	__m512i tmp0, tmp1, tmp2, tmp3;
67 	__m512i tmp4;
68 
69 	tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
70 	tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
71 
72 	res = _mm512_extracti64x2_epi64(fold1, 3);
73 	tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
74 
75 	tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
76 	tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
77 
78 	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
79 	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
80 
81 	tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
82 
83 	a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
84 	res = _mm256_extracti64x2_epi64(a, 1);
85 	res2 = _mm_xor_si128(res, *(__m128i *)&a);
86 
87 	return res2;
88 }
89 
90 static __rte_always_inline __m128i
91 last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
92 	const struct crc_vpclmulqdq_ctx *params)
93 {
94 	uint32_t offset;
95 	__m128i res2, res3, res4, pshufb_shf;
96 
97 	const alignas(16) uint32_t mask3[4] = {
98 		   0x80808080, 0x80808080, 0x80808080, 0x80808080
99 	};
100 
101 	res2 = res;
102 	offset = data_len - n;
103 	res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
104 
105 	pshufb_shf = _mm_loadu_si128((const __m128i *)
106 			(shf_table + (data_len-n)));
107 
108 	res = _mm_shuffle_epi8(res, pshufb_shf);
109 	pshufb_shf = _mm_xor_si128(pshufb_shf,
110 			_mm_load_si128((const __m128i *) mask3));
111 	res2 = _mm_shuffle_epi8(res2, pshufb_shf);
112 
113 	res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
114 
115 	res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
116 	res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
117 	res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
118 
119 	return res;
120 }
121 
122 static __rte_always_inline __m128i
123 done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
124 {
125 	__m128i res1;
126 
127 	res1 = res;
128 
129 	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
130 	res1 = _mm_srli_si128(res1, 8);
131 	res = _mm_xor_si128(res, res1);
132 
133 	res1 = res;
134 	res = _mm_slli_si128(res, 4);
135 	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
136 	res = _mm_xor_si128(res, res1);
137 
138 	return res;
139 }
140 
141 static __rte_always_inline uint32_t
142 barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
143 {
144 	__m128i tmp0, tmp1;
145 
146 	data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
147 	tmp0 = data64;
148 	tmp1 = data64;
149 
150 	data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
151 	data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
152 			0x28);
153 
154 	tmp1 = data64;
155 	data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
156 	data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
157 
158 	return _mm_extract_epi32(data64, 2);
159 }
160 
161 static __rte_always_inline void
162 reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
163 	const struct crc_vpclmulqdq_ctx *params)
164 {
165 	__m128i tmp, tmp1;
166 
167 	tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
168 	*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
169 	*fold = _mm_xor_si128(*fold, tmp);
170 	tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
171 	*fold = _mm_xor_si128(*fold, tmp1);
172 	*n += 16;
173 	*len -= 16;
174 }
175 
176 static __rte_always_inline uint32_t
177 crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
178 	const struct crc_vpclmulqdq_ctx *params)
179 {
180 	__m128i res, d, b;
181 	__m512i temp, k;
182 	__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
183 	__m512i fold0, fold1, fold2, fold3;
184 	__mmask16 mask;
185 	uint32_t n = 0;
186 	int reduction = 0;
187 
188 	/* Get CRC init value */
189 	b = _mm_cvtsi32_si128(crc);
190 	temp = _mm512_castsi128_si512(b);
191 
192 	if (data_len > 255) {
193 		fold0 = _mm512_loadu_si512((const __m512i *)data);
194 		fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
195 		fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
196 		fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
197 		fold0 = _mm512_xor_si512(fold0, temp);
198 
199 		/* Main folding loop */
200 		k = params->rk1_rk2;
201 		for (n = 256; (n + 256) <= data_len; n += 256) {
202 			qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
203 			qw1 = _mm512_loadu_si512((const __m512i *)
204 					&(data[n+64]));
205 			qw2 = _mm512_loadu_si512((const __m512i *)
206 					&(data[n+128]));
207 			qw3 = _mm512_loadu_si512((const __m512i *)
208 					&(data[n+192]));
209 			fold0 = crcr32_folding_round(qw0, k, fold0);
210 			fold1 = crcr32_folding_round(qw1, k, fold1);
211 			fold2 = crcr32_folding_round(qw2, k, fold2);
212 			fold3 = crcr32_folding_round(qw3, k, fold3);
213 		}
214 
215 		/* 256 to 128 fold */
216 		k = params->rk3_rk4;
217 		fold0 = crcr32_folding_round(fold2, k, fold0);
218 		fold1 = crcr32_folding_round(fold3, k, fold1);
219 
220 		res = crc32_fold_128(fold0, fold1, params);
221 
222 		reduction = 240 - ((n+256)-data_len);
223 
224 		while (reduction > 0)
225 			reduction_loop(&res, &reduction, data, &n,
226 					params);
227 
228 		reduction += 16;
229 
230 		if (n != data_len)
231 			res = last_two_xmm(data, data_len, n, res,
232 					params);
233 	} else {
234 		if (data_len > 31) {
235 			res = _mm_cvtsi32_si128(crc);
236 			d = _mm_loadu_si128((const __m128i *)data);
237 			res = _mm_xor_si128(res, d);
238 			n += 16;
239 
240 			reduction = 240 - ((n+256)-data_len);
241 
242 			while (reduction > 0)
243 				reduction_loop(&res, &reduction, data, &n,
244 						params);
245 
246 			if (n != data_len)
247 				res = last_two_xmm(data, data_len, n, res,
248 						params);
249 		} else if (data_len > 16) {
250 			res = _mm_cvtsi32_si128(crc);
251 			d = _mm_loadu_si128((const __m128i *)data);
252 			res = _mm_xor_si128(res, d);
253 			n += 16;
254 
255 			if (n != data_len)
256 				res = last_two_xmm(data, data_len, n, res,
257 						params);
258 		} else if (data_len == 16) {
259 			res = _mm_cvtsi32_si128(crc);
260 			d = _mm_loadu_si128((const __m128i *)data);
261 			res = _mm_xor_si128(res, d);
262 		} else {
263 			res = _mm_cvtsi32_si128(crc);
264 			mask = byte_len_to_mask_table[data_len];
265 			d = _mm_maskz_loadu_epi8(mask, data);
266 			res = _mm_xor_si128(res, d);
267 
268 			if (data_len > 3) {
269 				d = _mm_loadu_si128((const __m128i *)
270 						&shf_table[data_len]);
271 				res = _mm_shuffle_epi8(res, d);
272 			} else if (data_len > 2) {
273 				res = _mm_slli_si128(res, 5);
274 				goto do_barrett_reduction;
275 			} else if (data_len > 1) {
276 				res = _mm_slli_si128(res, 6);
277 				goto do_barrett_reduction;
278 			} else if (data_len > 0) {
279 				res = _mm_slli_si128(res, 7);
280 				goto do_barrett_reduction;
281 			} else {
282 				/* zero length case */
283 				return crc;
284 			}
285 		}
286 	}
287 
288 	res = done_128(res, params);
289 
290 do_barrett_reduction:
291 	n = barrett_reduction(res, params);
292 
293 	return n;
294 }
295 
296 static void
297 crc32_load_init_constants(void)
298 {
299 	__m128i a;
300 	/* fold constants */
301 	uint64_t c0 = 0x00000000e95c1271;
302 	uint64_t c1 = 0x00000000ce3371cb;
303 	uint64_t c2 = 0x00000000910eeec1;
304 	uint64_t c3 = 0x0000000033fff533;
305 	uint64_t c4 = 0x000000000cbec0ed;
306 	uint64_t c5 = 0x0000000031f8303f;
307 	uint64_t c6 = 0x0000000057c54819;
308 	uint64_t c7 = 0x00000000df068dc2;
309 	uint64_t c8 = 0x00000000ae0b5394;
310 	uint64_t c9 = 0x000000001c279815;
311 	uint64_t c10 = 0x000000001d9513d7;
312 	uint64_t c11 = 0x000000008f352d95;
313 	uint64_t c12 = 0x00000000af449247;
314 	uint64_t c13 = 0x000000003db1ecdc;
315 	uint64_t c14 = 0x0000000081256527;
316 	uint64_t c15 = 0x00000000f1da05aa;
317 	uint64_t c16 = 0x00000000ccaa009e;
318 	uint64_t c17 = 0x00000000ae689191;
319 	uint64_t c18 = 0x00000000ccaa009e;
320 	uint64_t c19 = 0x00000000b8bc6765;
321 	uint64_t c20 = 0x00000001f7011640;
322 	uint64_t c21 = 0x00000001db710640;
323 
324 	a = _mm_set_epi64x(c1, c0);
325 	crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
326 
327 	a = _mm_set_epi64x(c3, c2);
328 	crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
329 
330 	crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
331 			c9, c10, c11);
332 	crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
333 			c16, c17, 0, 0);
334 	crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
335 			_mm_cvtsi64_m64(c17));
336 
337 	crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
338 			_mm_cvtsi64_m64(c19));
339 	crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
340 			_mm_cvtsi64_m64(c21));
341 }
342 
343 static void
344 crc16_load_init_constants(void)
345 {
346 	__m128i a;
347 	/* fold constants */
348 	uint64_t c0 = 0x0000000000009a19;
349 	uint64_t c1 = 0x0000000000002df8;
350 	uint64_t c2 = 0x00000000000068af;
351 	uint64_t c3 = 0x000000000000b6c9;
352 	uint64_t c4 = 0x000000000000c64f;
353 	uint64_t c5 = 0x000000000000cd95;
354 	uint64_t c6 = 0x000000000000d341;
355 	uint64_t c7 = 0x000000000000b8f2;
356 	uint64_t c8 = 0x0000000000000842;
357 	uint64_t c9 = 0x000000000000b072;
358 	uint64_t c10 = 0x00000000000047e3;
359 	uint64_t c11 = 0x000000000000922d;
360 	uint64_t c12 = 0x0000000000000e3a;
361 	uint64_t c13 = 0x0000000000004d7a;
362 	uint64_t c14 = 0x0000000000005b44;
363 	uint64_t c15 = 0x0000000000007762;
364 	uint64_t c16 = 0x00000000000081bf;
365 	uint64_t c17 = 0x0000000000008e10;
366 	uint64_t c18 = 0x00000000000081bf;
367 	uint64_t c19 = 0x0000000000001cbb;
368 	uint64_t c20 = 0x000000011c581910;
369 	uint64_t c21 = 0x0000000000010810;
370 
371 	a = _mm_set_epi64x(c1, c0);
372 	crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
373 
374 	a = _mm_set_epi64x(c3, c2);
375 	crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
376 
377 	crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
378 			c9, c10, c11);
379 	crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
380 			c16, c17, 0, 0);
381 	crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
382 			_mm_cvtsi64_m64(c17));
383 
384 	crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
385 			_mm_cvtsi64_m64(c19));
386 	crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
387 			_mm_cvtsi64_m64(c21));
388 }
389 
390 void
391 rte_net_crc_avx512_init(void)
392 {
393 	crc32_load_init_constants();
394 	crc16_load_init_constants();
395 
396 	/*
397 	 * Reset the register as following calculation may
398 	 * use other data types such as float, double, etc.
399 	 */
400 	_mm_empty();
401 }
402 
403 uint32_t
404 rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
405 {
406 	/* return 16-bit CRC value */
407 	return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
408 		data_len,
409 		0xffff,
410 		&crc16_ccitt);
411 }
412 
413 uint32_t
414 rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
415 {
416 	/* return 32-bit CRC value */
417 	return ~crc32_eth_calc_vpclmulqdq(data,
418 		data_len,
419 		0xffffffffUL,
420 		&crc32_eth);
421 }
422