xref: /dpdk/lib/net/net_crc_sse.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017-2020 Intel Corporation
3  */
4 
5 #include <string.h>
6 
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_cpuflags.h>
10 
11 #include "net_crc.h"
12 
13 #include <x86intrin.h>
14 
15 /** PCLMULQDQ CRC computation context structure */
16 struct crc_pclmulqdq_ctx {
17 	__m128i rk1_rk2;
18 	__m128i rk5_rk6;
19 	__m128i rk7_rk8;
20 };
21 
22 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
23 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
24 /**
25  * @brief Performs one folding round
26  *
27  * Logically function operates as follows:
28  *     DATA = READ_NEXT_16BYTES();
29  *     F1 = LSB8(FOLD)
30  *     F2 = MSB8(FOLD)
31  *     T1 = CLMUL(F1, RK1)
32  *     T2 = CLMUL(F2, RK2)
33  *     FOLD = XOR(T1, T2, DATA)
34  *
35  * @param data_block
36  *   16 byte data block
37  * @param precomp
38  *   Precomputed rk1 constant
39  * @param fold
40  *   Current16 byte folded data
41  *
42  * @return
43  *   New 16 byte folded data
44  */
45 static __rte_always_inline __m128i
46 crcr32_folding_round(__m128i data_block,
47 		__m128i precomp,
48 		__m128i fold)
49 {
50 	__m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
51 	__m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
52 
53 	return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
54 }
55 
56 /**
57  * Performs reduction from 128 bits to 64 bits
58  *
59  * @param data128
60  *   128 bits data to be reduced
61  * @param precomp
62  *   precomputed constants rk5, rk6
63  *
64  * @return
65  *  64 bits reduced data
66  */
67 
68 static __rte_always_inline __m128i
69 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
70 {
71 	__m128i tmp0, tmp1, tmp2;
72 
73 	/* 64b fold */
74 	tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
75 	tmp1 = _mm_srli_si128(data128, 8);
76 	tmp0 = _mm_xor_si128(tmp0, tmp1);
77 
78 	/* 32b fold */
79 	tmp2 = _mm_slli_si128(tmp0, 4);
80 	tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
81 
82 	return _mm_xor_si128(tmp1, tmp0);
83 }
84 
85 /**
86  * Performs Barret's reduction from 64 bits to 32 bits
87  *
88  * @param data64
89  *   64 bits data to be reduced
90  * @param precomp
91  *   rk7 precomputed constant
92  *
93  * @return
94  *   reduced 32 bits data
95  */
96 
97 static __rte_always_inline uint32_t
98 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
99 {
100 	static const uint32_t mask1[4] __rte_aligned(16) = {
101 		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
102 	};
103 
104 	static const uint32_t mask2[4] __rte_aligned(16) = {
105 		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
106 	};
107 	__m128i tmp0, tmp1, tmp2;
108 
109 	tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
110 
111 	tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
112 	tmp1 = _mm_xor_si128(tmp1, tmp0);
113 	tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
114 
115 	tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
116 	tmp2 = _mm_xor_si128(tmp2, tmp1);
117 	tmp2 = _mm_xor_si128(tmp2, tmp0);
118 
119 	return _mm_extract_epi32(tmp2, 2);
120 }
121 
122 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
123 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
125 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
126 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
127 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
128 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
129 };
130 
131 /**
132  * Shifts left 128 bit register by specified number of bytes
133  *
134  * @param reg
135  *   128 bit value
136  * @param num
137  *   number of bytes to shift left reg by (0-16)
138  *
139  * @return
140  *   reg << (num * 8)
141  */
142 
143 static __rte_always_inline __m128i
144 xmm_shift_left(__m128i reg, const unsigned int num)
145 {
146 	const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
147 
148 	return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
149 }
150 
151 static __rte_always_inline uint32_t
152 crc32_eth_calc_pclmulqdq(
153 	const uint8_t *data,
154 	uint32_t data_len,
155 	uint32_t crc,
156 	const struct crc_pclmulqdq_ctx *params)
157 {
158 	__m128i temp, fold, k;
159 	uint32_t n;
160 
161 	/* Get CRC init value */
162 	temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
163 
164 	/**
165 	 * Folding all data into single 16 byte data block
166 	 * Assumes: fold holds first 16 bytes of data
167 	 */
168 
169 	if (unlikely(data_len < 32)) {
170 		if (unlikely(data_len == 16)) {
171 			/* 16 bytes */
172 			fold = _mm_loadu_si128((const __m128i *)data);
173 			fold = _mm_xor_si128(fold, temp);
174 			goto reduction_128_64;
175 		}
176 
177 		if (unlikely(data_len < 16)) {
178 			/* 0 to 15 bytes */
179 			uint8_t buffer[16] __rte_aligned(16);
180 
181 			memset(buffer, 0, sizeof(buffer));
182 			memcpy(buffer, data, data_len);
183 
184 			fold = _mm_load_si128((const __m128i *)buffer);
185 			fold = _mm_xor_si128(fold, temp);
186 			if (unlikely(data_len < 4)) {
187 				fold = xmm_shift_left(fold, 8 - data_len);
188 				goto barret_reduction;
189 			}
190 			fold = xmm_shift_left(fold, 16 - data_len);
191 			goto reduction_128_64;
192 		}
193 		/* 17 to 31 bytes */
194 		fold = _mm_loadu_si128((const __m128i *)data);
195 		fold = _mm_xor_si128(fold, temp);
196 		n = 16;
197 		k = params->rk1_rk2;
198 		goto partial_bytes;
199 	}
200 
201 	/** At least 32 bytes in the buffer */
202 	/** Apply CRC initial value */
203 	fold = _mm_loadu_si128((const __m128i *)data);
204 	fold = _mm_xor_si128(fold, temp);
205 
206 	/** Main folding loop - the last 16 bytes is processed separately */
207 	k = params->rk1_rk2;
208 	for (n = 16; (n + 16) <= data_len; n += 16) {
209 		temp = _mm_loadu_si128((const __m128i *)&data[n]);
210 		fold = crcr32_folding_round(temp, k, fold);
211 	}
212 
213 partial_bytes:
214 	if (likely(n < data_len)) {
215 
216 		const uint32_t mask3[4] __rte_aligned(16) = {
217 			0x80808080, 0x80808080, 0x80808080, 0x80808080
218 		};
219 
220 		const uint8_t shf_table[32] __rte_aligned(16) = {
221 			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
222 			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
223 			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
224 			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
225 		};
226 
227 		__m128i last16, a, b;
228 
229 		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
230 
231 		temp = _mm_loadu_si128((const __m128i *)
232 			&shf_table[data_len & 15]);
233 		a = _mm_shuffle_epi8(fold, temp);
234 
235 		temp = _mm_xor_si128(temp,
236 			_mm_load_si128((const __m128i *)mask3));
237 		b = _mm_shuffle_epi8(fold, temp);
238 		b = _mm_blendv_epi8(b, last16, temp);
239 
240 		/* k = rk1 & rk2 */
241 		temp = _mm_clmulepi64_si128(a, k, 0x01);
242 		fold = _mm_clmulepi64_si128(a, k, 0x10);
243 
244 		fold = _mm_xor_si128(fold, temp);
245 		fold = _mm_xor_si128(fold, b);
246 	}
247 
248 	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
249 reduction_128_64:
250 	k = params->rk5_rk6;
251 	fold = crcr32_reduce_128_to_64(fold, k);
252 
253 barret_reduction:
254 	k = params->rk7_rk8;
255 	n = crcr32_reduce_64_to_32(fold, k);
256 
257 	return n;
258 }
259 
260 void
261 rte_net_crc_sse42_init(void)
262 {
263 	uint64_t k1, k2, k5, k6;
264 	uint64_t p = 0, q = 0;
265 
266 	/** Initialize CRC16 data */
267 	k1 = 0x189aeLLU;
268 	k2 = 0x8e10LLU;
269 	k5 = 0x189aeLLU;
270 	k6 = 0x114aaLLU;
271 	q =  0x11c581910LLU;
272 	p =  0x10811LLU;
273 
274 	/** Save the params in context structure */
275 	crc16_ccitt_pclmulqdq.rk1_rk2 =
276 		_mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
277 	crc16_ccitt_pclmulqdq.rk5_rk6 =
278 		_mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
279 	crc16_ccitt_pclmulqdq.rk7_rk8 =
280 		_mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
281 
282 	/** Initialize CRC32 data */
283 	k1 = 0xccaa009eLLU;
284 	k2 = 0x1751997d0LLU;
285 	k5 = 0xccaa009eLLU;
286 	k6 = 0x163cd6124LLU;
287 	q =  0x1f7011640LLU;
288 	p =  0x1db710641LLU;
289 
290 	/** Save the params in context structure */
291 	crc32_eth_pclmulqdq.rk1_rk2 =
292 		_mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
293 	crc32_eth_pclmulqdq.rk5_rk6 =
294 		_mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
295 	crc32_eth_pclmulqdq.rk7_rk8 =
296 		_mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
297 
298 	/**
299 	 * Reset the register as following calculation may
300 	 * use other data types such as float, double, etc.
301 	 */
302 	_mm_empty();
303 }
304 
305 uint32_t
306 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
307 {
308 	/** return 16-bit CRC value */
309 	return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
310 		data_len,
311 		0xffff,
312 		&crc16_ccitt_pclmulqdq);
313 }
314 
315 uint32_t
316 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
317 {
318 	return ~crc32_eth_calc_pclmulqdq(data,
319 		data_len,
320 		0xffffffffUL,
321 		&crc32_eth_pclmulqdq);
322 }
323