xref: /dpdk/lib/net/net_crc_sse.c (revision df2c51a9bc47a5187db2bf16f13b9bb9b3e6f4b8)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017-2020 Intel Corporation
3  */
4 
5 #include <stdalign.h>
6 #include <string.h>
7 
8 #include <rte_common.h>
9 #include <rte_vect.h>
10 #include <rte_branch_prediction.h>
11 
12 #include "net_crc.h"
13 
14 /** PCLMULQDQ CRC computation context structure */
15 struct crc_pclmulqdq_ctx {
16 	__m128i rk1_rk2;
17 	__m128i rk5_rk6;
18 	__m128i rk7_rk8;
19 };
20 
21 static alignas(16) struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq;
22 static alignas(16) struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq;
23 /**
24  * @brief Performs one folding round
25  *
26  * Logically function operates as follows:
27  *     DATA = READ_NEXT_16BYTES();
28  *     F1 = LSB8(FOLD)
29  *     F2 = MSB8(FOLD)
30  *     T1 = CLMUL(F1, RK1)
31  *     T2 = CLMUL(F2, RK2)
32  *     FOLD = XOR(T1, T2, DATA)
33  *
34  * @param data_block
35  *   16 byte data block
36  * @param precomp
37  *   Precomputed rk1 constant
38  * @param fold
39  *   Current16 byte folded data
40  *
41  * @return
42  *   New 16 byte folded data
43  */
44 static __rte_always_inline __m128i
crcr32_folding_round(__m128i data_block,__m128i precomp,__m128i fold)45 crcr32_folding_round(__m128i data_block,
46 		__m128i precomp,
47 		__m128i fold)
48 {
49 	__m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
50 	__m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
51 
52 	return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
53 }
54 
55 /**
56  * Performs reduction from 128 bits to 64 bits
57  *
58  * @param data128
59  *   128 bits data to be reduced
60  * @param precomp
61  *   precomputed constants rk5, rk6
62  *
63  * @return
64  *  64 bits reduced data
65  */
66 
67 static __rte_always_inline __m128i
crcr32_reduce_128_to_64(__m128i data128,__m128i precomp)68 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
69 {
70 	__m128i tmp0, tmp1, tmp2;
71 
72 	/* 64b fold */
73 	tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
74 	tmp1 = _mm_srli_si128(data128, 8);
75 	tmp0 = _mm_xor_si128(tmp0, tmp1);
76 
77 	/* 32b fold */
78 	tmp2 = _mm_slli_si128(tmp0, 4);
79 	tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
80 
81 	return _mm_xor_si128(tmp1, tmp0);
82 }
83 
84 /**
85  * Performs Barret's reduction from 64 bits to 32 bits
86  *
87  * @param data64
88  *   64 bits data to be reduced
89  * @param precomp
90  *   rk7 precomputed constant
91  *
92  * @return
93  *   reduced 32 bits data
94  */
95 
96 static __rte_always_inline uint32_t
crcr32_reduce_64_to_32(__m128i data64,__m128i precomp)97 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
98 {
99 	static const alignas(16) uint32_t mask1[4] = {
100 		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
101 	};
102 
103 	static const alignas(16) uint32_t mask2[4] = {
104 		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
105 	};
106 	__m128i tmp0, tmp1, tmp2;
107 
108 	tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
109 
110 	tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
111 	tmp1 = _mm_xor_si128(tmp1, tmp0);
112 	tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
113 
114 	tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
115 	tmp2 = _mm_xor_si128(tmp2, tmp1);
116 	tmp2 = _mm_xor_si128(tmp2, tmp0);
117 
118 	return _mm_extract_epi32(tmp2, 2);
119 }
120 
121 static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
122 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
123 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
125 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
126 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
128 };
129 
130 /**
131  * Shifts left 128 bit register by specified number of bytes
132  *
133  * @param reg
134  *   128 bit value
135  * @param num
136  *   number of bytes to shift left reg by (0-16)
137  *
138  * @return
139  *   reg << (num * 8)
140  */
141 
142 static __rte_always_inline __m128i
xmm_shift_left(__m128i reg,const unsigned int num)143 xmm_shift_left(__m128i reg, const unsigned int num)
144 {
145 	const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
146 
147 	return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
148 }
149 
150 static __rte_always_inline uint32_t
crc32_eth_calc_pclmulqdq(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pclmulqdq_ctx * params)151 crc32_eth_calc_pclmulqdq(
152 	const uint8_t *data,
153 	uint32_t data_len,
154 	uint32_t crc,
155 	const struct crc_pclmulqdq_ctx *params)
156 {
157 	__m128i temp, fold, k;
158 	uint32_t n;
159 
160 	/* Get CRC init value */
161 	temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
162 
163 	/**
164 	 * Folding all data into single 16 byte data block
165 	 * Assumes: fold holds first 16 bytes of data
166 	 */
167 
168 	if (unlikely(data_len < 32)) {
169 		if (unlikely(data_len == 16)) {
170 			/* 16 bytes */
171 			fold = _mm_loadu_si128((const __m128i *)data);
172 			fold = _mm_xor_si128(fold, temp);
173 			goto reduction_128_64;
174 		}
175 
176 		if (unlikely(data_len < 16)) {
177 			/* 0 to 15 bytes */
178 			alignas(16) uint8_t buffer[16];
179 
180 			memset(buffer, 0, sizeof(buffer));
181 			memcpy(buffer, data, data_len);
182 
183 			fold = _mm_load_si128((const __m128i *)buffer);
184 			fold = _mm_xor_si128(fold, temp);
185 			if (unlikely(data_len < 4)) {
186 				fold = xmm_shift_left(fold, 8 - data_len);
187 				goto barret_reduction;
188 			}
189 			fold = xmm_shift_left(fold, 16 - data_len);
190 			goto reduction_128_64;
191 		}
192 		/* 17 to 31 bytes */
193 		fold = _mm_loadu_si128((const __m128i *)data);
194 		fold = _mm_xor_si128(fold, temp);
195 		n = 16;
196 		k = params->rk1_rk2;
197 		goto partial_bytes;
198 	}
199 
200 	/** At least 32 bytes in the buffer */
201 	/** Apply CRC initial value */
202 	fold = _mm_loadu_si128((const __m128i *)data);
203 	fold = _mm_xor_si128(fold, temp);
204 
205 	/** Main folding loop - the last 16 bytes is processed separately */
206 	k = params->rk1_rk2;
207 	for (n = 16; (n + 16) <= data_len; n += 16) {
208 		temp = _mm_loadu_si128((const __m128i *)&data[n]);
209 		fold = crcr32_folding_round(temp, k, fold);
210 	}
211 
212 partial_bytes:
213 	if (likely(n < data_len)) {
214 
215 		const alignas(16) uint32_t mask3[4] = {
216 			0x80808080, 0x80808080, 0x80808080, 0x80808080
217 		};
218 
219 		const alignas(16) uint8_t shf_table[32] = {
220 			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
221 			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
222 			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
223 			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
224 		};
225 
226 		__m128i last16, a, b;
227 
228 		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
229 
230 		temp = _mm_loadu_si128((const __m128i *)
231 			&shf_table[data_len & 15]);
232 		a = _mm_shuffle_epi8(fold, temp);
233 
234 		temp = _mm_xor_si128(temp,
235 			_mm_load_si128((const __m128i *)mask3));
236 		b = _mm_shuffle_epi8(fold, temp);
237 		b = _mm_blendv_epi8(b, last16, temp);
238 
239 		/* k = rk1 & rk2 */
240 		temp = _mm_clmulepi64_si128(a, k, 0x01);
241 		fold = _mm_clmulepi64_si128(a, k, 0x10);
242 
243 		fold = _mm_xor_si128(fold, temp);
244 		fold = _mm_xor_si128(fold, b);
245 	}
246 
247 	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
248 reduction_128_64:
249 	k = params->rk5_rk6;
250 	fold = crcr32_reduce_128_to_64(fold, k);
251 
252 barret_reduction:
253 	k = params->rk7_rk8;
254 	n = crcr32_reduce_64_to_32(fold, k);
255 
256 	return n;
257 }
258 
259 void
rte_net_crc_sse42_init(void)260 rte_net_crc_sse42_init(void)
261 {
262 	uint64_t k1, k2, k5, k6;
263 	uint64_t p = 0, q = 0;
264 
265 	/** Initialize CRC16 data */
266 	k1 = 0x189aeLLU;
267 	k2 = 0x8e10LLU;
268 	k5 = 0x189aeLLU;
269 	k6 = 0x114aaLLU;
270 	q =  0x11c581910LLU;
271 	p =  0x10811LLU;
272 
273 	/** Save the params in context structure */
274 	crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
275 	crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
276 	crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
277 
278 	/** Initialize CRC32 data */
279 	k1 = 0xccaa009eLLU;
280 	k2 = 0x1751997d0LLU;
281 	k5 = 0xccaa009eLLU;
282 	k6 = 0x163cd6124LLU;
283 	q =  0x1f7011640LLU;
284 	p =  0x1db710641LLU;
285 
286 	/** Save the params in context structure */
287 	crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
288 	crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
289 	crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
290 }
291 
292 uint32_t
rte_crc16_ccitt_sse42_handler(const uint8_t * data,uint32_t data_len)293 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
294 {
295 	/** return 16-bit CRC value */
296 	return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
297 		data_len,
298 		0xffff,
299 		&crc16_ccitt_pclmulqdq);
300 }
301 
302 uint32_t
rte_crc32_eth_sse42_handler(const uint8_t * data,uint32_t data_len)303 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
304 {
305 	return ~crc32_eth_calc_pclmulqdq(data,
306 		data_len,
307 		0xffffffffUL,
308 		&crc32_eth_pclmulqdq);
309 }
310