xref: /dpdk/lib/net/net_crc_sse.c (revision df2c51a9bc47a5187db2bf16f13b9bb9b3e6f4b8)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2017-2020 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
5e9fd1ebfSTyler Retzlaff #include <stdalign.h>
699a2dd95SBruce Richardson #include <string.h>
799a2dd95SBruce Richardson 
899a2dd95SBruce Richardson #include <rte_common.h>
9*df2c51a9STyler Retzlaff #include <rte_vect.h>
1099a2dd95SBruce Richardson #include <rte_branch_prediction.h>
1199a2dd95SBruce Richardson 
1299a2dd95SBruce Richardson #include "net_crc.h"
1399a2dd95SBruce Richardson 
1499a2dd95SBruce Richardson /** PCLMULQDQ CRC computation context structure */
1599a2dd95SBruce Richardson struct crc_pclmulqdq_ctx {
1699a2dd95SBruce Richardson 	__m128i rk1_rk2;
1799a2dd95SBruce Richardson 	__m128i rk5_rk6;
1899a2dd95SBruce Richardson 	__m128i rk7_rk8;
1999a2dd95SBruce Richardson };
2099a2dd95SBruce Richardson 
21e9fd1ebfSTyler Retzlaff static alignas(16) struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq;
22e9fd1ebfSTyler Retzlaff static alignas(16) struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq;
2399a2dd95SBruce Richardson /**
2499a2dd95SBruce Richardson  * @brief Performs one folding round
2599a2dd95SBruce Richardson  *
2699a2dd95SBruce Richardson  * Logically function operates as follows:
2799a2dd95SBruce Richardson  *     DATA = READ_NEXT_16BYTES();
2899a2dd95SBruce Richardson  *     F1 = LSB8(FOLD)
2999a2dd95SBruce Richardson  *     F2 = MSB8(FOLD)
3099a2dd95SBruce Richardson  *     T1 = CLMUL(F1, RK1)
3199a2dd95SBruce Richardson  *     T2 = CLMUL(F2, RK2)
3299a2dd95SBruce Richardson  *     FOLD = XOR(T1, T2, DATA)
3399a2dd95SBruce Richardson  *
3499a2dd95SBruce Richardson  * @param data_block
3599a2dd95SBruce Richardson  *   16 byte data block
3699a2dd95SBruce Richardson  * @param precomp
3799a2dd95SBruce Richardson  *   Precomputed rk1 constant
3899a2dd95SBruce Richardson  * @param fold
3999a2dd95SBruce Richardson  *   Current16 byte folded data
4099a2dd95SBruce Richardson  *
4199a2dd95SBruce Richardson  * @return
4299a2dd95SBruce Richardson  *   New 16 byte folded data
4399a2dd95SBruce Richardson  */
4499a2dd95SBruce Richardson static __rte_always_inline __m128i
crcr32_folding_round(__m128i data_block,__m128i precomp,__m128i fold)4599a2dd95SBruce Richardson crcr32_folding_round(__m128i data_block,
4699a2dd95SBruce Richardson 		__m128i precomp,
4799a2dd95SBruce Richardson 		__m128i fold)
4899a2dd95SBruce Richardson {
4999a2dd95SBruce Richardson 	__m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
5099a2dd95SBruce Richardson 	__m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
5199a2dd95SBruce Richardson 
5299a2dd95SBruce Richardson 	return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
5399a2dd95SBruce Richardson }
5499a2dd95SBruce Richardson 
5599a2dd95SBruce Richardson /**
5699a2dd95SBruce Richardson  * Performs reduction from 128 bits to 64 bits
5799a2dd95SBruce Richardson  *
5899a2dd95SBruce Richardson  * @param data128
5999a2dd95SBruce Richardson  *   128 bits data to be reduced
6099a2dd95SBruce Richardson  * @param precomp
6199a2dd95SBruce Richardson  *   precomputed constants rk5, rk6
6299a2dd95SBruce Richardson  *
6399a2dd95SBruce Richardson  * @return
6499a2dd95SBruce Richardson  *  64 bits reduced data
6599a2dd95SBruce Richardson  */
6699a2dd95SBruce Richardson 
6799a2dd95SBruce Richardson static __rte_always_inline __m128i
crcr32_reduce_128_to_64(__m128i data128,__m128i precomp)6899a2dd95SBruce Richardson crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
6999a2dd95SBruce Richardson {
7099a2dd95SBruce Richardson 	__m128i tmp0, tmp1, tmp2;
7199a2dd95SBruce Richardson 
7299a2dd95SBruce Richardson 	/* 64b fold */
7399a2dd95SBruce Richardson 	tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
7499a2dd95SBruce Richardson 	tmp1 = _mm_srli_si128(data128, 8);
7599a2dd95SBruce Richardson 	tmp0 = _mm_xor_si128(tmp0, tmp1);
7699a2dd95SBruce Richardson 
7799a2dd95SBruce Richardson 	/* 32b fold */
7899a2dd95SBruce Richardson 	tmp2 = _mm_slli_si128(tmp0, 4);
7999a2dd95SBruce Richardson 	tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
8099a2dd95SBruce Richardson 
8199a2dd95SBruce Richardson 	return _mm_xor_si128(tmp1, tmp0);
8299a2dd95SBruce Richardson }
8399a2dd95SBruce Richardson 
8499a2dd95SBruce Richardson /**
8599a2dd95SBruce Richardson  * Performs Barret's reduction from 64 bits to 32 bits
8699a2dd95SBruce Richardson  *
8799a2dd95SBruce Richardson  * @param data64
8899a2dd95SBruce Richardson  *   64 bits data to be reduced
8999a2dd95SBruce Richardson  * @param precomp
9099a2dd95SBruce Richardson  *   rk7 precomputed constant
9199a2dd95SBruce Richardson  *
9299a2dd95SBruce Richardson  * @return
9399a2dd95SBruce Richardson  *   reduced 32 bits data
9499a2dd95SBruce Richardson  */
9599a2dd95SBruce Richardson 
9699a2dd95SBruce Richardson static __rte_always_inline uint32_t
crcr32_reduce_64_to_32(__m128i data64,__m128i precomp)9799a2dd95SBruce Richardson crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
9899a2dd95SBruce Richardson {
99e9fd1ebfSTyler Retzlaff 	static const alignas(16) uint32_t mask1[4] = {
10099a2dd95SBruce Richardson 		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
10199a2dd95SBruce Richardson 	};
10299a2dd95SBruce Richardson 
103e9fd1ebfSTyler Retzlaff 	static const alignas(16) uint32_t mask2[4] = {
10499a2dd95SBruce Richardson 		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
10599a2dd95SBruce Richardson 	};
10699a2dd95SBruce Richardson 	__m128i tmp0, tmp1, tmp2;
10799a2dd95SBruce Richardson 
10899a2dd95SBruce Richardson 	tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
10999a2dd95SBruce Richardson 
11099a2dd95SBruce Richardson 	tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
11199a2dd95SBruce Richardson 	tmp1 = _mm_xor_si128(tmp1, tmp0);
11299a2dd95SBruce Richardson 	tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
11399a2dd95SBruce Richardson 
11499a2dd95SBruce Richardson 	tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
11599a2dd95SBruce Richardson 	tmp2 = _mm_xor_si128(tmp2, tmp1);
11699a2dd95SBruce Richardson 	tmp2 = _mm_xor_si128(tmp2, tmp0);
11799a2dd95SBruce Richardson 
11899a2dd95SBruce Richardson 	return _mm_extract_epi32(tmp2, 2);
11999a2dd95SBruce Richardson }
12099a2dd95SBruce Richardson 
121e9fd1ebfSTyler Retzlaff static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
12299a2dd95SBruce Richardson 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
12399a2dd95SBruce Richardson 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
12499a2dd95SBruce Richardson 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
12599a2dd95SBruce Richardson 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
12699a2dd95SBruce Richardson 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
12799a2dd95SBruce Richardson 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
12899a2dd95SBruce Richardson };
12999a2dd95SBruce Richardson 
13099a2dd95SBruce Richardson /**
13199a2dd95SBruce Richardson  * Shifts left 128 bit register by specified number of bytes
13299a2dd95SBruce Richardson  *
13399a2dd95SBruce Richardson  * @param reg
13499a2dd95SBruce Richardson  *   128 bit value
13599a2dd95SBruce Richardson  * @param num
13699a2dd95SBruce Richardson  *   number of bytes to shift left reg by (0-16)
13799a2dd95SBruce Richardson  *
13899a2dd95SBruce Richardson  * @return
13999a2dd95SBruce Richardson  *   reg << (num * 8)
14099a2dd95SBruce Richardson  */
14199a2dd95SBruce Richardson 
14299a2dd95SBruce Richardson static __rte_always_inline __m128i
xmm_shift_left(__m128i reg,const unsigned int num)14399a2dd95SBruce Richardson xmm_shift_left(__m128i reg, const unsigned int num)
14499a2dd95SBruce Richardson {
14599a2dd95SBruce Richardson 	const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
14699a2dd95SBruce Richardson 
14799a2dd95SBruce Richardson 	return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
14899a2dd95SBruce Richardson }
14999a2dd95SBruce Richardson 
15099a2dd95SBruce Richardson static __rte_always_inline uint32_t
crc32_eth_calc_pclmulqdq(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pclmulqdq_ctx * params)15199a2dd95SBruce Richardson crc32_eth_calc_pclmulqdq(
15299a2dd95SBruce Richardson 	const uint8_t *data,
15399a2dd95SBruce Richardson 	uint32_t data_len,
15499a2dd95SBruce Richardson 	uint32_t crc,
15599a2dd95SBruce Richardson 	const struct crc_pclmulqdq_ctx *params)
15699a2dd95SBruce Richardson {
15799a2dd95SBruce Richardson 	__m128i temp, fold, k;
15899a2dd95SBruce Richardson 	uint32_t n;
15999a2dd95SBruce Richardson 
16099a2dd95SBruce Richardson 	/* Get CRC init value */
16199a2dd95SBruce Richardson 	temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
16299a2dd95SBruce Richardson 
16399a2dd95SBruce Richardson 	/**
16499a2dd95SBruce Richardson 	 * Folding all data into single 16 byte data block
16599a2dd95SBruce Richardson 	 * Assumes: fold holds first 16 bytes of data
16699a2dd95SBruce Richardson 	 */
16799a2dd95SBruce Richardson 
16899a2dd95SBruce Richardson 	if (unlikely(data_len < 32)) {
16999a2dd95SBruce Richardson 		if (unlikely(data_len == 16)) {
17099a2dd95SBruce Richardson 			/* 16 bytes */
17199a2dd95SBruce Richardson 			fold = _mm_loadu_si128((const __m128i *)data);
17299a2dd95SBruce Richardson 			fold = _mm_xor_si128(fold, temp);
17399a2dd95SBruce Richardson 			goto reduction_128_64;
17499a2dd95SBruce Richardson 		}
17599a2dd95SBruce Richardson 
17699a2dd95SBruce Richardson 		if (unlikely(data_len < 16)) {
17799a2dd95SBruce Richardson 			/* 0 to 15 bytes */
178e9fd1ebfSTyler Retzlaff 			alignas(16) uint8_t buffer[16];
17999a2dd95SBruce Richardson 
18099a2dd95SBruce Richardson 			memset(buffer, 0, sizeof(buffer));
18199a2dd95SBruce Richardson 			memcpy(buffer, data, data_len);
18299a2dd95SBruce Richardson 
18399a2dd95SBruce Richardson 			fold = _mm_load_si128((const __m128i *)buffer);
18499a2dd95SBruce Richardson 			fold = _mm_xor_si128(fold, temp);
18599a2dd95SBruce Richardson 			if (unlikely(data_len < 4)) {
18699a2dd95SBruce Richardson 				fold = xmm_shift_left(fold, 8 - data_len);
18799a2dd95SBruce Richardson 				goto barret_reduction;
18899a2dd95SBruce Richardson 			}
18999a2dd95SBruce Richardson 			fold = xmm_shift_left(fold, 16 - data_len);
19099a2dd95SBruce Richardson 			goto reduction_128_64;
19199a2dd95SBruce Richardson 		}
19299a2dd95SBruce Richardson 		/* 17 to 31 bytes */
19399a2dd95SBruce Richardson 		fold = _mm_loadu_si128((const __m128i *)data);
19499a2dd95SBruce Richardson 		fold = _mm_xor_si128(fold, temp);
19599a2dd95SBruce Richardson 		n = 16;
19699a2dd95SBruce Richardson 		k = params->rk1_rk2;
19799a2dd95SBruce Richardson 		goto partial_bytes;
19899a2dd95SBruce Richardson 	}
19999a2dd95SBruce Richardson 
20099a2dd95SBruce Richardson 	/** At least 32 bytes in the buffer */
20199a2dd95SBruce Richardson 	/** Apply CRC initial value */
20299a2dd95SBruce Richardson 	fold = _mm_loadu_si128((const __m128i *)data);
20399a2dd95SBruce Richardson 	fold = _mm_xor_si128(fold, temp);
20499a2dd95SBruce Richardson 
20599a2dd95SBruce Richardson 	/** Main folding loop - the last 16 bytes is processed separately */
20699a2dd95SBruce Richardson 	k = params->rk1_rk2;
20799a2dd95SBruce Richardson 	for (n = 16; (n + 16) <= data_len; n += 16) {
20899a2dd95SBruce Richardson 		temp = _mm_loadu_si128((const __m128i *)&data[n]);
20999a2dd95SBruce Richardson 		fold = crcr32_folding_round(temp, k, fold);
21099a2dd95SBruce Richardson 	}
21199a2dd95SBruce Richardson 
21299a2dd95SBruce Richardson partial_bytes:
21399a2dd95SBruce Richardson 	if (likely(n < data_len)) {
21499a2dd95SBruce Richardson 
215e9fd1ebfSTyler Retzlaff 		const alignas(16) uint32_t mask3[4] = {
21699a2dd95SBruce Richardson 			0x80808080, 0x80808080, 0x80808080, 0x80808080
21799a2dd95SBruce Richardson 		};
21899a2dd95SBruce Richardson 
219e9fd1ebfSTyler Retzlaff 		const alignas(16) uint8_t shf_table[32] = {
22099a2dd95SBruce Richardson 			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
22199a2dd95SBruce Richardson 			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
22299a2dd95SBruce Richardson 			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
22399a2dd95SBruce Richardson 			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
22499a2dd95SBruce Richardson 		};
22599a2dd95SBruce Richardson 
22699a2dd95SBruce Richardson 		__m128i last16, a, b;
22799a2dd95SBruce Richardson 
22899a2dd95SBruce Richardson 		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
22999a2dd95SBruce Richardson 
23099a2dd95SBruce Richardson 		temp = _mm_loadu_si128((const __m128i *)
23199a2dd95SBruce Richardson 			&shf_table[data_len & 15]);
23299a2dd95SBruce Richardson 		a = _mm_shuffle_epi8(fold, temp);
23399a2dd95SBruce Richardson 
23499a2dd95SBruce Richardson 		temp = _mm_xor_si128(temp,
23599a2dd95SBruce Richardson 			_mm_load_si128((const __m128i *)mask3));
23699a2dd95SBruce Richardson 		b = _mm_shuffle_epi8(fold, temp);
23799a2dd95SBruce Richardson 		b = _mm_blendv_epi8(b, last16, temp);
23899a2dd95SBruce Richardson 
23999a2dd95SBruce Richardson 		/* k = rk1 & rk2 */
24099a2dd95SBruce Richardson 		temp = _mm_clmulepi64_si128(a, k, 0x01);
24199a2dd95SBruce Richardson 		fold = _mm_clmulepi64_si128(a, k, 0x10);
24299a2dd95SBruce Richardson 
24399a2dd95SBruce Richardson 		fold = _mm_xor_si128(fold, temp);
24499a2dd95SBruce Richardson 		fold = _mm_xor_si128(fold, b);
24599a2dd95SBruce Richardson 	}
24699a2dd95SBruce Richardson 
24799a2dd95SBruce Richardson 	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
24899a2dd95SBruce Richardson reduction_128_64:
24999a2dd95SBruce Richardson 	k = params->rk5_rk6;
25099a2dd95SBruce Richardson 	fold = crcr32_reduce_128_to_64(fold, k);
25199a2dd95SBruce Richardson 
25299a2dd95SBruce Richardson barret_reduction:
25399a2dd95SBruce Richardson 	k = params->rk7_rk8;
25499a2dd95SBruce Richardson 	n = crcr32_reduce_64_to_32(fold, k);
25599a2dd95SBruce Richardson 
25699a2dd95SBruce Richardson 	return n;
25799a2dd95SBruce Richardson }
25899a2dd95SBruce Richardson 
25999a2dd95SBruce Richardson void
rte_net_crc_sse42_init(void)26099a2dd95SBruce Richardson rte_net_crc_sse42_init(void)
26199a2dd95SBruce Richardson {
26299a2dd95SBruce Richardson 	uint64_t k1, k2, k5, k6;
26399a2dd95SBruce Richardson 	uint64_t p = 0, q = 0;
26499a2dd95SBruce Richardson 
26599a2dd95SBruce Richardson 	/** Initialize CRC16 data */
26699a2dd95SBruce Richardson 	k1 = 0x189aeLLU;
26799a2dd95SBruce Richardson 	k2 = 0x8e10LLU;
26899a2dd95SBruce Richardson 	k5 = 0x189aeLLU;
26999a2dd95SBruce Richardson 	k6 = 0x114aaLLU;
27099a2dd95SBruce Richardson 	q =  0x11c581910LLU;
27199a2dd95SBruce Richardson 	p =  0x10811LLU;
27299a2dd95SBruce Richardson 
27399a2dd95SBruce Richardson 	/** Save the params in context structure */
274*df2c51a9STyler Retzlaff 	crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
275*df2c51a9STyler Retzlaff 	crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
276*df2c51a9STyler Retzlaff 	crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
27799a2dd95SBruce Richardson 
27899a2dd95SBruce Richardson 	/** Initialize CRC32 data */
27999a2dd95SBruce Richardson 	k1 = 0xccaa009eLLU;
28099a2dd95SBruce Richardson 	k2 = 0x1751997d0LLU;
28199a2dd95SBruce Richardson 	k5 = 0xccaa009eLLU;
28299a2dd95SBruce Richardson 	k6 = 0x163cd6124LLU;
28399a2dd95SBruce Richardson 	q =  0x1f7011640LLU;
28499a2dd95SBruce Richardson 	p =  0x1db710641LLU;
28599a2dd95SBruce Richardson 
28699a2dd95SBruce Richardson 	/** Save the params in context structure */
287*df2c51a9STyler Retzlaff 	crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
288*df2c51a9STyler Retzlaff 	crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
289*df2c51a9STyler Retzlaff 	crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
29099a2dd95SBruce Richardson }
29199a2dd95SBruce Richardson 
29299a2dd95SBruce Richardson uint32_t
rte_crc16_ccitt_sse42_handler(const uint8_t * data,uint32_t data_len)29399a2dd95SBruce Richardson rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
29499a2dd95SBruce Richardson {
29599a2dd95SBruce Richardson 	/** return 16-bit CRC value */
29699a2dd95SBruce Richardson 	return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
29799a2dd95SBruce Richardson 		data_len,
29899a2dd95SBruce Richardson 		0xffff,
29999a2dd95SBruce Richardson 		&crc16_ccitt_pclmulqdq);
30099a2dd95SBruce Richardson }
30199a2dd95SBruce Richardson 
30299a2dd95SBruce Richardson uint32_t
rte_crc32_eth_sse42_handler(const uint8_t * data,uint32_t data_len)30399a2dd95SBruce Richardson rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
30499a2dd95SBruce Richardson {
30599a2dd95SBruce Richardson 	return ~crc32_eth_calc_pclmulqdq(data,
30699a2dd95SBruce Richardson 		data_len,
30799a2dd95SBruce Richardson 		0xffffffffUL,
30899a2dd95SBruce Richardson 		&crc32_eth_pclmulqdq);
30999a2dd95SBruce Richardson }
310