199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson * Copyright(c) 2017-2020 Intel Corporation
399a2dd95SBruce Richardson */
499a2dd95SBruce Richardson
5e9fd1ebfSTyler Retzlaff #include <stdalign.h>
699a2dd95SBruce Richardson #include <string.h>
799a2dd95SBruce Richardson
899a2dd95SBruce Richardson #include <rte_common.h>
9*df2c51a9STyler Retzlaff #include <rte_vect.h>
1099a2dd95SBruce Richardson #include <rte_branch_prediction.h>
1199a2dd95SBruce Richardson
1299a2dd95SBruce Richardson #include "net_crc.h"
1399a2dd95SBruce Richardson
1499a2dd95SBruce Richardson /** PCLMULQDQ CRC computation context structure */
1599a2dd95SBruce Richardson struct crc_pclmulqdq_ctx {
1699a2dd95SBruce Richardson __m128i rk1_rk2;
1799a2dd95SBruce Richardson __m128i rk5_rk6;
1899a2dd95SBruce Richardson __m128i rk7_rk8;
1999a2dd95SBruce Richardson };
2099a2dd95SBruce Richardson
21e9fd1ebfSTyler Retzlaff static alignas(16) struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq;
22e9fd1ebfSTyler Retzlaff static alignas(16) struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq;
2399a2dd95SBruce Richardson /**
2499a2dd95SBruce Richardson * @brief Performs one folding round
2599a2dd95SBruce Richardson *
2699a2dd95SBruce Richardson * Logically function operates as follows:
2799a2dd95SBruce Richardson * DATA = READ_NEXT_16BYTES();
2899a2dd95SBruce Richardson * F1 = LSB8(FOLD)
2999a2dd95SBruce Richardson * F2 = MSB8(FOLD)
3099a2dd95SBruce Richardson * T1 = CLMUL(F1, RK1)
3199a2dd95SBruce Richardson * T2 = CLMUL(F2, RK2)
3299a2dd95SBruce Richardson * FOLD = XOR(T1, T2, DATA)
3399a2dd95SBruce Richardson *
3499a2dd95SBruce Richardson * @param data_block
3599a2dd95SBruce Richardson * 16 byte data block
3699a2dd95SBruce Richardson * @param precomp
3799a2dd95SBruce Richardson * Precomputed rk1 constant
3899a2dd95SBruce Richardson * @param fold
3999a2dd95SBruce Richardson * Current16 byte folded data
4099a2dd95SBruce Richardson *
4199a2dd95SBruce Richardson * @return
4299a2dd95SBruce Richardson * New 16 byte folded data
4399a2dd95SBruce Richardson */
4499a2dd95SBruce Richardson static __rte_always_inline __m128i
crcr32_folding_round(__m128i data_block,__m128i precomp,__m128i fold)4599a2dd95SBruce Richardson crcr32_folding_round(__m128i data_block,
4699a2dd95SBruce Richardson __m128i precomp,
4799a2dd95SBruce Richardson __m128i fold)
4899a2dd95SBruce Richardson {
4999a2dd95SBruce Richardson __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
5099a2dd95SBruce Richardson __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
5199a2dd95SBruce Richardson
5299a2dd95SBruce Richardson return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
5399a2dd95SBruce Richardson }
5499a2dd95SBruce Richardson
5599a2dd95SBruce Richardson /**
5699a2dd95SBruce Richardson * Performs reduction from 128 bits to 64 bits
5799a2dd95SBruce Richardson *
5899a2dd95SBruce Richardson * @param data128
5999a2dd95SBruce Richardson * 128 bits data to be reduced
6099a2dd95SBruce Richardson * @param precomp
6199a2dd95SBruce Richardson * precomputed constants rk5, rk6
6299a2dd95SBruce Richardson *
6399a2dd95SBruce Richardson * @return
6499a2dd95SBruce Richardson * 64 bits reduced data
6599a2dd95SBruce Richardson */
6699a2dd95SBruce Richardson
6799a2dd95SBruce Richardson static __rte_always_inline __m128i
crcr32_reduce_128_to_64(__m128i data128,__m128i precomp)6899a2dd95SBruce Richardson crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
6999a2dd95SBruce Richardson {
7099a2dd95SBruce Richardson __m128i tmp0, tmp1, tmp2;
7199a2dd95SBruce Richardson
7299a2dd95SBruce Richardson /* 64b fold */
7399a2dd95SBruce Richardson tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
7499a2dd95SBruce Richardson tmp1 = _mm_srli_si128(data128, 8);
7599a2dd95SBruce Richardson tmp0 = _mm_xor_si128(tmp0, tmp1);
7699a2dd95SBruce Richardson
7799a2dd95SBruce Richardson /* 32b fold */
7899a2dd95SBruce Richardson tmp2 = _mm_slli_si128(tmp0, 4);
7999a2dd95SBruce Richardson tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
8099a2dd95SBruce Richardson
8199a2dd95SBruce Richardson return _mm_xor_si128(tmp1, tmp0);
8299a2dd95SBruce Richardson }
8399a2dd95SBruce Richardson
8499a2dd95SBruce Richardson /**
8599a2dd95SBruce Richardson * Performs Barret's reduction from 64 bits to 32 bits
8699a2dd95SBruce Richardson *
8799a2dd95SBruce Richardson * @param data64
8899a2dd95SBruce Richardson * 64 bits data to be reduced
8999a2dd95SBruce Richardson * @param precomp
9099a2dd95SBruce Richardson * rk7 precomputed constant
9199a2dd95SBruce Richardson *
9299a2dd95SBruce Richardson * @return
9399a2dd95SBruce Richardson * reduced 32 bits data
9499a2dd95SBruce Richardson */
9599a2dd95SBruce Richardson
9699a2dd95SBruce Richardson static __rte_always_inline uint32_t
crcr32_reduce_64_to_32(__m128i data64,__m128i precomp)9799a2dd95SBruce Richardson crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
9899a2dd95SBruce Richardson {
99e9fd1ebfSTyler Retzlaff static const alignas(16) uint32_t mask1[4] = {
10099a2dd95SBruce Richardson 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
10199a2dd95SBruce Richardson };
10299a2dd95SBruce Richardson
103e9fd1ebfSTyler Retzlaff static const alignas(16) uint32_t mask2[4] = {
10499a2dd95SBruce Richardson 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
10599a2dd95SBruce Richardson };
10699a2dd95SBruce Richardson __m128i tmp0, tmp1, tmp2;
10799a2dd95SBruce Richardson
10899a2dd95SBruce Richardson tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
10999a2dd95SBruce Richardson
11099a2dd95SBruce Richardson tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
11199a2dd95SBruce Richardson tmp1 = _mm_xor_si128(tmp1, tmp0);
11299a2dd95SBruce Richardson tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
11399a2dd95SBruce Richardson
11499a2dd95SBruce Richardson tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
11599a2dd95SBruce Richardson tmp2 = _mm_xor_si128(tmp2, tmp1);
11699a2dd95SBruce Richardson tmp2 = _mm_xor_si128(tmp2, tmp0);
11799a2dd95SBruce Richardson
11899a2dd95SBruce Richardson return _mm_extract_epi32(tmp2, 2);
11999a2dd95SBruce Richardson }
12099a2dd95SBruce Richardson
121e9fd1ebfSTyler Retzlaff static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
12299a2dd95SBruce Richardson 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
12399a2dd95SBruce Richardson 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
12499a2dd95SBruce Richardson 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
12599a2dd95SBruce Richardson 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
12699a2dd95SBruce Richardson 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
12799a2dd95SBruce Richardson 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
12899a2dd95SBruce Richardson };
12999a2dd95SBruce Richardson
13099a2dd95SBruce Richardson /**
13199a2dd95SBruce Richardson * Shifts left 128 bit register by specified number of bytes
13299a2dd95SBruce Richardson *
13399a2dd95SBruce Richardson * @param reg
13499a2dd95SBruce Richardson * 128 bit value
13599a2dd95SBruce Richardson * @param num
13699a2dd95SBruce Richardson * number of bytes to shift left reg by (0-16)
13799a2dd95SBruce Richardson *
13899a2dd95SBruce Richardson * @return
13999a2dd95SBruce Richardson * reg << (num * 8)
14099a2dd95SBruce Richardson */
14199a2dd95SBruce Richardson
14299a2dd95SBruce Richardson static __rte_always_inline __m128i
xmm_shift_left(__m128i reg,const unsigned int num)14399a2dd95SBruce Richardson xmm_shift_left(__m128i reg, const unsigned int num)
14499a2dd95SBruce Richardson {
14599a2dd95SBruce Richardson const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
14699a2dd95SBruce Richardson
14799a2dd95SBruce Richardson return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
14899a2dd95SBruce Richardson }
14999a2dd95SBruce Richardson
15099a2dd95SBruce Richardson static __rte_always_inline uint32_t
crc32_eth_calc_pclmulqdq(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pclmulqdq_ctx * params)15199a2dd95SBruce Richardson crc32_eth_calc_pclmulqdq(
15299a2dd95SBruce Richardson const uint8_t *data,
15399a2dd95SBruce Richardson uint32_t data_len,
15499a2dd95SBruce Richardson uint32_t crc,
15599a2dd95SBruce Richardson const struct crc_pclmulqdq_ctx *params)
15699a2dd95SBruce Richardson {
15799a2dd95SBruce Richardson __m128i temp, fold, k;
15899a2dd95SBruce Richardson uint32_t n;
15999a2dd95SBruce Richardson
16099a2dd95SBruce Richardson /* Get CRC init value */
16199a2dd95SBruce Richardson temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
16299a2dd95SBruce Richardson
16399a2dd95SBruce Richardson /**
16499a2dd95SBruce Richardson * Folding all data into single 16 byte data block
16599a2dd95SBruce Richardson * Assumes: fold holds first 16 bytes of data
16699a2dd95SBruce Richardson */
16799a2dd95SBruce Richardson
16899a2dd95SBruce Richardson if (unlikely(data_len < 32)) {
16999a2dd95SBruce Richardson if (unlikely(data_len == 16)) {
17099a2dd95SBruce Richardson /* 16 bytes */
17199a2dd95SBruce Richardson fold = _mm_loadu_si128((const __m128i *)data);
17299a2dd95SBruce Richardson fold = _mm_xor_si128(fold, temp);
17399a2dd95SBruce Richardson goto reduction_128_64;
17499a2dd95SBruce Richardson }
17599a2dd95SBruce Richardson
17699a2dd95SBruce Richardson if (unlikely(data_len < 16)) {
17799a2dd95SBruce Richardson /* 0 to 15 bytes */
178e9fd1ebfSTyler Retzlaff alignas(16) uint8_t buffer[16];
17999a2dd95SBruce Richardson
18099a2dd95SBruce Richardson memset(buffer, 0, sizeof(buffer));
18199a2dd95SBruce Richardson memcpy(buffer, data, data_len);
18299a2dd95SBruce Richardson
18399a2dd95SBruce Richardson fold = _mm_load_si128((const __m128i *)buffer);
18499a2dd95SBruce Richardson fold = _mm_xor_si128(fold, temp);
18599a2dd95SBruce Richardson if (unlikely(data_len < 4)) {
18699a2dd95SBruce Richardson fold = xmm_shift_left(fold, 8 - data_len);
18799a2dd95SBruce Richardson goto barret_reduction;
18899a2dd95SBruce Richardson }
18999a2dd95SBruce Richardson fold = xmm_shift_left(fold, 16 - data_len);
19099a2dd95SBruce Richardson goto reduction_128_64;
19199a2dd95SBruce Richardson }
19299a2dd95SBruce Richardson /* 17 to 31 bytes */
19399a2dd95SBruce Richardson fold = _mm_loadu_si128((const __m128i *)data);
19499a2dd95SBruce Richardson fold = _mm_xor_si128(fold, temp);
19599a2dd95SBruce Richardson n = 16;
19699a2dd95SBruce Richardson k = params->rk1_rk2;
19799a2dd95SBruce Richardson goto partial_bytes;
19899a2dd95SBruce Richardson }
19999a2dd95SBruce Richardson
20099a2dd95SBruce Richardson /** At least 32 bytes in the buffer */
20199a2dd95SBruce Richardson /** Apply CRC initial value */
20299a2dd95SBruce Richardson fold = _mm_loadu_si128((const __m128i *)data);
20399a2dd95SBruce Richardson fold = _mm_xor_si128(fold, temp);
20499a2dd95SBruce Richardson
20599a2dd95SBruce Richardson /** Main folding loop - the last 16 bytes is processed separately */
20699a2dd95SBruce Richardson k = params->rk1_rk2;
20799a2dd95SBruce Richardson for (n = 16; (n + 16) <= data_len; n += 16) {
20899a2dd95SBruce Richardson temp = _mm_loadu_si128((const __m128i *)&data[n]);
20999a2dd95SBruce Richardson fold = crcr32_folding_round(temp, k, fold);
21099a2dd95SBruce Richardson }
21199a2dd95SBruce Richardson
21299a2dd95SBruce Richardson partial_bytes:
21399a2dd95SBruce Richardson if (likely(n < data_len)) {
21499a2dd95SBruce Richardson
215e9fd1ebfSTyler Retzlaff const alignas(16) uint32_t mask3[4] = {
21699a2dd95SBruce Richardson 0x80808080, 0x80808080, 0x80808080, 0x80808080
21799a2dd95SBruce Richardson };
21899a2dd95SBruce Richardson
219e9fd1ebfSTyler Retzlaff const alignas(16) uint8_t shf_table[32] = {
22099a2dd95SBruce Richardson 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
22199a2dd95SBruce Richardson 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
22299a2dd95SBruce Richardson 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
22399a2dd95SBruce Richardson 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
22499a2dd95SBruce Richardson };
22599a2dd95SBruce Richardson
22699a2dd95SBruce Richardson __m128i last16, a, b;
22799a2dd95SBruce Richardson
22899a2dd95SBruce Richardson last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
22999a2dd95SBruce Richardson
23099a2dd95SBruce Richardson temp = _mm_loadu_si128((const __m128i *)
23199a2dd95SBruce Richardson &shf_table[data_len & 15]);
23299a2dd95SBruce Richardson a = _mm_shuffle_epi8(fold, temp);
23399a2dd95SBruce Richardson
23499a2dd95SBruce Richardson temp = _mm_xor_si128(temp,
23599a2dd95SBruce Richardson _mm_load_si128((const __m128i *)mask3));
23699a2dd95SBruce Richardson b = _mm_shuffle_epi8(fold, temp);
23799a2dd95SBruce Richardson b = _mm_blendv_epi8(b, last16, temp);
23899a2dd95SBruce Richardson
23999a2dd95SBruce Richardson /* k = rk1 & rk2 */
24099a2dd95SBruce Richardson temp = _mm_clmulepi64_si128(a, k, 0x01);
24199a2dd95SBruce Richardson fold = _mm_clmulepi64_si128(a, k, 0x10);
24299a2dd95SBruce Richardson
24399a2dd95SBruce Richardson fold = _mm_xor_si128(fold, temp);
24499a2dd95SBruce Richardson fold = _mm_xor_si128(fold, b);
24599a2dd95SBruce Richardson }
24699a2dd95SBruce Richardson
24799a2dd95SBruce Richardson /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
24899a2dd95SBruce Richardson reduction_128_64:
24999a2dd95SBruce Richardson k = params->rk5_rk6;
25099a2dd95SBruce Richardson fold = crcr32_reduce_128_to_64(fold, k);
25199a2dd95SBruce Richardson
25299a2dd95SBruce Richardson barret_reduction:
25399a2dd95SBruce Richardson k = params->rk7_rk8;
25499a2dd95SBruce Richardson n = crcr32_reduce_64_to_32(fold, k);
25599a2dd95SBruce Richardson
25699a2dd95SBruce Richardson return n;
25799a2dd95SBruce Richardson }
25899a2dd95SBruce Richardson
25999a2dd95SBruce Richardson void
rte_net_crc_sse42_init(void)26099a2dd95SBruce Richardson rte_net_crc_sse42_init(void)
26199a2dd95SBruce Richardson {
26299a2dd95SBruce Richardson uint64_t k1, k2, k5, k6;
26399a2dd95SBruce Richardson uint64_t p = 0, q = 0;
26499a2dd95SBruce Richardson
26599a2dd95SBruce Richardson /** Initialize CRC16 data */
26699a2dd95SBruce Richardson k1 = 0x189aeLLU;
26799a2dd95SBruce Richardson k2 = 0x8e10LLU;
26899a2dd95SBruce Richardson k5 = 0x189aeLLU;
26999a2dd95SBruce Richardson k6 = 0x114aaLLU;
27099a2dd95SBruce Richardson q = 0x11c581910LLU;
27199a2dd95SBruce Richardson p = 0x10811LLU;
27299a2dd95SBruce Richardson
27399a2dd95SBruce Richardson /** Save the params in context structure */
274*df2c51a9STyler Retzlaff crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
275*df2c51a9STyler Retzlaff crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
276*df2c51a9STyler Retzlaff crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
27799a2dd95SBruce Richardson
27899a2dd95SBruce Richardson /** Initialize CRC32 data */
27999a2dd95SBruce Richardson k1 = 0xccaa009eLLU;
28099a2dd95SBruce Richardson k2 = 0x1751997d0LLU;
28199a2dd95SBruce Richardson k5 = 0xccaa009eLLU;
28299a2dd95SBruce Richardson k6 = 0x163cd6124LLU;
28399a2dd95SBruce Richardson q = 0x1f7011640LLU;
28499a2dd95SBruce Richardson p = 0x1db710641LLU;
28599a2dd95SBruce Richardson
28699a2dd95SBruce Richardson /** Save the params in context structure */
287*df2c51a9STyler Retzlaff crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
288*df2c51a9STyler Retzlaff crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
289*df2c51a9STyler Retzlaff crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
29099a2dd95SBruce Richardson }
29199a2dd95SBruce Richardson
29299a2dd95SBruce Richardson uint32_t
rte_crc16_ccitt_sse42_handler(const uint8_t * data,uint32_t data_len)29399a2dd95SBruce Richardson rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
29499a2dd95SBruce Richardson {
29599a2dd95SBruce Richardson /** return 16-bit CRC value */
29699a2dd95SBruce Richardson return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
29799a2dd95SBruce Richardson data_len,
29899a2dd95SBruce Richardson 0xffff,
29999a2dd95SBruce Richardson &crc16_ccitt_pclmulqdq);
30099a2dd95SBruce Richardson }
30199a2dd95SBruce Richardson
30299a2dd95SBruce Richardson uint32_t
rte_crc32_eth_sse42_handler(const uint8_t * data,uint32_t data_len)30399a2dd95SBruce Richardson rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
30499a2dd95SBruce Richardson {
30599a2dd95SBruce Richardson return ~crc32_eth_calc_pclmulqdq(data,
30699a2dd95SBruce Richardson data_len,
30799a2dd95SBruce Richardson 0xffffffffUL,
30899a2dd95SBruce Richardson &crc32_eth_pclmulqdq);
30999a2dd95SBruce Richardson }
310