199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson * Copyright(c) 2017 Cavium, Inc
399a2dd95SBruce Richardson */
499a2dd95SBruce Richardson
5*e9fd1ebfSTyler Retzlaff #include <stdalign.h>
699a2dd95SBruce Richardson #include <string.h>
799a2dd95SBruce Richardson
899a2dd95SBruce Richardson #include <rte_common.h>
999a2dd95SBruce Richardson #include <rte_branch_prediction.h>
1099a2dd95SBruce Richardson #include <rte_net_crc.h>
1199a2dd95SBruce Richardson #include <rte_vect.h>
1299a2dd95SBruce Richardson #include <rte_cpuflags.h>
1399a2dd95SBruce Richardson
1499a2dd95SBruce Richardson #include "net_crc.h"
1599a2dd95SBruce Richardson
1699a2dd95SBruce Richardson /** PMULL CRC computation context structure */
1799a2dd95SBruce Richardson struct crc_pmull_ctx {
1899a2dd95SBruce Richardson uint64x2_t rk1_rk2;
1999a2dd95SBruce Richardson uint64x2_t rk5_rk6;
2099a2dd95SBruce Richardson uint64x2_t rk7_rk8;
2199a2dd95SBruce Richardson };
2299a2dd95SBruce Richardson
23*e9fd1ebfSTyler Retzlaff alignas(16) struct crc_pmull_ctx crc32_eth_pmull;
24*e9fd1ebfSTyler Retzlaff alignas(16) struct crc_pmull_ctx crc16_ccitt_pmull;
2599a2dd95SBruce Richardson
2699a2dd95SBruce Richardson /**
2799a2dd95SBruce Richardson * @brief Performs one folding round
2899a2dd95SBruce Richardson *
2999a2dd95SBruce Richardson * Logically function operates as follows:
3099a2dd95SBruce Richardson * DATA = READ_NEXT_16BYTES();
3199a2dd95SBruce Richardson * F1 = LSB8(FOLD)
3299a2dd95SBruce Richardson * F2 = MSB8(FOLD)
3399a2dd95SBruce Richardson * T1 = CLMUL(F1, RK1)
3499a2dd95SBruce Richardson * T2 = CLMUL(F2, RK2)
3599a2dd95SBruce Richardson * FOLD = XOR(T1, T2, DATA)
3699a2dd95SBruce Richardson *
3799a2dd95SBruce Richardson * @param data_block 16 byte data block
3899a2dd95SBruce Richardson * @param precomp precomputed rk1 constant
3999a2dd95SBruce Richardson * @param fold running 16 byte folded data
4099a2dd95SBruce Richardson *
4199a2dd95SBruce Richardson * @return New 16 byte folded data
4299a2dd95SBruce Richardson */
4399a2dd95SBruce Richardson static inline uint64x2_t
crcr32_folding_round(uint64x2_t data_block,uint64x2_t precomp,uint64x2_t fold)4499a2dd95SBruce Richardson crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
4599a2dd95SBruce Richardson uint64x2_t fold)
4699a2dd95SBruce Richardson {
4799a2dd95SBruce Richardson uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
4899a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
4999a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
5099a2dd95SBruce Richardson
5199a2dd95SBruce Richardson uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
5299a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
5399a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
5499a2dd95SBruce Richardson
5599a2dd95SBruce Richardson return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
5699a2dd95SBruce Richardson }
5799a2dd95SBruce Richardson
5899a2dd95SBruce Richardson /**
5999a2dd95SBruce Richardson * Performs reduction from 128 bits to 64 bits
6099a2dd95SBruce Richardson *
6199a2dd95SBruce Richardson * @param data128 128 bits data to be reduced
6299a2dd95SBruce Richardson * @param precomp rk5 and rk6 precomputed constants
6399a2dd95SBruce Richardson *
6499a2dd95SBruce Richardson * @return data reduced to 64 bits
6599a2dd95SBruce Richardson */
6699a2dd95SBruce Richardson static inline uint64x2_t
crcr32_reduce_128_to_64(uint64x2_t data128,uint64x2_t precomp)6799a2dd95SBruce Richardson crcr32_reduce_128_to_64(uint64x2_t data128,
6899a2dd95SBruce Richardson uint64x2_t precomp)
6999a2dd95SBruce Richardson {
7099a2dd95SBruce Richardson uint64x2_t tmp0, tmp1, tmp2;
7199a2dd95SBruce Richardson
7299a2dd95SBruce Richardson /* 64b fold */
7399a2dd95SBruce Richardson tmp0 = vreinterpretq_u64_p128(vmull_p64(
7499a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
7599a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
7699a2dd95SBruce Richardson tmp1 = vshift_bytes_right(data128, 8);
7799a2dd95SBruce Richardson tmp0 = veorq_u64(tmp0, tmp1);
7899a2dd95SBruce Richardson
7999a2dd95SBruce Richardson /* 32b fold */
8099a2dd95SBruce Richardson tmp2 = vshift_bytes_left(tmp0, 4);
8199a2dd95SBruce Richardson tmp1 = vreinterpretq_u64_p128(vmull_p64(
8299a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
8399a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
8499a2dd95SBruce Richardson
8599a2dd95SBruce Richardson return veorq_u64(tmp1, tmp0);
8699a2dd95SBruce Richardson }
8799a2dd95SBruce Richardson
8899a2dd95SBruce Richardson /**
8999a2dd95SBruce Richardson * Performs Barret's reduction from 64 bits to 32 bits
9099a2dd95SBruce Richardson *
9199a2dd95SBruce Richardson * @param data64 64 bits data to be reduced
9299a2dd95SBruce Richardson * @param precomp rk7 precomputed constant
9399a2dd95SBruce Richardson *
9499a2dd95SBruce Richardson * @return data reduced to 32 bits
9599a2dd95SBruce Richardson */
9699a2dd95SBruce Richardson static inline uint32_t
crcr32_reduce_64_to_32(uint64x2_t data64,uint64x2_t precomp)9799a2dd95SBruce Richardson crcr32_reduce_64_to_32(uint64x2_t data64,
9899a2dd95SBruce Richardson uint64x2_t precomp)
9999a2dd95SBruce Richardson {
100*e9fd1ebfSTyler Retzlaff static alignas(16) uint32_t mask1[4] = {
10199a2dd95SBruce Richardson 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
10299a2dd95SBruce Richardson };
103*e9fd1ebfSTyler Retzlaff static alignas(16) uint32_t mask2[4] = {
10499a2dd95SBruce Richardson 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
10599a2dd95SBruce Richardson };
10699a2dd95SBruce Richardson uint64x2_t tmp0, tmp1, tmp2;
10799a2dd95SBruce Richardson
10899a2dd95SBruce Richardson tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
10999a2dd95SBruce Richardson
11099a2dd95SBruce Richardson tmp1 = vreinterpretq_u64_p128(vmull_p64(
11199a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
11299a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
11399a2dd95SBruce Richardson tmp1 = veorq_u64(tmp1, tmp0);
11499a2dd95SBruce Richardson tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
11599a2dd95SBruce Richardson
11699a2dd95SBruce Richardson tmp2 = vreinterpretq_u64_p128(vmull_p64(
11799a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
11899a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
11999a2dd95SBruce Richardson tmp2 = veorq_u64(tmp2, tmp1);
12099a2dd95SBruce Richardson tmp2 = veorq_u64(tmp2, tmp0);
12199a2dd95SBruce Richardson
12299a2dd95SBruce Richardson return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
12399a2dd95SBruce Richardson }
12499a2dd95SBruce Richardson
12599a2dd95SBruce Richardson static inline uint32_t
crc32_eth_calc_pmull(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pmull_ctx * params)12699a2dd95SBruce Richardson crc32_eth_calc_pmull(
12799a2dd95SBruce Richardson const uint8_t *data,
12899a2dd95SBruce Richardson uint32_t data_len,
12999a2dd95SBruce Richardson uint32_t crc,
13099a2dd95SBruce Richardson const struct crc_pmull_ctx *params)
13199a2dd95SBruce Richardson {
13299a2dd95SBruce Richardson uint64x2_t temp, fold, k;
13399a2dd95SBruce Richardson uint32_t n;
13499a2dd95SBruce Richardson
13599a2dd95SBruce Richardson /* Get CRC init value */
13699a2dd95SBruce Richardson temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
13799a2dd95SBruce Richardson
13899a2dd95SBruce Richardson /**
13999a2dd95SBruce Richardson * Folding all data into single 16 byte data block
14099a2dd95SBruce Richardson * Assumes: fold holds first 16 bytes of data
14199a2dd95SBruce Richardson */
14299a2dd95SBruce Richardson if (unlikely(data_len < 32)) {
14399a2dd95SBruce Richardson if (unlikely(data_len == 16)) {
14499a2dd95SBruce Richardson /* 16 bytes */
14599a2dd95SBruce Richardson fold = vld1q_u64((const uint64_t *)data);
14699a2dd95SBruce Richardson fold = veorq_u64(fold, temp);
14799a2dd95SBruce Richardson goto reduction_128_64;
14899a2dd95SBruce Richardson }
14999a2dd95SBruce Richardson
15099a2dd95SBruce Richardson if (unlikely(data_len < 16)) {
15199a2dd95SBruce Richardson /* 0 to 15 bytes */
152*e9fd1ebfSTyler Retzlaff alignas(16) uint8_t buffer[16];
15399a2dd95SBruce Richardson
15499a2dd95SBruce Richardson memset(buffer, 0, sizeof(buffer));
15599a2dd95SBruce Richardson memcpy(buffer, data, data_len);
15699a2dd95SBruce Richardson
15799a2dd95SBruce Richardson fold = vld1q_u64((uint64_t *)buffer);
15899a2dd95SBruce Richardson fold = veorq_u64(fold, temp);
15999a2dd95SBruce Richardson if (unlikely(data_len < 4)) {
16099a2dd95SBruce Richardson fold = vshift_bytes_left(fold, 8 - data_len);
16199a2dd95SBruce Richardson goto barret_reduction;
16299a2dd95SBruce Richardson }
16399a2dd95SBruce Richardson fold = vshift_bytes_left(fold, 16 - data_len);
16499a2dd95SBruce Richardson goto reduction_128_64;
16599a2dd95SBruce Richardson }
16699a2dd95SBruce Richardson /* 17 to 31 bytes */
16799a2dd95SBruce Richardson fold = vld1q_u64((const uint64_t *)data);
16899a2dd95SBruce Richardson fold = veorq_u64(fold, temp);
16999a2dd95SBruce Richardson n = 16;
17099a2dd95SBruce Richardson k = params->rk1_rk2;
17199a2dd95SBruce Richardson goto partial_bytes;
17299a2dd95SBruce Richardson }
17399a2dd95SBruce Richardson
17499a2dd95SBruce Richardson /** At least 32 bytes in the buffer */
17599a2dd95SBruce Richardson /** Apply CRC initial value */
17699a2dd95SBruce Richardson fold = vld1q_u64((const uint64_t *)data);
17799a2dd95SBruce Richardson fold = veorq_u64(fold, temp);
17899a2dd95SBruce Richardson
17999a2dd95SBruce Richardson /** Main folding loop - the last 16 bytes is processed separately */
18099a2dd95SBruce Richardson k = params->rk1_rk2;
18199a2dd95SBruce Richardson for (n = 16; (n + 16) <= data_len; n += 16) {
18299a2dd95SBruce Richardson temp = vld1q_u64((const uint64_t *)&data[n]);
18399a2dd95SBruce Richardson fold = crcr32_folding_round(temp, k, fold);
18499a2dd95SBruce Richardson }
18599a2dd95SBruce Richardson
18699a2dd95SBruce Richardson partial_bytes:
18799a2dd95SBruce Richardson if (likely(n < data_len)) {
18899a2dd95SBruce Richardson uint64x2_t last16, a, b, mask;
18999a2dd95SBruce Richardson uint32_t rem = data_len & 15;
19099a2dd95SBruce Richardson
19199a2dd95SBruce Richardson last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
19299a2dd95SBruce Richardson a = vshift_bytes_left(fold, 16 - rem);
19399a2dd95SBruce Richardson b = vshift_bytes_right(fold, rem);
19499a2dd95SBruce Richardson mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
19599a2dd95SBruce Richardson b = vorrq_u64(b, vandq_u64(mask, last16));
19699a2dd95SBruce Richardson
19799a2dd95SBruce Richardson /* k = rk1 & rk2 */
19899a2dd95SBruce Richardson temp = vreinterpretq_u64_p128(vmull_p64(
19999a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
20099a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
20199a2dd95SBruce Richardson fold = vreinterpretq_u64_p128(vmull_p64(
20299a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
20399a2dd95SBruce Richardson vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
20499a2dd95SBruce Richardson fold = veorq_u64(fold, temp);
20599a2dd95SBruce Richardson fold = veorq_u64(fold, b);
20699a2dd95SBruce Richardson }
20799a2dd95SBruce Richardson
20899a2dd95SBruce Richardson /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
20999a2dd95SBruce Richardson reduction_128_64:
21099a2dd95SBruce Richardson k = params->rk5_rk6;
21199a2dd95SBruce Richardson fold = crcr32_reduce_128_to_64(fold, k);
21299a2dd95SBruce Richardson
21399a2dd95SBruce Richardson barret_reduction:
21499a2dd95SBruce Richardson k = params->rk7_rk8;
21599a2dd95SBruce Richardson n = crcr32_reduce_64_to_32(fold, k);
21699a2dd95SBruce Richardson
21799a2dd95SBruce Richardson return n;
21899a2dd95SBruce Richardson }
21999a2dd95SBruce Richardson
22099a2dd95SBruce Richardson void
rte_net_crc_neon_init(void)22199a2dd95SBruce Richardson rte_net_crc_neon_init(void)
22299a2dd95SBruce Richardson {
22399a2dd95SBruce Richardson /* Initialize CRC16 data */
22499a2dd95SBruce Richardson uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
22599a2dd95SBruce Richardson uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
22699a2dd95SBruce Richardson uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
22799a2dd95SBruce Richardson
22899a2dd95SBruce Richardson /* Initialize CRC32 data */
22999a2dd95SBruce Richardson uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
23099a2dd95SBruce Richardson uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
23199a2dd95SBruce Richardson uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
23299a2dd95SBruce Richardson
23399a2dd95SBruce Richardson /** Save the params in context structure */
23499a2dd95SBruce Richardson crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
23599a2dd95SBruce Richardson crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
23699a2dd95SBruce Richardson crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
23799a2dd95SBruce Richardson
23899a2dd95SBruce Richardson /** Save the params in context structure */
23999a2dd95SBruce Richardson crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
24099a2dd95SBruce Richardson crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
24199a2dd95SBruce Richardson crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
24299a2dd95SBruce Richardson }
24399a2dd95SBruce Richardson
24499a2dd95SBruce Richardson uint32_t
rte_crc16_ccitt_neon_handler(const uint8_t * data,uint32_t data_len)24599a2dd95SBruce Richardson rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len)
24699a2dd95SBruce Richardson {
24799a2dd95SBruce Richardson return (uint16_t)~crc32_eth_calc_pmull(data,
24899a2dd95SBruce Richardson data_len,
24999a2dd95SBruce Richardson 0xffff,
25099a2dd95SBruce Richardson &crc16_ccitt_pmull);
25199a2dd95SBruce Richardson }
25299a2dd95SBruce Richardson
25399a2dd95SBruce Richardson uint32_t
rte_crc32_eth_neon_handler(const uint8_t * data,uint32_t data_len)25499a2dd95SBruce Richardson rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len)
25599a2dd95SBruce Richardson {
25699a2dd95SBruce Richardson return ~crc32_eth_calc_pmull(data,
25799a2dd95SBruce Richardson data_len,
25899a2dd95SBruce Richardson 0xffffffffUL,
25999a2dd95SBruce Richardson &crc32_eth_pmull);
26099a2dd95SBruce Richardson }
261