xref: /dpdk/lib/net/net_crc_neon.c (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2017 Cavium, Inc
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
5*e9fd1ebfSTyler Retzlaff #include <stdalign.h>
699a2dd95SBruce Richardson #include <string.h>
799a2dd95SBruce Richardson 
899a2dd95SBruce Richardson #include <rte_common.h>
999a2dd95SBruce Richardson #include <rte_branch_prediction.h>
1099a2dd95SBruce Richardson #include <rte_net_crc.h>
1199a2dd95SBruce Richardson #include <rte_vect.h>
1299a2dd95SBruce Richardson #include <rte_cpuflags.h>
1399a2dd95SBruce Richardson 
1499a2dd95SBruce Richardson #include "net_crc.h"
1599a2dd95SBruce Richardson 
1699a2dd95SBruce Richardson /** PMULL CRC computation context structure */
1799a2dd95SBruce Richardson struct crc_pmull_ctx {
1899a2dd95SBruce Richardson 	uint64x2_t rk1_rk2;
1999a2dd95SBruce Richardson 	uint64x2_t rk5_rk6;
2099a2dd95SBruce Richardson 	uint64x2_t rk7_rk8;
2199a2dd95SBruce Richardson };
2299a2dd95SBruce Richardson 
23*e9fd1ebfSTyler Retzlaff alignas(16) struct crc_pmull_ctx crc32_eth_pmull;
24*e9fd1ebfSTyler Retzlaff alignas(16) struct crc_pmull_ctx crc16_ccitt_pmull;
2599a2dd95SBruce Richardson 
2699a2dd95SBruce Richardson /**
2799a2dd95SBruce Richardson  * @brief Performs one folding round
2899a2dd95SBruce Richardson  *
2999a2dd95SBruce Richardson  * Logically function operates as follows:
3099a2dd95SBruce Richardson  *     DATA = READ_NEXT_16BYTES();
3199a2dd95SBruce Richardson  *     F1 = LSB8(FOLD)
3299a2dd95SBruce Richardson  *     F2 = MSB8(FOLD)
3399a2dd95SBruce Richardson  *     T1 = CLMUL(F1, RK1)
3499a2dd95SBruce Richardson  *     T2 = CLMUL(F2, RK2)
3599a2dd95SBruce Richardson  *     FOLD = XOR(T1, T2, DATA)
3699a2dd95SBruce Richardson  *
3799a2dd95SBruce Richardson  * @param data_block 16 byte data block
3899a2dd95SBruce Richardson  * @param precomp precomputed rk1 constant
3999a2dd95SBruce Richardson  * @param fold running 16 byte folded data
4099a2dd95SBruce Richardson  *
4199a2dd95SBruce Richardson  * @return New 16 byte folded data
4299a2dd95SBruce Richardson  */
4399a2dd95SBruce Richardson static inline uint64x2_t
crcr32_folding_round(uint64x2_t data_block,uint64x2_t precomp,uint64x2_t fold)4499a2dd95SBruce Richardson crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
4599a2dd95SBruce Richardson 	uint64x2_t fold)
4699a2dd95SBruce Richardson {
4799a2dd95SBruce Richardson 	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
4899a2dd95SBruce Richardson 			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
4999a2dd95SBruce Richardson 			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
5099a2dd95SBruce Richardson 
5199a2dd95SBruce Richardson 	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
5299a2dd95SBruce Richardson 			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
5399a2dd95SBruce Richardson 			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
5499a2dd95SBruce Richardson 
5599a2dd95SBruce Richardson 	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
5699a2dd95SBruce Richardson }
5799a2dd95SBruce Richardson 
5899a2dd95SBruce Richardson /**
5999a2dd95SBruce Richardson  * Performs reduction from 128 bits to 64 bits
6099a2dd95SBruce Richardson  *
6199a2dd95SBruce Richardson  * @param data128 128 bits data to be reduced
6299a2dd95SBruce Richardson  * @param precomp rk5 and rk6 precomputed constants
6399a2dd95SBruce Richardson  *
6499a2dd95SBruce Richardson  * @return data reduced to 64 bits
6599a2dd95SBruce Richardson  */
6699a2dd95SBruce Richardson static inline uint64x2_t
crcr32_reduce_128_to_64(uint64x2_t data128,uint64x2_t precomp)6799a2dd95SBruce Richardson crcr32_reduce_128_to_64(uint64x2_t data128,
6899a2dd95SBruce Richardson 	uint64x2_t precomp)
6999a2dd95SBruce Richardson {
7099a2dd95SBruce Richardson 	uint64x2_t tmp0, tmp1, tmp2;
7199a2dd95SBruce Richardson 
7299a2dd95SBruce Richardson 	/* 64b fold */
7399a2dd95SBruce Richardson 	tmp0 = vreinterpretq_u64_p128(vmull_p64(
7499a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
7599a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
7699a2dd95SBruce Richardson 	tmp1 = vshift_bytes_right(data128, 8);
7799a2dd95SBruce Richardson 	tmp0 = veorq_u64(tmp0, tmp1);
7899a2dd95SBruce Richardson 
7999a2dd95SBruce Richardson 	/* 32b fold */
8099a2dd95SBruce Richardson 	tmp2 = vshift_bytes_left(tmp0, 4);
8199a2dd95SBruce Richardson 	tmp1 = vreinterpretq_u64_p128(vmull_p64(
8299a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
8399a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
8499a2dd95SBruce Richardson 
8599a2dd95SBruce Richardson 	return veorq_u64(tmp1, tmp0);
8699a2dd95SBruce Richardson }
8799a2dd95SBruce Richardson 
8899a2dd95SBruce Richardson /**
8999a2dd95SBruce Richardson  * Performs Barret's reduction from 64 bits to 32 bits
9099a2dd95SBruce Richardson  *
9199a2dd95SBruce Richardson  * @param data64 64 bits data to be reduced
9299a2dd95SBruce Richardson  * @param precomp rk7 precomputed constant
9399a2dd95SBruce Richardson  *
9499a2dd95SBruce Richardson  * @return data reduced to 32 bits
9599a2dd95SBruce Richardson  */
9699a2dd95SBruce Richardson static inline uint32_t
crcr32_reduce_64_to_32(uint64x2_t data64,uint64x2_t precomp)9799a2dd95SBruce Richardson crcr32_reduce_64_to_32(uint64x2_t data64,
9899a2dd95SBruce Richardson 	uint64x2_t precomp)
9999a2dd95SBruce Richardson {
100*e9fd1ebfSTyler Retzlaff 	static alignas(16) uint32_t mask1[4] = {
10199a2dd95SBruce Richardson 		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
10299a2dd95SBruce Richardson 	};
103*e9fd1ebfSTyler Retzlaff 	static alignas(16) uint32_t mask2[4] = {
10499a2dd95SBruce Richardson 		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
10599a2dd95SBruce Richardson 	};
10699a2dd95SBruce Richardson 	uint64x2_t tmp0, tmp1, tmp2;
10799a2dd95SBruce Richardson 
10899a2dd95SBruce Richardson 	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
10999a2dd95SBruce Richardson 
11099a2dd95SBruce Richardson 	tmp1 = vreinterpretq_u64_p128(vmull_p64(
11199a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
11299a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
11399a2dd95SBruce Richardson 	tmp1 = veorq_u64(tmp1, tmp0);
11499a2dd95SBruce Richardson 	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
11599a2dd95SBruce Richardson 
11699a2dd95SBruce Richardson 	tmp2 = vreinterpretq_u64_p128(vmull_p64(
11799a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
11899a2dd95SBruce Richardson 		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
11999a2dd95SBruce Richardson 	tmp2 = veorq_u64(tmp2, tmp1);
12099a2dd95SBruce Richardson 	tmp2 = veorq_u64(tmp2, tmp0);
12199a2dd95SBruce Richardson 
12299a2dd95SBruce Richardson 	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
12399a2dd95SBruce Richardson }
12499a2dd95SBruce Richardson 
12599a2dd95SBruce Richardson static inline uint32_t
crc32_eth_calc_pmull(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pmull_ctx * params)12699a2dd95SBruce Richardson crc32_eth_calc_pmull(
12799a2dd95SBruce Richardson 	const uint8_t *data,
12899a2dd95SBruce Richardson 	uint32_t data_len,
12999a2dd95SBruce Richardson 	uint32_t crc,
13099a2dd95SBruce Richardson 	const struct crc_pmull_ctx *params)
13199a2dd95SBruce Richardson {
13299a2dd95SBruce Richardson 	uint64x2_t temp, fold, k;
13399a2dd95SBruce Richardson 	uint32_t n;
13499a2dd95SBruce Richardson 
13599a2dd95SBruce Richardson 	/* Get CRC init value */
13699a2dd95SBruce Richardson 	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
13799a2dd95SBruce Richardson 
13899a2dd95SBruce Richardson 	/**
13999a2dd95SBruce Richardson 	 * Folding all data into single 16 byte data block
14099a2dd95SBruce Richardson 	 * Assumes: fold holds first 16 bytes of data
14199a2dd95SBruce Richardson 	 */
14299a2dd95SBruce Richardson 	if (unlikely(data_len < 32)) {
14399a2dd95SBruce Richardson 		if (unlikely(data_len == 16)) {
14499a2dd95SBruce Richardson 			/* 16 bytes */
14599a2dd95SBruce Richardson 			fold = vld1q_u64((const uint64_t *)data);
14699a2dd95SBruce Richardson 			fold = veorq_u64(fold, temp);
14799a2dd95SBruce Richardson 			goto reduction_128_64;
14899a2dd95SBruce Richardson 		}
14999a2dd95SBruce Richardson 
15099a2dd95SBruce Richardson 		if (unlikely(data_len < 16)) {
15199a2dd95SBruce Richardson 			/* 0 to 15 bytes */
152*e9fd1ebfSTyler Retzlaff 			alignas(16) uint8_t buffer[16];
15399a2dd95SBruce Richardson 
15499a2dd95SBruce Richardson 			memset(buffer, 0, sizeof(buffer));
15599a2dd95SBruce Richardson 			memcpy(buffer, data, data_len);
15699a2dd95SBruce Richardson 
15799a2dd95SBruce Richardson 			fold = vld1q_u64((uint64_t *)buffer);
15899a2dd95SBruce Richardson 			fold = veorq_u64(fold, temp);
15999a2dd95SBruce Richardson 			if (unlikely(data_len < 4)) {
16099a2dd95SBruce Richardson 				fold = vshift_bytes_left(fold, 8 - data_len);
16199a2dd95SBruce Richardson 				goto barret_reduction;
16299a2dd95SBruce Richardson 			}
16399a2dd95SBruce Richardson 			fold = vshift_bytes_left(fold, 16 - data_len);
16499a2dd95SBruce Richardson 			goto reduction_128_64;
16599a2dd95SBruce Richardson 		}
16699a2dd95SBruce Richardson 		/* 17 to 31 bytes */
16799a2dd95SBruce Richardson 		fold = vld1q_u64((const uint64_t *)data);
16899a2dd95SBruce Richardson 		fold = veorq_u64(fold, temp);
16999a2dd95SBruce Richardson 		n = 16;
17099a2dd95SBruce Richardson 		k = params->rk1_rk2;
17199a2dd95SBruce Richardson 		goto partial_bytes;
17299a2dd95SBruce Richardson 	}
17399a2dd95SBruce Richardson 
17499a2dd95SBruce Richardson 	/** At least 32 bytes in the buffer */
17599a2dd95SBruce Richardson 	/** Apply CRC initial value */
17699a2dd95SBruce Richardson 	fold = vld1q_u64((const uint64_t *)data);
17799a2dd95SBruce Richardson 	fold = veorq_u64(fold, temp);
17899a2dd95SBruce Richardson 
17999a2dd95SBruce Richardson 	/** Main folding loop - the last 16 bytes is processed separately */
18099a2dd95SBruce Richardson 	k = params->rk1_rk2;
18199a2dd95SBruce Richardson 	for (n = 16; (n + 16) <= data_len; n += 16) {
18299a2dd95SBruce Richardson 		temp = vld1q_u64((const uint64_t *)&data[n]);
18399a2dd95SBruce Richardson 		fold = crcr32_folding_round(temp, k, fold);
18499a2dd95SBruce Richardson 	}
18599a2dd95SBruce Richardson 
18699a2dd95SBruce Richardson partial_bytes:
18799a2dd95SBruce Richardson 	if (likely(n < data_len)) {
18899a2dd95SBruce Richardson 		uint64x2_t last16, a, b, mask;
18999a2dd95SBruce Richardson 		uint32_t rem = data_len & 15;
19099a2dd95SBruce Richardson 
19199a2dd95SBruce Richardson 		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
19299a2dd95SBruce Richardson 		a = vshift_bytes_left(fold, 16 - rem);
19399a2dd95SBruce Richardson 		b = vshift_bytes_right(fold, rem);
19499a2dd95SBruce Richardson 		mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
19599a2dd95SBruce Richardson 		b = vorrq_u64(b, vandq_u64(mask, last16));
19699a2dd95SBruce Richardson 
19799a2dd95SBruce Richardson 		/* k = rk1 & rk2 */
19899a2dd95SBruce Richardson 		temp = vreinterpretq_u64_p128(vmull_p64(
19999a2dd95SBruce Richardson 				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
20099a2dd95SBruce Richardson 				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
20199a2dd95SBruce Richardson 		fold = vreinterpretq_u64_p128(vmull_p64(
20299a2dd95SBruce Richardson 				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
20399a2dd95SBruce Richardson 				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
20499a2dd95SBruce Richardson 		fold = veorq_u64(fold, temp);
20599a2dd95SBruce Richardson 		fold = veorq_u64(fold, b);
20699a2dd95SBruce Richardson 	}
20799a2dd95SBruce Richardson 
20899a2dd95SBruce Richardson 	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
20999a2dd95SBruce Richardson reduction_128_64:
21099a2dd95SBruce Richardson 	k = params->rk5_rk6;
21199a2dd95SBruce Richardson 	fold = crcr32_reduce_128_to_64(fold, k);
21299a2dd95SBruce Richardson 
21399a2dd95SBruce Richardson barret_reduction:
21499a2dd95SBruce Richardson 	k = params->rk7_rk8;
21599a2dd95SBruce Richardson 	n = crcr32_reduce_64_to_32(fold, k);
21699a2dd95SBruce Richardson 
21799a2dd95SBruce Richardson 	return n;
21899a2dd95SBruce Richardson }
21999a2dd95SBruce Richardson 
22099a2dd95SBruce Richardson void
rte_net_crc_neon_init(void)22199a2dd95SBruce Richardson rte_net_crc_neon_init(void)
22299a2dd95SBruce Richardson {
22399a2dd95SBruce Richardson 	/* Initialize CRC16 data */
22499a2dd95SBruce Richardson 	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
22599a2dd95SBruce Richardson 	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
22699a2dd95SBruce Richardson 	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
22799a2dd95SBruce Richardson 
22899a2dd95SBruce Richardson 	/* Initialize CRC32 data */
22999a2dd95SBruce Richardson 	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
23099a2dd95SBruce Richardson 	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
23199a2dd95SBruce Richardson 	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
23299a2dd95SBruce Richardson 
23399a2dd95SBruce Richardson 	/** Save the params in context structure */
23499a2dd95SBruce Richardson 	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
23599a2dd95SBruce Richardson 	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
23699a2dd95SBruce Richardson 	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
23799a2dd95SBruce Richardson 
23899a2dd95SBruce Richardson 	/** Save the params in context structure */
23999a2dd95SBruce Richardson 	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
24099a2dd95SBruce Richardson 	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
24199a2dd95SBruce Richardson 	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
24299a2dd95SBruce Richardson }
24399a2dd95SBruce Richardson 
24499a2dd95SBruce Richardson uint32_t
rte_crc16_ccitt_neon_handler(const uint8_t * data,uint32_t data_len)24599a2dd95SBruce Richardson rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len)
24699a2dd95SBruce Richardson {
24799a2dd95SBruce Richardson 	return (uint16_t)~crc32_eth_calc_pmull(data,
24899a2dd95SBruce Richardson 		data_len,
24999a2dd95SBruce Richardson 		0xffff,
25099a2dd95SBruce Richardson 		&crc16_ccitt_pmull);
25199a2dd95SBruce Richardson }
25299a2dd95SBruce Richardson 
25399a2dd95SBruce Richardson uint32_t
rte_crc32_eth_neon_handler(const uint8_t * data,uint32_t data_len)25499a2dd95SBruce Richardson rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len)
25599a2dd95SBruce Richardson {
25699a2dd95SBruce Richardson 	return ~crc32_eth_calc_pmull(data,
25799a2dd95SBruce Richardson 		data_len,
25899a2dd95SBruce Richardson 		0xffffffffUL,
25999a2dd95SBruce Richardson 		&crc32_eth_pmull);
26099a2dd95SBruce Richardson }
261