xref: /dpdk/lib/acl/acl_run_avx2.h (revision 4a6672c2d301c105189ae74de73260af204c5ee8)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2010-2014 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #include "acl_run_sse.h"
699a2dd95SBruce Richardson 
799a2dd95SBruce Richardson static const rte_ymm_t ymm_match_mask = {
899a2dd95SBruce Richardson 	.u32 = {
999a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1099a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1199a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1299a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1399a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1499a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1599a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1699a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
1799a2dd95SBruce Richardson 	},
1899a2dd95SBruce Richardson };
1999a2dd95SBruce Richardson 
2099a2dd95SBruce Richardson static const rte_ymm_t ymm_index_mask = {
2199a2dd95SBruce Richardson 	.u32 = {
2299a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2399a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2499a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2599a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2699a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2799a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2899a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
2999a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
3099a2dd95SBruce Richardson 	},
3199a2dd95SBruce Richardson };
3299a2dd95SBruce Richardson 
3399a2dd95SBruce Richardson static const rte_ymm_t ymm_shuffle_input = {
3499a2dd95SBruce Richardson 	.u32 = {
3599a2dd95SBruce Richardson 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
3699a2dd95SBruce Richardson 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
3799a2dd95SBruce Richardson 	},
3899a2dd95SBruce Richardson };
3999a2dd95SBruce Richardson 
4099a2dd95SBruce Richardson static const rte_ymm_t ymm_ones_16 = {
4199a2dd95SBruce Richardson 	.u16 = {
4299a2dd95SBruce Richardson 		1, 1, 1, 1, 1, 1, 1, 1,
4399a2dd95SBruce Richardson 		1, 1, 1, 1, 1, 1, 1, 1,
4499a2dd95SBruce Richardson 	},
4599a2dd95SBruce Richardson };
4699a2dd95SBruce Richardson 
4799a2dd95SBruce Richardson static const rte_ymm_t ymm_range_base = {
4899a2dd95SBruce Richardson 	.u32 = {
4999a2dd95SBruce Richardson 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
5099a2dd95SBruce Richardson 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
5199a2dd95SBruce Richardson 	},
5299a2dd95SBruce Richardson };
5399a2dd95SBruce Richardson 
5499a2dd95SBruce Richardson /*
5599a2dd95SBruce Richardson  * Process 8 transitions in parallel.
5699a2dd95SBruce Richardson  * tr_lo contains low 32 bits for 8 transition.
5799a2dd95SBruce Richardson  * tr_hi contains high 32 bits for 8 transition.
5899a2dd95SBruce Richardson  * next_input contains up to 4 input bytes for 8 flows.
5999a2dd95SBruce Richardson  */
6099a2dd95SBruce Richardson static __rte_always_inline ymm_t
transition8(ymm_t next_input,const uint64_t * trans,ymm_t * tr_lo,ymm_t * tr_hi)6199a2dd95SBruce Richardson transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
6299a2dd95SBruce Richardson {
6399a2dd95SBruce Richardson 	const int32_t *tr;
6499a2dd95SBruce Richardson 	ymm_t addr;
6599a2dd95SBruce Richardson 
6699a2dd95SBruce Richardson 	tr = (const int32_t *)(uintptr_t)trans;
6799a2dd95SBruce Richardson 
6899a2dd95SBruce Richardson 	/* Calculate the address (array index) for all 8 transitions. */
6999a2dd95SBruce Richardson 	ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
7099a2dd95SBruce Richardson 		ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
7199a2dd95SBruce Richardson 		*tr_lo, *tr_hi);
7299a2dd95SBruce Richardson 
7399a2dd95SBruce Richardson 	/* load lower 32 bits of 8 transactions at once. */
7499a2dd95SBruce Richardson 	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
7599a2dd95SBruce Richardson 
7699a2dd95SBruce Richardson 	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
7799a2dd95SBruce Richardson 
7899a2dd95SBruce Richardson 	/* load high 32 bits of 8 transactions at once. */
7999a2dd95SBruce Richardson 	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
8099a2dd95SBruce Richardson 
8199a2dd95SBruce Richardson 	return next_input;
8299a2dd95SBruce Richardson }
8399a2dd95SBruce Richardson 
8499a2dd95SBruce Richardson /*
8599a2dd95SBruce Richardson  * Process matches for  8 flows.
8699a2dd95SBruce Richardson  * tr_lo contains low 32 bits for 8 transition.
8799a2dd95SBruce Richardson  * tr_hi contains high 32 bits for 8 transition.
8899a2dd95SBruce Richardson  */
8999a2dd95SBruce Richardson static inline void
acl_process_matches_avx2x8(const struct rte_acl_ctx * ctx,struct parms * parms,struct acl_flow_data * flows,uint32_t slot,ymm_t matches,ymm_t * tr_lo,ymm_t * tr_hi)9099a2dd95SBruce Richardson acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
9199a2dd95SBruce Richardson 	struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
9299a2dd95SBruce Richardson 	ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)
9399a2dd95SBruce Richardson {
9499a2dd95SBruce Richardson 	ymm_t t0, t1;
9599a2dd95SBruce Richardson 	ymm_t lo, hi;
9699a2dd95SBruce Richardson 	xmm_t l0, l1;
9799a2dd95SBruce Richardson 	uint32_t i;
9899a2dd95SBruce Richardson 	uint64_t tr[MAX_SEARCHES_SSE8];
9999a2dd95SBruce Richardson 
10099a2dd95SBruce Richardson 	l1 = _mm256_extracti128_si256(*tr_lo, 1);
10199a2dd95SBruce Richardson 	l0 = _mm256_castsi256_si128(*tr_lo);
10299a2dd95SBruce Richardson 
10399a2dd95SBruce Richardson 	for (i = 0; i != RTE_DIM(tr) / 2; i++) {
10499a2dd95SBruce Richardson 
10599a2dd95SBruce Richardson 		/*
10699a2dd95SBruce Richardson 		 * Extract low 32bits of each transition.
10799a2dd95SBruce Richardson 		 * That's enough to process the match.
10899a2dd95SBruce Richardson 		 */
10999a2dd95SBruce Richardson 		tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
11099a2dd95SBruce Richardson 		tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
11199a2dd95SBruce Richardson 
11299a2dd95SBruce Richardson 		l0 = _mm_srli_si128(l0, sizeof(uint32_t));
11399a2dd95SBruce Richardson 		l1 = _mm_srli_si128(l1, sizeof(uint32_t));
11499a2dd95SBruce Richardson 
11599a2dd95SBruce Richardson 		tr[i] = acl_match_check(tr[i], slot + i,
11699a2dd95SBruce Richardson 			ctx, parms, flows, resolve_priority_sse);
11799a2dd95SBruce Richardson 		tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
11899a2dd95SBruce Richardson 			ctx, parms, flows, resolve_priority_sse);
11999a2dd95SBruce Richardson 	}
12099a2dd95SBruce Richardson 
12199a2dd95SBruce Richardson 	/* Collect new transitions into 2 YMM registers. */
12299a2dd95SBruce Richardson 	t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
12399a2dd95SBruce Richardson 	t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
12499a2dd95SBruce Richardson 
12599a2dd95SBruce Richardson 	/* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
12699a2dd95SBruce Richardson 	ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
12799a2dd95SBruce Richardson 
128*4a6672c2SStephen Hemminger 	/* Keep transitions with NOMATCH intact. */
12999a2dd95SBruce Richardson 	*tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
13099a2dd95SBruce Richardson 	*tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
13199a2dd95SBruce Richardson }
13299a2dd95SBruce Richardson 
13399a2dd95SBruce Richardson static inline void
acl_match_check_avx2x8(const struct rte_acl_ctx * ctx,struct parms * parms,struct acl_flow_data * flows,uint32_t slot,ymm_t * tr_lo,ymm_t * tr_hi,ymm_t match_mask)13499a2dd95SBruce Richardson acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
13599a2dd95SBruce Richardson 	struct acl_flow_data *flows, uint32_t slot,
13699a2dd95SBruce Richardson 	ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)
13799a2dd95SBruce Richardson {
13899a2dd95SBruce Richardson 	uint32_t msk;
13999a2dd95SBruce Richardson 	ymm_t matches, temp;
14099a2dd95SBruce Richardson 
14199a2dd95SBruce Richardson 	/* test for match node */
14299a2dd95SBruce Richardson 	temp = _mm256_and_si256(match_mask, *tr_lo);
14399a2dd95SBruce Richardson 	matches = _mm256_cmpeq_epi32(temp, match_mask);
14499a2dd95SBruce Richardson 	msk = _mm256_movemask_epi8(matches);
14599a2dd95SBruce Richardson 
14699a2dd95SBruce Richardson 	while (msk != 0) {
14799a2dd95SBruce Richardson 
14899a2dd95SBruce Richardson 		acl_process_matches_avx2x8(ctx, parms, flows, slot,
14999a2dd95SBruce Richardson 			matches, tr_lo, tr_hi);
15099a2dd95SBruce Richardson 		temp = _mm256_and_si256(match_mask, *tr_lo);
15199a2dd95SBruce Richardson 		matches = _mm256_cmpeq_epi32(temp, match_mask);
15299a2dd95SBruce Richardson 		msk = _mm256_movemask_epi8(matches);
15399a2dd95SBruce Richardson 	}
15499a2dd95SBruce Richardson }
15599a2dd95SBruce Richardson 
15699a2dd95SBruce Richardson /*
15799a2dd95SBruce Richardson  * Execute trie traversal for up to 16 flows in parallel.
15899a2dd95SBruce Richardson  */
15999a2dd95SBruce Richardson static inline int
search_avx2x16(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t total_packets,uint32_t categories)16099a2dd95SBruce Richardson search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
16199a2dd95SBruce Richardson 	uint32_t *results, uint32_t total_packets, uint32_t categories)
16299a2dd95SBruce Richardson {
16399a2dd95SBruce Richardson 	uint32_t n;
16499a2dd95SBruce Richardson 	struct acl_flow_data flows;
16599a2dd95SBruce Richardson 	uint64_t index_array[MAX_SEARCHES_AVX16];
16699a2dd95SBruce Richardson 	struct completion cmplt[MAX_SEARCHES_AVX16];
16799a2dd95SBruce Richardson 	struct parms parms[MAX_SEARCHES_AVX16];
16899a2dd95SBruce Richardson 	ymm_t input[2], tr_lo[2], tr_hi[2];
16999a2dd95SBruce Richardson 	ymm_t t0, t1;
17099a2dd95SBruce Richardson 
17199a2dd95SBruce Richardson 	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
17299a2dd95SBruce Richardson 		total_packets, categories, ctx->trans_table);
17399a2dd95SBruce Richardson 
17499a2dd95SBruce Richardson 	for (n = 0; n < RTE_DIM(cmplt); n++) {
17599a2dd95SBruce Richardson 		cmplt[n].count = 0;
17699a2dd95SBruce Richardson 		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
17799a2dd95SBruce Richardson 	}
17899a2dd95SBruce Richardson 
17999a2dd95SBruce Richardson 	t0 = _mm256_set_epi64x(index_array[5], index_array[4],
18099a2dd95SBruce Richardson 		index_array[1], index_array[0]);
18199a2dd95SBruce Richardson 	t1 = _mm256_set_epi64x(index_array[7], index_array[6],
18299a2dd95SBruce Richardson 		index_array[3], index_array[2]);
18399a2dd95SBruce Richardson 
18499a2dd95SBruce Richardson 	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
18599a2dd95SBruce Richardson 
18699a2dd95SBruce Richardson 	t0 = _mm256_set_epi64x(index_array[13], index_array[12],
18799a2dd95SBruce Richardson 		index_array[9], index_array[8]);
18899a2dd95SBruce Richardson 	t1 = _mm256_set_epi64x(index_array[15], index_array[14],
18999a2dd95SBruce Richardson 		index_array[11], index_array[10]);
19099a2dd95SBruce Richardson 
19199a2dd95SBruce Richardson 	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
19299a2dd95SBruce Richardson 
19399a2dd95SBruce Richardson 	 /* Check for any matches. */
19499a2dd95SBruce Richardson 	acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
19599a2dd95SBruce Richardson 		ymm_match_mask.y);
19699a2dd95SBruce Richardson 	acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
19799a2dd95SBruce Richardson 		ymm_match_mask.y);
19899a2dd95SBruce Richardson 
19999a2dd95SBruce Richardson 	while (flows.started > 0) {
20099a2dd95SBruce Richardson 
20199a2dd95SBruce Richardson 		uint32_t in[MAX_SEARCHES_SSE8];
20299a2dd95SBruce Richardson 
20399a2dd95SBruce Richardson 		/* Gather 4 bytes of input data for first 8 flows. */
20499a2dd95SBruce Richardson 		in[0] = GET_NEXT_4BYTES(parms, 0);
20599a2dd95SBruce Richardson 		in[4] = GET_NEXT_4BYTES(parms, 4);
20699a2dd95SBruce Richardson 		in[1] = GET_NEXT_4BYTES(parms, 1);
20799a2dd95SBruce Richardson 		in[5] = GET_NEXT_4BYTES(parms, 5);
20899a2dd95SBruce Richardson 		in[2] = GET_NEXT_4BYTES(parms, 2);
20999a2dd95SBruce Richardson 		in[6] = GET_NEXT_4BYTES(parms, 6);
21099a2dd95SBruce Richardson 		in[3] = GET_NEXT_4BYTES(parms, 3);
21199a2dd95SBruce Richardson 		in[7] = GET_NEXT_4BYTES(parms, 7);
21299a2dd95SBruce Richardson 		input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
21399a2dd95SBruce Richardson 			in[3], in[2], in[1], in[0]);
21499a2dd95SBruce Richardson 
21599a2dd95SBruce Richardson 		/* Gather 4 bytes of input data for last 8 flows. */
21699a2dd95SBruce Richardson 		in[0] = GET_NEXT_4BYTES(parms, 8);
21799a2dd95SBruce Richardson 		in[4] = GET_NEXT_4BYTES(parms, 12);
21899a2dd95SBruce Richardson 		in[1] = GET_NEXT_4BYTES(parms, 9);
21999a2dd95SBruce Richardson 		in[5] = GET_NEXT_4BYTES(parms, 13);
22099a2dd95SBruce Richardson 		in[2] = GET_NEXT_4BYTES(parms, 10);
22199a2dd95SBruce Richardson 		in[6] = GET_NEXT_4BYTES(parms, 14);
22299a2dd95SBruce Richardson 		in[3] = GET_NEXT_4BYTES(parms, 11);
22399a2dd95SBruce Richardson 		in[7] = GET_NEXT_4BYTES(parms, 15);
22499a2dd95SBruce Richardson 		input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
22599a2dd95SBruce Richardson 			in[3], in[2], in[1], in[0]);
22699a2dd95SBruce Richardson 
22799a2dd95SBruce Richardson 		input[0] = transition8(input[0], flows.trans,
22899a2dd95SBruce Richardson 			&tr_lo[0], &tr_hi[0]);
22999a2dd95SBruce Richardson 		input[1] = transition8(input[1], flows.trans,
23099a2dd95SBruce Richardson 			&tr_lo[1], &tr_hi[1]);
23199a2dd95SBruce Richardson 
23299a2dd95SBruce Richardson 		input[0] = transition8(input[0], flows.trans,
23399a2dd95SBruce Richardson 			&tr_lo[0], &tr_hi[0]);
23499a2dd95SBruce Richardson 		input[1] = transition8(input[1], flows.trans,
23599a2dd95SBruce Richardson 			&tr_lo[1], &tr_hi[1]);
23699a2dd95SBruce Richardson 
23799a2dd95SBruce Richardson 		input[0] = transition8(input[0], flows.trans,
23899a2dd95SBruce Richardson 			&tr_lo[0], &tr_hi[0]);
23999a2dd95SBruce Richardson 		input[1] = transition8(input[1], flows.trans,
24099a2dd95SBruce Richardson 			&tr_lo[1], &tr_hi[1]);
24199a2dd95SBruce Richardson 
24299a2dd95SBruce Richardson 		input[0] = transition8(input[0], flows.trans,
24399a2dd95SBruce Richardson 			&tr_lo[0], &tr_hi[0]);
24499a2dd95SBruce Richardson 		input[1] = transition8(input[1], flows.trans,
24599a2dd95SBruce Richardson 			&tr_lo[1], &tr_hi[1]);
24699a2dd95SBruce Richardson 
24799a2dd95SBruce Richardson 		 /* Check for any matches. */
24899a2dd95SBruce Richardson 		acl_match_check_avx2x8(ctx, parms, &flows, 0,
24999a2dd95SBruce Richardson 			&tr_lo[0], &tr_hi[0], ymm_match_mask.y);
25099a2dd95SBruce Richardson 		acl_match_check_avx2x8(ctx, parms, &flows, 8,
25199a2dd95SBruce Richardson 			&tr_lo[1], &tr_hi[1], ymm_match_mask.y);
25299a2dd95SBruce Richardson 	}
25399a2dd95SBruce Richardson 
25499a2dd95SBruce Richardson 	return 0;
25599a2dd95SBruce Richardson }
256