xref: /dpdk/lib/acl/acl_run_avx512_common.h (revision 3d4e27fd7ff050d565c7450930c92fb945706518)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2020 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson /*
699a2dd95SBruce Richardson  * WARNING: It is not recommended to include this file directly.
799a2dd95SBruce Richardson  * Please include "acl_run_avx512x*.h" instead.
899a2dd95SBruce Richardson  * To make this file to generate proper code an includer has to
999a2dd95SBruce Richardson  * define several macros, refer to "acl_run_avx512x*.h" for more details.
1099a2dd95SBruce Richardson  */
1199a2dd95SBruce Richardson 
1299a2dd95SBruce Richardson /*
1399a2dd95SBruce Richardson  * Calculate the address of the next transition for
1499a2dd95SBruce Richardson  * all types of nodes. Note that only DFA nodes and range
1599a2dd95SBruce Richardson  * nodes actually transition to another node. Match
1699a2dd95SBruce Richardson  * nodes not supposed to be encountered here.
1799a2dd95SBruce Richardson  * For quad range nodes:
1899a2dd95SBruce Richardson  * Calculate number of range boundaries that are less than the
1999a2dd95SBruce Richardson  * input value. Range boundaries for each node are in signed 8 bit,
2099a2dd95SBruce Richardson  * ordered from -128 to 127.
2199a2dd95SBruce Richardson  * This is effectively a popcnt of bytes that are greater than the
2299a2dd95SBruce Richardson  * input byte.
2399a2dd95SBruce Richardson  * Single nodes are processed in the same ways as quad range nodes.
2499a2dd95SBruce Richardson  */
2599a2dd95SBruce Richardson static __rte_always_inline _T_simd
_F_(calc_addr)2699a2dd95SBruce Richardson _F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
2799a2dd95SBruce Richardson 	_T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
2899a2dd95SBruce Richardson {
2999a2dd95SBruce Richardson 	__mmask64 qm;
3099a2dd95SBruce Richardson 	_T_mask dfa_msk;
3199a2dd95SBruce Richardson 	_T_simd addr, in, node_type, r, t;
3299a2dd95SBruce Richardson 	_T_simd dfa_ofs, quad_ofs;
3399a2dd95SBruce Richardson 
3499a2dd95SBruce Richardson 	t = _M_SI_(xor)(index_mask, index_mask);
3599a2dd95SBruce Richardson 	in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
3699a2dd95SBruce Richardson 
3799a2dd95SBruce Richardson 	/* Calc node type and node addr */
3899a2dd95SBruce Richardson 	node_type = _M_SI_(andnot)(index_mask, tr_lo);
3999a2dd95SBruce Richardson 	addr = _M_SI_(and)(index_mask, tr_lo);
4099a2dd95SBruce Richardson 
4199a2dd95SBruce Richardson 	/* mask for DFA type(0) nodes */
4299a2dd95SBruce Richardson 	dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
4399a2dd95SBruce Richardson 
4499a2dd95SBruce Richardson 	/* DFA calculations. */
4599a2dd95SBruce Richardson 	r = _M_I_(srli_epi32)(in, 30);
4699a2dd95SBruce Richardson 	r = _M_I_(add_epi8)(r, range_base);
4799a2dd95SBruce Richardson 	t = _M_I_(srli_epi32)(in, 24);
4899a2dd95SBruce Richardson 	r = _M_I_(shuffle_epi8)(tr_hi, r);
4999a2dd95SBruce Richardson 
5099a2dd95SBruce Richardson 	dfa_ofs = _M_I_(sub_epi32)(t, r);
5199a2dd95SBruce Richardson 
5299a2dd95SBruce Richardson 	/* QUAD/SINGLE calculations. */
5399a2dd95SBruce Richardson 	qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
5499a2dd95SBruce Richardson 	t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
5599a2dd95SBruce Richardson 	t = _M_I_(lzcnt_epi32)(t);
5699a2dd95SBruce Richardson 	t = _M_I_(srli_epi32)(t, 3);
5799a2dd95SBruce Richardson 	quad_ofs = _M_I_(sub_epi32)(four_32, t);
5899a2dd95SBruce Richardson 
5999a2dd95SBruce Richardson 	/* blend DFA and QUAD/SINGLE. */
6099a2dd95SBruce Richardson 	t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
6199a2dd95SBruce Richardson 
6299a2dd95SBruce Richardson 	/* calculate address for next transitions. */
6399a2dd95SBruce Richardson 	addr = _M_I_(add_epi32)(addr, t);
6499a2dd95SBruce Richardson 	return addr;
6599a2dd95SBruce Richardson }
6699a2dd95SBruce Richardson 
6799a2dd95SBruce Richardson /*
6899a2dd95SBruce Richardson  * Process _N_ transitions in parallel.
6999a2dd95SBruce Richardson  * tr_lo contains low 32 bits for _N_ transition.
7099a2dd95SBruce Richardson  * tr_hi contains high 32 bits for _N_ transition.
7199a2dd95SBruce Richardson  * next_input contains up to 4 input bytes for _N_ flows.
7299a2dd95SBruce Richardson  */
7399a2dd95SBruce Richardson static __rte_always_inline _T_simd
_F_(trans)7499a2dd95SBruce Richardson _F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
7599a2dd95SBruce Richardson 	_T_simd *tr_hi)
7699a2dd95SBruce Richardson {
7799a2dd95SBruce Richardson 	const int32_t *tr;
7899a2dd95SBruce Richardson 	_T_simd addr;
7999a2dd95SBruce Richardson 
8099a2dd95SBruce Richardson 	tr = (const int32_t *)(uintptr_t)trans;
8199a2dd95SBruce Richardson 
8299a2dd95SBruce Richardson 	/* Calculate the address (array index) for all _N_ transitions. */
8399a2dd95SBruce Richardson 	addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
8499a2dd95SBruce Richardson 		_SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
8599a2dd95SBruce Richardson 
8699a2dd95SBruce Richardson 	/* load lower 32 bits of _N_ transactions at once. */
8799a2dd95SBruce Richardson 	*tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
8899a2dd95SBruce Richardson 
8999a2dd95SBruce Richardson 	next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
9099a2dd95SBruce Richardson 
9199a2dd95SBruce Richardson 	/* load high 32 bits of _N_ transactions at once. */
9299a2dd95SBruce Richardson 	*tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
9399a2dd95SBruce Richardson 
9499a2dd95SBruce Richardson 	return next_input;
9599a2dd95SBruce Richardson }
9699a2dd95SBruce Richardson 
9799a2dd95SBruce Richardson /*
9899a2dd95SBruce Richardson  * Execute first transition for up to _N_ flows in parallel.
9999a2dd95SBruce Richardson  * next_input should contain one input byte for up to _N_ flows.
10099a2dd95SBruce Richardson  * msk - mask of active flows.
10199a2dd95SBruce Richardson  * tr_lo contains low 32 bits for up to _N_ transitions.
10299a2dd95SBruce Richardson  * tr_hi contains high 32 bits for up to _N_ transitions.
10399a2dd95SBruce Richardson  */
10499a2dd95SBruce Richardson static __rte_always_inline void
_F_(first_trans)10599a2dd95SBruce Richardson _F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
10699a2dd95SBruce Richardson 	_T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
10799a2dd95SBruce Richardson {
10899a2dd95SBruce Richardson 	const int32_t *tr;
10999a2dd95SBruce Richardson 	_T_simd addr, root;
11099a2dd95SBruce Richardson 
11199a2dd95SBruce Richardson 	tr = (const int32_t *)(uintptr_t)flow->trans;
11299a2dd95SBruce Richardson 
11399a2dd95SBruce Richardson 	addr = _M_I_(set1_epi32)(UINT8_MAX);
11499a2dd95SBruce Richardson 	root = _M_I_(set1_epi32)(flow->root_index);
11599a2dd95SBruce Richardson 
11699a2dd95SBruce Richardson 	addr = _M_SI_(and)(next_input, addr);
11799a2dd95SBruce Richardson 	addr = _M_I_(add_epi32)(root, addr);
11899a2dd95SBruce Richardson 
11999a2dd95SBruce Richardson 	/* load lower 32 bits of _N_ transactions at once. */
12099a2dd95SBruce Richardson 	*tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
12199a2dd95SBruce Richardson 		sizeof(flow->trans[0]));
12299a2dd95SBruce Richardson 
12399a2dd95SBruce Richardson 	/* load high 32 bits of _N_ transactions at once. */
12499a2dd95SBruce Richardson 	*tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
12599a2dd95SBruce Richardson 		sizeof(flow->trans[0]));
12699a2dd95SBruce Richardson }
12799a2dd95SBruce Richardson 
12899a2dd95SBruce Richardson /*
12999a2dd95SBruce Richardson  * Load and return next 4 input bytes for up to _N_ flows in parallel.
13099a2dd95SBruce Richardson  * pdata - 8x2 pointers to flow input data
13199a2dd95SBruce Richardson  * mask - mask of active flows.
13299a2dd95SBruce Richardson  * di - data indexes for these _N_ flows.
13399a2dd95SBruce Richardson  */
13499a2dd95SBruce Richardson static inline _T_simd
_F_(get_next_bytes)13599a2dd95SBruce Richardson _F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
13699a2dd95SBruce Richardson 	uint32_t msk, _T_simd *di, uint32_t bnum)
13799a2dd95SBruce Richardson {
13899a2dd95SBruce Richardson 	const int32_t *div;
13999a2dd95SBruce Richardson 	uint32_t m[2];
14099a2dd95SBruce Richardson 	_T_simd one, zero, t, p[2];
14199a2dd95SBruce Richardson 
14299a2dd95SBruce Richardson 	div = (const int32_t *)flow->data_index;
14399a2dd95SBruce Richardson 
14499a2dd95SBruce Richardson 	one = _M_I_(set1_epi32)(1);
14599a2dd95SBruce Richardson 	zero = _M_SI_(xor)(one, one);
14699a2dd95SBruce Richardson 
14799a2dd95SBruce Richardson 	/* load data offsets for given indexes */
14899a2dd95SBruce Richardson 	t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
14999a2dd95SBruce Richardson 
15099a2dd95SBruce Richardson 	/* increment data indexes */
15199a2dd95SBruce Richardson 	*di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
15299a2dd95SBruce Richardson 
15399a2dd95SBruce Richardson 	/*
15499a2dd95SBruce Richardson 	 * unsigned expand 32-bit indexes to 64-bit
15599a2dd95SBruce Richardson 	 * (for later pointer arithmetic), i.e:
15699a2dd95SBruce Richardson 	 * for (i = 0; i != _N_; i++)
15799a2dd95SBruce Richardson 	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
15899a2dd95SBruce Richardson 	 */
15999a2dd95SBruce Richardson 	p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
16099a2dd95SBruce Richardson 			t);
16199a2dd95SBruce Richardson 	p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
16299a2dd95SBruce Richardson 			t);
16399a2dd95SBruce Richardson 
16499a2dd95SBruce Richardson 	p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
16599a2dd95SBruce Richardson 	p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
16699a2dd95SBruce Richardson 
16799a2dd95SBruce Richardson 	/* load input byte(s), either one or four */
16899a2dd95SBruce Richardson 
16999a2dd95SBruce Richardson 	m[0] = msk & _SIMD_PTR_MSK_;
17099a2dd95SBruce Richardson 	m[1] = msk >> _SIMD_PTR_NUM_;
17199a2dd95SBruce Richardson 
17299a2dd95SBruce Richardson 	return _F_(gather_bytes)(zero, p, m, bnum);
17399a2dd95SBruce Richardson }
17499a2dd95SBruce Richardson 
17599a2dd95SBruce Richardson /*
17699a2dd95SBruce Richardson  * Start up to _N_ new flows.
17799a2dd95SBruce Richardson  * num - number of flows to start
17899a2dd95SBruce Richardson  * msk - mask of new flows.
17999a2dd95SBruce Richardson  * pdata - pointers to flow input data
18099a2dd95SBruce Richardson  * idx - match indexed for given flows
18199a2dd95SBruce Richardson  * di - data indexes for these flows.
18299a2dd95SBruce Richardson  */
18399a2dd95SBruce Richardson static inline void
_F_(start_flow)18499a2dd95SBruce Richardson _F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
18599a2dd95SBruce Richardson 	_T_simd pdata[2], _T_simd *idx, _T_simd *di)
18699a2dd95SBruce Richardson {
18799a2dd95SBruce Richardson 	uint32_t n, m[2], nm[2];
18899a2dd95SBruce Richardson 	_T_simd ni, nd[2];
18999a2dd95SBruce Richardson 
19099a2dd95SBruce Richardson 	/* split mask into two - one for each pdata[] */
19199a2dd95SBruce Richardson 	m[0] = msk & _SIMD_PTR_MSK_;
19299a2dd95SBruce Richardson 	m[1] = msk >> _SIMD_PTR_NUM_;
19399a2dd95SBruce Richardson 
19499a2dd95SBruce Richardson 	/* calculate masks for new flows */
195*3d4e27fdSDavid Marchand 	n = rte_popcount32(m[0]);
19699a2dd95SBruce Richardson 	nm[0] = (1 << n) - 1;
19799a2dd95SBruce Richardson 	nm[1] = (1 << (num - n)) - 1;
19899a2dd95SBruce Richardson 
19999a2dd95SBruce Richardson 	/* load input data pointers for new flows */
20099a2dd95SBruce Richardson 	nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
20199a2dd95SBruce Richardson 			flow->idata + flow->num_packets);
20299a2dd95SBruce Richardson 	nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
20399a2dd95SBruce Richardson 			flow->idata + flow->num_packets + n);
20499a2dd95SBruce Richardson 
20599a2dd95SBruce Richardson 	/* calculate match indexes of new flows */
20699a2dd95SBruce Richardson 	ni = _M_I_(set1_epi32)(flow->num_packets);
20799a2dd95SBruce Richardson 	ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
20899a2dd95SBruce Richardson 
20999a2dd95SBruce Richardson 	/* merge new and existing flows data */
21099a2dd95SBruce Richardson 	pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
21199a2dd95SBruce Richardson 	pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
21299a2dd95SBruce Richardson 
21399a2dd95SBruce Richardson 	/* update match and data indexes */
21499a2dd95SBruce Richardson 	*idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
21599a2dd95SBruce Richardson 	*di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
21699a2dd95SBruce Richardson 
21799a2dd95SBruce Richardson 	flow->num_packets += num;
21899a2dd95SBruce Richardson }
21999a2dd95SBruce Richardson 
22099a2dd95SBruce Richardson /*
22199a2dd95SBruce Richardson  * Process found matches for up to _N_ flows.
22299a2dd95SBruce Richardson  * fmsk - mask of active flows
22399a2dd95SBruce Richardson  * rmsk - mask of found matches
22499a2dd95SBruce Richardson  * pdata - pointers to flow input data
22599a2dd95SBruce Richardson  * di - data indexes for these flows
22699a2dd95SBruce Richardson  * idx - match indexed for given flows
22799a2dd95SBruce Richardson  * tr_lo contains low 32 bits for up to _N_ transitions.
22899a2dd95SBruce Richardson  * tr_hi contains high 32 bits for up to _N_ transitions.
22999a2dd95SBruce Richardson  */
23099a2dd95SBruce Richardson static inline uint32_t
_F_(match_process)23199a2dd95SBruce Richardson _F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
23299a2dd95SBruce Richardson 	uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
23399a2dd95SBruce Richardson 	_T_simd *tr_lo, _T_simd *tr_hi)
23499a2dd95SBruce Richardson {
23599a2dd95SBruce Richardson 	uint32_t n;
23699a2dd95SBruce Richardson 	_T_simd res;
23799a2dd95SBruce Richardson 
23899a2dd95SBruce Richardson 	if (rmsk[0] == 0)
23999a2dd95SBruce Richardson 		return 0;
24099a2dd95SBruce Richardson 
24199a2dd95SBruce Richardson 	/* extract match indexes */
24299a2dd95SBruce Richardson 	res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
24399a2dd95SBruce Richardson 
24499a2dd95SBruce Richardson 	/* mask  matched transitions to nop */
24599a2dd95SBruce Richardson 	tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
24699a2dd95SBruce Richardson 	tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
24799a2dd95SBruce Richardson 
24899a2dd95SBruce Richardson 	/* save found match indexes */
24999a2dd95SBruce Richardson 	_M_I_(mask_i32scatter_epi32)((void *)flow->matches, rmsk[0], idx[0],
25099a2dd95SBruce Richardson 			res, sizeof(flow->matches[0]));
25199a2dd95SBruce Richardson 
25299a2dd95SBruce Richardson 	/* update masks and start new flows for matches */
25399a2dd95SBruce Richardson 	n = update_flow_mask(flow, fmsk, rmsk);
25499a2dd95SBruce Richardson 	_F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
25599a2dd95SBruce Richardson 
25699a2dd95SBruce Richardson 	return n;
25799a2dd95SBruce Richardson }
25899a2dd95SBruce Richardson 
25999a2dd95SBruce Richardson /*
26099a2dd95SBruce Richardson  * Test for matches ut to (2 * _N_) flows at once,
26199a2dd95SBruce Richardson  * if matches exist - process them and start new flows.
26299a2dd95SBruce Richardson  */
26399a2dd95SBruce Richardson static inline void
_F_(match_check_process)26499a2dd95SBruce Richardson _F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
26599a2dd95SBruce Richardson 	_T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
26699a2dd95SBruce Richardson 	_T_simd tr_lo[2], _T_simd tr_hi[2])
26799a2dd95SBruce Richardson {
26899a2dd95SBruce Richardson 	uint32_t n[2];
26999a2dd95SBruce Richardson 	uint32_t rm[2];
27099a2dd95SBruce Richardson 
27199a2dd95SBruce Richardson 	/* check for matches */
27299a2dd95SBruce Richardson 	rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
27399a2dd95SBruce Richardson 	rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
27499a2dd95SBruce Richardson 
27599a2dd95SBruce Richardson 	/* till unprocessed matches exist */
27699a2dd95SBruce Richardson 	while ((rm[0] | rm[1]) != 0) {
27799a2dd95SBruce Richardson 
27899a2dd95SBruce Richardson 		/* process matches and start new flows */
27999a2dd95SBruce Richardson 		n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
28099a2dd95SBruce Richardson 			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
28199a2dd95SBruce Richardson 		n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
28299a2dd95SBruce Richardson 			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
28399a2dd95SBruce Richardson 
28499a2dd95SBruce Richardson 		/* execute first transition for new flows, if any */
28599a2dd95SBruce Richardson 
28699a2dd95SBruce Richardson 		if (n[0] != 0) {
28799a2dd95SBruce Richardson 			inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
28899a2dd95SBruce Richardson 					rm[0], &di[0], flow->first_load_sz);
28999a2dd95SBruce Richardson 			_F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
29099a2dd95SBruce Richardson 					&tr_hi[0]);
29199a2dd95SBruce Richardson 			rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
29299a2dd95SBruce Richardson 					_SV_(match_mask));
29399a2dd95SBruce Richardson 		}
29499a2dd95SBruce Richardson 
29599a2dd95SBruce Richardson 		if (n[1] != 0) {
29699a2dd95SBruce Richardson 			inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
29799a2dd95SBruce Richardson 					rm[1], &di[1], flow->first_load_sz);
29899a2dd95SBruce Richardson 			_F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
29999a2dd95SBruce Richardson 					&tr_hi[1]);
30099a2dd95SBruce Richardson 			rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
30199a2dd95SBruce Richardson 					_SV_(match_mask));
30299a2dd95SBruce Richardson 		}
30399a2dd95SBruce Richardson 	}
30499a2dd95SBruce Richardson }
30599a2dd95SBruce Richardson 
3068e2dd74fSKonstantin Ananyev static inline void
_F_(reset_flow_vars)3078e2dd74fSKonstantin Ananyev _F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4],
3088e2dd74fSKonstantin Ananyev 	_T_simd tr_lo[2], _T_simd tr_hi[2])
3098e2dd74fSKonstantin Ananyev {
3108e2dd74fSKonstantin Ananyev 	di[0] = _M_SI_(setzero)();
3118e2dd74fSKonstantin Ananyev 	di[1] = _M_SI_(setzero)();
3128e2dd74fSKonstantin Ananyev 
3138e2dd74fSKonstantin Ananyev 	idx[0] = _M_SI_(setzero)();
3148e2dd74fSKonstantin Ananyev 	idx[1] = _M_SI_(setzero)();
3158e2dd74fSKonstantin Ananyev 
3168e2dd74fSKonstantin Ananyev 	pdata[0] = _M_SI_(setzero)();
3178e2dd74fSKonstantin Ananyev 	pdata[1] = _M_SI_(setzero)();
3188e2dd74fSKonstantin Ananyev 	pdata[2] = _M_SI_(setzero)();
3198e2dd74fSKonstantin Ananyev 	pdata[3] = _M_SI_(setzero)();
3208e2dd74fSKonstantin Ananyev 
3218e2dd74fSKonstantin Ananyev 	tr_lo[0] = _M_SI_(setzero)();
3228e2dd74fSKonstantin Ananyev 	tr_lo[1] = _M_SI_(setzero)();
3238e2dd74fSKonstantin Ananyev 
3248e2dd74fSKonstantin Ananyev 	tr_hi[0] = _M_SI_(setzero)();
3258e2dd74fSKonstantin Ananyev 	tr_hi[1] = _M_SI_(setzero)();
3268e2dd74fSKonstantin Ananyev }
3278e2dd74fSKonstantin Ananyev 
32899a2dd95SBruce Richardson /*
32999a2dd95SBruce Richardson  * Perform search for up to (2 * _N_) flows in parallel.
33099a2dd95SBruce Richardson  * Use two sets of metadata, each serves _N_ flows max.
33199a2dd95SBruce Richardson  */
33299a2dd95SBruce Richardson static inline void
_F_(search_trie)33399a2dd95SBruce Richardson _F_(search_trie)(struct acl_flow_avx512 *flow)
33499a2dd95SBruce Richardson {
33599a2dd95SBruce Richardson 	uint32_t fm[2];
33699a2dd95SBruce Richardson 	_T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
33799a2dd95SBruce Richardson 
3388e2dd74fSKonstantin Ananyev 	_F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi);
3398e2dd74fSKonstantin Ananyev 
34099a2dd95SBruce Richardson 	/* first 1B load */
34199a2dd95SBruce Richardson 	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
34299a2dd95SBruce Richardson 			&pdata[0], &idx[0], &di[0]);
34399a2dd95SBruce Richardson 	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
34499a2dd95SBruce Richardson 			&pdata[2], &idx[1], &di[1]);
34599a2dd95SBruce Richardson 
34699a2dd95SBruce Richardson 	in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
34799a2dd95SBruce Richardson 			flow->first_load_sz);
34899a2dd95SBruce Richardson 	in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
34999a2dd95SBruce Richardson 			flow->first_load_sz);
35099a2dd95SBruce Richardson 
35199a2dd95SBruce Richardson 	_F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
35299a2dd95SBruce Richardson 	_F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
35399a2dd95SBruce Richardson 
35499a2dd95SBruce Richardson 	fm[0] = _SIMD_MASK_MAX_;
35599a2dd95SBruce Richardson 	fm[1] = _SIMD_MASK_MAX_;
35699a2dd95SBruce Richardson 
35799a2dd95SBruce Richardson 	/* match check */
35899a2dd95SBruce Richardson 	_F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
35999a2dd95SBruce Richardson 
36099a2dd95SBruce Richardson 	while ((fm[0] | fm[1]) != 0) {
36199a2dd95SBruce Richardson 
36299a2dd95SBruce Richardson 		/* load next 4B */
36399a2dd95SBruce Richardson 
36499a2dd95SBruce Richardson 		in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
36599a2dd95SBruce Richardson 				&di[0], sizeof(uint32_t));
36699a2dd95SBruce Richardson 		in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
36799a2dd95SBruce Richardson 				&di[1], sizeof(uint32_t));
36899a2dd95SBruce Richardson 
36999a2dd95SBruce Richardson 		/* main 4B loop */
37099a2dd95SBruce Richardson 
37199a2dd95SBruce Richardson 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
37299a2dd95SBruce Richardson 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
37399a2dd95SBruce Richardson 
37499a2dd95SBruce Richardson 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
37599a2dd95SBruce Richardson 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
37699a2dd95SBruce Richardson 
37799a2dd95SBruce Richardson 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
37899a2dd95SBruce Richardson 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
37999a2dd95SBruce Richardson 
38099a2dd95SBruce Richardson 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
38199a2dd95SBruce Richardson 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
38299a2dd95SBruce Richardson 
38399a2dd95SBruce Richardson 		/* check for matches */
38499a2dd95SBruce Richardson 		_F_(match_check_process)(flow, fm, pdata, di, idx, in,
38599a2dd95SBruce Richardson 			tr_lo, tr_hi);
38699a2dd95SBruce Richardson 	}
38799a2dd95SBruce Richardson }
38899a2dd95SBruce Richardson 
38999a2dd95SBruce Richardson /*
39099a2dd95SBruce Richardson  * resolve match index to actual result/priority offset.
39199a2dd95SBruce Richardson  */
39299a2dd95SBruce Richardson static inline _T_simd
_F_(resolve_match_idx)39399a2dd95SBruce Richardson _F_(resolve_match_idx)(_T_simd mi)
39499a2dd95SBruce Richardson {
39599a2dd95SBruce Richardson 	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
396b3b36f0fSKonstantin Ananyev 		1 << (ACL_MATCH_LOG + 2));
397b3b36f0fSKonstantin Ananyev 	return _M_I_(slli_epi32)(mi, ACL_MATCH_LOG);
39899a2dd95SBruce Richardson }
39999a2dd95SBruce Richardson 
40099a2dd95SBruce Richardson /*
40199a2dd95SBruce Richardson  * Resolve multiple matches for the same flow based on priority.
40299a2dd95SBruce Richardson  */
40399a2dd95SBruce Richardson static inline _T_simd
_F_(resolve_pri)40499a2dd95SBruce Richardson _F_(resolve_pri)(const int32_t res[], const int32_t pri[],
40599a2dd95SBruce Richardson 	const uint32_t match[], _T_mask msk, uint32_t nb_trie,
40699a2dd95SBruce Richardson 	uint32_t nb_skip)
40799a2dd95SBruce Richardson {
40899a2dd95SBruce Richardson 	uint32_t i;
40999a2dd95SBruce Richardson 	const uint32_t *pm;
41099a2dd95SBruce Richardson 	_T_mask m;
41199a2dd95SBruce Richardson 	_T_simd cp, cr, np, nr, mch;
41299a2dd95SBruce Richardson 
41399a2dd95SBruce Richardson 	const _T_simd zero = _M_I_(set1_epi32)(0);
41499a2dd95SBruce Richardson 
41599a2dd95SBruce Richardson 	/* get match indexes */
41699a2dd95SBruce Richardson 	mch = _M_I_(maskz_loadu_epi32)(msk, match);
41799a2dd95SBruce Richardson 	mch = _F_(resolve_match_idx)(mch);
41899a2dd95SBruce Richardson 
41999a2dd95SBruce Richardson 	/* read result and priority values for first trie */
42099a2dd95SBruce Richardson 	cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
42199a2dd95SBruce Richardson 	cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
42299a2dd95SBruce Richardson 
42399a2dd95SBruce Richardson 	/*
42499a2dd95SBruce Richardson 	 * read result and priority values for next tries and select one
42599a2dd95SBruce Richardson 	 * with highest priority.
42699a2dd95SBruce Richardson 	 */
42799a2dd95SBruce Richardson 	for (i = 1, pm = match + nb_skip; i != nb_trie;
42899a2dd95SBruce Richardson 			i++, pm += nb_skip) {
42999a2dd95SBruce Richardson 
43099a2dd95SBruce Richardson 		mch = _M_I_(maskz_loadu_epi32)(msk, pm);
43199a2dd95SBruce Richardson 		mch = _F_(resolve_match_idx)(mch);
43299a2dd95SBruce Richardson 
43399a2dd95SBruce Richardson 		nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
43499a2dd95SBruce Richardson 				sizeof(res[0]));
43599a2dd95SBruce Richardson 		np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
43699a2dd95SBruce Richardson 				sizeof(pri[0]));
43799a2dd95SBruce Richardson 
43899a2dd95SBruce Richardson 		m = _M_I_(cmpgt_epi32_mask)(cp, np);
43999a2dd95SBruce Richardson 		cr = _M_I_(mask_mov_epi32)(nr, m, cr);
44099a2dd95SBruce Richardson 		cp = _M_I_(mask_mov_epi32)(np, m, cp);
44199a2dd95SBruce Richardson 	}
44299a2dd95SBruce Richardson 
44399a2dd95SBruce Richardson 	return cr;
44499a2dd95SBruce Richardson }
44599a2dd95SBruce Richardson 
44699a2dd95SBruce Richardson /*
44799a2dd95SBruce Richardson  * Resolve num (<= _N_) matches for single category
44899a2dd95SBruce Richardson  */
44999a2dd95SBruce Richardson static inline void
_F_(resolve_sc)45099a2dd95SBruce Richardson _F_(resolve_sc)(uint32_t result[], const int32_t res[],
45199a2dd95SBruce Richardson 	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
45299a2dd95SBruce Richardson 	uint32_t nb_trie, uint32_t nb_skip)
45399a2dd95SBruce Richardson {
45499a2dd95SBruce Richardson 	_T_mask msk;
45599a2dd95SBruce Richardson 	_T_simd cr;
45699a2dd95SBruce Richardson 
45799a2dd95SBruce Richardson 	msk = (1 << nb_pkt) - 1;
45899a2dd95SBruce Richardson 	cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
45999a2dd95SBruce Richardson 	_M_I_(mask_storeu_epi32)(result, msk, cr);
46099a2dd95SBruce Richardson }
46199a2dd95SBruce Richardson 
46299a2dd95SBruce Richardson /*
46399a2dd95SBruce Richardson  * Resolve matches for single category
46499a2dd95SBruce Richardson  */
46599a2dd95SBruce Richardson static inline void
_F_(resolve_single_cat)46699a2dd95SBruce Richardson _F_(resolve_single_cat)(uint32_t result[],
46799a2dd95SBruce Richardson 	const struct rte_acl_match_results pr[], const uint32_t match[],
46899a2dd95SBruce Richardson 	uint32_t nb_pkt, uint32_t nb_trie)
46999a2dd95SBruce Richardson {
47099a2dd95SBruce Richardson 	uint32_t j, k, n;
47199a2dd95SBruce Richardson 	const int32_t *res, *pri;
47299a2dd95SBruce Richardson 	_T_simd cr[2];
47399a2dd95SBruce Richardson 
47499a2dd95SBruce Richardson 	res = (const int32_t *)pr->results;
47599a2dd95SBruce Richardson 	pri = pr->priority;
47699a2dd95SBruce Richardson 
47799a2dd95SBruce Richardson 	for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
47899a2dd95SBruce Richardson 
47999a2dd95SBruce Richardson 		j = k + _SIMD_MASK_BIT_;
48099a2dd95SBruce Richardson 
48199a2dd95SBruce Richardson 		cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
48299a2dd95SBruce Richardson 				nb_trie, nb_pkt);
48399a2dd95SBruce Richardson 		cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
48499a2dd95SBruce Richardson 				nb_trie, nb_pkt);
48599a2dd95SBruce Richardson 
48699a2dd95SBruce Richardson 		_M_SI_(storeu)((void *)(result + k), cr[0]);
48799a2dd95SBruce Richardson 		_M_SI_(storeu)((void *)(result + j), cr[1]);
48899a2dd95SBruce Richardson 	}
48999a2dd95SBruce Richardson 
49099a2dd95SBruce Richardson 	n = nb_pkt - k;
49199a2dd95SBruce Richardson 	if (n != 0) {
49299a2dd95SBruce Richardson 		if (n > _SIMD_MASK_BIT_) {
49399a2dd95SBruce Richardson 			_F_(resolve_sc)(result + k, res, pri, match + k,
49499a2dd95SBruce Richardson 				_SIMD_MASK_BIT_, nb_trie, nb_pkt);
49599a2dd95SBruce Richardson 			k += _SIMD_MASK_BIT_;
49699a2dd95SBruce Richardson 			n -= _SIMD_MASK_BIT_;
49799a2dd95SBruce Richardson 		}
49899a2dd95SBruce Richardson 		_F_(resolve_sc)(result + k, res, pri, match + k, n,
49999a2dd95SBruce Richardson 				nb_trie, nb_pkt);
50099a2dd95SBruce Richardson 	}
50199a2dd95SBruce Richardson }
502