xref: /dpdk/lib/acl/acl_run_avx512_common.h (revision 3d4e27fd7ff050d565c7450930c92fb945706518)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 /*
6  * WARNING: It is not recommended to include this file directly.
7  * Please include "acl_run_avx512x*.h" instead.
8  * To make this file to generate proper code an includer has to
9  * define several macros, refer to "acl_run_avx512x*.h" for more details.
10  */
11 
12 /*
13  * Calculate the address of the next transition for
14  * all types of nodes. Note that only DFA nodes and range
15  * nodes actually transition to another node. Match
16  * nodes not supposed to be encountered here.
17  * For quad range nodes:
18  * Calculate number of range boundaries that are less than the
19  * input value. Range boundaries for each node are in signed 8 bit,
20  * ordered from -128 to 127.
21  * This is effectively a popcnt of bytes that are greater than the
22  * input byte.
23  * Single nodes are processed in the same ways as quad range nodes.
24  */
25 static __rte_always_inline _T_simd
_F_(calc_addr)26 _F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
27 	_T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
28 {
29 	__mmask64 qm;
30 	_T_mask dfa_msk;
31 	_T_simd addr, in, node_type, r, t;
32 	_T_simd dfa_ofs, quad_ofs;
33 
34 	t = _M_SI_(xor)(index_mask, index_mask);
35 	in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
36 
37 	/* Calc node type and node addr */
38 	node_type = _M_SI_(andnot)(index_mask, tr_lo);
39 	addr = _M_SI_(and)(index_mask, tr_lo);
40 
41 	/* mask for DFA type(0) nodes */
42 	dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
43 
44 	/* DFA calculations. */
45 	r = _M_I_(srli_epi32)(in, 30);
46 	r = _M_I_(add_epi8)(r, range_base);
47 	t = _M_I_(srli_epi32)(in, 24);
48 	r = _M_I_(shuffle_epi8)(tr_hi, r);
49 
50 	dfa_ofs = _M_I_(sub_epi32)(t, r);
51 
52 	/* QUAD/SINGLE calculations. */
53 	qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
54 	t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
55 	t = _M_I_(lzcnt_epi32)(t);
56 	t = _M_I_(srli_epi32)(t, 3);
57 	quad_ofs = _M_I_(sub_epi32)(four_32, t);
58 
59 	/* blend DFA and QUAD/SINGLE. */
60 	t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
61 
62 	/* calculate address for next transitions. */
63 	addr = _M_I_(add_epi32)(addr, t);
64 	return addr;
65 }
66 
67 /*
68  * Process _N_ transitions in parallel.
69  * tr_lo contains low 32 bits for _N_ transition.
70  * tr_hi contains high 32 bits for _N_ transition.
71  * next_input contains up to 4 input bytes for _N_ flows.
72  */
73 static __rte_always_inline _T_simd
_F_(trans)74 _F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
75 	_T_simd *tr_hi)
76 {
77 	const int32_t *tr;
78 	_T_simd addr;
79 
80 	tr = (const int32_t *)(uintptr_t)trans;
81 
82 	/* Calculate the address (array index) for all _N_ transitions. */
83 	addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
84 		_SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
85 
86 	/* load lower 32 bits of _N_ transactions at once. */
87 	*tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
88 
89 	next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
90 
91 	/* load high 32 bits of _N_ transactions at once. */
92 	*tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
93 
94 	return next_input;
95 }
96 
97 /*
98  * Execute first transition for up to _N_ flows in parallel.
99  * next_input should contain one input byte for up to _N_ flows.
100  * msk - mask of active flows.
101  * tr_lo contains low 32 bits for up to _N_ transitions.
102  * tr_hi contains high 32 bits for up to _N_ transitions.
103  */
104 static __rte_always_inline void
_F_(first_trans)105 _F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
106 	_T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
107 {
108 	const int32_t *tr;
109 	_T_simd addr, root;
110 
111 	tr = (const int32_t *)(uintptr_t)flow->trans;
112 
113 	addr = _M_I_(set1_epi32)(UINT8_MAX);
114 	root = _M_I_(set1_epi32)(flow->root_index);
115 
116 	addr = _M_SI_(and)(next_input, addr);
117 	addr = _M_I_(add_epi32)(root, addr);
118 
119 	/* load lower 32 bits of _N_ transactions at once. */
120 	*tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
121 		sizeof(flow->trans[0]));
122 
123 	/* load high 32 bits of _N_ transactions at once. */
124 	*tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
125 		sizeof(flow->trans[0]));
126 }
127 
128 /*
129  * Load and return next 4 input bytes for up to _N_ flows in parallel.
130  * pdata - 8x2 pointers to flow input data
131  * mask - mask of active flows.
132  * di - data indexes for these _N_ flows.
133  */
134 static inline _T_simd
_F_(get_next_bytes)135 _F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
136 	uint32_t msk, _T_simd *di, uint32_t bnum)
137 {
138 	const int32_t *div;
139 	uint32_t m[2];
140 	_T_simd one, zero, t, p[2];
141 
142 	div = (const int32_t *)flow->data_index;
143 
144 	one = _M_I_(set1_epi32)(1);
145 	zero = _M_SI_(xor)(one, one);
146 
147 	/* load data offsets for given indexes */
148 	t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
149 
150 	/* increment data indexes */
151 	*di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
152 
153 	/*
154 	 * unsigned expand 32-bit indexes to 64-bit
155 	 * (for later pointer arithmetic), i.e:
156 	 * for (i = 0; i != _N_; i++)
157 	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
158 	 */
159 	p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
160 			t);
161 	p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
162 			t);
163 
164 	p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
165 	p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
166 
167 	/* load input byte(s), either one or four */
168 
169 	m[0] = msk & _SIMD_PTR_MSK_;
170 	m[1] = msk >> _SIMD_PTR_NUM_;
171 
172 	return _F_(gather_bytes)(zero, p, m, bnum);
173 }
174 
175 /*
176  * Start up to _N_ new flows.
177  * num - number of flows to start
178  * msk - mask of new flows.
179  * pdata - pointers to flow input data
180  * idx - match indexed for given flows
181  * di - data indexes for these flows.
182  */
183 static inline void
_F_(start_flow)184 _F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
185 	_T_simd pdata[2], _T_simd *idx, _T_simd *di)
186 {
187 	uint32_t n, m[2], nm[2];
188 	_T_simd ni, nd[2];
189 
190 	/* split mask into two - one for each pdata[] */
191 	m[0] = msk & _SIMD_PTR_MSK_;
192 	m[1] = msk >> _SIMD_PTR_NUM_;
193 
194 	/* calculate masks for new flows */
195 	n = rte_popcount32(m[0]);
196 	nm[0] = (1 << n) - 1;
197 	nm[1] = (1 << (num - n)) - 1;
198 
199 	/* load input data pointers for new flows */
200 	nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
201 			flow->idata + flow->num_packets);
202 	nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
203 			flow->idata + flow->num_packets + n);
204 
205 	/* calculate match indexes of new flows */
206 	ni = _M_I_(set1_epi32)(flow->num_packets);
207 	ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
208 
209 	/* merge new and existing flows data */
210 	pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
211 	pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
212 
213 	/* update match and data indexes */
214 	*idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
215 	*di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
216 
217 	flow->num_packets += num;
218 }
219 
220 /*
221  * Process found matches for up to _N_ flows.
222  * fmsk - mask of active flows
223  * rmsk - mask of found matches
224  * pdata - pointers to flow input data
225  * di - data indexes for these flows
226  * idx - match indexed for given flows
227  * tr_lo contains low 32 bits for up to _N_ transitions.
228  * tr_hi contains high 32 bits for up to _N_ transitions.
229  */
230 static inline uint32_t
_F_(match_process)231 _F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
232 	uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
233 	_T_simd *tr_lo, _T_simd *tr_hi)
234 {
235 	uint32_t n;
236 	_T_simd res;
237 
238 	if (rmsk[0] == 0)
239 		return 0;
240 
241 	/* extract match indexes */
242 	res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
243 
244 	/* mask  matched transitions to nop */
245 	tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
246 	tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
247 
248 	/* save found match indexes */
249 	_M_I_(mask_i32scatter_epi32)((void *)flow->matches, rmsk[0], idx[0],
250 			res, sizeof(flow->matches[0]));
251 
252 	/* update masks and start new flows for matches */
253 	n = update_flow_mask(flow, fmsk, rmsk);
254 	_F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
255 
256 	return n;
257 }
258 
259 /*
260  * Test for matches ut to (2 * _N_) flows at once,
261  * if matches exist - process them and start new flows.
262  */
263 static inline void
_F_(match_check_process)264 _F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
265 	_T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
266 	_T_simd tr_lo[2], _T_simd tr_hi[2])
267 {
268 	uint32_t n[2];
269 	uint32_t rm[2];
270 
271 	/* check for matches */
272 	rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
273 	rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
274 
275 	/* till unprocessed matches exist */
276 	while ((rm[0] | rm[1]) != 0) {
277 
278 		/* process matches and start new flows */
279 		n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
280 			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
281 		n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
282 			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
283 
284 		/* execute first transition for new flows, if any */
285 
286 		if (n[0] != 0) {
287 			inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
288 					rm[0], &di[0], flow->first_load_sz);
289 			_F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
290 					&tr_hi[0]);
291 			rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
292 					_SV_(match_mask));
293 		}
294 
295 		if (n[1] != 0) {
296 			inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
297 					rm[1], &di[1], flow->first_load_sz);
298 			_F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
299 					&tr_hi[1]);
300 			rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
301 					_SV_(match_mask));
302 		}
303 	}
304 }
305 
306 static inline void
_F_(reset_flow_vars)307 _F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4],
308 	_T_simd tr_lo[2], _T_simd tr_hi[2])
309 {
310 	di[0] = _M_SI_(setzero)();
311 	di[1] = _M_SI_(setzero)();
312 
313 	idx[0] = _M_SI_(setzero)();
314 	idx[1] = _M_SI_(setzero)();
315 
316 	pdata[0] = _M_SI_(setzero)();
317 	pdata[1] = _M_SI_(setzero)();
318 	pdata[2] = _M_SI_(setzero)();
319 	pdata[3] = _M_SI_(setzero)();
320 
321 	tr_lo[0] = _M_SI_(setzero)();
322 	tr_lo[1] = _M_SI_(setzero)();
323 
324 	tr_hi[0] = _M_SI_(setzero)();
325 	tr_hi[1] = _M_SI_(setzero)();
326 }
327 
328 /*
329  * Perform search for up to (2 * _N_) flows in parallel.
330  * Use two sets of metadata, each serves _N_ flows max.
331  */
332 static inline void
_F_(search_trie)333 _F_(search_trie)(struct acl_flow_avx512 *flow)
334 {
335 	uint32_t fm[2];
336 	_T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
337 
338 	_F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi);
339 
340 	/* first 1B load */
341 	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
342 			&pdata[0], &idx[0], &di[0]);
343 	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
344 			&pdata[2], &idx[1], &di[1]);
345 
346 	in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
347 			flow->first_load_sz);
348 	in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
349 			flow->first_load_sz);
350 
351 	_F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
352 	_F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
353 
354 	fm[0] = _SIMD_MASK_MAX_;
355 	fm[1] = _SIMD_MASK_MAX_;
356 
357 	/* match check */
358 	_F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
359 
360 	while ((fm[0] | fm[1]) != 0) {
361 
362 		/* load next 4B */
363 
364 		in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
365 				&di[0], sizeof(uint32_t));
366 		in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
367 				&di[1], sizeof(uint32_t));
368 
369 		/* main 4B loop */
370 
371 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
372 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
373 
374 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
375 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
376 
377 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
378 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
379 
380 		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
381 		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
382 
383 		/* check for matches */
384 		_F_(match_check_process)(flow, fm, pdata, di, idx, in,
385 			tr_lo, tr_hi);
386 	}
387 }
388 
389 /*
390  * resolve match index to actual result/priority offset.
391  */
392 static inline _T_simd
_F_(resolve_match_idx)393 _F_(resolve_match_idx)(_T_simd mi)
394 {
395 	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
396 		1 << (ACL_MATCH_LOG + 2));
397 	return _M_I_(slli_epi32)(mi, ACL_MATCH_LOG);
398 }
399 
400 /*
401  * Resolve multiple matches for the same flow based on priority.
402  */
403 static inline _T_simd
_F_(resolve_pri)404 _F_(resolve_pri)(const int32_t res[], const int32_t pri[],
405 	const uint32_t match[], _T_mask msk, uint32_t nb_trie,
406 	uint32_t nb_skip)
407 {
408 	uint32_t i;
409 	const uint32_t *pm;
410 	_T_mask m;
411 	_T_simd cp, cr, np, nr, mch;
412 
413 	const _T_simd zero = _M_I_(set1_epi32)(0);
414 
415 	/* get match indexes */
416 	mch = _M_I_(maskz_loadu_epi32)(msk, match);
417 	mch = _F_(resolve_match_idx)(mch);
418 
419 	/* read result and priority values for first trie */
420 	cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
421 	cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
422 
423 	/*
424 	 * read result and priority values for next tries and select one
425 	 * with highest priority.
426 	 */
427 	for (i = 1, pm = match + nb_skip; i != nb_trie;
428 			i++, pm += nb_skip) {
429 
430 		mch = _M_I_(maskz_loadu_epi32)(msk, pm);
431 		mch = _F_(resolve_match_idx)(mch);
432 
433 		nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
434 				sizeof(res[0]));
435 		np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
436 				sizeof(pri[0]));
437 
438 		m = _M_I_(cmpgt_epi32_mask)(cp, np);
439 		cr = _M_I_(mask_mov_epi32)(nr, m, cr);
440 		cp = _M_I_(mask_mov_epi32)(np, m, cp);
441 	}
442 
443 	return cr;
444 }
445 
446 /*
447  * Resolve num (<= _N_) matches for single category
448  */
449 static inline void
_F_(resolve_sc)450 _F_(resolve_sc)(uint32_t result[], const int32_t res[],
451 	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
452 	uint32_t nb_trie, uint32_t nb_skip)
453 {
454 	_T_mask msk;
455 	_T_simd cr;
456 
457 	msk = (1 << nb_pkt) - 1;
458 	cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
459 	_M_I_(mask_storeu_epi32)(result, msk, cr);
460 }
461 
462 /*
463  * Resolve matches for single category
464  */
465 static inline void
_F_(resolve_single_cat)466 _F_(resolve_single_cat)(uint32_t result[],
467 	const struct rte_acl_match_results pr[], const uint32_t match[],
468 	uint32_t nb_pkt, uint32_t nb_trie)
469 {
470 	uint32_t j, k, n;
471 	const int32_t *res, *pri;
472 	_T_simd cr[2];
473 
474 	res = (const int32_t *)pr->results;
475 	pri = pr->priority;
476 
477 	for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
478 
479 		j = k + _SIMD_MASK_BIT_;
480 
481 		cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
482 				nb_trie, nb_pkt);
483 		cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
484 				nb_trie, nb_pkt);
485 
486 		_M_SI_(storeu)((void *)(result + k), cr[0]);
487 		_M_SI_(storeu)((void *)(result + j), cr[1]);
488 	}
489 
490 	n = nb_pkt - k;
491 	if (n != 0) {
492 		if (n > _SIMD_MASK_BIT_) {
493 			_F_(resolve_sc)(result + k, res, pri, match + k,
494 				_SIMD_MASK_BIT_, nb_trie, nb_pkt);
495 			k += _SIMD_MASK_BIT_;
496 			n -= _SIMD_MASK_BIT_;
497 		}
498 		_F_(resolve_sc)(result + k, res, pri, match + k, n,
499 				nb_trie, nb_pkt);
500 	}
501 }
502