xref: /dpdk/lib/acl/acl_run_avx512x16.h (revision 7be78d027918dbc846e502780faf94d5acdf5f75)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 /*
6  * Defines required by "acl_run_avx512_common.h".
7  * Note that all of them has to be undefined by the end
8  * of this file, as "acl_run_avx512_common.h" can be included several
9  * times from different *.h files for the same *.c.
10  */
11 
12 /*
13  * This implementation uses 512-bit registers(zmm) and intrinsics.
14  * So our main SIMD type is 512-bit width and each such variable can
15  * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
16  */
17 #define _T_simd		__m512i
18 #define _T_mask		__mmask16
19 
20 /* Naming convention for static const variables. */
21 #define _SC_(x)		zmm_##x
22 #define _SV_(x)		(zmm_##x.z)
23 
24 /* Naming convention for internal functions. */
25 #define _F_(x)		x##_avx512x16
26 
27 /*
28  * Same intrinsics have different syntaxes (depending on the bit-width),
29  * so to overcome that few macros need to be defined.
30  */
31 
32 /* Naming convention for generic epi(packed integers) type intrinsics. */
33 #define _M_I_(x)	_mm512_##x
34 
35 /* Naming convention for si(whole simd integer) type intrinsics. */
36 #define _M_SI_(x)	_mm512_##x##_si512
37 
38 /* Naming convention for masked gather type intrinsics. */
39 #define _M_MGI_(x)	_mm512_##x
40 
41 /* Naming convention for gather type intrinsics. */
42 #define _M_GI_(name, idx, base, scale)	_mm512_##name(idx, base, scale)
43 
44 /* num/mask of transitions per SIMD regs */
45 #define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
47 
48 #define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
50 
51 /* num/mask of pointers per SIMD regs */
52 #define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
54 
55 static const __rte_x86_zmm_t _SC_(match_mask) = {
56 	.u32 = {
57 		RTE_ACL_NODE_MATCH,
58 		RTE_ACL_NODE_MATCH,
59 		RTE_ACL_NODE_MATCH,
60 		RTE_ACL_NODE_MATCH,
61 		RTE_ACL_NODE_MATCH,
62 		RTE_ACL_NODE_MATCH,
63 		RTE_ACL_NODE_MATCH,
64 		RTE_ACL_NODE_MATCH,
65 		RTE_ACL_NODE_MATCH,
66 		RTE_ACL_NODE_MATCH,
67 		RTE_ACL_NODE_MATCH,
68 		RTE_ACL_NODE_MATCH,
69 		RTE_ACL_NODE_MATCH,
70 		RTE_ACL_NODE_MATCH,
71 		RTE_ACL_NODE_MATCH,
72 		RTE_ACL_NODE_MATCH,
73 	},
74 };
75 
76 static const __rte_x86_zmm_t _SC_(index_mask) = {
77 	.u32 = {
78 		RTE_ACL_NODE_INDEX,
79 		RTE_ACL_NODE_INDEX,
80 		RTE_ACL_NODE_INDEX,
81 		RTE_ACL_NODE_INDEX,
82 		RTE_ACL_NODE_INDEX,
83 		RTE_ACL_NODE_INDEX,
84 		RTE_ACL_NODE_INDEX,
85 		RTE_ACL_NODE_INDEX,
86 		RTE_ACL_NODE_INDEX,
87 		RTE_ACL_NODE_INDEX,
88 		RTE_ACL_NODE_INDEX,
89 		RTE_ACL_NODE_INDEX,
90 		RTE_ACL_NODE_INDEX,
91 		RTE_ACL_NODE_INDEX,
92 		RTE_ACL_NODE_INDEX,
93 		RTE_ACL_NODE_INDEX,
94 	},
95 };
96 
97 static const __rte_x86_zmm_t _SC_(trlo_idle) = {
98 	.u32 = {
99 		RTE_ACL_IDLE_NODE,
100 		RTE_ACL_IDLE_NODE,
101 		RTE_ACL_IDLE_NODE,
102 		RTE_ACL_IDLE_NODE,
103 		RTE_ACL_IDLE_NODE,
104 		RTE_ACL_IDLE_NODE,
105 		RTE_ACL_IDLE_NODE,
106 		RTE_ACL_IDLE_NODE,
107 		RTE_ACL_IDLE_NODE,
108 		RTE_ACL_IDLE_NODE,
109 		RTE_ACL_IDLE_NODE,
110 		RTE_ACL_IDLE_NODE,
111 		RTE_ACL_IDLE_NODE,
112 		RTE_ACL_IDLE_NODE,
113 		RTE_ACL_IDLE_NODE,
114 		RTE_ACL_IDLE_NODE,
115 	},
116 };
117 
118 static const __rte_x86_zmm_t _SC_(trhi_idle) = {
119 	.u32 = {
120 		0, 0, 0, 0,
121 		0, 0, 0, 0,
122 		0, 0, 0, 0,
123 		0, 0, 0, 0,
124 	},
125 };
126 
127 static const __rte_x86_zmm_t _SC_(shuffle_input) = {
128 	.u32 = {
129 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
130 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
131 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
132 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
133 	},
134 };
135 
136 static const __rte_x86_zmm_t _SC_(four_32) = {
137 	.u32 = {
138 		4, 4, 4, 4,
139 		4, 4, 4, 4,
140 		4, 4, 4, 4,
141 		4, 4, 4, 4,
142 	},
143 };
144 
145 static const __rte_x86_zmm_t _SC_(idx_add) = {
146 	.u32 = {
147 		0, 1, 2, 3,
148 		4, 5, 6, 7,
149 		8, 9, 10, 11,
150 		12, 13, 14, 15,
151 	},
152 };
153 
154 static const __rte_x86_zmm_t _SC_(range_base) = {
155 	.u32 = {
156 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
157 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
158 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
159 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
160 	},
161 };
162 
163 static const __rte_x86_zmm_t _SC_(pminp) = {
164 	.u32 = {
165 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
166 		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
167 	},
168 };
169 
170 static const _T_mask _SC_(pmidx_msk) = 0x5555;
171 
172 static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
173 	[0] = {
174 		.u32 = {
175 			0, 0, 1, 0, 2, 0, 3, 0,
176 			4, 0, 5, 0, 6, 0, 7, 0,
177 		},
178 	},
179 	[1] = {
180 		.u32 = {
181 			8, 0, 9, 0, 10, 0, 11, 0,
182 			12, 0, 13, 0, 14, 0, 15, 0,
183 		},
184 	},
185 };
186 
187 /*
188  * unfortunately current AVX512 ISA doesn't provide ability for
189  * gather load on a byte quantity. So we have to mimic it in SW,
190  * by doing 8x1B scalar loads.
191  */
192 static inline __m256i
_m512_mask_gather_epi8x8(__m512i pdata,__mmask8 mask)193 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
194 {
195 	rte_ymm_t v;
196 	__rte_x86_zmm_t p;
197 
198 	static const uint32_t zero;
199 
200 	p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
201 		(uintptr_t)&zero);
202 
203 	v.u32[0] = *(uint8_t *)p.u64[0];
204 	v.u32[1] = *(uint8_t *)p.u64[1];
205 	v.u32[2] = *(uint8_t *)p.u64[2];
206 	v.u32[3] = *(uint8_t *)p.u64[3];
207 	v.u32[4] = *(uint8_t *)p.u64[4];
208 	v.u32[5] = *(uint8_t *)p.u64[5];
209 	v.u32[6] = *(uint8_t *)p.u64[6];
210 	v.u32[7] = *(uint8_t *)p.u64[7];
211 
212 	return v.y;
213 }
214 
215 /*
216  * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
217  */
218 static __rte_always_inline __m512i
_F_(gather_bytes)219 _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
220 	uint32_t bnum)
221 {
222 	__m256i inp[2];
223 
224 	if (bnum == sizeof(uint8_t)) {
225 		inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
226 		inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
227 	} else {
228 		inp[0] = _mm512_mask_i64gather_epi32(
229 				_mm512_castsi512_si256(zero),
230 				m[0], p[0], NULL, sizeof(uint8_t));
231 		inp[1] = _mm512_mask_i64gather_epi32(
232 				_mm512_castsi512_si256(zero),
233 				m[1], p[1], NULL, sizeof(uint8_t));
234 	}
235 
236 	/* squeeze input into one 512-bit register */
237 	return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
238 			_SV_(pminp), _mm512_castsi256_si512(inp[1]));
239 }
240 
241 /*
242  * Resolve matches for multiple categories (GT 8, use 512b instructions/regs)
243  */
244 static inline void
resolve_mcgt8_avx512x1(uint32_t result[],const struct rte_acl_match_results pr[],const uint32_t match[],uint32_t nb_pkt,uint32_t nb_cat,uint32_t nb_trie)245 resolve_mcgt8_avx512x1(uint32_t result[],
246 	const struct rte_acl_match_results pr[], const uint32_t match[],
247 	uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
248 {
249 	const int32_t *pri;
250 	const uint32_t *pm, *res;
251 	uint32_t i, k, mi;
252 	__mmask16 cm, sm;
253 	__m512i cp, cr, np, nr;
254 
255 	res = pr->results;
256 	pri = pr->priority;
257 
258 	cm = (1 << nb_cat) - 1;
259 
260 	for (k = 0; k != nb_pkt; k++, result += nb_cat) {
261 
262 		mi = match[k] << ACL_MATCH_LOG;
263 
264 		cr = _mm512_maskz_loadu_epi32(cm, res + mi);
265 		cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
266 
267 		for (i = 1, pm = match + nb_pkt; i != nb_trie;
268 				i++, pm += nb_pkt) {
269 
270 			mi = pm[k] << ACL_MATCH_LOG;
271 
272 			nr = _mm512_maskz_loadu_epi32(cm, res + mi);
273 			np = _mm512_maskz_loadu_epi32(cm, pri + mi);
274 
275 			sm = _mm512_cmpgt_epi32_mask(cp, np);
276 			cr = _mm512_mask_mov_epi32(nr, sm, cr);
277 			cp = _mm512_mask_mov_epi32(np, sm, cp);
278 		}
279 
280 		_mm512_mask_storeu_epi32(result, cm, cr);
281 	}
282 }
283 
284 #include "acl_run_avx512_common.h"
285 
286 /*
287  * Perform search for up to (2 * 16) flows in parallel.
288  * Use two sets of metadata, each serves 16 flows max.
289  */
290 static inline int
search_avx512x16x2(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t total_packets,uint32_t categories)291 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
292 	uint32_t *results, uint32_t total_packets, uint32_t categories)
293 {
294 	uint32_t i, *pm;
295 	const struct rte_acl_match_results *pr;
296 	struct acl_flow_avx512 flow;
297 	uint32_t match[ctx->num_tries * total_packets];
298 
299 	for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
300 
301 		/* setup for next trie */
302 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
303 
304 		/* process the trie */
305 		_F_(search_trie)(&flow);
306 	}
307 
308 	/* resolve matches */
309 	pr = (const struct rte_acl_match_results *)
310 		(ctx->trans_table + ctx->match_index);
311 
312 	if (categories == 1)
313 		_F_(resolve_single_cat)(results, pr, match, total_packets,
314 			ctx->num_tries);
315 	else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
316 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
317 			categories, ctx->num_tries);
318 	else
319 		resolve_mcgt8_avx512x1(results, pr, match, total_packets,
320 			categories, ctx->num_tries);
321 
322 	return 0;
323 }
324 
325 #undef _SIMD_PTR_MSK_
326 #undef _SIMD_PTR_NUM_
327 #undef _SIMD_FLOW_MSK_
328 #undef _SIMD_FLOW_NUM_
329 #undef _SIMD_MASK_MAX_
330 #undef _SIMD_MASK_BIT_
331 #undef _M_GI_
332 #undef _M_MGI_
333 #undef _M_SI_
334 #undef _M_I_
335 #undef _F_
336 #undef _SV_
337 #undef _SC_
338 #undef _T_mask
339 #undef _T_simd
340