xref: /dpdk/lib/acl/acl_run_avx512x8.h (revision 7be78d027918dbc846e502780faf94d5acdf5f75)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 /*
6  * Defines required by "acl_run_avx512_common.h".
7  * Note that all of them has to be undefined by the end
8  * of this file, as "acl_run_avx512_common.h" can be included several
9  * times from different *.h files for the same *.c.
10  */
11 
12 /*
13  * This implementation uses 256-bit registers(ymm) and intrinsics.
14  * So our main SIMD type is 256-bit width and each such variable can
15  * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
16  */
17 #define _T_simd		__m256i
18 #define _T_mask		__mmask8
19 
20 /* Naming convention for static const variables. */
21 #define _SC_(x)		ymm_##x
22 #define _SV_(x)		(ymm_##x.y)
23 
24 /* Naming convention for internal functions. */
25 #define _F_(x)		x##_avx512x8
26 
27 /*
28  * Same intrinsics have different syntaxes (depending on the bit-width),
29  * so to overcome that few macros need to be defined.
30  */
31 
32 /* Naming convention for generic epi(packed integers) type intrinsics. */
33 #define _M_I_(x)	_mm256_##x
34 
35 /* Naming convention for si(whole simd integer) type intrinsics. */
36 #define _M_SI_(x)	_mm256_##x##_si256
37 
38 /* Naming convention for masked gather type intrinsics. */
39 #define _M_MGI_(x)	_mm256_m##x
40 
41 /* Naming convention for gather type intrinsics. */
42 #define _M_GI_(name, idx, base, scale)	_mm256_##name(base, idx, scale)
43 
44 /* num/mask of transitions per SIMD regs */
45 #define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
47 
48 #define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
50 
51 /* num/mask of pointers per SIMD regs */
52 #define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
54 
55 static const rte_ymm_t _SC_(match_mask) = {
56 	.u32 = {
57 		RTE_ACL_NODE_MATCH,
58 		RTE_ACL_NODE_MATCH,
59 		RTE_ACL_NODE_MATCH,
60 		RTE_ACL_NODE_MATCH,
61 		RTE_ACL_NODE_MATCH,
62 		RTE_ACL_NODE_MATCH,
63 		RTE_ACL_NODE_MATCH,
64 		RTE_ACL_NODE_MATCH,
65 	},
66 };
67 
68 static const rte_ymm_t _SC_(index_mask) = {
69 	.u32 = {
70 		RTE_ACL_NODE_INDEX,
71 		RTE_ACL_NODE_INDEX,
72 		RTE_ACL_NODE_INDEX,
73 		RTE_ACL_NODE_INDEX,
74 		RTE_ACL_NODE_INDEX,
75 		RTE_ACL_NODE_INDEX,
76 		RTE_ACL_NODE_INDEX,
77 		RTE_ACL_NODE_INDEX,
78 	},
79 };
80 
81 static const rte_ymm_t _SC_(trlo_idle) = {
82 	.u32 = {
83 		RTE_ACL_IDLE_NODE,
84 		RTE_ACL_IDLE_NODE,
85 		RTE_ACL_IDLE_NODE,
86 		RTE_ACL_IDLE_NODE,
87 		RTE_ACL_IDLE_NODE,
88 		RTE_ACL_IDLE_NODE,
89 		RTE_ACL_IDLE_NODE,
90 		RTE_ACL_IDLE_NODE,
91 	},
92 };
93 
94 static const rte_ymm_t _SC_(trhi_idle) = {
95 	.u32 = {
96 		0, 0, 0, 0,
97 		0, 0, 0, 0,
98 	},
99 };
100 
101 static const rte_ymm_t _SC_(shuffle_input) = {
102 	.u32 = {
103 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
104 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
105 	},
106 };
107 
108 static const rte_ymm_t _SC_(four_32) = {
109 	.u32 = {
110 		4, 4, 4, 4,
111 		4, 4, 4, 4,
112 	},
113 };
114 
115 static const rte_ymm_t _SC_(idx_add) = {
116 	.u32 = {
117 		0, 1, 2, 3,
118 		4, 5, 6, 7,
119 	},
120 };
121 
122 static const rte_ymm_t _SC_(range_base) = {
123 	.u32 = {
124 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
125 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
126 	},
127 };
128 
129 static const rte_ymm_t _SC_(pminp) = {
130 	.u32 = {
131 		0x00, 0x01, 0x02, 0x03,
132 		0x08, 0x09, 0x0a, 0x0b,
133 	},
134 };
135 
136 static const __mmask16 _SC_(pmidx_msk) = 0x55;
137 
138 static const rte_ymm_t _SC_(pmidx[2]) = {
139 	[0] = {
140 		.u32 = {
141 			0, 0, 1, 0, 2, 0, 3, 0,
142 		},
143 	},
144 	[1] = {
145 		.u32 = {
146 			4, 0, 5, 0, 6, 0, 7, 0,
147 		},
148 	},
149 };
150 
151 /*
152  * unfortunately current AVX512 ISA doesn't provide ability for
153  * gather load on a byte quantity. So we have to mimic it in SW,
154  * by doing 4x1B scalar loads.
155  */
156 static inline __m128i
_m256_mask_gather_epi8x4(__m256i pdata,__mmask8 mask)157 _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
158 {
159 	rte_xmm_t v;
160 	rte_ymm_t p;
161 
162 	static const uint32_t zero;
163 
164 	p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
165 		(uintptr_t)&zero);
166 
167 	v.u32[0] = *(uint8_t *)p.u64[0];
168 	v.u32[1] = *(uint8_t *)p.u64[1];
169 	v.u32[2] = *(uint8_t *)p.u64[2];
170 	v.u32[3] = *(uint8_t *)p.u64[3];
171 
172 	return v.x;
173 }
174 
175 /*
176  * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
177  */
178 static __rte_always_inline __m256i
_F_(gather_bytes)179 _F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2],
180 	uint32_t bnum)
181 {
182 	__m128i inp[2];
183 
184 	if (bnum == sizeof(uint8_t)) {
185 		inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
186 		inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
187 	} else {
188 		inp[0] = _mm256_mmask_i64gather_epi32(
189 				_mm256_castsi256_si128(zero),
190 				m[0], p[0], NULL, sizeof(uint8_t));
191 		inp[1] = _mm256_mmask_i64gather_epi32(
192 				_mm256_castsi256_si128(zero),
193 				m[1], p[1], NULL, sizeof(uint8_t));
194 	}
195 
196 	/* squeeze input into one 256-bit register */
197 	return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
198 			_SV_(pminp), _mm256_castsi128_si256(inp[1]));
199 }
200 
201 #include "acl_run_avx512_common.h"
202 
203 /*
204  * Perform search for up to (2 * 8) flows in parallel.
205  * Use two sets of metadata, each serves 8 flows max.
206  */
207 static inline int
search_avx512x8x2(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t total_packets,uint32_t categories)208 search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
209 	uint32_t *results, uint32_t total_packets, uint32_t categories)
210 {
211 	uint32_t i, *pm;
212 	const struct rte_acl_match_results *pr;
213 	struct acl_flow_avx512 flow;
214 	uint32_t match[ctx->num_tries * total_packets];
215 
216 	for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
217 
218 		/* setup for next trie */
219 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
220 
221 		/* process the trie */
222 		_F_(search_trie)(&flow);
223 	}
224 
225 	/* resolve matches */
226 	pr = (const struct rte_acl_match_results *)
227 		(ctx->trans_table + ctx->match_index);
228 
229 	if (categories == 1)
230 		_F_(resolve_single_cat)(results, pr, match, total_packets,
231 			ctx->num_tries);
232 	else
233 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
234 			categories, ctx->num_tries);
235 
236 	return 0;
237 }
238 
239 #undef _SIMD_PTR_MSK_
240 #undef _SIMD_PTR_NUM_
241 #undef _SIMD_FLOW_MSK_
242 #undef _SIMD_FLOW_NUM_
243 #undef _SIMD_MASK_MAX_
244 #undef _SIMD_MASK_BIT_
245 #undef _M_GI_
246 #undef _M_MGI_
247 #undef _M_SI_
248 #undef _M_I_
249 #undef _F_
250 #undef _SV_
251 #undef _SC_
252 #undef _T_mask
253 #undef _T_simd
254