xref: /dpdk/lib/acl/acl_run_avx512.c (revision 3d4e27fd7ff050d565c7450930c92fb945706518)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include "acl_run_sse.h"
6 
7 /*sizeof(uint32_t) << ACL_MATCH_LOG == sizeof(struct rte_acl_match_results)*/
8 #define ACL_MATCH_LOG	5
9 
10 struct acl_flow_avx512 {
11 	uint32_t num_packets;       /* number of packets processed */
12 	uint32_t total_packets;     /* max number of packets to process */
13 	uint32_t root_index;        /* current root index */
14 	uint32_t first_load_sz;     /* first load size for new packet */
15 	const uint64_t *trans;      /* transition table */
16 	const uint32_t *data_index; /* input data indexes */
17 	const uint8_t **idata;      /* input data */
18 	uint32_t *matches;          /* match indexes */
19 };
20 
21 static inline void
acl_set_flow_avx512(struct acl_flow_avx512 * flow,const struct rte_acl_ctx * ctx,uint32_t trie,const uint8_t * data[],uint32_t * matches,uint32_t total_packets)22 acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx,
23 	uint32_t trie, const uint8_t *data[], uint32_t *matches,
24 	uint32_t total_packets)
25 {
26 	flow->num_packets = 0;
27 	flow->total_packets = total_packets;
28 	flow->first_load_sz = ctx->first_load_sz;
29 	flow->root_index = ctx->trie[trie].root_index;
30 	flow->trans = ctx->trans_table;
31 	flow->data_index = ctx->trie[trie].data_index;
32 	flow->idata = data;
33 	flow->matches = matches;
34 }
35 
36 /*
37  * Update flow and result masks based on the number of unprocessed flows.
38  */
39 static inline uint32_t
update_flow_mask(const struct acl_flow_avx512 * flow,uint32_t * fmsk,uint32_t * rmsk)40 update_flow_mask(const struct acl_flow_avx512 *flow, uint32_t *fmsk,
41 	uint32_t *rmsk)
42 {
43 	uint32_t i, j, k, m, n;
44 
45 	fmsk[0] ^= rmsk[0];
46 	m = rmsk[0];
47 
48 	k = rte_popcount32(m);
49 	n = flow->total_packets - flow->num_packets;
50 
51 	if (n < k) {
52 		/* reduce mask */
53 		for (i = k - n; i != 0; i--) {
54 			j = sizeof(m) * CHAR_BIT - 1 - rte_clz32(m);
55 			m ^= 1 << j;
56 		}
57 	} else
58 		n = k;
59 
60 	rmsk[0] = m;
61 	fmsk[0] |= rmsk[0];
62 
63 	return n;
64 }
65 
66 /*
67  * Resolve matches for multiple categories (LE 8, use 128b instructions/regs)
68  */
69 static inline void
resolve_mcle8_avx512x1(uint32_t result[],const struct rte_acl_match_results pr[],const uint32_t match[],uint32_t nb_pkt,uint32_t nb_cat,uint32_t nb_trie)70 resolve_mcle8_avx512x1(uint32_t result[],
71 	const struct rte_acl_match_results pr[], const uint32_t match[],
72 	uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
73 {
74 	const int32_t *pri;
75 	const uint32_t *pm, *res;
76 	uint32_t i, j, k, mi, mn;
77 	__mmask8 msk;
78 	xmm_t cp, cr, np, nr;
79 
80 	res = pr->results;
81 	pri = pr->priority;
82 
83 	for (k = 0; k != nb_pkt; k++, result += nb_cat) {
84 
85 		mi = match[k] << ACL_MATCH_LOG;
86 
87 		for (j = 0; j != nb_cat; j += RTE_ACL_RESULTS_MULTIPLIER) {
88 
89 			cr = _mm_loadu_si128((const xmm_t *)(res + mi + j));
90 			cp = _mm_loadu_si128((const xmm_t *)(pri + mi + j));
91 
92 			for (i = 1, pm = match + nb_pkt; i != nb_trie;
93 				i++, pm += nb_pkt) {
94 
95 				mn = j + (pm[k] << ACL_MATCH_LOG);
96 
97 				nr = _mm_loadu_si128((const xmm_t *)(res + mn));
98 				np = _mm_loadu_si128((const xmm_t *)(pri + mn));
99 
100 				msk = _mm_cmpgt_epi32_mask(cp, np);
101 				cr = _mm_mask_mov_epi32(nr, msk, cr);
102 				cp = _mm_mask_mov_epi32(np, msk, cp);
103 			}
104 
105 			_mm_storeu_si128((xmm_t *)(result + j), cr);
106 		}
107 	}
108 }
109 
110 #include "acl_run_avx512x8.h"
111 
112 int
rte_acl_classify_avx512x16(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t num,uint32_t categories)113 rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
114 	uint32_t *results, uint32_t num, uint32_t categories)
115 {
116 	const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;
117 
118 	/* split huge lookup (gt 256) into series of fixed size ones */
119 	while (num > max_iter) {
120 		search_avx512x8x2(ctx, data, results, max_iter, categories);
121 		data += max_iter;
122 		results += max_iter * categories;
123 		num -= max_iter;
124 	}
125 
126 	/* select classify method based on number of remaining requests */
127 	if (num >= MAX_SEARCHES_AVX16)
128 		return search_avx512x8x2(ctx, data, results, num, categories);
129 	if (num >= MAX_SEARCHES_SSE8)
130 		return search_sse_8(ctx, data, results, num, categories);
131 	if (num >= MAX_SEARCHES_SSE4)
132 		return search_sse_4(ctx, data, results, num, categories);
133 
134 	return rte_acl_classify_scalar(ctx, data, results, num, categories);
135 }
136 
137 #include "acl_run_avx512x16.h"
138 
139 int
rte_acl_classify_avx512x32(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t num,uint32_t categories)140 rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data,
141 	uint32_t *results, uint32_t num, uint32_t categories)
142 {
143 	const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;
144 
145 	/* split huge lookup (gt 256) into series of fixed size ones */
146 	while (num > max_iter) {
147 		search_avx512x16x2(ctx, data, results, max_iter, categories);
148 		data += max_iter;
149 		results += max_iter * categories;
150 		num -= max_iter;
151 	}
152 
153 	/* select classify method based on number of remaining requests */
154 	if (num >= 2 * MAX_SEARCHES_AVX16)
155 		return search_avx512x16x2(ctx, data, results, num, categories);
156 	if (num >= MAX_SEARCHES_AVX16)
157 		return search_avx512x8x2(ctx, data, results, num, categories);
158 	if (num >= MAX_SEARCHES_SSE8)
159 		return search_sse_8(ctx, data, results, num, categories);
160 	if (num >= MAX_SEARCHES_SSE4)
161 		return search_sse_4(ctx, data, results, num, categories);
162 
163 	return rte_acl_classify_scalar(ctx, data, results, num, categories);
164 }
165