199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson * Copyright(c) 2020 Intel Corporation
399a2dd95SBruce Richardson */
499a2dd95SBruce Richardson
599a2dd95SBruce Richardson #include "acl_run_sse.h"
699a2dd95SBruce Richardson
7b3b36f0fSKonstantin Ananyev /*sizeof(uint32_t) << ACL_MATCH_LOG == sizeof(struct rte_acl_match_results)*/
8b3b36f0fSKonstantin Ananyev #define ACL_MATCH_LOG 5
999a2dd95SBruce Richardson
1099a2dd95SBruce Richardson struct acl_flow_avx512 {
1199a2dd95SBruce Richardson uint32_t num_packets; /* number of packets processed */
1299a2dd95SBruce Richardson uint32_t total_packets; /* max number of packets to process */
1399a2dd95SBruce Richardson uint32_t root_index; /* current root index */
1499a2dd95SBruce Richardson uint32_t first_load_sz; /* first load size for new packet */
1599a2dd95SBruce Richardson const uint64_t *trans; /* transition table */
1699a2dd95SBruce Richardson const uint32_t *data_index; /* input data indexes */
1799a2dd95SBruce Richardson const uint8_t **idata; /* input data */
1899a2dd95SBruce Richardson uint32_t *matches; /* match indexes */
1999a2dd95SBruce Richardson };
2099a2dd95SBruce Richardson
2199a2dd95SBruce Richardson static inline void
acl_set_flow_avx512(struct acl_flow_avx512 * flow,const struct rte_acl_ctx * ctx,uint32_t trie,const uint8_t * data[],uint32_t * matches,uint32_t total_packets)2299a2dd95SBruce Richardson acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx,
2399a2dd95SBruce Richardson uint32_t trie, const uint8_t *data[], uint32_t *matches,
2499a2dd95SBruce Richardson uint32_t total_packets)
2599a2dd95SBruce Richardson {
2699a2dd95SBruce Richardson flow->num_packets = 0;
2799a2dd95SBruce Richardson flow->total_packets = total_packets;
2899a2dd95SBruce Richardson flow->first_load_sz = ctx->first_load_sz;
2999a2dd95SBruce Richardson flow->root_index = ctx->trie[trie].root_index;
3099a2dd95SBruce Richardson flow->trans = ctx->trans_table;
3199a2dd95SBruce Richardson flow->data_index = ctx->trie[trie].data_index;
3299a2dd95SBruce Richardson flow->idata = data;
3399a2dd95SBruce Richardson flow->matches = matches;
3499a2dd95SBruce Richardson }
3599a2dd95SBruce Richardson
3699a2dd95SBruce Richardson /*
3799a2dd95SBruce Richardson * Update flow and result masks based on the number of unprocessed flows.
3899a2dd95SBruce Richardson */
3999a2dd95SBruce Richardson static inline uint32_t
update_flow_mask(const struct acl_flow_avx512 * flow,uint32_t * fmsk,uint32_t * rmsk)4099a2dd95SBruce Richardson update_flow_mask(const struct acl_flow_avx512 *flow, uint32_t *fmsk,
4199a2dd95SBruce Richardson uint32_t *rmsk)
4299a2dd95SBruce Richardson {
4399a2dd95SBruce Richardson uint32_t i, j, k, m, n;
4499a2dd95SBruce Richardson
4599a2dd95SBruce Richardson fmsk[0] ^= rmsk[0];
4699a2dd95SBruce Richardson m = rmsk[0];
4799a2dd95SBruce Richardson
48*3d4e27fdSDavid Marchand k = rte_popcount32(m);
4999a2dd95SBruce Richardson n = flow->total_packets - flow->num_packets;
5099a2dd95SBruce Richardson
5199a2dd95SBruce Richardson if (n < k) {
5299a2dd95SBruce Richardson /* reduce mask */
5399a2dd95SBruce Richardson for (i = k - n; i != 0; i--) {
54*3d4e27fdSDavid Marchand j = sizeof(m) * CHAR_BIT - 1 - rte_clz32(m);
5599a2dd95SBruce Richardson m ^= 1 << j;
5699a2dd95SBruce Richardson }
5799a2dd95SBruce Richardson } else
5899a2dd95SBruce Richardson n = k;
5999a2dd95SBruce Richardson
6099a2dd95SBruce Richardson rmsk[0] = m;
6199a2dd95SBruce Richardson fmsk[0] |= rmsk[0];
6299a2dd95SBruce Richardson
6399a2dd95SBruce Richardson return n;
6499a2dd95SBruce Richardson }
6599a2dd95SBruce Richardson
6699a2dd95SBruce Richardson /*
677be78d02SJosh Soref * Resolve matches for multiple categories (LE 8, use 128b instructions/regs)
6899a2dd95SBruce Richardson */
6999a2dd95SBruce Richardson static inline void
resolve_mcle8_avx512x1(uint32_t result[],const struct rte_acl_match_results pr[],const uint32_t match[],uint32_t nb_pkt,uint32_t nb_cat,uint32_t nb_trie)7099a2dd95SBruce Richardson resolve_mcle8_avx512x1(uint32_t result[],
7199a2dd95SBruce Richardson const struct rte_acl_match_results pr[], const uint32_t match[],
7299a2dd95SBruce Richardson uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
7399a2dd95SBruce Richardson {
7499a2dd95SBruce Richardson const int32_t *pri;
7599a2dd95SBruce Richardson const uint32_t *pm, *res;
7699a2dd95SBruce Richardson uint32_t i, j, k, mi, mn;
7799a2dd95SBruce Richardson __mmask8 msk;
7899a2dd95SBruce Richardson xmm_t cp, cr, np, nr;
7999a2dd95SBruce Richardson
8099a2dd95SBruce Richardson res = pr->results;
8199a2dd95SBruce Richardson pri = pr->priority;
8299a2dd95SBruce Richardson
8399a2dd95SBruce Richardson for (k = 0; k != nb_pkt; k++, result += nb_cat) {
8499a2dd95SBruce Richardson
85b3b36f0fSKonstantin Ananyev mi = match[k] << ACL_MATCH_LOG;
8699a2dd95SBruce Richardson
8799a2dd95SBruce Richardson for (j = 0; j != nb_cat; j += RTE_ACL_RESULTS_MULTIPLIER) {
8899a2dd95SBruce Richardson
8999a2dd95SBruce Richardson cr = _mm_loadu_si128((const xmm_t *)(res + mi + j));
9099a2dd95SBruce Richardson cp = _mm_loadu_si128((const xmm_t *)(pri + mi + j));
9199a2dd95SBruce Richardson
9299a2dd95SBruce Richardson for (i = 1, pm = match + nb_pkt; i != nb_trie;
9399a2dd95SBruce Richardson i++, pm += nb_pkt) {
9499a2dd95SBruce Richardson
95b3b36f0fSKonstantin Ananyev mn = j + (pm[k] << ACL_MATCH_LOG);
9699a2dd95SBruce Richardson
9799a2dd95SBruce Richardson nr = _mm_loadu_si128((const xmm_t *)(res + mn));
9899a2dd95SBruce Richardson np = _mm_loadu_si128((const xmm_t *)(pri + mn));
9999a2dd95SBruce Richardson
10099a2dd95SBruce Richardson msk = _mm_cmpgt_epi32_mask(cp, np);
10199a2dd95SBruce Richardson cr = _mm_mask_mov_epi32(nr, msk, cr);
10299a2dd95SBruce Richardson cp = _mm_mask_mov_epi32(np, msk, cp);
10399a2dd95SBruce Richardson }
10499a2dd95SBruce Richardson
10599a2dd95SBruce Richardson _mm_storeu_si128((xmm_t *)(result + j), cr);
10699a2dd95SBruce Richardson }
10799a2dd95SBruce Richardson }
10899a2dd95SBruce Richardson }
10999a2dd95SBruce Richardson
11099a2dd95SBruce Richardson #include "acl_run_avx512x8.h"
11199a2dd95SBruce Richardson
11299a2dd95SBruce Richardson int
rte_acl_classify_avx512x16(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t num,uint32_t categories)11399a2dd95SBruce Richardson rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
11499a2dd95SBruce Richardson uint32_t *results, uint32_t num, uint32_t categories)
11599a2dd95SBruce Richardson {
11699a2dd95SBruce Richardson const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;
11799a2dd95SBruce Richardson
11899a2dd95SBruce Richardson /* split huge lookup (gt 256) into series of fixed size ones */
11999a2dd95SBruce Richardson while (num > max_iter) {
12099a2dd95SBruce Richardson search_avx512x8x2(ctx, data, results, max_iter, categories);
12199a2dd95SBruce Richardson data += max_iter;
12299a2dd95SBruce Richardson results += max_iter * categories;
12399a2dd95SBruce Richardson num -= max_iter;
12499a2dd95SBruce Richardson }
12599a2dd95SBruce Richardson
12699a2dd95SBruce Richardson /* select classify method based on number of remaining requests */
12799a2dd95SBruce Richardson if (num >= MAX_SEARCHES_AVX16)
12899a2dd95SBruce Richardson return search_avx512x8x2(ctx, data, results, num, categories);
12999a2dd95SBruce Richardson if (num >= MAX_SEARCHES_SSE8)
13099a2dd95SBruce Richardson return search_sse_8(ctx, data, results, num, categories);
13199a2dd95SBruce Richardson if (num >= MAX_SEARCHES_SSE4)
13299a2dd95SBruce Richardson return search_sse_4(ctx, data, results, num, categories);
13399a2dd95SBruce Richardson
13499a2dd95SBruce Richardson return rte_acl_classify_scalar(ctx, data, results, num, categories);
13599a2dd95SBruce Richardson }
13699a2dd95SBruce Richardson
13799a2dd95SBruce Richardson #include "acl_run_avx512x16.h"
13899a2dd95SBruce Richardson
13999a2dd95SBruce Richardson int
rte_acl_classify_avx512x32(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t num,uint32_t categories)14099a2dd95SBruce Richardson rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data,
14199a2dd95SBruce Richardson uint32_t *results, uint32_t num, uint32_t categories)
14299a2dd95SBruce Richardson {
14399a2dd95SBruce Richardson const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;
14499a2dd95SBruce Richardson
14599a2dd95SBruce Richardson /* split huge lookup (gt 256) into series of fixed size ones */
14699a2dd95SBruce Richardson while (num > max_iter) {
14799a2dd95SBruce Richardson search_avx512x16x2(ctx, data, results, max_iter, categories);
14899a2dd95SBruce Richardson data += max_iter;
14999a2dd95SBruce Richardson results += max_iter * categories;
15099a2dd95SBruce Richardson num -= max_iter;
15199a2dd95SBruce Richardson }
15299a2dd95SBruce Richardson
15399a2dd95SBruce Richardson /* select classify method based on number of remaining requests */
15499a2dd95SBruce Richardson if (num >= 2 * MAX_SEARCHES_AVX16)
15599a2dd95SBruce Richardson return search_avx512x16x2(ctx, data, results, num, categories);
15699a2dd95SBruce Richardson if (num >= MAX_SEARCHES_AVX16)
15799a2dd95SBruce Richardson return search_avx512x8x2(ctx, data, results, num, categories);
15899a2dd95SBruce Richardson if (num >= MAX_SEARCHES_SSE8)
15999a2dd95SBruce Richardson return search_sse_8(ctx, data, results, num, categories);
16099a2dd95SBruce Richardson if (num >= MAX_SEARCHES_SSE4)
16199a2dd95SBruce Richardson return search_sse_4(ctx, data, results, num, categories);
16299a2dd95SBruce Richardson
16399a2dd95SBruce Richardson return rte_acl_classify_scalar(ctx, data, results, num, categories);
16499a2dd95SBruce Richardson }
165