199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson * Copyright(c) 2020 Intel Corporation
399a2dd95SBruce Richardson */
499a2dd95SBruce Richardson
599a2dd95SBruce Richardson /*
699a2dd95SBruce Richardson * Defines required by "acl_run_avx512_common.h".
799a2dd95SBruce Richardson * Note that all of them has to be undefined by the end
899a2dd95SBruce Richardson * of this file, as "acl_run_avx512_common.h" can be included several
999a2dd95SBruce Richardson * times from different *.h files for the same *.c.
1099a2dd95SBruce Richardson */
1199a2dd95SBruce Richardson
1299a2dd95SBruce Richardson /*
13*7be78d02SJosh Soref * This implementation uses 256-bit registers(ymm) and intrinsics.
1499a2dd95SBruce Richardson * So our main SIMD type is 256-bit width and each such variable can
1599a2dd95SBruce Richardson * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
1699a2dd95SBruce Richardson */
1799a2dd95SBruce Richardson #define _T_simd __m256i
1899a2dd95SBruce Richardson #define _T_mask __mmask8
1999a2dd95SBruce Richardson
2099a2dd95SBruce Richardson /* Naming convention for static const variables. */
2199a2dd95SBruce Richardson #define _SC_(x) ymm_##x
2299a2dd95SBruce Richardson #define _SV_(x) (ymm_##x.y)
2399a2dd95SBruce Richardson
2499a2dd95SBruce Richardson /* Naming convention for internal functions. */
2599a2dd95SBruce Richardson #define _F_(x) x##_avx512x8
2699a2dd95SBruce Richardson
2799a2dd95SBruce Richardson /*
28*7be78d02SJosh Soref * Same intrinsics have different syntaxes (depending on the bit-width),
2999a2dd95SBruce Richardson * so to overcome that few macros need to be defined.
3099a2dd95SBruce Richardson */
3199a2dd95SBruce Richardson
32*7be78d02SJosh Soref /* Naming convention for generic epi(packed integers) type intrinsics. */
3399a2dd95SBruce Richardson #define _M_I_(x) _mm256_##x
3499a2dd95SBruce Richardson
35*7be78d02SJosh Soref /* Naming convention for si(whole simd integer) type intrinsics. */
3699a2dd95SBruce Richardson #define _M_SI_(x) _mm256_##x##_si256
3799a2dd95SBruce Richardson
38*7be78d02SJosh Soref /* Naming convention for masked gather type intrinsics. */
3999a2dd95SBruce Richardson #define _M_MGI_(x) _mm256_m##x
4099a2dd95SBruce Richardson
41*7be78d02SJosh Soref /* Naming convention for gather type intrinsics. */
4299a2dd95SBruce Richardson #define _M_GI_(name, idx, base, scale) _mm256_##name(base, idx, scale)
4399a2dd95SBruce Richardson
4499a2dd95SBruce Richardson /* num/mask of transitions per SIMD regs */
4599a2dd95SBruce Richardson #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
4699a2dd95SBruce Richardson #define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
4799a2dd95SBruce Richardson
4899a2dd95SBruce Richardson #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
4999a2dd95SBruce Richardson #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
5099a2dd95SBruce Richardson
5199a2dd95SBruce Richardson /* num/mask of pointers per SIMD regs */
5299a2dd95SBruce Richardson #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
5399a2dd95SBruce Richardson #define _SIMD_PTR_MSK_ RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
5499a2dd95SBruce Richardson
5599a2dd95SBruce Richardson static const rte_ymm_t _SC_(match_mask) = {
5699a2dd95SBruce Richardson .u32 = {
5799a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
5899a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
5999a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
6099a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
6199a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
6299a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
6399a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
6499a2dd95SBruce Richardson RTE_ACL_NODE_MATCH,
6599a2dd95SBruce Richardson },
6699a2dd95SBruce Richardson };
6799a2dd95SBruce Richardson
6899a2dd95SBruce Richardson static const rte_ymm_t _SC_(index_mask) = {
6999a2dd95SBruce Richardson .u32 = {
7099a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7199a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7299a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7399a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7499a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7599a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7699a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7799a2dd95SBruce Richardson RTE_ACL_NODE_INDEX,
7899a2dd95SBruce Richardson },
7999a2dd95SBruce Richardson };
8099a2dd95SBruce Richardson
8199a2dd95SBruce Richardson static const rte_ymm_t _SC_(trlo_idle) = {
8299a2dd95SBruce Richardson .u32 = {
8399a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
8499a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
8599a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
8699a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
8799a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
8899a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
8999a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
9099a2dd95SBruce Richardson RTE_ACL_IDLE_NODE,
9199a2dd95SBruce Richardson },
9299a2dd95SBruce Richardson };
9399a2dd95SBruce Richardson
9499a2dd95SBruce Richardson static const rte_ymm_t _SC_(trhi_idle) = {
9599a2dd95SBruce Richardson .u32 = {
9699a2dd95SBruce Richardson 0, 0, 0, 0,
9799a2dd95SBruce Richardson 0, 0, 0, 0,
9899a2dd95SBruce Richardson },
9999a2dd95SBruce Richardson };
10099a2dd95SBruce Richardson
10199a2dd95SBruce Richardson static const rte_ymm_t _SC_(shuffle_input) = {
10299a2dd95SBruce Richardson .u32 = {
10399a2dd95SBruce Richardson 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
10499a2dd95SBruce Richardson 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
10599a2dd95SBruce Richardson },
10699a2dd95SBruce Richardson };
10799a2dd95SBruce Richardson
10899a2dd95SBruce Richardson static const rte_ymm_t _SC_(four_32) = {
10999a2dd95SBruce Richardson .u32 = {
11099a2dd95SBruce Richardson 4, 4, 4, 4,
11199a2dd95SBruce Richardson 4, 4, 4, 4,
11299a2dd95SBruce Richardson },
11399a2dd95SBruce Richardson };
11499a2dd95SBruce Richardson
11599a2dd95SBruce Richardson static const rte_ymm_t _SC_(idx_add) = {
11699a2dd95SBruce Richardson .u32 = {
11799a2dd95SBruce Richardson 0, 1, 2, 3,
11899a2dd95SBruce Richardson 4, 5, 6, 7,
11999a2dd95SBruce Richardson },
12099a2dd95SBruce Richardson };
12199a2dd95SBruce Richardson
12299a2dd95SBruce Richardson static const rte_ymm_t _SC_(range_base) = {
12399a2dd95SBruce Richardson .u32 = {
12499a2dd95SBruce Richardson 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
12599a2dd95SBruce Richardson 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
12699a2dd95SBruce Richardson },
12799a2dd95SBruce Richardson };
12899a2dd95SBruce Richardson
12999a2dd95SBruce Richardson static const rte_ymm_t _SC_(pminp) = {
13099a2dd95SBruce Richardson .u32 = {
13199a2dd95SBruce Richardson 0x00, 0x01, 0x02, 0x03,
13299a2dd95SBruce Richardson 0x08, 0x09, 0x0a, 0x0b,
13399a2dd95SBruce Richardson },
13499a2dd95SBruce Richardson };
13599a2dd95SBruce Richardson
13699a2dd95SBruce Richardson static const __mmask16 _SC_(pmidx_msk) = 0x55;
13799a2dd95SBruce Richardson
13899a2dd95SBruce Richardson static const rte_ymm_t _SC_(pmidx[2]) = {
13999a2dd95SBruce Richardson [0] = {
14099a2dd95SBruce Richardson .u32 = {
14199a2dd95SBruce Richardson 0, 0, 1, 0, 2, 0, 3, 0,
14299a2dd95SBruce Richardson },
14399a2dd95SBruce Richardson },
14499a2dd95SBruce Richardson [1] = {
14599a2dd95SBruce Richardson .u32 = {
14699a2dd95SBruce Richardson 4, 0, 5, 0, 6, 0, 7, 0,
14799a2dd95SBruce Richardson },
14899a2dd95SBruce Richardson },
14999a2dd95SBruce Richardson };
15099a2dd95SBruce Richardson
15199a2dd95SBruce Richardson /*
15299a2dd95SBruce Richardson * unfortunately current AVX512 ISA doesn't provide ability for
15399a2dd95SBruce Richardson * gather load on a byte quantity. So we have to mimic it in SW,
15499a2dd95SBruce Richardson * by doing 4x1B scalar loads.
15599a2dd95SBruce Richardson */
15699a2dd95SBruce Richardson static inline __m128i
_m256_mask_gather_epi8x4(__m256i pdata,__mmask8 mask)15799a2dd95SBruce Richardson _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
15899a2dd95SBruce Richardson {
15999a2dd95SBruce Richardson rte_xmm_t v;
16099a2dd95SBruce Richardson rte_ymm_t p;
16199a2dd95SBruce Richardson
16299a2dd95SBruce Richardson static const uint32_t zero;
16399a2dd95SBruce Richardson
16499a2dd95SBruce Richardson p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
16599a2dd95SBruce Richardson (uintptr_t)&zero);
16699a2dd95SBruce Richardson
16799a2dd95SBruce Richardson v.u32[0] = *(uint8_t *)p.u64[0];
16899a2dd95SBruce Richardson v.u32[1] = *(uint8_t *)p.u64[1];
16999a2dd95SBruce Richardson v.u32[2] = *(uint8_t *)p.u64[2];
17099a2dd95SBruce Richardson v.u32[3] = *(uint8_t *)p.u64[3];
17199a2dd95SBruce Richardson
17299a2dd95SBruce Richardson return v.x;
17399a2dd95SBruce Richardson }
17499a2dd95SBruce Richardson
17599a2dd95SBruce Richardson /*
17699a2dd95SBruce Richardson * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
17799a2dd95SBruce Richardson */
17899a2dd95SBruce Richardson static __rte_always_inline __m256i
_F_(gather_bytes)17999a2dd95SBruce Richardson _F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2],
18099a2dd95SBruce Richardson uint32_t bnum)
18199a2dd95SBruce Richardson {
18299a2dd95SBruce Richardson __m128i inp[2];
18399a2dd95SBruce Richardson
18499a2dd95SBruce Richardson if (bnum == sizeof(uint8_t)) {
18599a2dd95SBruce Richardson inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
18699a2dd95SBruce Richardson inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
18799a2dd95SBruce Richardson } else {
18899a2dd95SBruce Richardson inp[0] = _mm256_mmask_i64gather_epi32(
18999a2dd95SBruce Richardson _mm256_castsi256_si128(zero),
19099a2dd95SBruce Richardson m[0], p[0], NULL, sizeof(uint8_t));
19199a2dd95SBruce Richardson inp[1] = _mm256_mmask_i64gather_epi32(
19299a2dd95SBruce Richardson _mm256_castsi256_si128(zero),
19399a2dd95SBruce Richardson m[1], p[1], NULL, sizeof(uint8_t));
19499a2dd95SBruce Richardson }
19599a2dd95SBruce Richardson
19699a2dd95SBruce Richardson /* squeeze input into one 256-bit register */
19799a2dd95SBruce Richardson return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
19899a2dd95SBruce Richardson _SV_(pminp), _mm256_castsi128_si256(inp[1]));
19999a2dd95SBruce Richardson }
20099a2dd95SBruce Richardson
20199a2dd95SBruce Richardson #include "acl_run_avx512_common.h"
20299a2dd95SBruce Richardson
20399a2dd95SBruce Richardson /*
20499a2dd95SBruce Richardson * Perform search for up to (2 * 8) flows in parallel.
20599a2dd95SBruce Richardson * Use two sets of metadata, each serves 8 flows max.
20699a2dd95SBruce Richardson */
20799a2dd95SBruce Richardson static inline int
search_avx512x8x2(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t total_packets,uint32_t categories)20899a2dd95SBruce Richardson search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
20999a2dd95SBruce Richardson uint32_t *results, uint32_t total_packets, uint32_t categories)
21099a2dd95SBruce Richardson {
21199a2dd95SBruce Richardson uint32_t i, *pm;
21299a2dd95SBruce Richardson const struct rte_acl_match_results *pr;
21399a2dd95SBruce Richardson struct acl_flow_avx512 flow;
21499a2dd95SBruce Richardson uint32_t match[ctx->num_tries * total_packets];
21599a2dd95SBruce Richardson
21699a2dd95SBruce Richardson for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
21799a2dd95SBruce Richardson
21899a2dd95SBruce Richardson /* setup for next trie */
21999a2dd95SBruce Richardson acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
22099a2dd95SBruce Richardson
22199a2dd95SBruce Richardson /* process the trie */
22299a2dd95SBruce Richardson _F_(search_trie)(&flow);
22399a2dd95SBruce Richardson }
22499a2dd95SBruce Richardson
22599a2dd95SBruce Richardson /* resolve matches */
22699a2dd95SBruce Richardson pr = (const struct rte_acl_match_results *)
22799a2dd95SBruce Richardson (ctx->trans_table + ctx->match_index);
22899a2dd95SBruce Richardson
22999a2dd95SBruce Richardson if (categories == 1)
23099a2dd95SBruce Richardson _F_(resolve_single_cat)(results, pr, match, total_packets,
23199a2dd95SBruce Richardson ctx->num_tries);
23299a2dd95SBruce Richardson else
23399a2dd95SBruce Richardson resolve_mcle8_avx512x1(results, pr, match, total_packets,
23499a2dd95SBruce Richardson categories, ctx->num_tries);
23599a2dd95SBruce Richardson
23699a2dd95SBruce Richardson return 0;
23799a2dd95SBruce Richardson }
23899a2dd95SBruce Richardson
23999a2dd95SBruce Richardson #undef _SIMD_PTR_MSK_
24099a2dd95SBruce Richardson #undef _SIMD_PTR_NUM_
24199a2dd95SBruce Richardson #undef _SIMD_FLOW_MSK_
24299a2dd95SBruce Richardson #undef _SIMD_FLOW_NUM_
24399a2dd95SBruce Richardson #undef _SIMD_MASK_MAX_
24499a2dd95SBruce Richardson #undef _SIMD_MASK_BIT_
24599a2dd95SBruce Richardson #undef _M_GI_
24699a2dd95SBruce Richardson #undef _M_MGI_
24799a2dd95SBruce Richardson #undef _M_SI_
24899a2dd95SBruce Richardson #undef _M_I_
24999a2dd95SBruce Richardson #undef _F_
25099a2dd95SBruce Richardson #undef _SV_
25199a2dd95SBruce Richardson #undef _SC_
25299a2dd95SBruce Richardson #undef _T_mask
25399a2dd95SBruce Richardson #undef _T_simd
254