1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include "acl_run_sse.h" 6 7 /*sizeof(uint32_t) << ACL_MATCH_LOG == sizeof(struct rte_acl_match_results)*/ 8 #define ACL_MATCH_LOG 5 9 10 struct acl_flow_avx512 { 11 uint32_t num_packets; /* number of packets processed */ 12 uint32_t total_packets; /* max number of packets to process */ 13 uint32_t root_index; /* current root index */ 14 uint32_t first_load_sz; /* first load size for new packet */ 15 const uint64_t *trans; /* transition table */ 16 const uint32_t *data_index; /* input data indexes */ 17 const uint8_t **idata; /* input data */ 18 uint32_t *matches; /* match indexes */ 19 }; 20 21 static inline void 22 acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx, 23 uint32_t trie, const uint8_t *data[], uint32_t *matches, 24 uint32_t total_packets) 25 { 26 flow->num_packets = 0; 27 flow->total_packets = total_packets; 28 flow->first_load_sz = ctx->first_load_sz; 29 flow->root_index = ctx->trie[trie].root_index; 30 flow->trans = ctx->trans_table; 31 flow->data_index = ctx->trie[trie].data_index; 32 flow->idata = data; 33 flow->matches = matches; 34 } 35 36 /* 37 * Update flow and result masks based on the number of unprocessed flows. 38 */ 39 static inline uint32_t 40 update_flow_mask(const struct acl_flow_avx512 *flow, uint32_t *fmsk, 41 uint32_t *rmsk) 42 { 43 uint32_t i, j, k, m, n; 44 45 fmsk[0] ^= rmsk[0]; 46 m = rmsk[0]; 47 48 k = __builtin_popcount(m); 49 n = flow->total_packets - flow->num_packets; 50 51 if (n < k) { 52 /* reduce mask */ 53 for (i = k - n; i != 0; i--) { 54 j = sizeof(m) * CHAR_BIT - 1 - __builtin_clz(m); 55 m ^= 1 << j; 56 } 57 } else 58 n = k; 59 60 rmsk[0] = m; 61 fmsk[0] |= rmsk[0]; 62 63 return n; 64 } 65 66 /* 67 * Resolve matches for multiple categories (LE 8, use 128b instuctions/regs) 68 */ 69 static inline void 70 resolve_mcle8_avx512x1(uint32_t result[], 71 const struct rte_acl_match_results pr[], const uint32_t match[], 72 uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie) 73 { 74 const int32_t *pri; 75 const uint32_t *pm, *res; 76 uint32_t i, j, k, mi, mn; 77 __mmask8 msk; 78 xmm_t cp, cr, np, nr; 79 80 res = pr->results; 81 pri = pr->priority; 82 83 for (k = 0; k != nb_pkt; k++, result += nb_cat) { 84 85 mi = match[k] << ACL_MATCH_LOG; 86 87 for (j = 0; j != nb_cat; j += RTE_ACL_RESULTS_MULTIPLIER) { 88 89 cr = _mm_loadu_si128((const xmm_t *)(res + mi + j)); 90 cp = _mm_loadu_si128((const xmm_t *)(pri + mi + j)); 91 92 for (i = 1, pm = match + nb_pkt; i != nb_trie; 93 i++, pm += nb_pkt) { 94 95 mn = j + (pm[k] << ACL_MATCH_LOG); 96 97 nr = _mm_loadu_si128((const xmm_t *)(res + mn)); 98 np = _mm_loadu_si128((const xmm_t *)(pri + mn)); 99 100 msk = _mm_cmpgt_epi32_mask(cp, np); 101 cr = _mm_mask_mov_epi32(nr, msk, cr); 102 cp = _mm_mask_mov_epi32(np, msk, cp); 103 } 104 105 _mm_storeu_si128((xmm_t *)(result + j), cr); 106 } 107 } 108 } 109 110 #include "acl_run_avx512x8.h" 111 112 int 113 rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data, 114 uint32_t *results, uint32_t num, uint32_t categories) 115 { 116 const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16; 117 118 /* split huge lookup (gt 256) into series of fixed size ones */ 119 while (num > max_iter) { 120 search_avx512x8x2(ctx, data, results, max_iter, categories); 121 data += max_iter; 122 results += max_iter * categories; 123 num -= max_iter; 124 } 125 126 /* select classify method based on number of remaining requests */ 127 if (num >= MAX_SEARCHES_AVX16) 128 return search_avx512x8x2(ctx, data, results, num, categories); 129 if (num >= MAX_SEARCHES_SSE8) 130 return search_sse_8(ctx, data, results, num, categories); 131 if (num >= MAX_SEARCHES_SSE4) 132 return search_sse_4(ctx, data, results, num, categories); 133 134 return rte_acl_classify_scalar(ctx, data, results, num, categories); 135 } 136 137 #include "acl_run_avx512x16.h" 138 139 int 140 rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data, 141 uint32_t *results, uint32_t num, uint32_t categories) 142 { 143 const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16; 144 145 /* split huge lookup (gt 256) into series of fixed size ones */ 146 while (num > max_iter) { 147 search_avx512x16x2(ctx, data, results, max_iter, categories); 148 data += max_iter; 149 results += max_iter * categories; 150 num -= max_iter; 151 } 152 153 /* select classify method based on number of remaining requests */ 154 if (num >= 2 * MAX_SEARCHES_AVX16) 155 return search_avx512x16x2(ctx, data, results, num, categories); 156 if (num >= MAX_SEARCHES_AVX16) 157 return search_avx512x8x2(ctx, data, results, num, categories); 158 if (num >= MAX_SEARCHES_SSE8) 159 return search_sse_8(ctx, data, results, num, categories); 160 if (num >= MAX_SEARCHES_SSE4) 161 return search_sse_4(ctx, data, results, num, categories); 162 163 return rte_acl_classify_scalar(ctx, data, results, num, categories); 164 } 165