1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2015 Cavium, Inc 3 */ 4 5 #include <stdalign.h> 6 7 #include "acl_run.h" 8 #include "acl_vect.h" 9 10 alignas(RTE_CACHE_LINE_SIZE) struct _neon_acl_const { 11 rte_xmm_t xmm_shuffle_input; 12 rte_xmm_t xmm_index_mask; 13 rte_xmm_t range_base; 14 } neon_acl_const = { 15 { 16 .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c} 17 }, 18 { 19 .u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, 20 RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX} 21 }, 22 { 23 .u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c} 24 }, 25 }; 26 27 /* 28 * Resolve priority for multiple results (neon version). 29 * This consists comparing the priority of the current traversal with the 30 * running set of results for the packet. 31 * For each result, keep a running array of the result (rule number) and 32 * its priority for each category. 33 */ 34 static inline void 35 resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx, 36 struct parms *parms, 37 const struct rte_acl_match_results *p, 38 uint32_t categories) 39 { 40 uint32_t x; 41 int32x4_t results, priority, results1, priority1; 42 uint32x4_t selector; 43 int32_t *saved_results, *saved_priority; 44 45 for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { 46 saved_results = (int32_t *)(&parms[n].cmplt->results[x]); 47 saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]); 48 49 /* get results and priorities for completed trie */ 50 results = vld1q_s32( 51 (const int32_t *)&p[transition].results[x]); 52 priority = vld1q_s32( 53 (const int32_t *)&p[transition].priority[x]); 54 55 /* if this is not the first completed trie */ 56 if (parms[n].cmplt->count != ctx->num_tries) { 57 /* get running best results and their priorities */ 58 results1 = vld1q_s32(saved_results); 59 priority1 = vld1q_s32(saved_priority); 60 61 /* select results that are highest priority */ 62 selector = vcgtq_s32(priority1, priority); 63 results = vbslq_s32(selector, results1, results); 64 priority = vbslq_s32(selector, priority1, priority); 65 } 66 67 /* save running best results and their priorities */ 68 vst1q_s32(saved_results, results); 69 vst1q_s32(saved_priority, priority); 70 } 71 } 72 73 /* 74 * Check for any match in 4 transitions 75 */ 76 static __rte_always_inline uint32_t 77 check_any_match_x4(uint64_t val[]) 78 { 79 return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; 80 } 81 82 static __rte_always_inline void 83 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, 84 struct acl_flow_data *flows, uint64_t transitions[]) 85 { 86 while (check_any_match_x4(transitions)) { 87 transitions[0] = acl_match_check(transitions[0], slot, ctx, 88 parms, flows, resolve_priority_neon); 89 transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, 90 parms, flows, resolve_priority_neon); 91 transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, 92 parms, flows, resolve_priority_neon); 93 transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, 94 parms, flows, resolve_priority_neon); 95 } 96 } 97 98 /* 99 * Process 4 transitions (in 2 NEON Q registers) in parallel 100 */ 101 static __rte_always_inline int32x4_t 102 transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[]) 103 { 104 int32x4x2_t tr_hi_lo; 105 int32x4_t t, in, r; 106 uint32x4_t index_msk, node_type, addr; 107 uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs; 108 109 /* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */ 110 tr_hi_lo = vld2q_s32((const int32_t *)transitions); 111 112 /* Calculate the address (array index) for all 4 transitions. */ 113 114 index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask); 115 116 /* Calc node type and node addr */ 117 node_type = vbicq_s32(tr_hi_lo.val[0], index_msk); 118 addr = vandq_s32(tr_hi_lo.val[0], index_msk); 119 120 /* t = 0 */ 121 t = veorq_s32(node_type, node_type); 122 123 /* mask for DFA type(0) nodes */ 124 dfa_msk = vceqq_u32(node_type, t); 125 126 mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input); 127 in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask); 128 129 /* DFA calculations. */ 130 r = vshrq_n_u32(in, 30); /* div by 64 */ 131 mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base); 132 r = vaddq_u8(r, mask); 133 t = vshrq_n_u32(in, 24); 134 r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r); 135 dfa_ofs = vsubq_s32(t, r); 136 137 /* QUAD/SINGLE calculations. */ 138 t = vcgtq_s8(in, tr_hi_lo.val[1]); 139 t = vabsq_s8(t); 140 t = vpaddlq_u8(t); 141 quad_ofs = vpaddlq_u16(t); 142 143 /* blend DFA and QUAD/SINGLE. */ 144 t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs); 145 146 /* calculate address for next transitions */ 147 addr = vaddq_u32(addr, t); 148 149 /* Fill next transitions */ 150 transitions[0] = trans[vgetq_lane_u32(addr, 0)]; 151 transitions[1] = trans[vgetq_lane_u32(addr, 1)]; 152 transitions[2] = trans[vgetq_lane_u32(addr, 2)]; 153 transitions[3] = trans[vgetq_lane_u32(addr, 3)]; 154 155 return vshrq_n_u32(next_input, CHAR_BIT); 156 } 157 158 /* 159 * Execute trie traversal with 8 traversals in parallel 160 */ 161 static inline int 162 search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data, 163 uint32_t *results, uint32_t total_packets, uint32_t categories) 164 { 165 int n; 166 struct acl_flow_data flows; 167 uint64_t index_array[8]; 168 struct completion cmplt[8]; 169 struct parms parms[8]; 170 int32x4_t input0, input1; 171 172 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 173 total_packets, categories, ctx->trans_table); 174 175 for (n = 0; n < 8; n++) { 176 cmplt[n].count = 0; 177 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 178 } 179 180 /* Check for any matches. */ 181 acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); 182 acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); 183 184 while (flows.started > 0) { 185 /* Gather 4 bytes of input data for each stream. */ 186 input0 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0)); 187 input1 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 4)); 188 189 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1); 190 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1); 191 192 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2); 193 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2); 194 195 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3); 196 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3); 197 198 /* Process the 4 bytes of input on each stream. */ 199 200 input0 = transition4(input0, flows.trans, &index_array[0]); 201 input1 = transition4(input1, flows.trans, &index_array[4]); 202 203 input0 = transition4(input0, flows.trans, &index_array[0]); 204 input1 = transition4(input1, flows.trans, &index_array[4]); 205 206 input0 = transition4(input0, flows.trans, &index_array[0]); 207 input1 = transition4(input1, flows.trans, &index_array[4]); 208 209 input0 = transition4(input0, flows.trans, &index_array[0]); 210 input1 = transition4(input1, flows.trans, &index_array[4]); 211 212 /* Check for any matches. */ 213 acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); 214 acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); 215 } 216 217 return 0; 218 } 219 220 /* 221 * Execute trie traversal with 4 traversals in parallel 222 */ 223 static inline int 224 search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data, 225 uint32_t *results, int total_packets, uint32_t categories) 226 { 227 int n; 228 struct acl_flow_data flows; 229 uint64_t index_array[4]; 230 struct completion cmplt[4]; 231 struct parms parms[4]; 232 int32x4_t input; 233 234 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 235 total_packets, categories, ctx->trans_table); 236 237 for (n = 0; n < 4; n++) { 238 cmplt[n].count = 0; 239 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 240 } 241 242 /* Check for any matches. */ 243 acl_match_check_x4(0, ctx, parms, &flows, index_array); 244 245 while (flows.started > 0) { 246 /* Gather 4 bytes of input data for each stream. */ 247 input = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0)); 248 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1); 249 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2); 250 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3); 251 252 /* Process the 4 bytes of input on each stream. */ 253 input = transition4(input, flows.trans, index_array); 254 input = transition4(input, flows.trans, index_array); 255 input = transition4(input, flows.trans, index_array); 256 input = transition4(input, flows.trans, index_array); 257 258 /* Check for any matches. */ 259 acl_match_check_x4(0, ctx, parms, &flows, index_array); 260 } 261 262 return 0; 263 } 264