1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2015 Cavium, Inc 3 */ 4 5 #include "acl_run.h" 6 #include "acl_vect.h" 7 8 struct _neon_acl_const { 9 rte_xmm_t xmm_shuffle_input; 10 rte_xmm_t xmm_index_mask; 11 rte_xmm_t range_base; 12 } neon_acl_const __rte_cache_aligned = { 13 { 14 .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c} 15 }, 16 { 17 .u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, 18 RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX} 19 }, 20 { 21 .u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c} 22 }, 23 }; 24 25 /* 26 * Resolve priority for multiple results (neon version). 27 * This consists comparing the priority of the current traversal with the 28 * running set of results for the packet. 29 * For each result, keep a running array of the result (rule number) and 30 * its priority for each category. 31 */ 32 static inline void 33 resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx, 34 struct parms *parms, 35 const struct rte_acl_match_results *p, 36 uint32_t categories) 37 { 38 uint32_t x; 39 int32x4_t results, priority, results1, priority1; 40 uint32x4_t selector; 41 int32_t *saved_results, *saved_priority; 42 43 for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { 44 saved_results = (int32_t *)(&parms[n].cmplt->results[x]); 45 saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]); 46 47 /* get results and priorities for completed trie */ 48 results = vld1q_s32( 49 (const int32_t *)&p[transition].results[x]); 50 priority = vld1q_s32( 51 (const int32_t *)&p[transition].priority[x]); 52 53 /* if this is not the first completed trie */ 54 if (parms[n].cmplt->count != ctx->num_tries) { 55 /* get running best results and their priorities */ 56 results1 = vld1q_s32(saved_results); 57 priority1 = vld1q_s32(saved_priority); 58 59 /* select results that are highest priority */ 60 selector = vcgtq_s32(priority1, priority); 61 results = vbslq_s32(selector, results1, results); 62 priority = vbslq_s32(selector, priority1, priority); 63 } 64 65 /* save running best results and their priorities */ 66 vst1q_s32(saved_results, results); 67 vst1q_s32(saved_priority, priority); 68 } 69 } 70 71 /* 72 * Check for any match in 4 transitions 73 */ 74 static __rte_always_inline uint32_t 75 check_any_match_x4(uint64_t val[]) 76 { 77 return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; 78 } 79 80 static __rte_always_inline void 81 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, 82 struct acl_flow_data *flows, uint64_t transitions[]) 83 { 84 while (check_any_match_x4(transitions)) { 85 transitions[0] = acl_match_check(transitions[0], slot, ctx, 86 parms, flows, resolve_priority_neon); 87 transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, 88 parms, flows, resolve_priority_neon); 89 transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, 90 parms, flows, resolve_priority_neon); 91 transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, 92 parms, flows, resolve_priority_neon); 93 } 94 } 95 96 /* 97 * Process 4 transitions (in 2 NEON Q registers) in parallel 98 */ 99 static __rte_always_inline int32x4_t 100 transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[]) 101 { 102 int32x4x2_t tr_hi_lo; 103 int32x4_t t, in, r; 104 uint32x4_t index_msk, node_type, addr; 105 uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs; 106 107 /* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */ 108 tr_hi_lo = vld2q_s32((const int32_t *)transitions); 109 110 /* Calculate the address (array index) for all 4 transitions. */ 111 112 index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask); 113 114 /* Calc node type and node addr */ 115 node_type = vbicq_s32(tr_hi_lo.val[0], index_msk); 116 addr = vandq_s32(tr_hi_lo.val[0], index_msk); 117 118 /* t = 0 */ 119 t = veorq_s32(node_type, node_type); 120 121 /* mask for DFA type(0) nodes */ 122 dfa_msk = vceqq_u32(node_type, t); 123 124 mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input); 125 in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask); 126 127 /* DFA calculations. */ 128 r = vshrq_n_u32(in, 30); /* div by 64 */ 129 mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base); 130 r = vaddq_u8(r, mask); 131 t = vshrq_n_u32(in, 24); 132 r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r); 133 dfa_ofs = vsubq_s32(t, r); 134 135 /* QUAD/SINGLE calculations. */ 136 t = vcgtq_s8(in, tr_hi_lo.val[1]); 137 t = vabsq_s8(t); 138 t = vpaddlq_u8(t); 139 quad_ofs = vpaddlq_u16(t); 140 141 /* blend DFA and QUAD/SINGLE. */ 142 t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs); 143 144 /* calculate address for next transitions */ 145 addr = vaddq_u32(addr, t); 146 147 /* Fill next transitions */ 148 transitions[0] = trans[vgetq_lane_u32(addr, 0)]; 149 transitions[1] = trans[vgetq_lane_u32(addr, 1)]; 150 transitions[2] = trans[vgetq_lane_u32(addr, 2)]; 151 transitions[3] = trans[vgetq_lane_u32(addr, 3)]; 152 153 return vshrq_n_u32(next_input, CHAR_BIT); 154 } 155 156 /* 157 * Execute trie traversal with 8 traversals in parallel 158 */ 159 static inline int 160 search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data, 161 uint32_t *results, uint32_t total_packets, uint32_t categories) 162 { 163 int n; 164 struct acl_flow_data flows; 165 uint64_t index_array[8]; 166 struct completion cmplt[8]; 167 struct parms parms[8]; 168 int32x4_t input0, input1; 169 170 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 171 total_packets, categories, ctx->trans_table); 172 173 for (n = 0; n < 8; n++) { 174 cmplt[n].count = 0; 175 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 176 } 177 178 /* Check for any matches. */ 179 acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); 180 acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); 181 182 while (flows.started > 0) { 183 /* Gather 4 bytes of input data for each stream. */ 184 input0 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0)); 185 input1 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 4)); 186 187 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1); 188 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1); 189 190 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2); 191 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2); 192 193 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3); 194 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3); 195 196 /* Process the 4 bytes of input on each stream. */ 197 198 input0 = transition4(input0, flows.trans, &index_array[0]); 199 input1 = transition4(input1, flows.trans, &index_array[4]); 200 201 input0 = transition4(input0, flows.trans, &index_array[0]); 202 input1 = transition4(input1, flows.trans, &index_array[4]); 203 204 input0 = transition4(input0, flows.trans, &index_array[0]); 205 input1 = transition4(input1, flows.trans, &index_array[4]); 206 207 input0 = transition4(input0, flows.trans, &index_array[0]); 208 input1 = transition4(input1, flows.trans, &index_array[4]); 209 210 /* Check for any matches. */ 211 acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); 212 acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); 213 } 214 215 return 0; 216 } 217 218 /* 219 * Execute trie traversal with 4 traversals in parallel 220 */ 221 static inline int 222 search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data, 223 uint32_t *results, int total_packets, uint32_t categories) 224 { 225 int n; 226 struct acl_flow_data flows; 227 uint64_t index_array[4]; 228 struct completion cmplt[4]; 229 struct parms parms[4]; 230 int32x4_t input; 231 232 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 233 total_packets, categories, ctx->trans_table); 234 235 for (n = 0; n < 4; n++) { 236 cmplt[n].count = 0; 237 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 238 } 239 240 /* Check for any matches. */ 241 acl_match_check_x4(0, ctx, parms, &flows, index_array); 242 243 while (flows.started > 0) { 244 /* Gather 4 bytes of input data for each stream. */ 245 input = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0)); 246 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1); 247 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2); 248 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3); 249 250 /* Process the 4 bytes of input on each stream. */ 251 input = transition4(input, flows.trans, index_array); 252 input = transition4(input, flows.trans, index_array); 253 input = transition4(input, flows.trans, index_array); 254 input = transition4(input, flows.trans, index_array); 255 256 /* Check for any matches. */ 257 acl_match_check_x4(0, ctx, parms, &flows, index_array); 258 } 259 260 return 0; 261 } 262