1 /* 2 * SPDX-License-Identifier: BSD-3-Clause 3 * Copyright (C) IBM Corporation 2016. 4 */ 5 6 #include "acl_run.h" 7 #include "acl_vect.h" 8 9 struct _altivec_acl_const { 10 rte_xmm_t xmm_shuffle_input; 11 rte_xmm_t xmm_index_mask; 12 rte_xmm_t xmm_ones_16; 13 rte_xmm_t range_base; 14 } altivec_acl_const __rte_cache_aligned = { 15 { 16 .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c} 17 }, 18 { 19 .u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, 20 RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX} 21 }, 22 { 23 .u16 = {1, 1, 1, 1, 1, 1, 1, 1} 24 }, 25 { 26 .u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c} 27 }, 28 }; 29 30 /* 31 * Resolve priority for multiple results (altivec version). 32 * This consists comparing the priority of the current traversal with the 33 * running set of results for the packet. 34 * For each result, keep a running array of the result (rule number) and 35 * its priority for each category. 36 */ 37 static inline void 38 resolve_priority_altivec(uint64_t transition, int n, 39 const struct rte_acl_ctx *ctx, struct parms *parms, 40 const struct rte_acl_match_results *p, uint32_t categories) 41 { 42 uint32_t x; 43 xmm_t results, priority, results1, priority1; 44 vector bool int selector; 45 xmm_t *saved_results, *saved_priority; 46 47 for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { 48 49 saved_results = (xmm_t *)(&parms[n].cmplt->results[x]); 50 saved_priority = 51 (xmm_t *)(&parms[n].cmplt->priority[x]); 52 53 /* get results and priorities for completed trie */ 54 results = *(const xmm_t *)&p[transition].results[x]; 55 priority = *(const xmm_t *)&p[transition].priority[x]; 56 57 /* if this is not the first completed trie */ 58 if (parms[n].cmplt->count != ctx->num_tries) { 59 60 /* get running best results and their priorities */ 61 results1 = *saved_results; 62 priority1 = *saved_priority; 63 64 /* select results that are highest priority */ 65 selector = vec_cmpgt(priority1, priority); 66 results = vec_sel(results, results1, selector); 67 priority = vec_sel(priority, priority1, 68 selector); 69 } 70 71 /* save running best results and their priorities */ 72 *saved_results = results; 73 *saved_priority = priority; 74 } 75 } 76 77 /* 78 * Check for any match in 4 transitions 79 */ 80 static __rte_always_inline uint32_t 81 check_any_match_x4(uint64_t val[]) 82 { 83 return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; 84 } 85 86 static __rte_always_inline void 87 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, 88 struct acl_flow_data *flows, uint64_t transitions[]) 89 { 90 while (check_any_match_x4(transitions)) { 91 transitions[0] = acl_match_check(transitions[0], slot, ctx, 92 parms, flows, resolve_priority_altivec); 93 transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, 94 parms, flows, resolve_priority_altivec); 95 transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, 96 parms, flows, resolve_priority_altivec); 97 transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, 98 parms, flows, resolve_priority_altivec); 99 } 100 } 101 102 /* 103 * Process 4 transitions (in 2 XMM registers) in parallel 104 */ 105 static inline __attribute__((optimize("O2"))) xmm_t 106 transition4(xmm_t next_input, const uint64_t *trans, 107 xmm_t *indices1, xmm_t *indices2) 108 { 109 xmm_t addr, tr_lo, tr_hi; 110 xmm_t in, node_type, r, t; 111 xmm_t dfa_ofs, quad_ofs; 112 xmm_t *index_mask, *tp; 113 vector bool int dfa_msk; 114 vector signed char zeroes = {}; 115 union { 116 uint64_t d64[2]; 117 uint32_t d32[4]; 118 } v; 119 120 /* Move low 32 into tr_lo and high 32 into tr_hi */ 121 tr_lo = (xmm_t){(*indices1)[0], (*indices1)[2], 122 (*indices2)[0], (*indices2)[2]}; 123 tr_hi = (xmm_t){(*indices1)[1], (*indices1)[3], 124 (*indices2)[1], (*indices2)[3]}; 125 126 /* Calculate the address (array index) for all 4 transitions. */ 127 index_mask = (xmm_t *)&altivec_acl_const.xmm_index_mask.u32; 128 t = vec_xor(*index_mask, *index_mask); 129 in = vec_perm(next_input, (xmm_t){}, 130 *(vector unsigned char *)&altivec_acl_const.xmm_shuffle_input); 131 132 /* Calc node type and node addr */ 133 node_type = vec_and(vec_nor(*index_mask, *index_mask), tr_lo); 134 addr = vec_and(tr_lo, *index_mask); 135 136 /* mask for DFA type(0) nodes */ 137 dfa_msk = vec_cmpeq(node_type, t); 138 139 /* DFA calculations. */ 140 r = vec_sr(in, (vector unsigned int){30, 30, 30, 30}); 141 tp = (xmm_t *)&altivec_acl_const.range_base.u32; 142 r = vec_add(r, *tp); 143 t = vec_sr(in, (vector unsigned int){24, 24, 24, 24}); 144 r = vec_perm(tr_hi, (xmm_t){(uint16_t)0 << 16}, 145 (vector unsigned char)r); 146 147 dfa_ofs = vec_sub(t, r); 148 149 /* QUAD/SINGLE caluclations. */ 150 t = (xmm_t)vec_cmpgt((vector signed char)in, (vector signed char)tr_hi); 151 t = (xmm_t)vec_sel( 152 vec_sel( 153 (vector signed char)vec_sub( 154 zeroes, (vector signed char)t), 155 (vector signed char)t, 156 vec_cmpgt((vector signed char)t, zeroes)), 157 zeroes, 158 vec_cmpeq((vector signed char)t, zeroes)); 159 160 t = (xmm_t)vec_msum((vector signed char)t, 161 (vector unsigned char)t, (xmm_t){}); 162 quad_ofs = (xmm_t)vec_msum((vector signed short)t, 163 *(vector signed short *)&altivec_acl_const.xmm_ones_16.u16, 164 (xmm_t){}); 165 166 /* blend DFA and QUAD/SINGLE. */ 167 t = vec_sel(quad_ofs, dfa_ofs, dfa_msk); 168 169 /* calculate address for next transitions. */ 170 addr = vec_add(addr, t); 171 172 v.d64[0] = (uint64_t)trans[addr[0]]; 173 v.d64[1] = (uint64_t)trans[addr[1]]; 174 *indices1 = (xmm_t){v.d32[0], v.d32[1], v.d32[2], v.d32[3]}; 175 v.d64[0] = (uint64_t)trans[addr[2]]; 176 v.d64[1] = (uint64_t)trans[addr[3]]; 177 *indices2 = (xmm_t){v.d32[0], v.d32[1], v.d32[2], v.d32[3]}; 178 179 return vec_sr(next_input, 180 (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT}); 181 } 182 183 /* 184 * Execute trie traversal with 8 traversals in parallel 185 */ 186 static inline int 187 search_altivec_8(const struct rte_acl_ctx *ctx, const uint8_t **data, 188 uint32_t *results, uint32_t total_packets, uint32_t categories) 189 { 190 int n; 191 struct acl_flow_data flows; 192 uint64_t index_array[MAX_SEARCHES_ALTIVEC8]; 193 struct completion cmplt[MAX_SEARCHES_ALTIVEC8]; 194 struct parms parms[MAX_SEARCHES_ALTIVEC8]; 195 xmm_t input0, input1; 196 197 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 198 total_packets, categories, ctx->trans_table); 199 200 for (n = 0; n < MAX_SEARCHES_ALTIVEC8; n++) { 201 cmplt[n].count = 0; 202 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 203 } 204 205 /* Check for any matches. */ 206 acl_match_check_x4(0, ctx, parms, &flows, (uint64_t *)&index_array[0]); 207 acl_match_check_x4(4, ctx, parms, &flows, (uint64_t *)&index_array[4]); 208 209 while (flows.started > 0) { 210 211 /* Gather 4 bytes of input data for each stream. */ 212 input0 = (xmm_t){GET_NEXT_4BYTES(parms, 0), 213 GET_NEXT_4BYTES(parms, 1), 214 GET_NEXT_4BYTES(parms, 2), 215 GET_NEXT_4BYTES(parms, 3)}; 216 217 input1 = (xmm_t){GET_NEXT_4BYTES(parms, 4), 218 GET_NEXT_4BYTES(parms, 5), 219 GET_NEXT_4BYTES(parms, 6), 220 GET_NEXT_4BYTES(parms, 7)}; 221 222 /* Process the 4 bytes of input on each stream. */ 223 224 input0 = transition4(input0, flows.trans, 225 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 226 input1 = transition4(input1, flows.trans, 227 (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); 228 229 input0 = transition4(input0, flows.trans, 230 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 231 input1 = transition4(input1, flows.trans, 232 (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); 233 234 input0 = transition4(input0, flows.trans, 235 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 236 input1 = transition4(input1, flows.trans, 237 (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); 238 239 input0 = transition4(input0, flows.trans, 240 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 241 input1 = transition4(input1, flows.trans, 242 (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); 243 244 /* Check for any matches. */ 245 acl_match_check_x4(0, ctx, parms, &flows, 246 (uint64_t *)&index_array[0]); 247 acl_match_check_x4(4, ctx, parms, &flows, 248 (uint64_t *)&index_array[4]); 249 } 250 251 return 0; 252 } 253 254 /* 255 * Execute trie traversal with 4 traversals in parallel 256 */ 257 static inline int 258 search_altivec_4(const struct rte_acl_ctx *ctx, const uint8_t **data, 259 uint32_t *results, int total_packets, uint32_t categories) 260 { 261 int n; 262 struct acl_flow_data flows; 263 uint64_t index_array[MAX_SEARCHES_ALTIVEC4]; 264 struct completion cmplt[MAX_SEARCHES_ALTIVEC4]; 265 struct parms parms[MAX_SEARCHES_ALTIVEC4]; 266 xmm_t input; 267 268 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 269 total_packets, categories, ctx->trans_table); 270 271 for (n = 0; n < MAX_SEARCHES_ALTIVEC4; n++) { 272 cmplt[n].count = 0; 273 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 274 } 275 276 /* Check for any matches. */ 277 acl_match_check_x4(0, ctx, parms, &flows, index_array); 278 279 while (flows.started > 0) { 280 281 /* Gather 4 bytes of input data for each stream. */ 282 input = (xmm_t){GET_NEXT_4BYTES(parms, 0), 283 GET_NEXT_4BYTES(parms, 1), 284 GET_NEXT_4BYTES(parms, 2), 285 GET_NEXT_4BYTES(parms, 3)}; 286 287 /* Process the 4 bytes of input on each stream. */ 288 input = transition4(input, flows.trans, 289 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 290 input = transition4(input, flows.trans, 291 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 292 input = transition4(input, flows.trans, 293 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 294 input = transition4(input, flows.trans, 295 (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); 296 297 /* Check for any matches. */ 298 acl_match_check_x4(0, ctx, parms, &flows, index_array); 299 } 300 301 return 0; 302 } 303