1 /* SPDX-License-Identifier: BSD-3-Clause */ 2 /* Copyright(c) 2019-2021 Broadcom All rights reserved. */ 3 4 #include <inttypes.h> 5 #include <stdbool.h> 6 7 #include <rte_bitmap.h> 8 #include <rte_byteorder.h> 9 #include <rte_malloc.h> 10 #include <rte_memory.h> 11 #include <rte_vect.h> 12 13 #include "bnxt.h" 14 #include "bnxt_cpr.h" 15 #include "bnxt_ring.h" 16 17 #include "bnxt_txq.h" 18 #include "bnxt_txr.h" 19 #include "bnxt_rxtx_vec_common.h" 20 21 /* 22 * RX Ring handling 23 */ 24 25 #define GET_OL_FLAGS(rss_flags, ol_idx, errors, pi, ol_flags) \ 26 { \ 27 uint32_t tmp, of; \ 28 \ 29 of = vgetq_lane_u32((rss_flags), (pi)) | \ 30 rxr->ol_flags_table[vgetq_lane_u32((ol_idx), (pi))]; \ 31 \ 32 tmp = vgetq_lane_u32((errors), (pi)); \ 33 if (tmp) \ 34 of |= rxr->ol_flags_err_table[tmp]; \ 35 (ol_flags) = of; \ 36 } 37 38 #define GET_DESC_FIELDS(rxcmp, rxcmp1, shuf_msk, ptype_idx, pkt_idx, ret) \ 39 { \ 40 uint32_t ptype; \ 41 uint16_t vlan_tci; \ 42 uint32x4_t r; \ 43 \ 44 /* Set mbuf pkt_len, data_len, and rss_hash fields. */ \ 45 r = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(rxcmp), \ 46 (shuf_msk))); \ 47 \ 48 /* Set packet type. */ \ 49 ptype = bnxt_ptype_table[vgetq_lane_u32((ptype_idx), (pkt_idx))]; \ 50 r = vsetq_lane_u32(ptype, r, 0); \ 51 \ 52 /* Set vlan_tci. */ \ 53 vlan_tci = vgetq_lane_u32((rxcmp1), 1); \ 54 r = vreinterpretq_u32_u16(vsetq_lane_u16(vlan_tci, \ 55 vreinterpretq_u16_u32(r), 5)); \ 56 (ret) = r; \ 57 } 58 59 static void 60 descs_to_mbufs(uint32x4_t mm_rxcmp[4], uint32x4_t mm_rxcmp1[4], 61 uint64x2_t mb_init, struct rte_mbuf **mbuf, 62 struct bnxt_rx_ring_info *rxr) 63 { 64 const uint8x16_t shuf_msk = { 65 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */ 66 2, 3, 0xFF, 0xFF, /* pkt_len */ 67 2, 3, /* data_len */ 68 0xFF, 0xFF, /* vlan_tci (zeroes) */ 69 12, 13, 14, 15 /* rss hash */ 70 }; 71 const uint32x4_t flags_type_mask = 72 vdupq_n_u32(RX_PKT_CMPL_FLAGS_ITYPE_MASK); 73 const uint32x4_t flags2_mask1 = 74 vdupq_n_u32(RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN | 75 RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC); 76 const uint32x4_t flags2_mask2 = 77 vdupq_n_u32(RX_PKT_CMPL_FLAGS2_IP_TYPE); 78 const uint32x4_t rss_mask = 79 vdupq_n_u32(RX_PKT_CMPL_FLAGS_RSS_VALID); 80 const uint32x4_t flags2_index_mask = vdupq_n_u32(0x1F); 81 const uint32x4_t flags2_error_mask = vdupq_n_u32(0x0F); 82 uint32x4_t flags_type, flags2, index, errors, rss_flags; 83 uint32x4_t tmp, ptype_idx, is_tunnel; 84 uint64x2_t t0, t1; 85 uint32_t ol_flags; 86 87 /* Compute packet type table indexes for four packets */ 88 t0 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp[0], mm_rxcmp[1])); 89 t1 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp[2], mm_rxcmp[3])); 90 91 flags_type = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0), 92 vget_low_u64(t1))); 93 ptype_idx = 94 vshrq_n_u32(vandq_u32(flags_type, flags_type_mask), 9); 95 96 t0 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp1[0], mm_rxcmp1[1])); 97 t1 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp1[2], mm_rxcmp1[3])); 98 99 flags2 = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0), 100 vget_low_u64(t1))); 101 102 ptype_idx = vorrq_u32(ptype_idx, 103 vshrq_n_u32(vandq_u32(flags2, flags2_mask1), 2)); 104 ptype_idx = vorrq_u32(ptype_idx, 105 vshrq_n_u32(vandq_u32(flags2, flags2_mask2), 7)); 106 107 /* Extract RSS valid flags for four packets. */ 108 rss_flags = vshrq_n_u32(vandq_u32(flags_type, rss_mask), 9); 109 110 flags2 = vandq_u32(flags2, flags2_index_mask); 111 112 /* Extract errors_v2 fields for four packets. */ 113 t0 = vreinterpretq_u64_u32(vzip2q_u32(mm_rxcmp1[0], mm_rxcmp1[1])); 114 t1 = vreinterpretq_u64_u32(vzip2q_u32(mm_rxcmp1[2], mm_rxcmp1[3])); 115 116 errors = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0), 117 vget_low_u64(t1))); 118 119 /* Compute ol_flags and checksum error indexes for four packets. */ 120 is_tunnel = vandq_u32(flags2, vdupq_n_u32(4)); 121 is_tunnel = vshlq_n_u32(is_tunnel, 3); 122 errors = vandq_u32(vshrq_n_u32(errors, 4), flags2_error_mask); 123 errors = vandq_u32(errors, flags2); 124 125 index = vbicq_u32(flags2, errors); 126 errors = vorrq_u32(errors, vshrq_n_u32(is_tunnel, 1)); 127 index = vorrq_u32(index, is_tunnel); 128 129 /* Update mbuf rearm_data for four packets. */ 130 GET_OL_FLAGS(rss_flags, index, errors, 0, ol_flags); 131 vst1q_u32((uint32_t *)&mbuf[0]->rearm_data, 132 vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2)); 133 GET_OL_FLAGS(rss_flags, index, errors, 1, ol_flags); 134 vst1q_u32((uint32_t *)&mbuf[1]->rearm_data, 135 vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2)); 136 GET_OL_FLAGS(rss_flags, index, errors, 2, ol_flags); 137 vst1q_u32((uint32_t *)&mbuf[2]->rearm_data, 138 vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2)); 139 GET_OL_FLAGS(rss_flags, index, errors, 3, ol_flags); 140 vst1q_u32((uint32_t *)&mbuf[3]->rearm_data, 141 vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2)); 142 143 /* Update mbuf rx_descriptor_fields1 for four packets. */ 144 GET_DESC_FIELDS(mm_rxcmp[0], mm_rxcmp1[0], shuf_msk, ptype_idx, 0, tmp); 145 vst1q_u32((uint32_t *)&mbuf[0]->rx_descriptor_fields1, tmp); 146 GET_DESC_FIELDS(mm_rxcmp[1], mm_rxcmp1[1], shuf_msk, ptype_idx, 1, tmp); 147 vst1q_u32((uint32_t *)&mbuf[1]->rx_descriptor_fields1, tmp); 148 GET_DESC_FIELDS(mm_rxcmp[2], mm_rxcmp1[2], shuf_msk, ptype_idx, 2, tmp); 149 vst1q_u32((uint32_t *)&mbuf[2]->rx_descriptor_fields1, tmp); 150 GET_DESC_FIELDS(mm_rxcmp[3], mm_rxcmp1[3], shuf_msk, ptype_idx, 3, tmp); 151 vst1q_u32((uint32_t *)&mbuf[3]->rx_descriptor_fields1, tmp); 152 } 153 154 uint16_t 155 bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 156 uint16_t nb_pkts) 157 { 158 struct bnxt_rx_queue *rxq = rx_queue; 159 struct bnxt_cp_ring_info *cpr = rxq->cp_ring; 160 struct bnxt_rx_ring_info *rxr = rxq->rx_ring; 161 uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size; 162 uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size; 163 struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; 164 uint64_t valid, desc_valid_mask = ~0UL; 165 const uint32x4_t info3_v_mask = vdupq_n_u32(CMPL_BASE_V); 166 uint32_t raw_cons = cpr->cp_raw_cons; 167 uint32_t cons, mbcons; 168 int nb_rx_pkts = 0; 169 const uint64x2_t mb_init = {rxq->mbuf_initializer, 0}; 170 const uint32x4_t valid_target = 171 vdupq_n_u32(!!(raw_cons & cp_ring_size)); 172 int i; 173 174 /* If Rx Q was stopped return */ 175 if (unlikely(!rxq->rx_started)) 176 return 0; 177 178 if (rxq->rxrearm_nb >= rxq->rx_free_thresh) 179 bnxt_rxq_rearm(rxq, rxr); 180 181 /* Return no more than RTE_BNXT_MAX_RX_BURST per call. */ 182 nb_pkts = RTE_MIN(nb_pkts, RTE_BNXT_MAX_RX_BURST); 183 184 cons = raw_cons & (cp_ring_size - 1); 185 mbcons = (raw_cons / 2) & (rx_ring_size - 1); 186 187 /* Prefetch first four descriptor pairs. */ 188 rte_prefetch0(&cp_desc_ring[cons]); 189 rte_prefetch0(&cp_desc_ring[cons + 4]); 190 191 /* Ensure that we do not go past the ends of the rings. */ 192 nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons, 193 (cp_ring_size - cons) / 2)); 194 /* 195 * If we are at the end of the ring, ensure that descriptors after the 196 * last valid entry are not treated as valid. Otherwise, force the 197 * maximum number of packets to receive to be a multiple of the per- 198 * loop count. 199 */ 200 if (nb_pkts < RTE_BNXT_DESCS_PER_LOOP) 201 desc_valid_mask >>= 16 * (RTE_BNXT_DESCS_PER_LOOP - nb_pkts); 202 else 203 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP); 204 205 /* Handle RX burst request */ 206 for (i = 0; i < nb_pkts; i += RTE_BNXT_DESCS_PER_LOOP, 207 cons += RTE_BNXT_DESCS_PER_LOOP * 2, 208 mbcons += RTE_BNXT_DESCS_PER_LOOP) { 209 uint32x4_t rxcmp1[RTE_BNXT_DESCS_PER_LOOP]; 210 uint32x4_t rxcmp[RTE_BNXT_DESCS_PER_LOOP]; 211 uint32x4_t info3_v; 212 uint64x2_t t0, t1; 213 uint32_t num_valid; 214 215 /* Copy four mbuf pointers to output array. */ 216 t0 = vld1q_u64((void *)&rxr->rx_buf_ring[mbcons]); 217 #ifdef RTE_ARCH_ARM64 218 t1 = vld1q_u64((void *)&rxr->rx_buf_ring[mbcons + 2]); 219 #endif 220 vst1q_u64((void *)&rx_pkts[i], t0); 221 #ifdef RTE_ARCH_ARM64 222 vst1q_u64((void *)&rx_pkts[i + 2], t1); 223 #endif 224 225 /* Prefetch four descriptor pairs for next iteration. */ 226 if (i + RTE_BNXT_DESCS_PER_LOOP < nb_pkts) { 227 rte_prefetch0(&cp_desc_ring[cons + 8]); 228 rte_prefetch0(&cp_desc_ring[cons + 12]); 229 } 230 231 /* 232 * Load the four current descriptors into SSE registers in 233 * reverse order to ensure consistent state. 234 */ 235 rxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]); 236 rte_io_rmb(); 237 rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]); 238 239 rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]); 240 rte_io_rmb(); 241 rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]); 242 243 t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3])); 244 245 rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]); 246 rte_io_rmb(); 247 rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]); 248 249 rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]); 250 rte_io_rmb(); 251 rxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]); 252 253 t0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1])); 254 255 /* Isolate descriptor status flags. */ 256 info3_v = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0), 257 vget_low_u64(t1))); 258 info3_v = vandq_u32(info3_v, info3_v_mask); 259 info3_v = veorq_u32(info3_v, valid_target); 260 261 /* 262 * Pack the 128-bit array of valid descriptor flags into 64 263 * bits and count the number of set bits in order to determine 264 * the number of valid descriptors. 265 */ 266 valid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)), 267 0); 268 /* 269 * At this point, 'valid' is a 64-bit value containing four 270 * 16-bit fields, each of which is either 0x0001 or 0x0000. 271 * Compute number of valid descriptors from the index of 272 * the highest non-zero field. 273 */ 274 num_valid = (sizeof(uint64_t) / sizeof(uint16_t)) - 275 (__builtin_clzl(valid & desc_valid_mask) / 16); 276 277 if (num_valid == 0) 278 break; 279 280 descs_to_mbufs(rxcmp, rxcmp1, mb_init, &rx_pkts[nb_rx_pkts], 281 rxr); 282 nb_rx_pkts += num_valid; 283 284 if (num_valid < RTE_BNXT_DESCS_PER_LOOP) 285 break; 286 } 287 288 if (nb_rx_pkts) { 289 rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts); 290 291 rxq->rxrearm_nb += nb_rx_pkts; 292 cpr->cp_raw_cons += 2 * nb_rx_pkts; 293 cpr->valid = 294 !!(cpr->cp_raw_cons & cpr->cp_ring_struct->ring_size); 295 bnxt_db_cq(cpr); 296 } 297 298 return nb_rx_pkts; 299 } 300 301 static void 302 bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq) 303 { 304 struct bnxt_cp_ring_info *cpr = txq->cp_ring; 305 uint32_t raw_cons = cpr->cp_raw_cons; 306 uint32_t cons; 307 uint32_t nb_tx_pkts = 0; 308 struct tx_cmpl *txcmp; 309 struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; 310 struct bnxt_ring *cp_ring_struct = cpr->cp_ring_struct; 311 uint32_t ring_mask = cp_ring_struct->ring_mask; 312 313 do { 314 cons = RING_CMPL(ring_mask, raw_cons); 315 txcmp = (struct tx_cmpl *)&cp_desc_ring[cons]; 316 317 if (!CMP_VALID(txcmp, raw_cons, cp_ring_struct)) 318 break; 319 320 if (likely(CMP_TYPE(txcmp) == TX_CMPL_TYPE_TX_L2)) 321 nb_tx_pkts += txcmp->opaque; 322 else 323 RTE_LOG_DP(ERR, PMD, 324 "Unhandled CMP type %02x\n", 325 CMP_TYPE(txcmp)); 326 raw_cons = NEXT_RAW_CMP(raw_cons); 327 } while (nb_tx_pkts < ring_mask); 328 329 cpr->valid = !!(raw_cons & cp_ring_struct->ring_size); 330 if (nb_tx_pkts) { 331 if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 332 bnxt_tx_cmp_vec_fast(txq, nb_tx_pkts); 333 else 334 bnxt_tx_cmp_vec(txq, nb_tx_pkts); 335 cpr->cp_raw_cons = raw_cons; 336 bnxt_db_cq(cpr); 337 } 338 } 339 340 static uint16_t 341 bnxt_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 342 uint16_t nb_pkts) 343 { 344 struct bnxt_tx_queue *txq = tx_queue; 345 struct bnxt_tx_ring_info *txr = txq->tx_ring; 346 uint16_t tx_prod, tx_raw_prod = txr->tx_raw_prod; 347 struct rte_mbuf *tx_mbuf; 348 struct tx_bd_long *txbd = NULL; 349 struct rte_mbuf **tx_buf; 350 uint16_t to_send; 351 352 nb_pkts = RTE_MIN(nb_pkts, bnxt_tx_avail(txq)); 353 354 if (unlikely(nb_pkts == 0)) 355 return 0; 356 357 /* Handle TX burst request */ 358 to_send = nb_pkts; 359 while (to_send) { 360 tx_mbuf = *tx_pkts++; 361 rte_prefetch0(tx_mbuf); 362 363 tx_prod = RING_IDX(txr->tx_ring_struct, tx_raw_prod); 364 tx_buf = &txr->tx_buf_ring[tx_prod]; 365 *tx_buf = tx_mbuf; 366 367 txbd = &txr->tx_desc_ring[tx_prod]; 368 txbd->address = tx_mbuf->buf_iova + tx_mbuf->data_off; 369 txbd->len = tx_mbuf->data_len; 370 txbd->flags_type = bnxt_xmit_flags_len(tx_mbuf->data_len, 371 TX_BD_FLAGS_NOCMPL); 372 tx_raw_prod = RING_NEXT(tx_raw_prod); 373 to_send--; 374 } 375 376 /* Request a completion for last packet in burst */ 377 if (txbd) { 378 txbd->opaque = nb_pkts; 379 txbd->flags_type &= ~TX_BD_LONG_FLAGS_NO_CMPL; 380 } 381 382 rte_compiler_barrier(); 383 bnxt_db_write(&txr->tx_db, tx_raw_prod); 384 385 txr->tx_raw_prod = tx_raw_prod; 386 387 return nb_pkts; 388 } 389 390 uint16_t 391 bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 392 uint16_t nb_pkts) 393 { 394 int nb_sent = 0; 395 struct bnxt_tx_queue *txq = tx_queue; 396 397 /* Tx queue was stopped; wait for it to be restarted */ 398 if (unlikely(!txq->tx_started)) { 399 PMD_DRV_LOG(DEBUG, "Tx q stopped;return\n"); 400 return 0; 401 } 402 403 /* Handle TX completions */ 404 if (bnxt_tx_bds_in_hw(txq) >= txq->tx_free_thresh) 405 bnxt_handle_tx_cp_vec(txq); 406 407 while (nb_pkts) { 408 uint16_t ret, num; 409 410 num = RTE_MIN(nb_pkts, RTE_BNXT_MAX_TX_BURST); 411 ret = bnxt_xmit_fixed_burst_vec(tx_queue, 412 &tx_pkts[nb_sent], 413 num); 414 nb_sent += ret; 415 nb_pkts -= ret; 416 if (ret < num) 417 break; 418 } 419 420 return nb_sent; 421 } 422 423 int __rte_cold 424 bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq) 425 { 426 return bnxt_rxq_vec_setup_common(rxq); 427 } 428