1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2024 Corigine, Inc. 3 * All rights reserved. 4 */ 5 6 #include <stdbool.h> 7 8 #include <bus_pci_driver.h> 9 #include <ethdev_driver.h> 10 #include <rte_cpuflags.h> 11 #include <rte_vect.h> 12 13 #include "nfp_logs.h" 14 #include "nfp_net_common.h" 15 #include "nfp_net_meta.h" 16 #include "nfp_rxtx_vec.h" 17 18 bool 19 nfp_net_get_avx2_supported(void) 20 { 21 if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 && 22 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1) 23 return true; 24 25 return false; 26 } 27 28 static inline void 29 nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq, 30 struct nfp_net_rx_desc *rxds, 31 struct rte_mbuf *rxb) 32 { 33 __m128i dma; 34 __m128i dma_hi; 35 __m128i vaddr0; 36 __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM); 37 38 dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room); 39 dma_hi = _mm_srli_epi64(dma, 32); 40 vaddr0 = _mm_unpacklo_epi32(dma_hi, dma); 41 42 _mm_storel_epi64((void *)rxds, vaddr0); 43 44 rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1); 45 } 46 47 static inline void 48 nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq, 49 struct nfp_net_rx_desc *rxds, 50 struct rte_mbuf **rxb) 51 { 52 __m128i dma; 53 __m128i dma_hi; 54 __m128i vaddr0; 55 __m128i vaddr1; 56 __m128i vaddr2; 57 __m128i vaddr3; 58 __m128i vaddr0_1; 59 __m128i vaddr2_3; 60 __m256i vaddr0_3; 61 __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM); 62 63 dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room); 64 dma_hi = _mm_srli_epi64(dma, 32); 65 vaddr0 = _mm_unpacklo_epi32(dma_hi, dma); 66 67 dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room); 68 dma_hi = _mm_srli_epi64(dma, 32); 69 vaddr1 = _mm_unpacklo_epi32(dma_hi, dma); 70 71 dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room); 72 dma_hi = _mm_srli_epi64(dma, 32); 73 vaddr2 = _mm_unpacklo_epi32(dma_hi, dma); 74 75 dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room); 76 dma_hi = _mm_srli_epi64(dma, 32); 77 vaddr3 = _mm_unpacklo_epi32(dma_hi, dma); 78 79 vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1); 80 vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3); 81 82 vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1), 83 vaddr2_3, 1); 84 85 _mm256_store_si256((void *)rxds, vaddr0_3); 86 87 rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1); 88 } 89 90 static inline void 91 nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq, 92 struct nfp_net_rx_desc *rxds, 93 struct rte_mbuf *rx_pkt) 94 { 95 struct nfp_net_hw *hw = rxq->hw; 96 struct nfp_net_meta_parsed meta; 97 98 rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds); 99 /* Size of the whole packet. We just support 1 segment */ 100 rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds); 101 102 /* Filling the received mbuf with packet info */ 103 if (hw->rx_offset) 104 rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset; 105 else 106 rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds); 107 108 rx_pkt->port = rxq->port_id; 109 rx_pkt->nb_segs = 1; 110 rx_pkt->next = NULL; 111 112 nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta); 113 114 nfp_net_parse_ptype(rxq, rxds, rx_pkt); 115 116 /* Checking the checksum flag */ 117 nfp_net_rx_cksum(rxq, rxds, rx_pkt); 118 } 119 120 static inline int 121 nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq, 122 struct nfp_net_rx_desc *rxds, 123 struct rte_mbuf **rxb, 124 struct rte_mbuf *rx_pkt) 125 { 126 /* Allocate a new mbuf into the software ring. */ 127 if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) { 128 PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu.", 129 rxq->port_id, rxq->qidx); 130 nfp_net_mbuf_alloc_failed(rxq); 131 return -ENOMEM; 132 } 133 134 nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt); 135 136 nfp_vec_avx2_recv_set_des1(rxq, rxds, *rxb); 137 138 return 0; 139 } 140 141 static inline int 142 nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq, 143 struct nfp_net_rx_desc *rxds, 144 struct rte_mbuf **rxb, 145 struct rte_mbuf **rx_pkts) 146 { 147 /* Allocate 4 new mbufs into the software ring. */ 148 if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) { 149 PMD_RX_LOG(DEBUG, "RX mbuf bulk alloc failed port_id=%u queue_id=%hu.", 150 rxq->port_id, rxq->qidx); 151 return -ENOMEM; 152 } 153 154 nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]); 155 nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]); 156 nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]); 157 nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]); 158 159 nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb); 160 161 return 0; 162 } 163 164 static inline bool 165 nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds) 166 { 167 __m256i data = _mm256_loadu_si256((void *)rxds); 168 169 if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 || 170 (_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 || 171 (_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 || 172 (_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0) 173 return false; 174 175 return true; 176 } 177 178 uint16_t 179 nfp_net_vec_avx2_recv_pkts(void *rx_queue, 180 struct rte_mbuf **rx_pkts, 181 uint16_t nb_pkts) 182 { 183 uint16_t avail; 184 uint16_t nb_hold; 185 bool burst_receive; 186 struct rte_mbuf **rxb; 187 struct nfp_net_rx_desc *rxds; 188 struct nfp_net_rxq *rxq = rx_queue; 189 190 if (unlikely(rxq == NULL)) { 191 PMD_RX_LOG(ERR, "RX Bad queue."); 192 return 0; 193 } 194 195 avail = 0; 196 nb_hold = 0; 197 burst_receive = true; 198 while (avail < nb_pkts) { 199 rxds = &rxq->rxds[rxq->rd_p]; 200 rxb = &rxq->rxbufs[rxq->rd_p].mbuf; 201 202 if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3) 203 & PCIE_DESC_RX_DD) == 0) 204 goto recv_end; 205 206 rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf); 207 208 if ((rxq->rd_p & 0x3) == 0) { 209 rte_prefetch0(&rxq->rxds[rxq->rd_p]); 210 rte_prefetch0(&rxq->rxbufs[rxq->rd_p]); 211 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf); 212 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf); 213 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf); 214 } 215 216 if ((rxq->rd_p & 0x7) == 0) { 217 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf); 218 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf); 219 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf); 220 rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf); 221 } 222 223 /* 224 * If can not receive burst, just receive one. 225 * 1. Rx ring will coming to the tail. 226 * 2. Do not need to receive 4 packets. 227 * 3. If pointer address unaligned on 32-bit boundary. 228 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed. 229 */ 230 if ((rxq->rx_count - rxq->rd_p) < 4 || 231 (nb_pkts - avail) < 4 || 232 ((uintptr_t)rxds & 0x1F) != 0 || 233 !burst_receive) { 234 _mm_storel_epi64((void *)&rx_pkts[avail], 235 _mm_loadu_si128((void *)rxb)); 236 237 if (nfp_vec_avx2_recv1(rxq, rxds, rxb, rx_pkts[avail]) != 0) 238 goto recv_end; 239 240 avail++; 241 nb_hold++; 242 continue; 243 } 244 245 burst_receive = nfp_vec_avx2_recv_check_packets4(rxds); 246 if (!burst_receive) 247 continue; 248 249 _mm256_storeu_si256((void *)&rx_pkts[avail], 250 _mm256_loadu_si256((void *)rxb)); 251 252 if (nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]) != 0) { 253 burst_receive = false; 254 continue; 255 } 256 257 avail += 4; 258 nb_hold += 4; 259 } 260 261 recv_end: 262 if (nb_hold == 0) 263 return nb_hold; 264 265 PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received.", 266 rxq->port_id, (unsigned int)rxq->qidx, nb_hold); 267 268 nb_hold += rxq->nb_rx_hold; 269 270 /* 271 * FL descriptors needs to be written before incrementing the 272 * FL queue WR pointer 273 */ 274 rte_wmb(); 275 if (nb_hold > rxq->rx_free_thresh) { 276 PMD_RX_LOG(DEBUG, "The port=%hu queue=%hu nb_hold=%hu avail=%hu.", 277 rxq->port_id, rxq->qidx, nb_hold, avail); 278 nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold); 279 nb_hold = 0; 280 } 281 rxq->nb_rx_hold = nb_hold; 282 283 return avail; 284 } 285