1fb6befdfSLong Wu /* SPDX-License-Identifier: BSD-3-Clause 2fb6befdfSLong Wu * Copyright(c) 2024 Corigine, Inc. 3fb6befdfSLong Wu * All rights reserved. 4fb6befdfSLong Wu */ 5fb6befdfSLong Wu 6fb6befdfSLong Wu #include <stdbool.h> 7fb6befdfSLong Wu 8b6755530SLong Wu #include <bus_pci_driver.h> 9b6755530SLong Wu #include <ethdev_driver.h> 10fb6befdfSLong Wu #include <rte_cpuflags.h> 11fb6befdfSLong Wu #include <rte_vect.h> 12fb6befdfSLong Wu 13b6755530SLong Wu #include "nfp_logs.h" 14b6755530SLong Wu #include "nfp_net_common.h" 15b6755530SLong Wu #include "nfp_net_meta.h" 16fb6befdfSLong Wu #include "nfp_rxtx_vec.h" 17fb6befdfSLong Wu 18fb6befdfSLong Wu bool 19fb6befdfSLong Wu nfp_net_get_avx2_supported(void) 20fb6befdfSLong Wu { 21fb6befdfSLong Wu if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 && 22fb6befdfSLong Wu rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1) 23fb6befdfSLong Wu return true; 24fb6befdfSLong Wu 25fb6befdfSLong Wu return false; 26fb6befdfSLong Wu } 27b6755530SLong Wu 28b6755530SLong Wu static inline void 29b6755530SLong Wu nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq, 30b6755530SLong Wu struct nfp_net_rx_desc *rxds, 31b6755530SLong Wu struct rte_mbuf *rxb) 32b6755530SLong Wu { 33b6755530SLong Wu __m128i dma; 34b6755530SLong Wu __m128i dma_hi; 35b6755530SLong Wu __m128i vaddr0; 36b6755530SLong Wu __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM); 37b6755530SLong Wu 38b6755530SLong Wu dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room); 39b6755530SLong Wu dma_hi = _mm_srli_epi64(dma, 32); 40b6755530SLong Wu vaddr0 = _mm_unpacklo_epi32(dma_hi, dma); 41b6755530SLong Wu 42b6755530SLong Wu _mm_storel_epi64((void *)rxds, vaddr0); 43b6755530SLong Wu 44b6755530SLong Wu rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1); 45b6755530SLong Wu } 46b6755530SLong Wu 47b6755530SLong Wu static inline void 48b6755530SLong Wu nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq, 49b6755530SLong Wu struct nfp_net_rx_desc *rxds, 50b6755530SLong Wu struct rte_mbuf **rxb) 51b6755530SLong Wu { 52b6755530SLong Wu __m128i dma; 53b6755530SLong Wu __m128i dma_hi; 54b6755530SLong Wu __m128i vaddr0; 55b6755530SLong Wu __m128i vaddr1; 56b6755530SLong Wu __m128i vaddr2; 57b6755530SLong Wu __m128i vaddr3; 58b6755530SLong Wu __m128i vaddr0_1; 59b6755530SLong Wu __m128i vaddr2_3; 60b6755530SLong Wu __m256i vaddr0_3; 61b6755530SLong Wu __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM); 62b6755530SLong Wu 63b6755530SLong Wu dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room); 64b6755530SLong Wu dma_hi = _mm_srli_epi64(dma, 32); 65b6755530SLong Wu vaddr0 = _mm_unpacklo_epi32(dma_hi, dma); 66b6755530SLong Wu 67b6755530SLong Wu dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room); 68b6755530SLong Wu dma_hi = _mm_srli_epi64(dma, 32); 69b6755530SLong Wu vaddr1 = _mm_unpacklo_epi32(dma_hi, dma); 70b6755530SLong Wu 71b6755530SLong Wu dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room); 72b6755530SLong Wu dma_hi = _mm_srli_epi64(dma, 32); 73b6755530SLong Wu vaddr2 = _mm_unpacklo_epi32(dma_hi, dma); 74b6755530SLong Wu 75b6755530SLong Wu dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room); 76b6755530SLong Wu dma_hi = _mm_srli_epi64(dma, 32); 77b6755530SLong Wu vaddr3 = _mm_unpacklo_epi32(dma_hi, dma); 78b6755530SLong Wu 79b6755530SLong Wu vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1); 80b6755530SLong Wu vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3); 81b6755530SLong Wu 82b6755530SLong Wu vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1), 83b6755530SLong Wu vaddr2_3, 1); 84b6755530SLong Wu 85b6755530SLong Wu _mm256_store_si256((void *)rxds, vaddr0_3); 86b6755530SLong Wu 87b6755530SLong Wu rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1); 88b6755530SLong Wu } 89b6755530SLong Wu 90b6755530SLong Wu static inline void 91b6755530SLong Wu nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq, 92b6755530SLong Wu struct nfp_net_rx_desc *rxds, 93b6755530SLong Wu struct rte_mbuf *rx_pkt) 94b6755530SLong Wu { 95b6755530SLong Wu struct nfp_net_hw *hw = rxq->hw; 96b6755530SLong Wu struct nfp_net_meta_parsed meta; 97b6755530SLong Wu 98b6755530SLong Wu rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds); 99b6755530SLong Wu /* Size of the whole packet. We just support 1 segment */ 100b6755530SLong Wu rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds); 101b6755530SLong Wu 102b6755530SLong Wu /* Filling the received mbuf with packet info */ 103b6755530SLong Wu if (hw->rx_offset) 104b6755530SLong Wu rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset; 105b6755530SLong Wu else 106b6755530SLong Wu rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds); 107b6755530SLong Wu 108b6755530SLong Wu rx_pkt->port = rxq->port_id; 109b6755530SLong Wu rx_pkt->nb_segs = 1; 110b6755530SLong Wu rx_pkt->next = NULL; 111b6755530SLong Wu 112b6755530SLong Wu nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta); 113b6755530SLong Wu 11462edcfd6SLong Wu nfp_net_parse_ptype(rxq, rxds, rx_pkt); 11562edcfd6SLong Wu 116b6755530SLong Wu /* Checking the checksum flag */ 117b6755530SLong Wu nfp_net_rx_cksum(rxq, rxds, rx_pkt); 118b6755530SLong Wu } 119b6755530SLong Wu 12014612528SLong Wu static inline int 121b6755530SLong Wu nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq, 122b6755530SLong Wu struct nfp_net_rx_desc *rxds, 12314612528SLong Wu struct rte_mbuf **rxb, 124b6755530SLong Wu struct rte_mbuf *rx_pkt) 125b6755530SLong Wu { 12614612528SLong Wu /* Allocate a new mbuf into the software ring. */ 12714612528SLong Wu if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) { 128*b6de4353SZerun Fu PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu.", 12914612528SLong Wu rxq->port_id, rxq->qidx); 13014612528SLong Wu nfp_net_mbuf_alloc_failed(rxq); 13114612528SLong Wu return -ENOMEM; 132b6755530SLong Wu } 133b6755530SLong Wu 13414612528SLong Wu nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt); 13514612528SLong Wu 13614612528SLong Wu nfp_vec_avx2_recv_set_des1(rxq, rxds, *rxb); 13714612528SLong Wu 13814612528SLong Wu return 0; 13914612528SLong Wu } 14014612528SLong Wu 14114612528SLong Wu static inline int 142b6755530SLong Wu nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq, 143b6755530SLong Wu struct nfp_net_rx_desc *rxds, 144b6755530SLong Wu struct rte_mbuf **rxb, 145b6755530SLong Wu struct rte_mbuf **rx_pkts) 146b6755530SLong Wu { 14714612528SLong Wu /* Allocate 4 new mbufs into the software ring. */ 14814612528SLong Wu if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) { 149*b6de4353SZerun Fu PMD_RX_LOG(DEBUG, "RX mbuf bulk alloc failed port_id=%u queue_id=%hu.", 15014612528SLong Wu rxq->port_id, rxq->qidx); 15114612528SLong Wu return -ENOMEM; 15214612528SLong Wu } 15314612528SLong Wu 154b6755530SLong Wu nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]); 155b6755530SLong Wu nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]); 156b6755530SLong Wu nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]); 157b6755530SLong Wu nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]); 158b6755530SLong Wu 159b6755530SLong Wu nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb); 16014612528SLong Wu 16114612528SLong Wu return 0; 162b6755530SLong Wu } 163b6755530SLong Wu 164b6755530SLong Wu static inline bool 165b6755530SLong Wu nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds) 166b6755530SLong Wu { 167b6755530SLong Wu __m256i data = _mm256_loadu_si256((void *)rxds); 168b6755530SLong Wu 169b6755530SLong Wu if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 || 170b6755530SLong Wu (_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 || 171b6755530SLong Wu (_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 || 172b6755530SLong Wu (_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0) 173b6755530SLong Wu return false; 174b6755530SLong Wu 175b6755530SLong Wu return true; 176b6755530SLong Wu } 177b6755530SLong Wu 178b6755530SLong Wu uint16_t 179b6755530SLong Wu nfp_net_vec_avx2_recv_pkts(void *rx_queue, 180b6755530SLong Wu struct rte_mbuf **rx_pkts, 181b6755530SLong Wu uint16_t nb_pkts) 182b6755530SLong Wu { 183b6755530SLong Wu uint16_t avail; 184b6755530SLong Wu uint16_t nb_hold; 185b6755530SLong Wu bool burst_receive; 186b6755530SLong Wu struct rte_mbuf **rxb; 187b6755530SLong Wu struct nfp_net_rx_desc *rxds; 188b6755530SLong Wu struct nfp_net_rxq *rxq = rx_queue; 189b6755530SLong Wu 190b6755530SLong Wu if (unlikely(rxq == NULL)) { 191*b6de4353SZerun Fu PMD_RX_LOG(ERR, "RX Bad queue."); 192b6755530SLong Wu return 0; 193b6755530SLong Wu } 194b6755530SLong Wu 195b6755530SLong Wu avail = 0; 196b6755530SLong Wu nb_hold = 0; 197b6755530SLong Wu burst_receive = true; 198b6755530SLong Wu while (avail < nb_pkts) { 199b6755530SLong Wu rxds = &rxq->rxds[rxq->rd_p]; 200b6755530SLong Wu rxb = &rxq->rxbufs[rxq->rd_p].mbuf; 201b6755530SLong Wu 202b6755530SLong Wu if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3) 203b6755530SLong Wu & PCIE_DESC_RX_DD) == 0) 204b6755530SLong Wu goto recv_end; 205b6755530SLong Wu 206b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf); 207b6755530SLong Wu 208b6755530SLong Wu if ((rxq->rd_p & 0x3) == 0) { 209b6755530SLong Wu rte_prefetch0(&rxq->rxds[rxq->rd_p]); 210b6755530SLong Wu rte_prefetch0(&rxq->rxbufs[rxq->rd_p]); 211b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf); 212b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf); 213b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf); 214b6755530SLong Wu } 215b6755530SLong Wu 216b6755530SLong Wu if ((rxq->rd_p & 0x7) == 0) { 217b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf); 218b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf); 219b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf); 220b6755530SLong Wu rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf); 221b6755530SLong Wu } 222b6755530SLong Wu 223b6755530SLong Wu /* 224b6755530SLong Wu * If can not receive burst, just receive one. 225b6755530SLong Wu * 1. Rx ring will coming to the tail. 226b6755530SLong Wu * 2. Do not need to receive 4 packets. 227b6755530SLong Wu * 3. If pointer address unaligned on 32-bit boundary. 228b6755530SLong Wu * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed. 229b6755530SLong Wu */ 230b6755530SLong Wu if ((rxq->rx_count - rxq->rd_p) < 4 || 231b6755530SLong Wu (nb_pkts - avail) < 4 || 232b6755530SLong Wu ((uintptr_t)rxds & 0x1F) != 0 || 233b6755530SLong Wu !burst_receive) { 234b6755530SLong Wu _mm_storel_epi64((void *)&rx_pkts[avail], 235b6755530SLong Wu _mm_loadu_si128((void *)rxb)); 236b6755530SLong Wu 23714612528SLong Wu if (nfp_vec_avx2_recv1(rxq, rxds, rxb, rx_pkts[avail]) != 0) 238b6755530SLong Wu goto recv_end; 239b6755530SLong Wu 240b6755530SLong Wu avail++; 241b6755530SLong Wu nb_hold++; 242b6755530SLong Wu continue; 243b6755530SLong Wu } 244b6755530SLong Wu 245b6755530SLong Wu burst_receive = nfp_vec_avx2_recv_check_packets4(rxds); 246b6755530SLong Wu if (!burst_receive) 247b6755530SLong Wu continue; 248b6755530SLong Wu 249b6755530SLong Wu _mm256_storeu_si256((void *)&rx_pkts[avail], 250b6755530SLong Wu _mm256_loadu_si256((void *)rxb)); 251b6755530SLong Wu 25214612528SLong Wu if (nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]) != 0) { 253b6755530SLong Wu burst_receive = false; 254b6755530SLong Wu continue; 255b6755530SLong Wu } 256b6755530SLong Wu 257b6755530SLong Wu avail += 4; 258b6755530SLong Wu nb_hold += 4; 259b6755530SLong Wu } 260b6755530SLong Wu 261b6755530SLong Wu recv_end: 262b6755530SLong Wu if (nb_hold == 0) 263b6755530SLong Wu return nb_hold; 264b6755530SLong Wu 265*b6de4353SZerun Fu PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received.", 266b6755530SLong Wu rxq->port_id, (unsigned int)rxq->qidx, nb_hold); 267b6755530SLong Wu 268b6755530SLong Wu nb_hold += rxq->nb_rx_hold; 269b6755530SLong Wu 270b6755530SLong Wu /* 271b6755530SLong Wu * FL descriptors needs to be written before incrementing the 272b6755530SLong Wu * FL queue WR pointer 273b6755530SLong Wu */ 274b6755530SLong Wu rte_wmb(); 275b6755530SLong Wu if (nb_hold > rxq->rx_free_thresh) { 276*b6de4353SZerun Fu PMD_RX_LOG(DEBUG, "The port=%hu queue=%hu nb_hold=%hu avail=%hu.", 277b6755530SLong Wu rxq->port_id, rxq->qidx, nb_hold, avail); 278b6755530SLong Wu nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold); 279b6755530SLong Wu nb_hold = 0; 280b6755530SLong Wu } 281b6755530SLong Wu rxq->nb_rx_hold = nb_hold; 282b6755530SLong Wu 283b6755530SLong Wu return avail; 284b6755530SLong Wu } 285