1c1d14583SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2c1d14583SBruce Richardson * Copyright(c) 2019 Intel Corporation 3c1d14583SBruce Richardson */ 4c1d14583SBruce Richardson 5c1d14583SBruce Richardson #include "ice_rxtx_vec_common.h" 6c1d14583SBruce Richardson 7c1d14583SBruce Richardson #include <rte_vect.h> 8c1d14583SBruce Richardson 9c1d14583SBruce Richardson static inline __m128i 10c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3) 11c1d14583SBruce Richardson { 12c1d14583SBruce Richardson #define FDID_MIS_MAGIC 0xFFFFFFFF 13c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 14c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 15c1d14583SBruce Richardson const __m128i pkt_fdir_bit = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR | 16c1d14583SBruce Richardson RTE_MBUF_F_RX_FDIR_ID); 17c1d14583SBruce Richardson /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 18c1d14583SBruce Richardson const __m128i fdir_mis_mask = _mm_set1_epi32(FDID_MIS_MAGIC); 19c1d14583SBruce Richardson __m128i fdir_mask = _mm_cmpeq_epi32(fdir_id0_3, 20c1d14583SBruce Richardson fdir_mis_mask); 21c1d14583SBruce Richardson /* this XOR op results to bit-reverse the fdir_mask */ 22c1d14583SBruce Richardson fdir_mask = _mm_xor_si128(fdir_mask, fdir_mis_mask); 23c1d14583SBruce Richardson const __m128i fdir_flags = _mm_and_si128(fdir_mask, pkt_fdir_bit); 24c1d14583SBruce Richardson 25c1d14583SBruce Richardson return fdir_flags; 26c1d14583SBruce Richardson } 27c1d14583SBruce Richardson 28c1d14583SBruce Richardson static inline void 29c1d14583SBruce Richardson ice_rxq_rearm(struct ice_rx_queue *rxq) 30c1d14583SBruce Richardson { 31c1d14583SBruce Richardson int i; 32c1d14583SBruce Richardson uint16_t rx_id; 33c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp; 34c1d14583SBruce Richardson struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; 35c1d14583SBruce Richardson struct rte_mbuf *mb0, *mb1; 36c1d14583SBruce Richardson __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, 37c1d14583SBruce Richardson RTE_PKTMBUF_HEADROOM); 38c1d14583SBruce Richardson __m128i dma_addr0, dma_addr1; 39c1d14583SBruce Richardson 40c1d14583SBruce Richardson rxdp = rxq->rx_ring + rxq->rxrearm_start; 41c1d14583SBruce Richardson 42c1d14583SBruce Richardson /* Pull 'n' more MBUFs into the software ring */ 43c1d14583SBruce Richardson if (rte_mempool_get_bulk(rxq->mp, 44c1d14583SBruce Richardson (void *)rxep, 45c1d14583SBruce Richardson ICE_RXQ_REARM_THRESH) < 0) { 46c1d14583SBruce Richardson if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >= 47c1d14583SBruce Richardson rxq->nb_rx_desc) { 48c1d14583SBruce Richardson dma_addr0 = _mm_setzero_si128(); 49c1d14583SBruce Richardson for (i = 0; i < ICE_DESCS_PER_LOOP; i++) { 50c1d14583SBruce Richardson rxep[i].mbuf = &rxq->fake_mbuf; 51c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 52c1d14583SBruce Richardson dma_addr0); 53c1d14583SBruce Richardson } 54c1d14583SBruce Richardson } 55c1d14583SBruce Richardson rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += 56c1d14583SBruce Richardson ICE_RXQ_REARM_THRESH; 57c1d14583SBruce Richardson return; 58c1d14583SBruce Richardson } 59c1d14583SBruce Richardson 60c1d14583SBruce Richardson /* Initialize the mbufs in vector, process 2 mbufs in one loop */ 61c1d14583SBruce Richardson for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) { 62c1d14583SBruce Richardson __m128i vaddr0, vaddr1; 63c1d14583SBruce Richardson 64c1d14583SBruce Richardson mb0 = rxep[0].mbuf; 65c1d14583SBruce Richardson mb1 = rxep[1].mbuf; 66c1d14583SBruce Richardson 67c1d14583SBruce Richardson #if RTE_IOVA_IN_MBUF 68c1d14583SBruce Richardson /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 69c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 70c1d14583SBruce Richardson offsetof(struct rte_mbuf, buf_addr) + 8); 71c1d14583SBruce Richardson #endif 72c1d14583SBruce Richardson vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 73c1d14583SBruce Richardson vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 74c1d14583SBruce Richardson 75c1d14583SBruce Richardson #if RTE_IOVA_IN_MBUF 76c1d14583SBruce Richardson /* convert pa to dma_addr hdr/data */ 77c1d14583SBruce Richardson dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); 78c1d14583SBruce Richardson dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); 79c1d14583SBruce Richardson #else 80c1d14583SBruce Richardson /* convert va to dma_addr hdr/data */ 81c1d14583SBruce Richardson dma_addr0 = _mm_unpacklo_epi64(vaddr0, vaddr0); 82c1d14583SBruce Richardson dma_addr1 = _mm_unpacklo_epi64(vaddr1, vaddr1); 83c1d14583SBruce Richardson #endif 84c1d14583SBruce Richardson 85c1d14583SBruce Richardson /* add headroom to pa values */ 86c1d14583SBruce Richardson dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); 87c1d14583SBruce Richardson dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); 88c1d14583SBruce Richardson 89c1d14583SBruce Richardson /* flush desc with pa dma_addr */ 90c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0); 91c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1); 92c1d14583SBruce Richardson } 93c1d14583SBruce Richardson 94c1d14583SBruce Richardson rxq->rxrearm_start += ICE_RXQ_REARM_THRESH; 95c1d14583SBruce Richardson if (rxq->rxrearm_start >= rxq->nb_rx_desc) 96c1d14583SBruce Richardson rxq->rxrearm_start = 0; 97c1d14583SBruce Richardson 98c1d14583SBruce Richardson rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH; 99c1d14583SBruce Richardson 100c1d14583SBruce Richardson rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 101c1d14583SBruce Richardson (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 102c1d14583SBruce Richardson 103c1d14583SBruce Richardson /* Update the tail pointer on the NIC */ 104c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); 105c1d14583SBruce Richardson } 106c1d14583SBruce Richardson 107c1d14583SBruce Richardson static inline void 108c1d14583SBruce Richardson ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4], 109c1d14583SBruce Richardson struct rte_mbuf **rx_pkts) 110c1d14583SBruce Richardson { 111c1d14583SBruce Richardson const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); 112c1d14583SBruce Richardson __m128i rearm0, rearm1, rearm2, rearm3; 113c1d14583SBruce Richardson 114c1d14583SBruce Richardson __m128i tmp_desc, flags, rss_vlan; 115c1d14583SBruce Richardson 116c1d14583SBruce Richardson /* mask everything except checksum, RSS and VLAN flags. 117c1d14583SBruce Richardson * bit6:4 for checksum. 118c1d14583SBruce Richardson * bit12 for RSS indication. 119c1d14583SBruce Richardson * bit13 for VLAN indication. 120c1d14583SBruce Richardson */ 121c1d14583SBruce Richardson const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0, 122c1d14583SBruce Richardson 0x30f0, 0x30f0); 123c1d14583SBruce Richardson const __m128i cksum_mask = _mm_set_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 124c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 125c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 126c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 127c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_MASK | 128c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 129c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 130c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 131c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_MASK | 132c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 133c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 134c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 135c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_MASK | 136c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 137c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 138c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 139c1d14583SBruce Richardson 140c1d14583SBruce Richardson /* map the checksum, rss and vlan fields to the checksum, rss 141c1d14583SBruce Richardson * and vlan flag 142c1d14583SBruce Richardson */ 143c1d14583SBruce Richardson const __m128i cksum_flags = 144c1d14583SBruce Richardson _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 145c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 146c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 147c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 148c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 149c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 150c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 151c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 152c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 153c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 154c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 155c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 156c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 157c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 158c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 159c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 160c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 161c1d14583SBruce Richardson /** 162c1d14583SBruce Richardson * shift right 20 bits to use the low two bits to indicate 163c1d14583SBruce Richardson * outer checksum status 164c1d14583SBruce Richardson * shift right 1 bit to make sure it not exceed 255 165c1d14583SBruce Richardson */ 166c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 167c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 168c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 169c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 170c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 171c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 172c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 173c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 174c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 175c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 176c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 177c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 178c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 179c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 180c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 181c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 182c1d14583SBruce Richardson 183c1d14583SBruce Richardson const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, 184c1d14583SBruce Richardson 0, 0, 0, 0, 185c1d14583SBruce Richardson 0, 0, 0, 0, 186c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 187c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 188c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0); 189c1d14583SBruce Richardson 190c1d14583SBruce Richardson /* merge 4 descriptors */ 191c1d14583SBruce Richardson flags = _mm_unpackhi_epi32(descs[0], descs[1]); 192c1d14583SBruce Richardson tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]); 193c1d14583SBruce Richardson tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc); 194c1d14583SBruce Richardson tmp_desc = _mm_and_si128(tmp_desc, desc_mask); 195c1d14583SBruce Richardson 196c1d14583SBruce Richardson /* checksum flags */ 197c1d14583SBruce Richardson tmp_desc = _mm_srli_epi32(tmp_desc, 4); 198c1d14583SBruce Richardson flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); 199c1d14583SBruce Richardson /* then we shift left 1 bit */ 200c1d14583SBruce Richardson flags = _mm_slli_epi32(flags, 1); 201c1d14583SBruce Richardson 202c1d14583SBruce Richardson __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); 203c1d14583SBruce Richardson __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask); 204c1d14583SBruce Richardson l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); 205c1d14583SBruce Richardson 206c1d14583SBruce Richardson __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6); 207c1d14583SBruce Richardson __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask); 208c1d14583SBruce Richardson flags = _mm_or_si128(l3_l4_flags, l4_outer_flags); 209c1d14583SBruce Richardson /* we need to mask out the redundant bits introduced by RSS or 210c1d14583SBruce Richardson * VLAN fields. 211c1d14583SBruce Richardson */ 212c1d14583SBruce Richardson flags = _mm_and_si128(flags, cksum_mask); 213c1d14583SBruce Richardson 214c1d14583SBruce Richardson /* RSS, VLAN flag */ 215c1d14583SBruce Richardson tmp_desc = _mm_srli_epi32(tmp_desc, 8); 216c1d14583SBruce Richardson rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc); 217c1d14583SBruce Richardson 218c1d14583SBruce Richardson /* merge the flags */ 219c1d14583SBruce Richardson flags = _mm_or_si128(flags, rss_vlan); 220c1d14583SBruce Richardson 221c1d14583SBruce Richardson if (rxq->fdir_enabled) { 222c1d14583SBruce Richardson const __m128i fdir_id0_1 = 223c1d14583SBruce Richardson _mm_unpackhi_epi32(descs[0], descs[1]); 224c1d14583SBruce Richardson 225c1d14583SBruce Richardson const __m128i fdir_id2_3 = 226c1d14583SBruce Richardson _mm_unpackhi_epi32(descs[2], descs[3]); 227c1d14583SBruce Richardson 228c1d14583SBruce Richardson const __m128i fdir_id0_3 = 229c1d14583SBruce Richardson _mm_unpackhi_epi64(fdir_id0_1, fdir_id2_3); 230c1d14583SBruce Richardson 231c1d14583SBruce Richardson const __m128i fdir_flags = 232c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3); 233c1d14583SBruce Richardson 234c1d14583SBruce Richardson /* merge with fdir_flags */ 235c1d14583SBruce Richardson flags = _mm_or_si128(flags, fdir_flags); 236c1d14583SBruce Richardson 237c1d14583SBruce Richardson /* write fdir_id to mbuf */ 238c1d14583SBruce Richardson rx_pkts[0]->hash.fdir.hi = 239c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 0); 240c1d14583SBruce Richardson 241c1d14583SBruce Richardson rx_pkts[1]->hash.fdir.hi = 242c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 1); 243c1d14583SBruce Richardson 244c1d14583SBruce Richardson rx_pkts[2]->hash.fdir.hi = 245c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 2); 246c1d14583SBruce Richardson 247c1d14583SBruce Richardson rx_pkts[3]->hash.fdir.hi = 248c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 3); 249c1d14583SBruce Richardson } /* if() on fdir_enabled */ 250c1d14583SBruce Richardson 251c1d14583SBruce Richardson /** 252c1d14583SBruce Richardson * At this point, we have the 4 sets of flags in the low 16-bits 253c1d14583SBruce Richardson * of each 32-bit value in flags. 254c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init data 255c1d14583SBruce Richardson * so we can do a single 16-byte write to the mbuf to set the flags 256c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 257c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend for 258c1d14583SBruce Richardson * each mbuf before we do the write. 259c1d14583SBruce Richardson */ 260c1d14583SBruce Richardson rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30); 261c1d14583SBruce Richardson rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30); 262c1d14583SBruce Richardson rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30); 263c1d14583SBruce Richardson rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30); 264c1d14583SBruce Richardson 265c1d14583SBruce Richardson /* write the rearm data and the olflags in one write */ 266c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 267c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 268c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 269c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); 270c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); 271c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); 272c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); 273c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); 274c1d14583SBruce Richardson } 275c1d14583SBruce Richardson 276c1d14583SBruce Richardson static inline void 277c1d14583SBruce Richardson ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, 278c1d14583SBruce Richardson uint32_t *ptype_tbl) 279c1d14583SBruce Richardson { 280c1d14583SBruce Richardson const __m128i ptype_mask = _mm_set_epi16(ICE_RX_FLEX_DESC_PTYPE_M, 0, 281c1d14583SBruce Richardson ICE_RX_FLEX_DESC_PTYPE_M, 0, 282c1d14583SBruce Richardson ICE_RX_FLEX_DESC_PTYPE_M, 0, 283c1d14583SBruce Richardson ICE_RX_FLEX_DESC_PTYPE_M, 0); 284c1d14583SBruce Richardson __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]); 285c1d14583SBruce Richardson __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]); 286c1d14583SBruce Richardson __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23); 287c1d14583SBruce Richardson 288c1d14583SBruce Richardson ptype_all = _mm_and_si128(ptype_all, ptype_mask); 289c1d14583SBruce Richardson 290c1d14583SBruce Richardson rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 1)]; 291c1d14583SBruce Richardson rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 3)]; 292c1d14583SBruce Richardson rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 5)]; 293c1d14583SBruce Richardson rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 7)]; 294c1d14583SBruce Richardson } 295c1d14583SBruce Richardson 296c1d14583SBruce Richardson /** 297c1d14583SBruce Richardson * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP) 298c1d14583SBruce Richardson * 299c1d14583SBruce Richardson * Notice: 300c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 301c1d14583SBruce Richardson * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two 302c1d14583SBruce Richardson */ 303c1d14583SBruce Richardson static inline uint16_t 304c1d14583SBruce Richardson _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, 305c1d14583SBruce Richardson uint16_t nb_pkts, uint8_t *split_packet) 306c1d14583SBruce Richardson { 307c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp; 308c1d14583SBruce Richardson struct ice_rx_entry *sw_ring; 309c1d14583SBruce Richardson uint16_t nb_pkts_recd; 310c1d14583SBruce Richardson int pos; 311c1d14583SBruce Richardson uint64_t var; 312c1d14583SBruce Richardson uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 313c1d14583SBruce Richardson __m128i crc_adjust = _mm_set_epi16 314c1d14583SBruce Richardson (0, 0, 0, /* ignore non-length fields */ 315c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 316c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 317c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 318c1d14583SBruce Richardson 0, 0 /* ignore pkt_type field */ 319c1d14583SBruce Richardson ); 320c1d14583SBruce Richardson const __m128i zero = _mm_setzero_si128(); 321c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf */ 322c1d14583SBruce Richardson const __m128i shuf_msk = _mm_set_epi8 323c1d14583SBruce Richardson (0xFF, 0xFF, 324c1d14583SBruce Richardson 0xFF, 0xFF, /* rss hash parsed separately */ 325c1d14583SBruce Richardson 11, 10, /* octet 10~11, 16 bits vlan_macip */ 326c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits data_len */ 327c1d14583SBruce Richardson 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 328c1d14583SBruce Richardson 5, 4, /* octet 4~5, low 16 bits pkt_len */ 329c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 330c1d14583SBruce Richardson 0xFF, 0xFF /* pkt_type set as unknown */ 331c1d14583SBruce Richardson ); 332c1d14583SBruce Richardson const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF, 333c1d14583SBruce Richardson 0xFF, 0xFF, 334c1d14583SBruce Richardson 0xFF, 0xFF, 335c1d14583SBruce Richardson 0xFF, 0xFF, 336c1d14583SBruce Richardson 0xFF, 0xFF, 337c1d14583SBruce Richardson 0xFF, 0xFF, 338c1d14583SBruce Richardson 0x04, 0x0C, 339c1d14583SBruce Richardson 0x00, 0x08); 340c1d14583SBruce Richardson 341c1d14583SBruce Richardson /** 342c1d14583SBruce Richardson * compile-time check the above crc_adjust layout is correct. 343c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi16 344c1d14583SBruce Richardson * call above. 345c1d14583SBruce Richardson */ 346c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 347c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 348c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 349c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 350c1d14583SBruce Richardson 351c1d14583SBruce Richardson /* 4 packets DD mask */ 352c1d14583SBruce Richardson const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL, 353c1d14583SBruce Richardson 0x0000000100000001LL); 354c1d14583SBruce Richardson /* 4 packets EOP mask */ 355c1d14583SBruce Richardson const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL, 356c1d14583SBruce Richardson 0x0000000200000002LL); 357c1d14583SBruce Richardson 358c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */ 359c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP); 360c1d14583SBruce Richardson 361c1d14583SBruce Richardson /* Just the act of getting into the function from the application is 362c1d14583SBruce Richardson * going to cost about 7 cycles 363c1d14583SBruce Richardson */ 364c1d14583SBruce Richardson rxdp = rxq->rx_ring + rxq->rx_tail; 365c1d14583SBruce Richardson 366c1d14583SBruce Richardson rte_prefetch0(rxdp); 367c1d14583SBruce Richardson 368c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 369c1d14583SBruce Richardson * of time to act 370c1d14583SBruce Richardson */ 371c1d14583SBruce Richardson if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 372c1d14583SBruce Richardson ice_rxq_rearm(rxq); 373c1d14583SBruce Richardson 374c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 375c1d14583SBruce Richardson * there is actually a packet available 376c1d14583SBruce Richardson */ 377c1d14583SBruce Richardson if (!(rxdp->wb.status_error0 & 378c1d14583SBruce Richardson rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 379c1d14583SBruce Richardson return 0; 380c1d14583SBruce Richardson 381c1d14583SBruce Richardson /** 382c1d14583SBruce Richardson * Compile-time verify the shuffle mask 383c1d14583SBruce Richardson * NOTE: some field positions already verified above, but duplicated 384c1d14583SBruce Richardson * here for completeness in case of future modifications. 385c1d14583SBruce Richardson */ 386c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 387c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 388c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 389c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 390c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 391c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 392c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 393c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 394c1d14583SBruce Richardson 395c1d14583SBruce Richardson /* Cache is empty -> need to scan the buffer rings, but first move 396c1d14583SBruce Richardson * the next 'n' mbufs into the cache 397c1d14583SBruce Richardson */ 398c1d14583SBruce Richardson sw_ring = &rxq->sw_ring[rxq->rx_tail]; 399c1d14583SBruce Richardson 400c1d14583SBruce Richardson /* A. load 4 packet in one loop 401c1d14583SBruce Richardson * [A*. mask out 4 unused dirty field in desc] 402c1d14583SBruce Richardson * B. copy 4 mbuf point from swring to rx_pkts 403c1d14583SBruce Richardson * C. calc the number of DD bits among the 4 packets 404c1d14583SBruce Richardson * [C*. extract the end-of-packet bit, if requested] 405c1d14583SBruce Richardson * D. fill info. from desc to mbuf 406c1d14583SBruce Richardson */ 407c1d14583SBruce Richardson 408c1d14583SBruce Richardson for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; 409c1d14583SBruce Richardson pos += ICE_DESCS_PER_LOOP, 410c1d14583SBruce Richardson rxdp += ICE_DESCS_PER_LOOP) { 411c1d14583SBruce Richardson __m128i descs[ICE_DESCS_PER_LOOP]; 412c1d14583SBruce Richardson __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 413c1d14583SBruce Richardson __m128i staterr, sterr_tmp1, sterr_tmp2; 414c1d14583SBruce Richardson /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ 415c1d14583SBruce Richardson __m128i mbp1; 416c1d14583SBruce Richardson #if defined(RTE_ARCH_X86_64) 417c1d14583SBruce Richardson __m128i mbp2; 418c1d14583SBruce Richardson #endif 419c1d14583SBruce Richardson 420c1d14583SBruce Richardson /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ 421c1d14583SBruce Richardson mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); 422c1d14583SBruce Richardson /* Read desc statuses backwards to avoid race condition */ 423c1d14583SBruce Richardson /* A.1 load desc[3] */ 424c1d14583SBruce Richardson descs[3] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 425c1d14583SBruce Richardson rte_compiler_barrier(); 426c1d14583SBruce Richardson 427c1d14583SBruce Richardson /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ 428c1d14583SBruce Richardson _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); 429c1d14583SBruce Richardson 430c1d14583SBruce Richardson #if defined(RTE_ARCH_X86_64) 431c1d14583SBruce Richardson /* B.1 load 2 64 bit mbuf points */ 432c1d14583SBruce Richardson mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); 433c1d14583SBruce Richardson #endif 434c1d14583SBruce Richardson 435c1d14583SBruce Richardson /* A.1 load desc[2-0] */ 436c1d14583SBruce Richardson descs[2] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 437c1d14583SBruce Richardson rte_compiler_barrier(); 438c1d14583SBruce Richardson descs[1] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 439c1d14583SBruce Richardson rte_compiler_barrier(); 440c1d14583SBruce Richardson descs[0] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp)); 441c1d14583SBruce Richardson 442c1d14583SBruce Richardson #if defined(RTE_ARCH_X86_64) 443c1d14583SBruce Richardson /* B.2 copy 2 mbuf point into rx_pkts */ 444c1d14583SBruce Richardson _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); 445c1d14583SBruce Richardson #endif 446c1d14583SBruce Richardson 447c1d14583SBruce Richardson if (split_packet) { 448c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos]); 449c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); 450c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); 451c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); 452c1d14583SBruce Richardson } 453c1d14583SBruce Richardson 454c1d14583SBruce Richardson /* avoid compiler reorder optimization */ 455c1d14583SBruce Richardson rte_compiler_barrier(); 456c1d14583SBruce Richardson 457c1d14583SBruce Richardson /* D.1 pkt 3,4 convert format from desc to pktmbuf */ 458c1d14583SBruce Richardson pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk); 459c1d14583SBruce Richardson pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk); 460c1d14583SBruce Richardson 461c1d14583SBruce Richardson /* D.1 pkt 1,2 convert format from desc to pktmbuf */ 462c1d14583SBruce Richardson pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk); 463c1d14583SBruce Richardson pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk); 464c1d14583SBruce Richardson 465c1d14583SBruce Richardson /* C.1 4=>2 filter staterr info only */ 466c1d14583SBruce Richardson sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); 467c1d14583SBruce Richardson /* C.1 4=>2 filter staterr info only */ 468c1d14583SBruce Richardson sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); 469c1d14583SBruce Richardson 470c1d14583SBruce Richardson ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); 471c1d14583SBruce Richardson 472c1d14583SBruce Richardson /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ 473c1d14583SBruce Richardson pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); 474c1d14583SBruce Richardson pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); 475c1d14583SBruce Richardson 476c1d14583SBruce Richardson /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ 477c1d14583SBruce Richardson pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); 478c1d14583SBruce Richardson pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust); 479c1d14583SBruce Richardson 480c1d14583SBruce Richardson #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 481c1d14583SBruce Richardson /** 482c1d14583SBruce Richardson * needs to load 2nd 16B of each desc for RSS hash parsing, 483c1d14583SBruce Richardson * will cause performance drop to get into this context. 484c1d14583SBruce Richardson */ 485c1d14583SBruce Richardson if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 486c1d14583SBruce Richardson RTE_ETH_RX_OFFLOAD_RSS_HASH) { 487c1d14583SBruce Richardson /* load bottom half of every 32B desc */ 488c1d14583SBruce Richardson const __m128i raw_desc_bh3 = 489c1d14583SBruce Richardson _mm_load_si128 490c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 491c1d14583SBruce Richardson rte_compiler_barrier(); 492c1d14583SBruce Richardson const __m128i raw_desc_bh2 = 493c1d14583SBruce Richardson _mm_load_si128 494c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 495c1d14583SBruce Richardson rte_compiler_barrier(); 496c1d14583SBruce Richardson const __m128i raw_desc_bh1 = 497c1d14583SBruce Richardson _mm_load_si128 498c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 499c1d14583SBruce Richardson rte_compiler_barrier(); 500c1d14583SBruce Richardson const __m128i raw_desc_bh0 = 501c1d14583SBruce Richardson _mm_load_si128 502c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 503c1d14583SBruce Richardson 504c1d14583SBruce Richardson /** 505c1d14583SBruce Richardson * to shift the 32b RSS hash value to the 506c1d14583SBruce Richardson * highest 32b of each 128b before mask 507c1d14583SBruce Richardson */ 508c1d14583SBruce Richardson __m128i rss_hash3 = 509c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh3, 32); 510c1d14583SBruce Richardson __m128i rss_hash2 = 511c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh2, 32); 512c1d14583SBruce Richardson __m128i rss_hash1 = 513c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh1, 32); 514c1d14583SBruce Richardson __m128i rss_hash0 = 515c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh0, 32); 516c1d14583SBruce Richardson 517c1d14583SBruce Richardson __m128i rss_hash_msk = 518c1d14583SBruce Richardson _mm_set_epi32(0xFFFFFFFF, 0, 0, 0); 519c1d14583SBruce Richardson 520c1d14583SBruce Richardson rss_hash3 = _mm_and_si128 521c1d14583SBruce Richardson (rss_hash3, rss_hash_msk); 522c1d14583SBruce Richardson rss_hash2 = _mm_and_si128 523c1d14583SBruce Richardson (rss_hash2, rss_hash_msk); 524c1d14583SBruce Richardson rss_hash1 = _mm_and_si128 525c1d14583SBruce Richardson (rss_hash1, rss_hash_msk); 526c1d14583SBruce Richardson rss_hash0 = _mm_and_si128 527c1d14583SBruce Richardson (rss_hash0, rss_hash_msk); 528c1d14583SBruce Richardson 529c1d14583SBruce Richardson pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3); 530c1d14583SBruce Richardson pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2); 531c1d14583SBruce Richardson pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1); 532c1d14583SBruce Richardson pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0); 533c1d14583SBruce Richardson } /* if() on RSS hash parsing */ 534c1d14583SBruce Richardson #endif 535c1d14583SBruce Richardson 536c1d14583SBruce Richardson /* C.2 get 4 pkts staterr value */ 537c1d14583SBruce Richardson staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); 538c1d14583SBruce Richardson 539c1d14583SBruce Richardson /* D.3 copy final 3,4 data to rx_pkts */ 540c1d14583SBruce Richardson _mm_storeu_si128 541c1d14583SBruce Richardson ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, 542c1d14583SBruce Richardson pkt_mb3); 543c1d14583SBruce Richardson _mm_storeu_si128 544c1d14583SBruce Richardson ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, 545c1d14583SBruce Richardson pkt_mb2); 546c1d14583SBruce Richardson 547c1d14583SBruce Richardson /* C* extract and record EOP bit */ 548c1d14583SBruce Richardson if (split_packet) { 549c1d14583SBruce Richardson /* and with mask to extract bits, flipping 1-0 */ 550c1d14583SBruce Richardson __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); 551c1d14583SBruce Richardson /* the staterr values are not in order, as the count 552c1d14583SBruce Richardson * of dd bits doesn't care. However, for end of 553c1d14583SBruce Richardson * packet tracking, we do care, so shuffle. This also 554c1d14583SBruce Richardson * compresses the 32-bit values to 8-bit 555c1d14583SBruce Richardson */ 556c1d14583SBruce Richardson eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); 557c1d14583SBruce Richardson /* store the resulting 32-bit value */ 558c1d14583SBruce Richardson *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); 559c1d14583SBruce Richardson split_packet += ICE_DESCS_PER_LOOP; 560c1d14583SBruce Richardson } 561c1d14583SBruce Richardson 562c1d14583SBruce Richardson /* C.3 calc available number of desc */ 563c1d14583SBruce Richardson staterr = _mm_and_si128(staterr, dd_check); 564c1d14583SBruce Richardson staterr = _mm_packs_epi32(staterr, zero); 565c1d14583SBruce Richardson 566c1d14583SBruce Richardson /* D.3 copy final 1,2 data to rx_pkts */ 567c1d14583SBruce Richardson _mm_storeu_si128 568c1d14583SBruce Richardson ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, 569c1d14583SBruce Richardson pkt_mb1); 570c1d14583SBruce Richardson _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, 571c1d14583SBruce Richardson pkt_mb0); 572c1d14583SBruce Richardson ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); 573c1d14583SBruce Richardson /* C.4 calc available number of desc */ 574c1d14583SBruce Richardson var = rte_popcount64(_mm_cvtsi128_si64(staterr)); 575c1d14583SBruce Richardson nb_pkts_recd += var; 576c1d14583SBruce Richardson if (likely(var != ICE_DESCS_PER_LOOP)) 577c1d14583SBruce Richardson break; 578c1d14583SBruce Richardson } 579c1d14583SBruce Richardson 580c1d14583SBruce Richardson /* Update our internal tail pointer */ 581c1d14583SBruce Richardson rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); 582c1d14583SBruce Richardson rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); 583c1d14583SBruce Richardson rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); 584c1d14583SBruce Richardson 585c1d14583SBruce Richardson return nb_pkts_recd; 586c1d14583SBruce Richardson } 587c1d14583SBruce Richardson 588c1d14583SBruce Richardson /** 589c1d14583SBruce Richardson * Notice: 590c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 591c1d14583SBruce Richardson * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST 592c1d14583SBruce Richardson * numbers of DD bits 593c1d14583SBruce Richardson */ 594c1d14583SBruce Richardson uint16_t 595c1d14583SBruce Richardson ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 596c1d14583SBruce Richardson uint16_t nb_pkts) 597c1d14583SBruce Richardson { 598c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); 599c1d14583SBruce Richardson } 600c1d14583SBruce Richardson 601c1d14583SBruce Richardson /** 602c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 603c1d14583SBruce Richardson * 604c1d14583SBruce Richardson * Notice: 605c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 606c1d14583SBruce Richardson */ 607c1d14583SBruce Richardson static uint16_t 608c1d14583SBruce Richardson ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 609c1d14583SBruce Richardson uint16_t nb_pkts) 610c1d14583SBruce Richardson { 611c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 612c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 613c1d14583SBruce Richardson 614c1d14583SBruce Richardson /* get some new buffers */ 615c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, 616c1d14583SBruce Richardson split_flags); 617c1d14583SBruce Richardson if (nb_bufs == 0) 618c1d14583SBruce Richardson return 0; 619c1d14583SBruce Richardson 620c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 621c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 622c1d14583SBruce Richardson 623c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 624c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 625c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 626c1d14583SBruce Richardson return nb_bufs; 627c1d14583SBruce Richardson 628c1d14583SBruce Richardson /* reassemble any packets that need reassembly*/ 629c1d14583SBruce Richardson unsigned int i = 0; 630c1d14583SBruce Richardson 631c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 632c1d14583SBruce Richardson /* find the first split flag, and only reassemble then*/ 633c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 634c1d14583SBruce Richardson i++; 635c1d14583SBruce Richardson if (i == nb_bufs) 636c1d14583SBruce Richardson return nb_bufs; 637c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 638c1d14583SBruce Richardson } 63982fbc4a4SBruce Richardson return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 64082fbc4a4SBruce Richardson &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 641c1d14583SBruce Richardson } 642c1d14583SBruce Richardson 643c1d14583SBruce Richardson /** 644c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 645c1d14583SBruce Richardson */ 646c1d14583SBruce Richardson uint16_t 647c1d14583SBruce Richardson ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 648c1d14583SBruce Richardson uint16_t nb_pkts) 649c1d14583SBruce Richardson { 650c1d14583SBruce Richardson uint16_t retval = 0; 651c1d14583SBruce Richardson 652c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 653c1d14583SBruce Richardson uint16_t burst; 654c1d14583SBruce Richardson 655c1d14583SBruce Richardson burst = ice_recv_scattered_burst_vec(rx_queue, 656c1d14583SBruce Richardson rx_pkts + retval, 657c1d14583SBruce Richardson ICE_VPMD_RX_BURST); 658c1d14583SBruce Richardson retval += burst; 659c1d14583SBruce Richardson nb_pkts -= burst; 660c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 661c1d14583SBruce Richardson return retval; 662c1d14583SBruce Richardson } 663c1d14583SBruce Richardson 664c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec(rx_queue, 665c1d14583SBruce Richardson rx_pkts + retval, 666c1d14583SBruce Richardson nb_pkts); 667c1d14583SBruce Richardson } 668c1d14583SBruce Richardson 669c1d14583SBruce Richardson static inline void 670c1d14583SBruce Richardson ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, 671c1d14583SBruce Richardson uint64_t flags) 672c1d14583SBruce Richardson { 673c1d14583SBruce Richardson uint64_t high_qw = 674c1d14583SBruce Richardson (ICE_TX_DESC_DTYPE_DATA | 675c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 676c1d14583SBruce Richardson ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 677c1d14583SBruce Richardson 678c1d14583SBruce Richardson __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 679c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 680c1d14583SBruce Richardson } 681c1d14583SBruce Richardson 682c1d14583SBruce Richardson static inline void 683c1d14583SBruce Richardson ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt, 684c1d14583SBruce Richardson uint16_t nb_pkts, uint64_t flags) 685c1d14583SBruce Richardson { 686c1d14583SBruce Richardson int i; 687c1d14583SBruce Richardson 688c1d14583SBruce Richardson for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) 689c1d14583SBruce Richardson ice_vtx1(txdp, *pkt, flags); 690c1d14583SBruce Richardson } 691c1d14583SBruce Richardson 692c1d14583SBruce Richardson static uint16_t 693c1d14583SBruce Richardson ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 694c1d14583SBruce Richardson uint16_t nb_pkts) 695c1d14583SBruce Richardson { 696c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 697c1d14583SBruce Richardson volatile struct ice_tx_desc *txdp; 6987e230d56SBruce Richardson struct ci_tx_entry_vec *txep; 699c1d14583SBruce Richardson uint16_t n, nb_commit, tx_id; 700c1d14583SBruce Richardson uint64_t flags = ICE_TD_CMD; 701c1d14583SBruce Richardson uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 702c1d14583SBruce Richardson int i; 703c1d14583SBruce Richardson 704c1d14583SBruce Richardson /* cross rx_thresh boundary is not allowed */ 705c1d14583SBruce Richardson nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 706c1d14583SBruce Richardson 707c1d14583SBruce Richardson if (txq->nb_tx_free < txq->tx_free_thresh) 7087e230d56SBruce Richardson ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false); 709c1d14583SBruce Richardson 710c1d14583SBruce Richardson nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 711c1d14583SBruce Richardson nb_commit = nb_pkts; 712c1d14583SBruce Richardson if (unlikely(nb_pkts == 0)) 713c1d14583SBruce Richardson return 0; 714c1d14583SBruce Richardson 715c1d14583SBruce Richardson tx_id = txq->tx_tail; 7164d0f54d9SBruce Richardson txdp = &txq->ice_tx_ring[tx_id]; 7177e230d56SBruce Richardson txep = &txq->sw_ring_vec[tx_id]; 718c1d14583SBruce Richardson 719c1d14583SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 720c1d14583SBruce Richardson 721c1d14583SBruce Richardson n = (uint16_t)(txq->nb_tx_desc - tx_id); 722c1d14583SBruce Richardson if (nb_commit >= n) { 7237e230d56SBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, n); 724c1d14583SBruce Richardson 725c1d14583SBruce Richardson for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) 726c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts, flags); 727c1d14583SBruce Richardson 728c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts++, rs); 729c1d14583SBruce Richardson 730c1d14583SBruce Richardson nb_commit = (uint16_t)(nb_commit - n); 731c1d14583SBruce Richardson 732c1d14583SBruce Richardson tx_id = 0; 733c1d14583SBruce Richardson txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 734c1d14583SBruce Richardson 735c1d14583SBruce Richardson /* avoid reach the end of ring */ 7364d0f54d9SBruce Richardson txdp = &txq->ice_tx_ring[tx_id]; 7377e230d56SBruce Richardson txep = &txq->sw_ring_vec[tx_id]; 738c1d14583SBruce Richardson } 739c1d14583SBruce Richardson 7407e230d56SBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 741c1d14583SBruce Richardson 742c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, nb_commit, flags); 743c1d14583SBruce Richardson 744c1d14583SBruce Richardson tx_id = (uint16_t)(tx_id + nb_commit); 745c1d14583SBruce Richardson if (tx_id > txq->tx_next_rs) { 7464d0f54d9SBruce Richardson txq->ice_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 747c1d14583SBruce Richardson rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 748c1d14583SBruce Richardson ICE_TXD_QW1_CMD_S); 749c1d14583SBruce Richardson txq->tx_next_rs = 750c1d14583SBruce Richardson (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 751c1d14583SBruce Richardson } 752c1d14583SBruce Richardson 753c1d14583SBruce Richardson txq->tx_tail = tx_id; 754c1d14583SBruce Richardson 755c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 756c1d14583SBruce Richardson 757c1d14583SBruce Richardson return nb_pkts; 758c1d14583SBruce Richardson } 759c1d14583SBruce Richardson 760c1d14583SBruce Richardson uint16_t 761c1d14583SBruce Richardson ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 762c1d14583SBruce Richardson uint16_t nb_pkts) 763c1d14583SBruce Richardson { 764c1d14583SBruce Richardson uint16_t nb_tx = 0; 765c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 766c1d14583SBruce Richardson 767c1d14583SBruce Richardson while (nb_pkts) { 768c1d14583SBruce Richardson uint16_t ret, num; 769c1d14583SBruce Richardson 770c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 771c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num); 772c1d14583SBruce Richardson nb_tx += ret; 773c1d14583SBruce Richardson nb_pkts -= ret; 774c1d14583SBruce Richardson if (ret < num) 775c1d14583SBruce Richardson break; 776c1d14583SBruce Richardson } 777c1d14583SBruce Richardson 778c1d14583SBruce Richardson return nb_tx; 779c1d14583SBruce Richardson } 780c1d14583SBruce Richardson 781c1d14583SBruce Richardson int __rte_cold 782c1d14583SBruce Richardson ice_rxq_vec_setup(struct ice_rx_queue *rxq) 783c1d14583SBruce Richardson { 784c1d14583SBruce Richardson if (!rxq) 785c1d14583SBruce Richardson return -1; 786c1d14583SBruce Richardson 787c1d14583SBruce Richardson rxq->rx_rel_mbufs = _ice_rx_queue_release_mbufs_vec; 788*61dcf278SBruce Richardson rxq->mbuf_initializer = ci_rxq_mbuf_initializer(rxq->port_id); 789*61dcf278SBruce Richardson return 0; 790c1d14583SBruce Richardson } 791c1d14583SBruce Richardson 792c1d14583SBruce Richardson int __rte_cold 793c038157aSBruce Richardson ice_txq_vec_setup(struct ci_tx_queue *txq __rte_unused) 794c1d14583SBruce Richardson { 795c1d14583SBruce Richardson return 0; 796c1d14583SBruce Richardson } 797c1d14583SBruce Richardson 798c1d14583SBruce Richardson int __rte_cold 799c1d14583SBruce Richardson ice_rx_vec_dev_check(struct rte_eth_dev *dev) 800c1d14583SBruce Richardson { 801c1d14583SBruce Richardson return ice_rx_vec_dev_check_default(dev); 802c1d14583SBruce Richardson } 803c1d14583SBruce Richardson 804c1d14583SBruce Richardson int __rte_cold 805c1d14583SBruce Richardson ice_tx_vec_dev_check(struct rte_eth_dev *dev) 806c1d14583SBruce Richardson { 807c1d14583SBruce Richardson return ice_tx_vec_dev_check_default(dev); 808c1d14583SBruce Richardson } 809