1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017 Intel Corporation 3 */ 4 5 #include <stdint.h> 6 #include <ethdev_driver.h> 7 #include <rte_malloc.h> 8 9 #include "iavf.h" 10 #include "iavf_rxtx.h" 11 #include "iavf_rxtx_vec_common.h" 12 13 #include <rte_vect.h> 14 15 static inline void 16 iavf_rxq_rearm(struct iavf_rx_queue *rxq) 17 { 18 int i; 19 uint16_t rx_id; 20 21 volatile union iavf_rx_desc *rxdp; 22 struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 23 struct rte_mbuf *mb0, *mb1; 24 __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, 25 RTE_PKTMBUF_HEADROOM); 26 __m128i dma_addr0, dma_addr1; 27 28 rxdp = rxq->rx_ring + rxq->rxrearm_start; 29 30 /* Pull 'n' more MBUFs into the software ring */ 31 if (rte_mempool_get_bulk(rxq->mp, (void *)rxp, 32 rxq->rx_free_thresh) < 0) { 33 if (rxq->rxrearm_nb + rxq->rx_free_thresh >= rxq->nb_rx_desc) { 34 dma_addr0 = _mm_setzero_si128(); 35 for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) { 36 rxp[i] = &rxq->fake_mbuf; 37 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 38 dma_addr0); 39 } 40 } 41 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += 42 rxq->rx_free_thresh; 43 return; 44 } 45 46 /* Initialize the mbufs in vector, process 2 mbufs in one loop */ 47 for (i = 0; i < rxq->rx_free_thresh; i += 2, rxp += 2) { 48 __m128i vaddr0, vaddr1; 49 50 mb0 = rxp[0]; 51 mb1 = rxp[1]; 52 53 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 54 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 55 offsetof(struct rte_mbuf, buf_addr) + 8); 56 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 57 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 58 59 /* convert pa to dma_addr hdr/data */ 60 dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); 61 dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); 62 63 /* add headroom to pa values */ 64 dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); 65 dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); 66 67 /* flush desc with pa dma_addr */ 68 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0); 69 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1); 70 } 71 72 rxq->rxrearm_start += rxq->rx_free_thresh; 73 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 74 rxq->rxrearm_start = 0; 75 76 rxq->rxrearm_nb -= rxq->rx_free_thresh; 77 78 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 79 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 80 81 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u " 82 "rearm_start=%u rearm_nb=%u", 83 rxq->port_id, rxq->queue_id, 84 rx_id, rxq->rxrearm_start, rxq->rxrearm_nb); 85 86 /* Update the tail pointer on the NIC */ 87 IAVF_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); 88 } 89 90 static inline void 91 desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], 92 struct rte_mbuf **rx_pkts) 93 { 94 const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); 95 __m128i rearm0, rearm1, rearm2, rearm3; 96 97 __m128i vlan0, vlan1, rss, l3_l4e; 98 99 /* mask everything except RSS, flow director and VLAN flags 100 * bit2 is for VLAN tag, bit11 for flow director indication 101 * bit13:12 for RSS indication. 102 */ 103 const __m128i rss_vlan_msk = _mm_set_epi32( 104 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804); 105 106 const __m128i cksum_mask = _mm_set_epi32( 107 RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | 108 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 109 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 110 RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | 111 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 112 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 113 RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | 114 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 115 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 116 RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | 117 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 118 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 119 120 /* map rss and vlan type to rss hash and vlan flag */ 121 const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 122 0, 0, 0, 0, 123 0, 0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 124 0, 0, 0, 0); 125 126 const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 127 0, 0, 0, 0, 128 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH, 0, 0, 129 0, 0, RTE_MBUF_F_RX_FDIR, 0); 130 131 const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 132 /* shift right 1 bit to make sure it not exceed 255 */ 133 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 134 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 135 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 136 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 137 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 138 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, 139 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 140 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 141 RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, 142 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1); 143 144 vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); 145 vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]); 146 vlan0 = _mm_unpacklo_epi64(vlan0, vlan1); 147 148 vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); 149 vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); 150 151 rss = _mm_srli_epi32(vlan1, 11); 152 rss = _mm_shuffle_epi8(rss_flags, rss); 153 154 l3_l4e = _mm_srli_epi32(vlan1, 22); 155 l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); 156 /* then we shift left 1 bit */ 157 l3_l4e = _mm_slli_epi32(l3_l4e, 1); 158 /* we need to mask out the redundant bits */ 159 l3_l4e = _mm_and_si128(l3_l4e, cksum_mask); 160 161 vlan0 = _mm_or_si128(vlan0, rss); 162 vlan0 = _mm_or_si128(vlan0, l3_l4e); 163 164 /* At this point, we have the 4 sets of flags in the low 16-bits 165 * of each 32-bit value in vlan0. 166 * We want to extract these, and merge them with the mbuf init data 167 * so we can do a single 16-byte write to the mbuf to set the flags 168 * and all the other initialization fields. Extracting the 169 * appropriate flags means that we have to do a shift and blend for 170 * each mbuf before we do the write. 171 */ 172 rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10); 173 rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10); 174 rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10); 175 rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10); 176 177 /* write the rearm data and the olflags in one write */ 178 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 179 offsetof(struct rte_mbuf, rearm_data) + 8); 180 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 181 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); 182 _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); 183 _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); 184 _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); 185 _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); 186 } 187 188 static inline __m128i 189 flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3) 190 { 191 #define FDID_MIS_MAGIC 0xFFFFFFFF 192 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 193 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 194 const __m128i pkt_fdir_bit = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR | 195 RTE_MBUF_F_RX_FDIR_ID); 196 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 197 const __m128i fdir_mis_mask = _mm_set1_epi32(FDID_MIS_MAGIC); 198 __m128i fdir_mask = _mm_cmpeq_epi32(fdir_id0_3, 199 fdir_mis_mask); 200 /* this XOR op results to bit-reverse the fdir_mask */ 201 fdir_mask = _mm_xor_si128(fdir_mask, fdir_mis_mask); 202 const __m128i fdir_flags = _mm_and_si128(fdir_mask, pkt_fdir_bit); 203 204 return fdir_flags; 205 } 206 207 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 208 static inline void 209 flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], __m128i descs_bh[4], 210 struct rte_mbuf **rx_pkts) 211 #else 212 static inline void 213 flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], 214 struct rte_mbuf **rx_pkts) 215 #endif 216 { 217 const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); 218 __m128i rearm0, rearm1, rearm2, rearm3; 219 220 __m128i tmp_desc, flags, rss_vlan; 221 222 /* mask everything except checksum, RSS and VLAN flags. 223 * bit6:4 for checksum. 224 * bit12 for RSS indication. 225 * bit13 for VLAN indication. 226 */ 227 const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0, 228 0x30f0, 0x30f0); 229 230 const __m128i cksum_mask = _mm_set_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 231 RTE_MBUF_F_RX_L4_CKSUM_MASK | 232 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 233 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 234 RTE_MBUF_F_RX_IP_CKSUM_MASK | 235 RTE_MBUF_F_RX_L4_CKSUM_MASK | 236 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 237 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 238 RTE_MBUF_F_RX_IP_CKSUM_MASK | 239 RTE_MBUF_F_RX_L4_CKSUM_MASK | 240 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 241 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 242 RTE_MBUF_F_RX_IP_CKSUM_MASK | 243 RTE_MBUF_F_RX_L4_CKSUM_MASK | 244 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 245 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 246 247 /* map the checksum, rss and vlan fields to the checksum, rss 248 * and vlan flag 249 */ 250 const __m128i cksum_flags = 251 _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 252 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 253 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 254 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 255 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 256 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 257 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 258 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 259 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 260 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 261 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 262 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 263 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 264 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 265 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 266 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 267 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 268 /** 269 * shift right 20 bits to use the low two bits to indicate 270 * outer checksum status 271 * shift right 1 bit to make sure it not exceed 255 272 */ 273 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 274 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 275 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 276 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 277 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 278 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 279 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 280 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 281 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 282 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 283 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 284 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 285 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 286 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 287 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 288 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 289 290 291 const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, 292 0, 0, 0, 0, 293 0, 0, 0, 0, 294 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 295 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 296 RTE_MBUF_F_RX_RSS_HASH, 0); 297 298 /* merge 4 descriptors */ 299 flags = _mm_unpackhi_epi32(descs[0], descs[1]); 300 tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]); 301 tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc); 302 tmp_desc = _mm_and_si128(tmp_desc, desc_mask); 303 304 /* checksum flags */ 305 tmp_desc = _mm_srli_epi32(tmp_desc, 4); 306 flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); 307 /* then we shift left 1 bit */ 308 flags = _mm_slli_epi32(flags, 1); 309 __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); 310 __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask); 311 l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); 312 313 __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6); 314 __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask); 315 flags = _mm_or_si128(l3_l4_flags, l4_outer_flags); 316 /* we need to mask out the redundant bits introduced by RSS or 317 * VLAN fields. 318 */ 319 flags = _mm_and_si128(flags, cksum_mask); 320 321 /* RSS, VLAN flag */ 322 tmp_desc = _mm_srli_epi32(tmp_desc, 8); 323 rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc); 324 325 /* merge the flags */ 326 flags = _mm_or_si128(flags, rss_vlan); 327 328 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 329 if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 330 const __m128i l2tag2_mask = 331 _mm_set1_epi32(1 << IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); 332 333 const __m128i vlan_tci0_1 = 334 _mm_unpacklo_epi32(descs_bh[0], descs_bh[1]); 335 const __m128i vlan_tci2_3 = 336 _mm_unpacklo_epi32(descs_bh[2], descs_bh[3]); 337 const __m128i vlan_tci0_3 = 338 _mm_unpacklo_epi64(vlan_tci0_1, vlan_tci2_3); 339 340 __m128i vlan_bits = _mm_and_si128(vlan_tci0_3, l2tag2_mask); 341 342 vlan_bits = _mm_srli_epi32(vlan_bits, 343 IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); 344 345 const __m128i vlan_flags_shuf = 346 _mm_set_epi8(0, 0, 0, 0, 347 0, 0, 0, 0, 348 0, 0, 0, 0, 349 0, 0, 350 RTE_MBUF_F_RX_VLAN | 351 RTE_MBUF_F_RX_VLAN_STRIPPED, 352 0); 353 354 const __m128i vlan_flags = _mm_shuffle_epi8(vlan_flags_shuf, vlan_bits); 355 356 /* merge with vlan_flags */ 357 flags = _mm_or_si128(flags, vlan_flags); 358 } 359 #endif 360 361 if (rxq->fdir_enabled) { 362 const __m128i fdir_id0_1 = 363 _mm_unpackhi_epi32(descs[0], descs[1]); 364 365 const __m128i fdir_id2_3 = 366 _mm_unpackhi_epi32(descs[2], descs[3]); 367 368 const __m128i fdir_id0_3 = 369 _mm_unpackhi_epi64(fdir_id0_1, fdir_id2_3); 370 371 const __m128i fdir_flags = 372 flex_rxd_to_fdir_flags_vec(fdir_id0_3); 373 374 /* merge with fdir_flags */ 375 flags = _mm_or_si128(flags, fdir_flags); 376 377 /* write fdir_id to mbuf */ 378 rx_pkts[0]->hash.fdir.hi = 379 _mm_extract_epi32(fdir_id0_3, 0); 380 381 rx_pkts[1]->hash.fdir.hi = 382 _mm_extract_epi32(fdir_id0_3, 1); 383 384 rx_pkts[2]->hash.fdir.hi = 385 _mm_extract_epi32(fdir_id0_3, 2); 386 387 rx_pkts[3]->hash.fdir.hi = 388 _mm_extract_epi32(fdir_id0_3, 3); 389 } /* if() on fdir_enabled */ 390 391 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 392 if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) 393 flags = _mm_or_si128(flags, _mm_set1_epi32(iavf_timestamp_dynflag)); 394 #endif 395 396 /** 397 * At this point, we have the 4 sets of flags in the low 16-bits 398 * of each 32-bit value in flags. 399 * We want to extract these, and merge them with the mbuf init data 400 * so we can do a single 16-byte write to the mbuf to set the flags 401 * and all the other initialization fields. Extracting the 402 * appropriate flags means that we have to do a shift and blend for 403 * each mbuf before we do the write. 404 */ 405 rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30); 406 rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30); 407 rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30); 408 rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30); 409 410 /* write the rearm data and the olflags in one write */ 411 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 412 offsetof(struct rte_mbuf, rearm_data) + 8); 413 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 414 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); 415 _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); 416 _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); 417 _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); 418 _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); 419 } 420 421 #define PKTLEN_SHIFT 10 422 423 static inline void 424 desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, 425 const uint32_t *type_table) 426 { 427 __m128i ptype0 = _mm_unpackhi_epi64(descs[0], descs[1]); 428 __m128i ptype1 = _mm_unpackhi_epi64(descs[2], descs[3]); 429 430 ptype0 = _mm_srli_epi64(ptype0, 30); 431 ptype1 = _mm_srli_epi64(ptype1, 30); 432 433 rx_pkts[0]->packet_type = type_table[_mm_extract_epi8(ptype0, 0)]; 434 rx_pkts[1]->packet_type = type_table[_mm_extract_epi8(ptype0, 8)]; 435 rx_pkts[2]->packet_type = type_table[_mm_extract_epi8(ptype1, 0)]; 436 rx_pkts[3]->packet_type = type_table[_mm_extract_epi8(ptype1, 8)]; 437 } 438 439 static inline void 440 flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, 441 const uint32_t *type_table) 442 { 443 const __m128i ptype_mask = 444 _mm_set_epi16(IAVF_RX_FLEX_DESC_PTYPE_M, 0x0, 445 IAVF_RX_FLEX_DESC_PTYPE_M, 0x0, 446 IAVF_RX_FLEX_DESC_PTYPE_M, 0x0, 447 IAVF_RX_FLEX_DESC_PTYPE_M, 0x0); 448 449 __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]); 450 __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]); 451 __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23); 452 453 ptype_all = _mm_and_si128(ptype_all, ptype_mask); 454 455 rx_pkts[0]->packet_type = type_table[_mm_extract_epi16(ptype_all, 1)]; 456 rx_pkts[1]->packet_type = type_table[_mm_extract_epi16(ptype_all, 3)]; 457 rx_pkts[2]->packet_type = type_table[_mm_extract_epi16(ptype_all, 5)]; 458 rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)]; 459 } 460 461 /** 462 * vPMD raw receive routine, only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP) 463 * 464 * Notice: 465 * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet 466 * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two 467 */ 468 static inline uint16_t 469 _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, 470 uint16_t nb_pkts, uint8_t *split_packet) 471 { 472 volatile union iavf_rx_desc *rxdp; 473 struct rte_mbuf **sw_ring; 474 uint16_t nb_pkts_recd; 475 int pos; 476 uint64_t var; 477 __m128i shuf_msk; 478 const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 479 480 __m128i crc_adjust = _mm_set_epi16( 481 0, 0, 0, /* ignore non-length fields */ 482 -rxq->crc_len, /* sub crc on data_len */ 483 0, /* ignore high-16bits of pkt_len */ 484 -rxq->crc_len, /* sub crc on pkt_len */ 485 0, 0 /* ignore pkt_type field */ 486 ); 487 /* compile-time check the above crc_adjust layout is correct. 488 * NOTE: the first field (lowest address) is given last in set_epi16 489 * call above. 490 */ 491 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 492 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 493 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 494 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 495 __m128i dd_check, eop_check; 496 497 /* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */ 498 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP); 499 500 /* Just the act of getting into the function from the application is 501 * going to cost about 7 cycles 502 */ 503 rxdp = rxq->rx_ring + rxq->rx_tail; 504 505 rte_prefetch0(rxdp); 506 507 /* See if we need to rearm the RX queue - gives the prefetch a bit 508 * of time to act 509 */ 510 if (rxq->rxrearm_nb > rxq->rx_free_thresh) 511 iavf_rxq_rearm(rxq); 512 513 /* Before we start moving massive data around, check to see if 514 * there is actually a packet available 515 */ 516 if (!(rxdp->wb.qword1.status_error_len & 517 rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT))) 518 return 0; 519 520 /* 4 packets DD mask */ 521 dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); 522 523 /* 4 packets EOP mask */ 524 eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); 525 526 /* mask to shuffle from desc. to mbuf */ 527 shuf_msk = _mm_set_epi8( 528 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 529 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 530 15, 14, /* octet 15~14, 16 bits data_len */ 531 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 532 15, 14, /* octet 15~14, low 16 bits pkt_len */ 533 0xFF, 0xFF, 0xFF, 0xFF /* pkt_type set as unknown */ 534 ); 535 /* Compile-time verify the shuffle mask 536 * NOTE: some field positions already verified above, but duplicated 537 * here for completeness in case of future modifications. 538 */ 539 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 540 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 541 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 542 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 543 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 544 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 545 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 546 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 547 548 /* Cache is empty -> need to scan the buffer rings, but first move 549 * the next 'n' mbufs into the cache 550 */ 551 sw_ring = &rxq->sw_ring[rxq->rx_tail]; 552 553 /* A. load 4 packet in one loop 554 * [A*. mask out 4 unused dirty field in desc] 555 * B. copy 4 mbuf point from swring to rx_pkts 556 * C. calc the number of DD bits among the 4 packets 557 * [C*. extract the end-of-packet bit, if requested] 558 * D. fill info. from desc to mbuf 559 */ 560 561 for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; 562 pos += IAVF_VPMD_DESCS_PER_LOOP, 563 rxdp += IAVF_VPMD_DESCS_PER_LOOP) { 564 __m128i descs[IAVF_VPMD_DESCS_PER_LOOP]; 565 __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; 566 __m128i zero, staterr, sterr_tmp1, sterr_tmp2; 567 /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ 568 __m128i mbp1; 569 #if defined(RTE_ARCH_X86_64) 570 __m128i mbp2; 571 #endif 572 573 /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ 574 mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); 575 /* Read desc statuses backwards to avoid race condition */ 576 /* A.1 load desc[3] */ 577 descs[3] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 578 rte_compiler_barrier(); 579 580 /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ 581 _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); 582 583 #if defined(RTE_ARCH_X86_64) 584 /* B.1 load 2 64 bit mbuf points */ 585 mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); 586 #endif 587 588 /* A.1 load desc[2-0] */ 589 descs[2] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 590 rte_compiler_barrier(); 591 descs[1] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 592 rte_compiler_barrier(); 593 descs[0] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp)); 594 595 #if defined(RTE_ARCH_X86_64) 596 /* B.2 copy 2 mbuf point into rx_pkts */ 597 _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); 598 #endif 599 600 if (split_packet) { 601 rte_mbuf_prefetch_part2(rx_pkts[pos]); 602 rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); 603 rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); 604 rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); 605 } 606 607 /* avoid compiler reorder optimization */ 608 rte_compiler_barrier(); 609 610 /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ 611 const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT); 612 const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT); 613 614 /* merge the now-aligned packet length fields back in */ 615 descs[3] = _mm_blend_epi16(descs[3], len3, 0x80); 616 descs[2] = _mm_blend_epi16(descs[2], len2, 0x80); 617 618 /* D.1 pkt 3,4 convert format from desc to pktmbuf */ 619 pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); 620 pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); 621 622 /* C.1 4=>2 status err info only */ 623 sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); 624 sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); 625 626 desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); 627 628 /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ 629 pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); 630 pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); 631 632 /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ 633 const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT); 634 const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT); 635 636 /* merge the now-aligned packet length fields back in */ 637 descs[1] = _mm_blend_epi16(descs[1], len1, 0x80); 638 descs[0] = _mm_blend_epi16(descs[0], len0, 0x80); 639 640 /* D.1 pkt 1,2 convert format from desc to pktmbuf */ 641 pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); 642 pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); 643 644 /* C.2 get 4 pkts status err value */ 645 zero = _mm_xor_si128(dd_check, dd_check); 646 staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); 647 648 /* D.3 copy final 3,4 data to rx_pkts */ 649 _mm_storeu_si128( 650 (void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, 651 pkt_mb4); 652 _mm_storeu_si128( 653 (void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, 654 pkt_mb3); 655 656 /* D.2 pkt 1,2 remove crc */ 657 pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); 658 pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); 659 660 /* C* extract and record EOP bit */ 661 if (split_packet) { 662 __m128i eop_shuf_mask = _mm_set_epi8( 663 0xFF, 0xFF, 0xFF, 0xFF, 664 0xFF, 0xFF, 0xFF, 0xFF, 665 0xFF, 0xFF, 0xFF, 0xFF, 666 0x04, 0x0C, 0x00, 0x08 667 ); 668 669 /* and with mask to extract bits, flipping 1-0 */ 670 __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); 671 /* the staterr values are not in order, as the count 672 * of dd bits doesn't care. However, for end of 673 * packet tracking, we do care, so shuffle. This also 674 * compresses the 32-bit values to 8-bit 675 */ 676 eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); 677 /* store the resulting 32-bit value */ 678 *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); 679 split_packet += IAVF_VPMD_DESCS_PER_LOOP; 680 } 681 682 /* C.3 calc available number of desc */ 683 staterr = _mm_and_si128(staterr, dd_check); 684 staterr = _mm_packs_epi32(staterr, zero); 685 686 /* D.3 copy final 1,2 data to rx_pkts */ 687 _mm_storeu_si128( 688 (void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, 689 pkt_mb2); 690 _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, 691 pkt_mb1); 692 desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); 693 /* C.4 calc available number of desc */ 694 var = rte_popcount64(_mm_cvtsi128_si64(staterr)); 695 nb_pkts_recd += var; 696 if (likely(var != IAVF_VPMD_DESCS_PER_LOOP)) 697 break; 698 } 699 700 /* Update our internal tail pointer */ 701 rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); 702 rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); 703 rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); 704 705 return nb_pkts_recd; 706 } 707 708 /** 709 * vPMD raw receive routine for flex RxD, 710 * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP) 711 * 712 * Notice: 713 * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet 714 * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two 715 */ 716 static inline uint16_t 717 _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, 718 struct rte_mbuf **rx_pkts, 719 uint16_t nb_pkts, uint8_t *split_packet) 720 { 721 volatile union iavf_rx_flex_desc *rxdp; 722 struct rte_mbuf **sw_ring; 723 uint16_t nb_pkts_recd; 724 int pos; 725 uint64_t var; 726 struct iavf_adapter *adapter = rxq->vsi->adapter; 727 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 728 uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads; 729 #endif 730 const uint32_t *ptype_tbl = adapter->ptype_tbl; 731 __m128i crc_adjust = _mm_set_epi16 732 (0, 0, 0, /* ignore non-length fields */ 733 -rxq->crc_len, /* sub crc on data_len */ 734 0, /* ignore high-16bits of pkt_len */ 735 -rxq->crc_len, /* sub crc on pkt_len */ 736 0, 0 /* ignore pkt_type field */ 737 ); 738 const __m128i zero = _mm_setzero_si128(); 739 /* mask to shuffle from desc. to mbuf */ 740 const __m128i shuf_msk = _mm_set_epi8 741 (0xFF, 0xFF, 742 0xFF, 0xFF, /* rss hash parsed separately */ 743 11, 10, /* octet 10~11, 16 bits vlan_macip */ 744 5, 4, /* octet 4~5, 16 bits data_len */ 745 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 746 5, 4, /* octet 4~5, low 16 bits pkt_len */ 747 0xFF, 0xFF, /* pkt_type set as unknown */ 748 0xFF, 0xFF /* pkt_type set as unknown */ 749 ); 750 const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF, 751 0xFF, 0xFF, 752 0xFF, 0xFF, 753 0xFF, 0xFF, 754 0xFF, 0xFF, 755 0xFF, 0xFF, 756 0x04, 0x0C, 757 0x00, 0x08); 758 759 /** 760 * compile-time check the above crc_adjust layout is correct. 761 * NOTE: the first field (lowest address) is given last in set_epi16 762 * call above. 763 */ 764 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 765 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 766 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 767 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 768 769 /* 4 packets DD mask */ 770 const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL, 771 0x0000000100000001LL); 772 /* 4 packets EOP mask */ 773 const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL, 774 0x0000000200000002LL); 775 776 /* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */ 777 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP); 778 779 /* Just the act of getting into the function from the application is 780 * going to cost about 7 cycles 781 */ 782 rxdp = (volatile union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail; 783 784 rte_prefetch0(rxdp); 785 786 /* See if we need to rearm the RX queue - gives the prefetch a bit 787 * of time to act 788 */ 789 if (rxq->rxrearm_nb > rxq->rx_free_thresh) 790 iavf_rxq_rearm(rxq); 791 792 /* Before we start moving massive data around, check to see if 793 * there is actually a packet available 794 */ 795 if (!(rxdp->wb.status_error0 & 796 rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S))) 797 return 0; 798 799 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 800 uint8_t inflection_point = 0; 801 bool is_tsinit = false; 802 __m128i hw_low_last = _mm_set_epi32(0, 0, 0, (uint32_t)rxq->phc_time); 803 804 if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 805 uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000); 806 807 if (unlikely(sw_cur_time - rxq->hw_time_update > 4)) { 808 hw_low_last = _mm_setzero_si128(); 809 is_tsinit = 1; 810 } else { 811 hw_low_last = _mm_set_epi32(0, 0, 0, (uint32_t)rxq->phc_time); 812 } 813 } 814 815 #endif 816 817 /** 818 * Compile-time verify the shuffle mask 819 * NOTE: some field positions already verified above, but duplicated 820 * here for completeness in case of future modifications. 821 */ 822 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 823 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 824 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 825 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 826 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 827 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 828 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 829 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 830 831 /* Cache is empty -> need to scan the buffer rings, but first move 832 * the next 'n' mbufs into the cache 833 */ 834 sw_ring = &rxq->sw_ring[rxq->rx_tail]; 835 836 /* A. load 4 packet in one loop 837 * [A*. mask out 4 unused dirty field in desc] 838 * B. copy 4 mbuf point from swring to rx_pkts 839 * C. calc the number of DD bits among the 4 packets 840 * [C*. extract the end-of-packet bit, if requested] 841 * D. fill info. from desc to mbuf 842 */ 843 844 for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; 845 pos += IAVF_VPMD_DESCS_PER_LOOP, 846 rxdp += IAVF_VPMD_DESCS_PER_LOOP) { 847 __m128i descs[IAVF_VPMD_DESCS_PER_LOOP]; 848 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 849 __m128i descs_bh[IAVF_VPMD_DESCS_PER_LOOP] = {_mm_setzero_si128()}; 850 #endif 851 __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 852 __m128i staterr, sterr_tmp1, sterr_tmp2; 853 /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ 854 __m128i mbp1; 855 #if defined(RTE_ARCH_X86_64) 856 __m128i mbp2; 857 #endif 858 859 /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ 860 mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); 861 /* Read desc statuses backwards to avoid race condition */ 862 /* A.1 load desc[3] */ 863 descs[3] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 864 rte_compiler_barrier(); 865 866 /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ 867 _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); 868 869 #if defined(RTE_ARCH_X86_64) 870 /* B.1 load 2 64 bit mbuf points */ 871 mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); 872 #endif 873 874 /* A.1 load desc[2-0] */ 875 descs[2] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 876 rte_compiler_barrier(); 877 descs[1] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 878 rte_compiler_barrier(); 879 descs[0] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp)); 880 881 #if defined(RTE_ARCH_X86_64) 882 /* B.2 copy 2 mbuf point into rx_pkts */ 883 _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); 884 #endif 885 886 if (split_packet) { 887 rte_mbuf_prefetch_part2(rx_pkts[pos]); 888 rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); 889 rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); 890 rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); 891 } 892 893 /* avoid compiler reorder optimization */ 894 rte_compiler_barrier(); 895 896 /* D.1 pkt 3,4 convert format from desc to pktmbuf */ 897 pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk); 898 pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk); 899 900 /* D.1 pkt 1,2 convert format from desc to pktmbuf */ 901 pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk); 902 pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk); 903 904 /* C.1 4=>2 filter staterr info only */ 905 sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); 906 /* C.1 4=>2 filter staterr info only */ 907 sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); 908 909 /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ 910 pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); 911 pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); 912 913 /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ 914 pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); 915 pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust); 916 917 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 918 /** 919 * needs to load 2nd 16B of each desc, 920 * will cause performance drop to get into this context. 921 */ 922 if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH || 923 offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP || 924 rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 925 /* load bottom half of every 32B desc */ 926 descs_bh[3] = _mm_load_si128 927 (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 928 rte_compiler_barrier(); 929 descs_bh[2] = _mm_load_si128 930 (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 931 rte_compiler_barrier(); 932 descs_bh[1] = _mm_load_si128 933 (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 934 rte_compiler_barrier(); 935 descs_bh[0] = _mm_load_si128 936 (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 937 } 938 939 if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) { 940 /** 941 * to shift the 32b RSS hash value to the 942 * highest 32b of each 128b before mask 943 */ 944 __m128i rss_hash3 = 945 _mm_slli_epi64(descs_bh[3], 32); 946 __m128i rss_hash2 = 947 _mm_slli_epi64(descs_bh[2], 32); 948 __m128i rss_hash1 = 949 _mm_slli_epi64(descs_bh[1], 32); 950 __m128i rss_hash0 = 951 _mm_slli_epi64(descs_bh[0], 32); 952 953 __m128i rss_hash_msk = 954 _mm_set_epi32(0xFFFFFFFF, 0, 0, 0); 955 956 rss_hash3 = _mm_and_si128 957 (rss_hash3, rss_hash_msk); 958 rss_hash2 = _mm_and_si128 959 (rss_hash2, rss_hash_msk); 960 rss_hash1 = _mm_and_si128 961 (rss_hash1, rss_hash_msk); 962 rss_hash0 = _mm_and_si128 963 (rss_hash0, rss_hash_msk); 964 965 pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3); 966 pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2); 967 pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1); 968 pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0); 969 } /* if() on RSS hash parsing */ 970 971 if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 972 /* L2TAG2_2 */ 973 __m128i vlan_tci3 = _mm_slli_si128(descs_bh[3], 4); 974 __m128i vlan_tci2 = _mm_slli_si128(descs_bh[2], 4); 975 __m128i vlan_tci1 = _mm_slli_si128(descs_bh[1], 4); 976 __m128i vlan_tci0 = _mm_slli_si128(descs_bh[0], 4); 977 978 const __m128i vlan_tci_msk = _mm_set_epi32(0, 0xFFFF0000, 0, 0); 979 980 vlan_tci3 = _mm_and_si128(vlan_tci3, vlan_tci_msk); 981 vlan_tci2 = _mm_and_si128(vlan_tci2, vlan_tci_msk); 982 vlan_tci1 = _mm_and_si128(vlan_tci1, vlan_tci_msk); 983 vlan_tci0 = _mm_and_si128(vlan_tci0, vlan_tci_msk); 984 985 pkt_mb3 = _mm_or_si128(pkt_mb3, vlan_tci3); 986 pkt_mb2 = _mm_or_si128(pkt_mb2, vlan_tci2); 987 pkt_mb1 = _mm_or_si128(pkt_mb1, vlan_tci1); 988 pkt_mb0 = _mm_or_si128(pkt_mb0, vlan_tci0); 989 } /* if() on Vlan parsing */ 990 991 if (offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 992 uint32_t mask = 0xFFFFFFFF; 993 __m128i ts; 994 __m128i ts_low = _mm_setzero_si128(); 995 __m128i ts_low1; 996 __m128i max_ret; 997 __m128i cmp_ret; 998 uint8_t ret = 0; 999 uint8_t shift = 4; 1000 __m128i ts_desp_mask = _mm_set_epi32(mask, 0, 0, 0); 1001 __m128i cmp_mask = _mm_set1_epi32(mask); 1002 1003 ts = _mm_and_si128(descs_bh[0], ts_desp_mask); 1004 ts_low = _mm_or_si128(ts_low, _mm_srli_si128(ts, 3 * 4)); 1005 ts = _mm_and_si128(descs_bh[1], ts_desp_mask); 1006 ts_low = _mm_or_si128(ts_low, _mm_srli_si128(ts, 2 * 4)); 1007 ts = _mm_and_si128(descs_bh[2], ts_desp_mask); 1008 ts_low = _mm_or_si128(ts_low, _mm_srli_si128(ts, 1 * 4)); 1009 ts = _mm_and_si128(descs_bh[3], ts_desp_mask); 1010 ts_low = _mm_or_si128(ts_low, ts); 1011 1012 ts_low1 = _mm_slli_si128(ts_low, 4); 1013 ts_low1 = _mm_and_si128(ts_low, _mm_set_epi32(mask, mask, mask, 0)); 1014 ts_low1 = _mm_or_si128(ts_low1, hw_low_last); 1015 hw_low_last = _mm_and_si128(ts_low, _mm_set_epi32(0, 0, 0, mask)); 1016 1017 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 0], 1018 iavf_timestamp_dynfield_offset, uint32_t *) = _mm_extract_epi32(ts_low, 0); 1019 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 1], 1020 iavf_timestamp_dynfield_offset, uint32_t *) = _mm_extract_epi32(ts_low, 1); 1021 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 2], 1022 iavf_timestamp_dynfield_offset, uint32_t *) = _mm_extract_epi32(ts_low, 2); 1023 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 3], 1024 iavf_timestamp_dynfield_offset, uint32_t *) = _mm_extract_epi32(ts_low, 3); 1025 1026 if (unlikely(is_tsinit)) { 1027 uint32_t in_timestamp; 1028 1029 if (iavf_get_phc_time(rxq)) 1030 PMD_DRV_LOG(ERR, "get physical time failed"); 1031 in_timestamp = *RTE_MBUF_DYNFIELD(rx_pkts[pos + 0], 1032 iavf_timestamp_dynfield_offset, uint32_t *); 1033 rxq->phc_time = iavf_tstamp_convert_32b_64b(rxq->phc_time, in_timestamp); 1034 } 1035 1036 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 0], 1037 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1038 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 1], 1039 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1040 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 2], 1041 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1042 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 3], 1043 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1044 1045 max_ret = _mm_max_epu32(ts_low, ts_low1); 1046 cmp_ret = _mm_andnot_si128(_mm_cmpeq_epi32(max_ret, ts_low), cmp_mask); 1047 1048 if (_mm_testz_si128(cmp_ret, cmp_mask)) { 1049 inflection_point = 0; 1050 } else { 1051 inflection_point = 1; 1052 while (shift > 1) { 1053 shift = shift >> 1; 1054 __m128i mask_low = _mm_setzero_si128(); 1055 __m128i mask_high = _mm_setzero_si128(); 1056 switch (shift) { 1057 case 2: 1058 mask_low = _mm_set_epi32(0, 0, mask, mask); 1059 mask_high = _mm_set_epi32(mask, mask, 0, 0); 1060 break; 1061 case 1: 1062 mask_low = _mm_srli_si128(cmp_mask, 4); 1063 mask_high = _mm_slli_si128(cmp_mask, 4); 1064 break; 1065 } 1066 ret = _mm_testz_si128(cmp_ret, mask_low); 1067 if (ret) { 1068 ret = _mm_testz_si128(cmp_ret, mask_high); 1069 inflection_point += ret ? 0 : shift; 1070 cmp_mask = mask_high; 1071 } else { 1072 cmp_mask = mask_low; 1073 } 1074 } 1075 } 1076 } /* if() on Timestamp parsing */ 1077 1078 flex_desc_to_olflags_v(rxq, descs, descs_bh, &rx_pkts[pos]); 1079 #else 1080 flex_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); 1081 #endif 1082 1083 /* C.2 get 4 pkts staterr value */ 1084 staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); 1085 1086 /* D.3 copy final 3,4 data to rx_pkts */ 1087 _mm_storeu_si128 1088 ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, 1089 pkt_mb3); 1090 _mm_storeu_si128 1091 ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, 1092 pkt_mb2); 1093 1094 /* C* extract and record EOP bit */ 1095 if (split_packet) { 1096 /* and with mask to extract bits, flipping 1-0 */ 1097 __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); 1098 /* the staterr values are not in order, as the count 1099 * of dd bits doesn't care. However, for end of 1100 * packet tracking, we do care, so shuffle. This also 1101 * compresses the 32-bit values to 8-bit 1102 */ 1103 eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); 1104 /* store the resulting 32-bit value */ 1105 *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); 1106 split_packet += IAVF_VPMD_DESCS_PER_LOOP; 1107 } 1108 1109 /* C.3 calc available number of desc */ 1110 staterr = _mm_and_si128(staterr, dd_check); 1111 staterr = _mm_packs_epi32(staterr, zero); 1112 1113 /* D.3 copy final 1,2 data to rx_pkts */ 1114 _mm_storeu_si128 1115 ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, 1116 pkt_mb1); 1117 _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, 1118 pkt_mb0); 1119 flex_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); 1120 /* C.4 calc available number of desc */ 1121 var = rte_popcount64(_mm_cvtsi128_si64(staterr)); 1122 nb_pkts_recd += var; 1123 1124 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1125 if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 1126 inflection_point = (inflection_point <= var) ? inflection_point : 0; 1127 switch (inflection_point) { 1128 case 1: 1129 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 0], 1130 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1131 /* fallthrough */ 1132 case 2: 1133 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 1], 1134 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1135 /* fallthrough */ 1136 case 3: 1137 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 2], 1138 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1139 /* fallthrough */ 1140 case 4: 1141 *RTE_MBUF_DYNFIELD(rx_pkts[pos + 3], 1142 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1143 rxq->phc_time += (uint64_t)1 << 32; 1144 /* fallthrough */ 1145 case 0: 1146 break; 1147 default: 1148 PMD_DRV_LOG(ERR, "invalid inflection point for rx timestamp"); 1149 break; 1150 } 1151 1152 rxq->hw_time_update = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000); 1153 } 1154 #endif 1155 1156 if (likely(var != IAVF_VPMD_DESCS_PER_LOOP)) 1157 break; 1158 } 1159 1160 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1161 #ifdef IAVF_RX_TS_OFFLOAD 1162 if (nb_pkts_recd > 0 && (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP)) 1163 rxq->phc_time = *RTE_MBUF_DYNFIELD(rx_pkts[nb_pkts_recd - 1], 1164 iavf_timestamp_dynfield_offset, uint32_t *); 1165 #endif 1166 #endif 1167 1168 /* Update our internal tail pointer */ 1169 rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); 1170 rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); 1171 rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); 1172 1173 return nb_pkts_recd; 1174 } 1175 1176 /* Notice: 1177 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1178 * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST 1179 * numbers of DD bits 1180 */ 1181 uint16_t 1182 iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 1183 uint16_t nb_pkts) 1184 { 1185 return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); 1186 } 1187 1188 /* Notice: 1189 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1190 * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST 1191 * numbers of DD bits 1192 */ 1193 uint16_t 1194 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts, 1195 uint16_t nb_pkts) 1196 { 1197 return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL); 1198 } 1199 1200 /** 1201 * vPMD receive routine that reassembles single burst of 32 scattered packets 1202 * 1203 * Notice: 1204 * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet 1205 */ 1206 static uint16_t 1207 iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 1208 uint16_t nb_pkts) 1209 { 1210 struct iavf_rx_queue *rxq = rx_queue; 1211 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; 1212 unsigned int i = 0; 1213 1214 /* get some new buffers */ 1215 uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, 1216 split_flags); 1217 if (nb_bufs == 0) 1218 return 0; 1219 1220 /* happy day case, full burst + no packets to be joined */ 1221 const uint64_t *split_fl64 = (uint64_t *)split_flags; 1222 1223 if (!rxq->pkt_first_seg && 1224 split_fl64[0] == 0 && split_fl64[1] == 0 && 1225 split_fl64[2] == 0 && split_fl64[3] == 0) 1226 return nb_bufs; 1227 1228 /* reassemble any packets that need reassembly*/ 1229 if (!rxq->pkt_first_seg) { 1230 /* find the first split flag, and only reassemble then*/ 1231 while (i < nb_bufs && !split_flags[i]) 1232 i++; 1233 if (i == nb_bufs) 1234 return nb_bufs; 1235 rxq->pkt_first_seg = rx_pkts[i]; 1236 } 1237 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 1238 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 1239 } 1240 1241 /** 1242 * vPMD receive routine that reassembles scattered packets. 1243 */ 1244 uint16_t 1245 iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 1246 uint16_t nb_pkts) 1247 { 1248 uint16_t retval = 0; 1249 1250 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) { 1251 uint16_t burst; 1252 1253 burst = iavf_recv_scattered_burst_vec(rx_queue, 1254 rx_pkts + retval, 1255 IAVF_VPMD_RX_MAX_BURST); 1256 retval += burst; 1257 nb_pkts -= burst; 1258 if (burst < IAVF_VPMD_RX_MAX_BURST) 1259 return retval; 1260 } 1261 1262 return retval + iavf_recv_scattered_burst_vec(rx_queue, 1263 rx_pkts + retval, 1264 nb_pkts); 1265 } 1266 1267 /** 1268 * vPMD receive routine that reassembles single burst of 32 scattered packets 1269 * for flex RxD 1270 * 1271 * Notice: 1272 * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet 1273 */ 1274 static uint16_t 1275 iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue, 1276 struct rte_mbuf **rx_pkts, 1277 uint16_t nb_pkts) 1278 { 1279 struct iavf_rx_queue *rxq = rx_queue; 1280 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; 1281 unsigned int i = 0; 1282 1283 /* get some new buffers */ 1284 uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts, 1285 split_flags); 1286 if (nb_bufs == 0) 1287 return 0; 1288 1289 /* happy day case, full burst + no packets to be joined */ 1290 const uint64_t *split_fl64 = (uint64_t *)split_flags; 1291 1292 if (!rxq->pkt_first_seg && 1293 split_fl64[0] == 0 && split_fl64[1] == 0 && 1294 split_fl64[2] == 0 && split_fl64[3] == 0) 1295 return nb_bufs; 1296 1297 /* reassemble any packets that need reassembly*/ 1298 if (!rxq->pkt_first_seg) { 1299 /* find the first split flag, and only reassemble then*/ 1300 while (i < nb_bufs && !split_flags[i]) 1301 i++; 1302 if (i == nb_bufs) 1303 return nb_bufs; 1304 rxq->pkt_first_seg = rx_pkts[i]; 1305 } 1306 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 1307 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 1308 } 1309 1310 /** 1311 * vPMD receive routine that reassembles scattered packets for flex RxD 1312 */ 1313 uint16_t 1314 iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue, 1315 struct rte_mbuf **rx_pkts, 1316 uint16_t nb_pkts) 1317 { 1318 uint16_t retval = 0; 1319 1320 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) { 1321 uint16_t burst; 1322 1323 burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue, 1324 rx_pkts + retval, 1325 IAVF_VPMD_RX_MAX_BURST); 1326 retval += burst; 1327 nb_pkts -= burst; 1328 if (burst < IAVF_VPMD_RX_MAX_BURST) 1329 return retval; 1330 } 1331 1332 return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue, 1333 rx_pkts + retval, 1334 nb_pkts); 1335 } 1336 1337 static inline void 1338 vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) 1339 { 1340 uint64_t high_qw = 1341 (IAVF_TX_DESC_DTYPE_DATA | 1342 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) | 1343 ((uint64_t)pkt->data_len << 1344 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)); 1345 1346 __m128i descriptor = _mm_set_epi64x(high_qw, 1347 pkt->buf_iova + pkt->data_off); 1348 _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 1349 } 1350 1351 static inline void 1352 iavf_vtx(volatile struct iavf_tx_desc *txdp, struct rte_mbuf **pkt, 1353 uint16_t nb_pkts, uint64_t flags) 1354 { 1355 int i; 1356 1357 for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) 1358 vtx1(txdp, *pkt, flags); 1359 } 1360 1361 uint16_t 1362 iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 1363 uint16_t nb_pkts) 1364 { 1365 struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue; 1366 volatile struct iavf_tx_desc *txdp; 1367 struct ci_tx_entry *txep; 1368 uint16_t n, nb_commit, tx_id; 1369 uint64_t flags = IAVF_TX_DESC_CMD_EOP | 0x04; /* bit 2 must be set */ 1370 uint64_t rs = IAVF_TX_DESC_CMD_RS | flags; 1371 int i; 1372 1373 if (txq->nb_free < txq->free_thresh) 1374 iavf_tx_free_bufs(txq); 1375 1376 nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 1377 if (unlikely(nb_pkts == 0)) 1378 return 0; 1379 nb_commit = nb_pkts; 1380 1381 tx_id = txq->tx_tail; 1382 txdp = &txq->tx_ring[tx_id]; 1383 txep = &txq->sw_ring[tx_id]; 1384 1385 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 1386 1387 n = (uint16_t)(txq->nb_tx_desc - tx_id); 1388 if (nb_commit >= n) { 1389 ci_tx_backlog_entry(txep, tx_pkts, n); 1390 1391 for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) 1392 vtx1(txdp, *tx_pkts, flags); 1393 1394 vtx1(txdp, *tx_pkts++, rs); 1395 1396 nb_commit = (uint16_t)(nb_commit - n); 1397 1398 tx_id = 0; 1399 txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 1400 1401 /* avoid reach the end of ring */ 1402 txdp = &txq->tx_ring[tx_id]; 1403 txep = &txq->sw_ring[tx_id]; 1404 } 1405 1406 ci_tx_backlog_entry(txep, tx_pkts, nb_commit); 1407 1408 iavf_vtx(txdp, tx_pkts, nb_commit, flags); 1409 1410 tx_id = (uint16_t)(tx_id + nb_commit); 1411 if (tx_id > txq->next_rs) { 1412 txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |= 1413 rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) << 1414 IAVF_TXD_QW1_CMD_SHIFT); 1415 txq->next_rs = 1416 (uint16_t)(txq->next_rs + txq->rs_thresh); 1417 } 1418 1419 txq->tx_tail = tx_id; 1420 1421 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_pkts=%u", 1422 txq->port_id, txq->queue_id, tx_id, nb_pkts); 1423 1424 IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 1425 1426 return nb_pkts; 1427 } 1428 1429 uint16_t 1430 iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 1431 uint16_t nb_pkts) 1432 { 1433 uint16_t nb_tx = 0; 1434 struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue; 1435 1436 while (nb_pkts) { 1437 uint16_t ret, num; 1438 1439 /* cross rs_thresh boundary is not allowed */ 1440 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1441 ret = iavf_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num); 1442 nb_tx += ret; 1443 nb_pkts -= ret; 1444 if (ret < num) 1445 break; 1446 } 1447 1448 return nb_tx; 1449 } 1450 1451 void __rte_cold 1452 iavf_rx_queue_release_mbufs_sse(struct iavf_rx_queue *rxq) 1453 { 1454 _iavf_rx_queue_release_mbufs_vec(rxq); 1455 } 1456 1457 void __rte_cold 1458 iavf_tx_queue_release_mbufs_sse(struct iavf_tx_queue *txq) 1459 { 1460 _iavf_tx_queue_release_mbufs_vec(txq); 1461 } 1462 1463 int __rte_cold 1464 iavf_txq_vec_setup(struct iavf_tx_queue *txq) 1465 { 1466 txq->rel_mbufs_type = IAVF_REL_MBUFS_SSE_VEC; 1467 return 0; 1468 } 1469 1470 int __rte_cold 1471 iavf_rxq_vec_setup(struct iavf_rx_queue *rxq) 1472 { 1473 rxq->rel_mbufs_type = IAVF_REL_MBUFS_SSE_VEC; 1474 return iavf_rxq_vec_setup_default(rxq); 1475 } 1476 1477 int __rte_cold 1478 iavf_rx_vec_dev_check(struct rte_eth_dev *dev) 1479 { 1480 return iavf_rx_vec_dev_check_default(dev); 1481 } 1482 1483 int __rte_cold 1484 iavf_tx_vec_dev_check(struct rte_eth_dev *dev) 1485 { 1486 return iavf_tx_vec_dev_check_default(dev); 1487 } 1488