1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 #include "ice_rxtx_vec_common.h" 6 7 #include <rte_vect.h> 8 9 static inline __m128i 10 ice_flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3) 11 { 12 #define FDID_MIS_MAGIC 0xFFFFFFFF 13 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 14 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 15 const __m128i pkt_fdir_bit = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR | 16 RTE_MBUF_F_RX_FDIR_ID); 17 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 18 const __m128i fdir_mis_mask = _mm_set1_epi32(FDID_MIS_MAGIC); 19 __m128i fdir_mask = _mm_cmpeq_epi32(fdir_id0_3, 20 fdir_mis_mask); 21 /* this XOR op results to bit-reverse the fdir_mask */ 22 fdir_mask = _mm_xor_si128(fdir_mask, fdir_mis_mask); 23 const __m128i fdir_flags = _mm_and_si128(fdir_mask, pkt_fdir_bit); 24 25 return fdir_flags; 26 } 27 28 static inline void 29 ice_rxq_rearm(struct ice_rx_queue *rxq) 30 { 31 int i; 32 uint16_t rx_id; 33 volatile union ice_rx_flex_desc *rxdp; 34 struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; 35 struct rte_mbuf *mb0, *mb1; 36 __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, 37 RTE_PKTMBUF_HEADROOM); 38 __m128i dma_addr0, dma_addr1; 39 40 rxdp = rxq->rx_ring + rxq->rxrearm_start; 41 42 /* Pull 'n' more MBUFs into the software ring */ 43 if (rte_mempool_get_bulk(rxq->mp, 44 (void *)rxep, 45 ICE_RXQ_REARM_THRESH) < 0) { 46 if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >= 47 rxq->nb_rx_desc) { 48 dma_addr0 = _mm_setzero_si128(); 49 for (i = 0; i < ICE_DESCS_PER_LOOP; i++) { 50 rxep[i].mbuf = &rxq->fake_mbuf; 51 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 52 dma_addr0); 53 } 54 } 55 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += 56 ICE_RXQ_REARM_THRESH; 57 return; 58 } 59 60 /* Initialize the mbufs in vector, process 2 mbufs in one loop */ 61 for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) { 62 __m128i vaddr0, vaddr1; 63 64 mb0 = rxep[0].mbuf; 65 mb1 = rxep[1].mbuf; 66 67 #if RTE_IOVA_IN_MBUF 68 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 69 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 70 offsetof(struct rte_mbuf, buf_addr) + 8); 71 #endif 72 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 73 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 74 75 #if RTE_IOVA_IN_MBUF 76 /* convert pa to dma_addr hdr/data */ 77 dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); 78 dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); 79 #else 80 /* convert va to dma_addr hdr/data */ 81 dma_addr0 = _mm_unpacklo_epi64(vaddr0, vaddr0); 82 dma_addr1 = _mm_unpacklo_epi64(vaddr1, vaddr1); 83 #endif 84 85 /* add headroom to pa values */ 86 dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); 87 dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); 88 89 /* flush desc with pa dma_addr */ 90 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0); 91 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1); 92 } 93 94 rxq->rxrearm_start += ICE_RXQ_REARM_THRESH; 95 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 96 rxq->rxrearm_start = 0; 97 98 rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH; 99 100 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 101 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 102 103 /* Update the tail pointer on the NIC */ 104 ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); 105 } 106 107 static inline void 108 ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4], 109 struct rte_mbuf **rx_pkts) 110 { 111 const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); 112 __m128i rearm0, rearm1, rearm2, rearm3; 113 114 __m128i tmp_desc, flags, rss_vlan; 115 116 /* mask everything except checksum, RSS and VLAN flags. 117 * bit6:4 for checksum. 118 * bit12 for RSS indication. 119 * bit13 for VLAN indication. 120 */ 121 const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0, 122 0x30f0, 0x30f0); 123 const __m128i cksum_mask = _mm_set_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 124 RTE_MBUF_F_RX_L4_CKSUM_MASK | 125 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 126 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 127 RTE_MBUF_F_RX_IP_CKSUM_MASK | 128 RTE_MBUF_F_RX_L4_CKSUM_MASK | 129 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 130 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 131 RTE_MBUF_F_RX_IP_CKSUM_MASK | 132 RTE_MBUF_F_RX_L4_CKSUM_MASK | 133 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 134 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 135 RTE_MBUF_F_RX_IP_CKSUM_MASK | 136 RTE_MBUF_F_RX_L4_CKSUM_MASK | 137 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 138 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 139 140 /* map the checksum, rss and vlan fields to the checksum, rss 141 * and vlan flag 142 */ 143 const __m128i cksum_flags = 144 _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 145 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 146 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 147 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 148 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 149 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 150 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 151 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 152 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 153 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 154 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 155 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 156 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 157 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 158 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 159 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 160 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 161 /** 162 * shift right 20 bits to use the low two bits to indicate 163 * outer checksum status 164 * shift right 1 bit to make sure it not exceed 255 165 */ 166 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 167 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 168 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 169 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 170 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 171 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 172 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 173 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 174 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 175 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 176 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 177 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 178 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 179 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 180 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 181 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 182 183 const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, 184 0, 0, 0, 0, 185 0, 0, 0, 0, 186 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 187 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 188 RTE_MBUF_F_RX_RSS_HASH, 0); 189 190 /* merge 4 descriptors */ 191 flags = _mm_unpackhi_epi32(descs[0], descs[1]); 192 tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]); 193 tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc); 194 tmp_desc = _mm_and_si128(tmp_desc, desc_mask); 195 196 /* checksum flags */ 197 tmp_desc = _mm_srli_epi32(tmp_desc, 4); 198 flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); 199 /* then we shift left 1 bit */ 200 flags = _mm_slli_epi32(flags, 1); 201 202 __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); 203 __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask); 204 l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); 205 206 __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6); 207 __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask); 208 flags = _mm_or_si128(l3_l4_flags, l4_outer_flags); 209 /* we need to mask out the redundant bits introduced by RSS or 210 * VLAN fields. 211 */ 212 flags = _mm_and_si128(flags, cksum_mask); 213 214 /* RSS, VLAN flag */ 215 tmp_desc = _mm_srli_epi32(tmp_desc, 8); 216 rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc); 217 218 /* merge the flags */ 219 flags = _mm_or_si128(flags, rss_vlan); 220 221 if (rxq->fdir_enabled) { 222 const __m128i fdir_id0_1 = 223 _mm_unpackhi_epi32(descs[0], descs[1]); 224 225 const __m128i fdir_id2_3 = 226 _mm_unpackhi_epi32(descs[2], descs[3]); 227 228 const __m128i fdir_id0_3 = 229 _mm_unpackhi_epi64(fdir_id0_1, fdir_id2_3); 230 231 const __m128i fdir_flags = 232 ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3); 233 234 /* merge with fdir_flags */ 235 flags = _mm_or_si128(flags, fdir_flags); 236 237 /* write fdir_id to mbuf */ 238 rx_pkts[0]->hash.fdir.hi = 239 _mm_extract_epi32(fdir_id0_3, 0); 240 241 rx_pkts[1]->hash.fdir.hi = 242 _mm_extract_epi32(fdir_id0_3, 1); 243 244 rx_pkts[2]->hash.fdir.hi = 245 _mm_extract_epi32(fdir_id0_3, 2); 246 247 rx_pkts[3]->hash.fdir.hi = 248 _mm_extract_epi32(fdir_id0_3, 3); 249 } /* if() on fdir_enabled */ 250 251 /** 252 * At this point, we have the 4 sets of flags in the low 16-bits 253 * of each 32-bit value in flags. 254 * We want to extract these, and merge them with the mbuf init data 255 * so we can do a single 16-byte write to the mbuf to set the flags 256 * and all the other initialization fields. Extracting the 257 * appropriate flags means that we have to do a shift and blend for 258 * each mbuf before we do the write. 259 */ 260 rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30); 261 rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30); 262 rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30); 263 rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30); 264 265 /* write the rearm data and the olflags in one write */ 266 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 267 offsetof(struct rte_mbuf, rearm_data) + 8); 268 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 269 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); 270 _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); 271 _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); 272 _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); 273 _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); 274 } 275 276 static inline void 277 ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, 278 uint32_t *ptype_tbl) 279 { 280 const __m128i ptype_mask = _mm_set_epi16(ICE_RX_FLEX_DESC_PTYPE_M, 0, 281 ICE_RX_FLEX_DESC_PTYPE_M, 0, 282 ICE_RX_FLEX_DESC_PTYPE_M, 0, 283 ICE_RX_FLEX_DESC_PTYPE_M, 0); 284 __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]); 285 __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]); 286 __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23); 287 288 ptype_all = _mm_and_si128(ptype_all, ptype_mask); 289 290 rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 1)]; 291 rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 3)]; 292 rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 5)]; 293 rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 7)]; 294 } 295 296 /** 297 * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP) 298 * 299 * Notice: 300 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 301 * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two 302 */ 303 static inline uint16_t 304 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, 305 uint16_t nb_pkts, uint8_t *split_packet) 306 { 307 volatile union ice_rx_flex_desc *rxdp; 308 struct ice_rx_entry *sw_ring; 309 uint16_t nb_pkts_recd; 310 int pos; 311 uint64_t var; 312 uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 313 __m128i crc_adjust = _mm_set_epi16 314 (0, 0, 0, /* ignore non-length fields */ 315 -rxq->crc_len, /* sub crc on data_len */ 316 0, /* ignore high-16bits of pkt_len */ 317 -rxq->crc_len, /* sub crc on pkt_len */ 318 0, 0 /* ignore pkt_type field */ 319 ); 320 const __m128i zero = _mm_setzero_si128(); 321 /* mask to shuffle from desc. to mbuf */ 322 const __m128i shuf_msk = _mm_set_epi8 323 (0xFF, 0xFF, 324 0xFF, 0xFF, /* rss hash parsed separately */ 325 11, 10, /* octet 10~11, 16 bits vlan_macip */ 326 5, 4, /* octet 4~5, 16 bits data_len */ 327 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 328 5, 4, /* octet 4~5, low 16 bits pkt_len */ 329 0xFF, 0xFF, /* pkt_type set as unknown */ 330 0xFF, 0xFF /* pkt_type set as unknown */ 331 ); 332 const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF, 333 0xFF, 0xFF, 334 0xFF, 0xFF, 335 0xFF, 0xFF, 336 0xFF, 0xFF, 337 0xFF, 0xFF, 338 0x04, 0x0C, 339 0x00, 0x08); 340 341 /** 342 * compile-time check the above crc_adjust layout is correct. 343 * NOTE: the first field (lowest address) is given last in set_epi16 344 * call above. 345 */ 346 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 347 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 348 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 349 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 350 351 /* 4 packets DD mask */ 352 const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL, 353 0x0000000100000001LL); 354 /* 4 packets EOP mask */ 355 const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL, 356 0x0000000200000002LL); 357 358 /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */ 359 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP); 360 361 /* Just the act of getting into the function from the application is 362 * going to cost about 7 cycles 363 */ 364 rxdp = rxq->rx_ring + rxq->rx_tail; 365 366 rte_prefetch0(rxdp); 367 368 /* See if we need to rearm the RX queue - gives the prefetch a bit 369 * of time to act 370 */ 371 if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 372 ice_rxq_rearm(rxq); 373 374 /* Before we start moving massive data around, check to see if 375 * there is actually a packet available 376 */ 377 if (!(rxdp->wb.status_error0 & 378 rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 379 return 0; 380 381 /** 382 * Compile-time verify the shuffle mask 383 * NOTE: some field positions already verified above, but duplicated 384 * here for completeness in case of future modifications. 385 */ 386 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 387 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 388 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 389 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 390 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 391 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 392 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 393 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 394 395 /* Cache is empty -> need to scan the buffer rings, but first move 396 * the next 'n' mbufs into the cache 397 */ 398 sw_ring = &rxq->sw_ring[rxq->rx_tail]; 399 400 /* A. load 4 packet in one loop 401 * [A*. mask out 4 unused dirty field in desc] 402 * B. copy 4 mbuf point from swring to rx_pkts 403 * C. calc the number of DD bits among the 4 packets 404 * [C*. extract the end-of-packet bit, if requested] 405 * D. fill info. from desc to mbuf 406 */ 407 408 for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; 409 pos += ICE_DESCS_PER_LOOP, 410 rxdp += ICE_DESCS_PER_LOOP) { 411 __m128i descs[ICE_DESCS_PER_LOOP]; 412 __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 413 __m128i staterr, sterr_tmp1, sterr_tmp2; 414 /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ 415 __m128i mbp1; 416 #if defined(RTE_ARCH_X86_64) 417 __m128i mbp2; 418 #endif 419 420 /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ 421 mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); 422 /* Read desc statuses backwards to avoid race condition */ 423 /* A.1 load desc[3] */ 424 descs[3] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 425 rte_compiler_barrier(); 426 427 /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ 428 _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); 429 430 #if defined(RTE_ARCH_X86_64) 431 /* B.1 load 2 64 bit mbuf points */ 432 mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); 433 #endif 434 435 /* A.1 load desc[2-0] */ 436 descs[2] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 437 rte_compiler_barrier(); 438 descs[1] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 439 rte_compiler_barrier(); 440 descs[0] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp)); 441 442 #if defined(RTE_ARCH_X86_64) 443 /* B.2 copy 2 mbuf point into rx_pkts */ 444 _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); 445 #endif 446 447 if (split_packet) { 448 rte_mbuf_prefetch_part2(rx_pkts[pos]); 449 rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); 450 rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); 451 rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); 452 } 453 454 /* avoid compiler reorder optimization */ 455 rte_compiler_barrier(); 456 457 /* D.1 pkt 3,4 convert format from desc to pktmbuf */ 458 pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk); 459 pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk); 460 461 /* D.1 pkt 1,2 convert format from desc to pktmbuf */ 462 pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk); 463 pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk); 464 465 /* C.1 4=>2 filter staterr info only */ 466 sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); 467 /* C.1 4=>2 filter staterr info only */ 468 sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); 469 470 ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); 471 472 /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ 473 pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); 474 pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); 475 476 /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ 477 pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); 478 pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust); 479 480 #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 481 /** 482 * needs to load 2nd 16B of each desc for RSS hash parsing, 483 * will cause performance drop to get into this context. 484 */ 485 if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 486 RTE_ETH_RX_OFFLOAD_RSS_HASH) { 487 /* load bottom half of every 32B desc */ 488 const __m128i raw_desc_bh3 = 489 _mm_load_si128 490 (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 491 rte_compiler_barrier(); 492 const __m128i raw_desc_bh2 = 493 _mm_load_si128 494 (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 495 rte_compiler_barrier(); 496 const __m128i raw_desc_bh1 = 497 _mm_load_si128 498 (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 499 rte_compiler_barrier(); 500 const __m128i raw_desc_bh0 = 501 _mm_load_si128 502 (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 503 504 /** 505 * to shift the 32b RSS hash value to the 506 * highest 32b of each 128b before mask 507 */ 508 __m128i rss_hash3 = 509 _mm_slli_epi64(raw_desc_bh3, 32); 510 __m128i rss_hash2 = 511 _mm_slli_epi64(raw_desc_bh2, 32); 512 __m128i rss_hash1 = 513 _mm_slli_epi64(raw_desc_bh1, 32); 514 __m128i rss_hash0 = 515 _mm_slli_epi64(raw_desc_bh0, 32); 516 517 __m128i rss_hash_msk = 518 _mm_set_epi32(0xFFFFFFFF, 0, 0, 0); 519 520 rss_hash3 = _mm_and_si128 521 (rss_hash3, rss_hash_msk); 522 rss_hash2 = _mm_and_si128 523 (rss_hash2, rss_hash_msk); 524 rss_hash1 = _mm_and_si128 525 (rss_hash1, rss_hash_msk); 526 rss_hash0 = _mm_and_si128 527 (rss_hash0, rss_hash_msk); 528 529 pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3); 530 pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2); 531 pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1); 532 pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0); 533 } /* if() on RSS hash parsing */ 534 #endif 535 536 /* C.2 get 4 pkts staterr value */ 537 staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); 538 539 /* D.3 copy final 3,4 data to rx_pkts */ 540 _mm_storeu_si128 541 ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, 542 pkt_mb3); 543 _mm_storeu_si128 544 ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, 545 pkt_mb2); 546 547 /* C* extract and record EOP bit */ 548 if (split_packet) { 549 /* and with mask to extract bits, flipping 1-0 */ 550 __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); 551 /* the staterr values are not in order, as the count 552 * of dd bits doesn't care. However, for end of 553 * packet tracking, we do care, so shuffle. This also 554 * compresses the 32-bit values to 8-bit 555 */ 556 eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); 557 /* store the resulting 32-bit value */ 558 *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); 559 split_packet += ICE_DESCS_PER_LOOP; 560 } 561 562 /* C.3 calc available number of desc */ 563 staterr = _mm_and_si128(staterr, dd_check); 564 staterr = _mm_packs_epi32(staterr, zero); 565 566 /* D.3 copy final 1,2 data to rx_pkts */ 567 _mm_storeu_si128 568 ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, 569 pkt_mb1); 570 _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, 571 pkt_mb0); 572 ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); 573 /* C.4 calc available number of desc */ 574 var = rte_popcount64(_mm_cvtsi128_si64(staterr)); 575 nb_pkts_recd += var; 576 if (likely(var != ICE_DESCS_PER_LOOP)) 577 break; 578 } 579 580 /* Update our internal tail pointer */ 581 rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); 582 rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); 583 rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); 584 585 return nb_pkts_recd; 586 } 587 588 /** 589 * Notice: 590 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 591 * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST 592 * numbers of DD bits 593 */ 594 uint16_t 595 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 596 uint16_t nb_pkts) 597 { 598 return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); 599 } 600 601 /** 602 * vPMD receive routine that reassembles single burst of 32 scattered packets 603 * 604 * Notice: 605 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 606 */ 607 static uint16_t 608 ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 609 uint16_t nb_pkts) 610 { 611 struct ice_rx_queue *rxq = rx_queue; 612 uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 613 614 /* get some new buffers */ 615 uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, 616 split_flags); 617 if (nb_bufs == 0) 618 return 0; 619 620 /* happy day case, full burst + no packets to be joined */ 621 const uint64_t *split_fl64 = (uint64_t *)split_flags; 622 623 if (!rxq->pkt_first_seg && 624 split_fl64[0] == 0 && split_fl64[1] == 0 && 625 split_fl64[2] == 0 && split_fl64[3] == 0) 626 return nb_bufs; 627 628 /* reassemble any packets that need reassembly*/ 629 unsigned int i = 0; 630 631 if (!rxq->pkt_first_seg) { 632 /* find the first split flag, and only reassemble then*/ 633 while (i < nb_bufs && !split_flags[i]) 634 i++; 635 if (i == nb_bufs) 636 return nb_bufs; 637 rxq->pkt_first_seg = rx_pkts[i]; 638 } 639 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 640 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 641 } 642 643 /** 644 * vPMD receive routine that reassembles scattered packets. 645 */ 646 uint16_t 647 ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 648 uint16_t nb_pkts) 649 { 650 uint16_t retval = 0; 651 652 while (nb_pkts > ICE_VPMD_RX_BURST) { 653 uint16_t burst; 654 655 burst = ice_recv_scattered_burst_vec(rx_queue, 656 rx_pkts + retval, 657 ICE_VPMD_RX_BURST); 658 retval += burst; 659 nb_pkts -= burst; 660 if (burst < ICE_VPMD_RX_BURST) 661 return retval; 662 } 663 664 return retval + ice_recv_scattered_burst_vec(rx_queue, 665 rx_pkts + retval, 666 nb_pkts); 667 } 668 669 static inline void 670 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, 671 uint64_t flags) 672 { 673 uint64_t high_qw = 674 (ICE_TX_DESC_DTYPE_DATA | 675 ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 676 ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 677 678 __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 679 _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 680 } 681 682 static inline void 683 ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt, 684 uint16_t nb_pkts, uint64_t flags) 685 { 686 int i; 687 688 for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) 689 ice_vtx1(txdp, *pkt, flags); 690 } 691 692 static uint16_t 693 ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 694 uint16_t nb_pkts) 695 { 696 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 697 volatile struct ice_tx_desc *txdp; 698 struct ci_tx_entry_vec *txep; 699 uint16_t n, nb_commit, tx_id; 700 uint64_t flags = ICE_TD_CMD; 701 uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 702 int i; 703 704 /* cross rx_thresh boundary is not allowed */ 705 nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 706 707 if (txq->nb_tx_free < txq->tx_free_thresh) 708 ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false); 709 710 nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 711 nb_commit = nb_pkts; 712 if (unlikely(nb_pkts == 0)) 713 return 0; 714 715 tx_id = txq->tx_tail; 716 txdp = &txq->ice_tx_ring[tx_id]; 717 txep = &txq->sw_ring_vec[tx_id]; 718 719 txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 720 721 n = (uint16_t)(txq->nb_tx_desc - tx_id); 722 if (nb_commit >= n) { 723 ci_tx_backlog_entry_vec(txep, tx_pkts, n); 724 725 for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) 726 ice_vtx1(txdp, *tx_pkts, flags); 727 728 ice_vtx1(txdp, *tx_pkts++, rs); 729 730 nb_commit = (uint16_t)(nb_commit - n); 731 732 tx_id = 0; 733 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 734 735 /* avoid reach the end of ring */ 736 txdp = &txq->ice_tx_ring[tx_id]; 737 txep = &txq->sw_ring_vec[tx_id]; 738 } 739 740 ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 741 742 ice_vtx(txdp, tx_pkts, nb_commit, flags); 743 744 tx_id = (uint16_t)(tx_id + nb_commit); 745 if (tx_id > txq->tx_next_rs) { 746 txq->ice_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 747 rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 748 ICE_TXD_QW1_CMD_S); 749 txq->tx_next_rs = 750 (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 751 } 752 753 txq->tx_tail = tx_id; 754 755 ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 756 757 return nb_pkts; 758 } 759 760 uint16_t 761 ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 762 uint16_t nb_pkts) 763 { 764 uint16_t nb_tx = 0; 765 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 766 767 while (nb_pkts) { 768 uint16_t ret, num; 769 770 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 771 ret = ice_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num); 772 nb_tx += ret; 773 nb_pkts -= ret; 774 if (ret < num) 775 break; 776 } 777 778 return nb_tx; 779 } 780 781 int __rte_cold 782 ice_rxq_vec_setup(struct ice_rx_queue *rxq) 783 { 784 if (!rxq) 785 return -1; 786 787 rxq->rx_rel_mbufs = _ice_rx_queue_release_mbufs_vec; 788 rxq->mbuf_initializer = ci_rxq_mbuf_initializer(rxq->port_id); 789 return 0; 790 } 791 792 int __rte_cold 793 ice_txq_vec_setup(struct ci_tx_queue *txq __rte_unused) 794 { 795 return 0; 796 } 797 798 int __rte_cold 799 ice_rx_vec_dev_check(struct rte_eth_dev *dev) 800 { 801 return ice_rx_vec_dev_check_default(dev); 802 } 803 804 int __rte_cold 805 ice_tx_vec_dev_check(struct rte_eth_dev *dev) 806 { 807 return ice_tx_vec_dev_check_default(dev); 808 } 809