1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. 3 * Copyright 2007 Nuova Systems, Inc. All rights reserved. 4 */ 5 6 #include <rte_mbuf.h> 7 #include <rte_ethdev_driver.h> 8 9 #include "enic_compat.h" 10 #include "rq_enet_desc.h" 11 #include "enic.h" 12 #include "enic_rxtx_common.h" 13 14 #include <x86intrin.h> 15 16 static struct rte_mbuf * 17 rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic) 18 { 19 bool tnl; 20 21 *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer; 22 mb->data_len = cqd->bytes_written_flags & 23 CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK; 24 mb->pkt_len = mb->data_len; 25 tnl = enic->overlay_offload && (cqd->completed_index_flags & 26 CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0; 27 mb->packet_type = 28 enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl); 29 enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb); 30 /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */ 31 if (tnl) { 32 mb->packet_type &= ~(RTE_PTYPE_L3_MASK | 33 RTE_PTYPE_L4_MASK); 34 } 35 return mb; 36 } 37 38 static uint16_t 39 enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 40 uint16_t nb_pkts) 41 { 42 struct rte_mbuf **rx, **rxmb; 43 uint16_t cq_idx, nb_rx, max_rx; 44 struct cq_enet_rq_desc *cqd; 45 struct rq_enet_desc *rqd; 46 struct vnic_cq *cq; 47 struct vnic_rq *rq; 48 struct enic *enic; 49 uint8_t color; 50 51 rq = rx_queue; 52 enic = vnic_dev_priv(rq->vdev); 53 cq = &enic->cq[enic_cq_rq(enic, rq->index)]; 54 cq_idx = cq->to_clean; 55 56 /* 57 * Fill up the reserve of free mbufs. Below, we restock the receive 58 * ring with these mbufs to avoid allocation failures. 59 */ 60 if (rq->num_free_mbufs == 0) { 61 if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs, 62 ENIC_RX_BURST_MAX)) 63 return 0; 64 rq->num_free_mbufs = ENIC_RX_BURST_MAX; 65 } 66 /* Receive until the end of the ring, at most. */ 67 max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs); 68 max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx); 69 70 rxmb = rq->mbuf_ring + cq_idx; 71 color = cq->last_color; 72 cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx; 73 rx = rx_pkts; 74 if (max_rx == 0 || 75 (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color) 76 return 0; 77 78 /* Step 1: Process one packet to do aligned 256-bit load below */ 79 if (cq_idx & 0x1) { 80 if (unlikely(cqd->bytes_written_flags & 81 CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) { 82 rte_pktmbuf_free(*rxmb++); 83 rte_atomic64_inc(&enic->soft_stats.rx_packet_errors); 84 } else { 85 *rx++ = rx_one(cqd, *rxmb++, enic); 86 } 87 cqd++; 88 max_rx--; 89 } 90 91 const __m256i mask = 92 _mm256_set_epi8(/* Second descriptor */ 93 0xff, /* type_color */ 94 (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT | 95 CQ_ENET_RQ_DESC_FLAGS_IPV4 | 96 CQ_ENET_RQ_DESC_FLAGS_IPV6 | 97 CQ_ENET_RQ_DESC_FLAGS_TCP | 98 CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */ 99 0, 0, /* checksum_fcoe */ 100 0xff, 0xff, /* vlan */ 101 0x3f, 0xff, /* bytes_written_flags */ 102 0xff, 0xff, 0xff, 0xff, /* rss_hash */ 103 0xff, 0xff, /* q_number_rss_type_flags */ 104 0, 0, /* completed_index_flags */ 105 /* First descriptor */ 106 0xff, /* type_color */ 107 (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT | 108 CQ_ENET_RQ_DESC_FLAGS_IPV4 | 109 CQ_ENET_RQ_DESC_FLAGS_IPV6 | 110 CQ_ENET_RQ_DESC_FLAGS_TCP | 111 CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */ 112 0, 0, /* checksum_fcoe */ 113 0xff, 0xff, /* vlan */ 114 0x3f, 0xff, /* bytes_written_flags */ 115 0xff, 0xff, 0xff, 0xff, /* rss_hash */ 116 0xff, 0xff, /* q_number_rss_type_flags */ 117 0, 0 /* completed_index_flags */ 118 ); 119 const __m256i shuffle_mask = 120 _mm256_set_epi8(/* Second descriptor */ 121 7, 6, 5, 4, /* rss = rss_hash */ 122 11, 10, /* vlan_tci = vlan */ 123 9, 8, /* data_len = bytes_written */ 124 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */ 125 0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */ 126 /* First descriptor */ 127 7, 6, 5, 4, /* rss = rss_hash */ 128 11, 10, /* vlan_tci = vlan */ 129 9, 8, /* data_len = bytes_written */ 130 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */ 131 0x80, 0x80, 0x80, 0x80 /* packet_type = 0 */ 132 ); 133 /* Used to collect 8 flags from 8 desc into one register */ 134 const __m256i flags_shuffle_mask = 135 _mm256_set_epi8(/* Second descriptor */ 136 1, 3, 9, 14, 137 1, 3, 9, 14, 138 1, 3, 9, 14, 139 1, 3, 9, 14, 140 /* First descriptor */ 141 1, 3, 9, 14, 142 1, 3, 9, 14, 143 1, 3, 9, 14, 144 /* 145 * Byte 3: upper byte of completed_index_flags 146 * bit 5 = fcoe (tunnel) 147 * Byte 2: upper byte of q_number_rss_type_flags 148 * bits 2,3,4,5 = rss type 149 * bit 6 = csum_not_calc 150 * Byte 1: upper byte of bytes_written_flags 151 * bit 6 = truncated 152 * bit 7 = vlan stripped 153 * Byte 0: flags 154 */ 155 1, 3, 9, 14 156 ); 157 /* Used to collect 8 VLAN IDs from 8 desc into one register */ 158 const __m256i vlan_shuffle_mask = 159 _mm256_set_epi8(/* Second descriptor */ 160 0x80, 0x80, 11, 10, 161 0x80, 0x80, 11, 10, 162 0x80, 0x80, 11, 10, 163 0x80, 0x80, 11, 10, 164 /* First descriptor */ 165 0x80, 0x80, 11, 10, 166 0x80, 0x80, 11, 10, 167 0x80, 0x80, 11, 10, 168 0x80, 0x80, 11, 10); 169 /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */ 170 const __m256i rss_shuffle = 171 _mm256_set_epi8(/* second 128 bits */ 172 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 173 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 174 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 175 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 176 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 177 0, /* rss_types = 0 */ 178 /* first 128 bits */ 179 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 180 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 181 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 182 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 183 PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 184 0 /* rss_types = 0 */); 185 /* 186 * VLAN offload flags. 187 * shuffle index: 188 * vlan_stripped => bit 0 189 * vlan_id == 0 => bit 1 190 */ 191 const __m256i vlan_shuffle = 192 _mm256_set_epi32(0, 0, 0, 0, 193 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 194 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN); 195 /* Use the same shuffle index as vlan_shuffle */ 196 const __m256i vlan_ptype_shuffle = 197 _mm256_set_epi32(0, 0, 0, 0, 198 RTE_PTYPE_L2_ETHER, 199 RTE_PTYPE_L2_ETHER, 200 RTE_PTYPE_L2_ETHER, 201 RTE_PTYPE_L2_ETHER_VLAN); 202 /* 203 * CKSUM flags. Shift right so they fit int 8-bit integers. 204 * shuffle index: 205 * ipv4_csum_ok => bit 3 206 * ip4 => bit 2 207 * tcp_or_udp => bit 1 208 * tcp_udp_csum_ok => bit 0 209 */ 210 const __m256i csum_shuffle = 211 _mm256_set_epi8(/* second 128 bits */ 212 /* 1111 ip4+ip4_ok+l4+l4_ok */ 213 ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1), 214 /* 1110 ip4_ok+ip4+l4+!l4_ok */ 215 ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1), 216 (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */ 217 (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */ 218 (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */ 219 (PKT_RX_L4_CKSUM_BAD >> 1), /* 1010 l4+!l4_ok */ 220 0, /* 1001 */ 221 0, /* 1000 */ 222 /* 0111 !ip4_ok+ip4+l4+l4_ok */ 223 ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1), 224 /* 0110 !ip4_ok+ip4+l4+!l4_ok */ 225 ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1), 226 (PKT_RX_IP_CKSUM_BAD >> 1), /* 0101 !ip4_ok+ip4 */ 227 (PKT_RX_IP_CKSUM_BAD >> 1), /* 0100 !ip4_ok+ip4 */ 228 (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */ 229 (PKT_RX_L4_CKSUM_BAD >> 1), /* 0010 l4+!l4_ok */ 230 0, /* 0001 */ 231 0, /* 0000 */ 232 /* first 128 bits */ 233 ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1), 234 ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1), 235 (PKT_RX_IP_CKSUM_GOOD >> 1), 236 (PKT_RX_IP_CKSUM_GOOD >> 1), 237 (PKT_RX_L4_CKSUM_GOOD >> 1), 238 (PKT_RX_L4_CKSUM_BAD >> 1), 239 0, 0, 240 ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1), 241 ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1), 242 (PKT_RX_IP_CKSUM_BAD >> 1), 243 (PKT_RX_IP_CKSUM_BAD >> 1), 244 (PKT_RX_L4_CKSUM_GOOD >> 1), 245 (PKT_RX_L4_CKSUM_BAD >> 1), 246 0, 0); 247 /* 248 * Non-fragment PTYPEs. 249 * Shuffle 4-bit index: 250 * ip6 => bit 0 251 * ip4 => bit 1 252 * udp => bit 2 253 * tcp => bit 3 254 * bit 255 * 3 2 1 0 256 * ------- 257 * 0 0 0 0 unknown 258 * 0 0 0 1 ip6 | nonfrag 259 * 0 0 1 0 ip4 | nonfrag 260 * 0 0 1 1 unknown 261 * 0 1 0 0 unknown 262 * 0 1 0 1 ip6 | udp 263 * 0 1 1 0 ip4 | udp 264 * 0 1 1 1 unknown 265 * 1 0 0 0 unknown 266 * 1 0 0 1 ip6 | tcp 267 * 1 0 1 0 ip4 | tcp 268 * 1 0 1 1 unknown 269 * 1 1 0 0 unknown 270 * 1 1 0 1 unknown 271 * 1 1 1 0 unknown 272 * 1 1 1 1 unknown 273 * 274 * PTYPEs do not fit in 8 bits, so shift right 4.. 275 */ 276 const __m256i nonfrag_ptype_shuffle = 277 _mm256_set_epi8(/* second 128 bits */ 278 RTE_PTYPE_UNKNOWN, 279 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 280 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 281 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4, 282 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4, 283 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 284 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4, 285 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4, 286 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 287 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 288 RTE_PTYPE_L4_NONFRAG) >> 4, 289 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 290 RTE_PTYPE_L4_NONFRAG) >> 4, 291 RTE_PTYPE_UNKNOWN, 292 /* first 128 bits */ 293 RTE_PTYPE_UNKNOWN, 294 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 295 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 296 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4, 297 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4, 298 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 299 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4, 300 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4, 301 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 302 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 303 RTE_PTYPE_L4_NONFRAG) >> 4, 304 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 305 RTE_PTYPE_L4_NONFRAG) >> 4, 306 RTE_PTYPE_UNKNOWN); 307 /* Fragment PTYPEs. Use the same shuffle index as above. */ 308 const __m256i frag_ptype_shuffle = 309 _mm256_set_epi8(/* second 128 bits */ 310 RTE_PTYPE_UNKNOWN, 311 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 312 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 313 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 314 RTE_PTYPE_L4_FRAG) >> 4, 315 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 316 RTE_PTYPE_L4_FRAG) >> 4, 317 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 318 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 319 RTE_PTYPE_L4_FRAG) >> 4, 320 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 321 RTE_PTYPE_L4_FRAG) >> 4, 322 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 323 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 324 RTE_PTYPE_L4_FRAG) >> 4, 325 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 326 RTE_PTYPE_L4_FRAG) >> 4, 327 RTE_PTYPE_UNKNOWN, 328 /* first 128 bits */ 329 RTE_PTYPE_UNKNOWN, 330 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 331 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 332 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 333 RTE_PTYPE_L4_FRAG) >> 4, 334 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 335 RTE_PTYPE_L4_FRAG) >> 4, 336 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 337 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 338 RTE_PTYPE_L4_FRAG) >> 4, 339 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 340 RTE_PTYPE_L4_FRAG) >> 4, 341 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 342 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 343 RTE_PTYPE_L4_FRAG) >> 4, 344 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 345 RTE_PTYPE_L4_FRAG) >> 4, 346 RTE_PTYPE_UNKNOWN); 347 /* 348 * Tunnel PTYPEs. Use the same shuffle index as above. 349 * L4 types are not part of this table. They come from non-tunnel 350 * types above. 351 */ 352 const __m256i tnl_l3_ptype_shuffle = 353 _mm256_set_epi8(/* second 128 bits */ 354 RTE_PTYPE_UNKNOWN, 355 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 356 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 357 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16, 358 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16, 359 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 360 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16, 361 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16, 362 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 363 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16, 364 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16, 365 RTE_PTYPE_UNKNOWN, 366 /* first 128 bits */ 367 RTE_PTYPE_UNKNOWN, 368 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 369 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 370 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16, 371 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16, 372 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 373 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16, 374 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16, 375 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, 376 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16, 377 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16, 378 RTE_PTYPE_UNKNOWN); 379 380 const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer, 381 0, enic->mbuf_initializer); 382 383 /* 384 * --- cq desc fields --- offset 385 * completed_index_flags - 0 use: fcoe 386 * q_number_rss_type_flags - 2 use: rss types, csum_not_calc 387 * rss_hash - 4 ==> mbuf.hash.rss 388 * bytes_written_flags - 8 ==> mbuf.pkt_len,data_len 389 * use: truncated, vlan_stripped 390 * vlan - 10 ==> mbuf.vlan_tci 391 * checksum_fcoe - 12 (unused) 392 * flags - 14 use: all bits 393 * type_color - 15 (unused) 394 * 395 * --- mbuf fields --- offset 396 * rearm_data ---- 16 397 * data_off - 0 (mbuf_init) -+ 398 * refcnt - 2 (mbuf_init) | 399 * nb_segs - 4 (mbuf_init) | 16B 128b 400 * port - 6 (mbuf_init) | 401 * ol_flag - 8 (from cqd) -+ 402 * rx_descriptor_fields1 ---- 32 403 * packet_type - 0 (from cqd) -+ 404 * pkt_len - 4 (from cqd) | 405 * data_len - 8 (from cqd) | 16B 128b 406 * vlan_tci - 10 (from cqd) | 407 * rss - 12 (from cqd) -+ 408 */ 409 410 __m256i overlay_enabled = 411 _mm256_set1_epi32((uint32_t)enic->overlay_offload); 412 413 /* Step 2: Process 8 packets per loop using SIMD */ 414 while (max_rx > 7 && (((cqd + 7)->type_color & 415 CQ_DESC_COLOR_MASK_NOSHIFT) != color)) { 416 /* Load 8 16B CQ descriptors */ 417 __m256i cqd01 = _mm256_load_si256((void *)cqd); 418 __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2)); 419 __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4)); 420 __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6)); 421 /* Copy 8 mbuf pointers to rx_pkts */ 422 _mm256_storeu_si256((void *)rx, 423 _mm256_loadu_si256((void *)rxmb)); 424 _mm256_storeu_si256((void *)(rx + 4), 425 _mm256_loadu_si256((void *)(rxmb + 4))); 426 427 /* 428 * Collect 8 flags (each 32 bits) into one register. 429 * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc 430 */ 431 __m256i flags01 = 432 _mm256_shuffle_epi8(cqd01, flags_shuffle_mask); 433 /* 434 * Shuffle above produces 8 x 32-bit flags for 8 descriptors 435 * in this order: 0, 0, 0, 0, 1, 1, 1, 1 436 * The duplicates in each 128-bit lane simplifies blending 437 * below. 438 */ 439 __m256i flags23 = 440 _mm256_shuffle_epi8(cqd23, flags_shuffle_mask); 441 __m256i flags45 = 442 _mm256_shuffle_epi8(cqd45, flags_shuffle_mask); 443 __m256i flags67 = 444 _mm256_shuffle_epi8(cqd67, flags_shuffle_mask); 445 /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */ 446 __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22); 447 /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */ 448 __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88); 449 /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */ 450 __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc); 451 /* 452 * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6 453 * This order simplifies blend operations way below that 454 * produce 'rearm' data for each mbuf. 455 */ 456 flags0_7 = _mm256_permute4x64_epi64(flags0_7, 457 (1 << 6) + (0 << 4) + (3 << 2) + 2); 458 459 /* 460 * Check truncated bits and bail out early on. 461 * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc 462 */ 463 __m256i trunc = 464 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31); 465 trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc, 466 (1 << 6) + (0 << 4) + (3 << 2) + 2)); 467 /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */ 468 if (_mm256_extract_epi64(trunc, 0) || 469 _mm256_extract_epi64(trunc, 1)) 470 break; 471 472 /* 473 * Compute PKT_RX_RSS_HASH. 474 * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc 475 * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28 476 * Everything else is zero. 477 */ 478 __m256i rss_types = 479 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28); 480 /* 481 * RSS flags (PKT_RX_RSS_HASH) are in 482 * byte 0, 4, 8, 12, 16, 20, 24, 28 483 * Everything else is zero. 484 */ 485 __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types); 486 487 /* 488 * Compute CKSUM flags. First build the index and then 489 * use it to shuffle csum_shuffle. 490 * 20 instructions including const loads: 2.5 inst/desc 491 */ 492 /* 493 * csum_not_calc (bit 22) 494 * csum_not_calc (0) => 0xffffffff 495 * csum_not_calc (1) => 0x0 496 */ 497 const __m256i zero4 = _mm256_setzero_si256(); 498 const __m256i mask22 = _mm256_set1_epi32(0x400000); 499 __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4, 500 _mm256_and_si256(flags0_7, mask22)); 501 /* 502 * (tcp|udp) && !fragment => bit 1 503 * tcp = bit 2, udp = bit 1, frag = bit 6 504 */ 505 const __m256i mask1 = _mm256_set1_epi32(0x2); 506 __m256i tcp_udp = 507 _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5), 508 _mm256_or_si256(flags0_7, 509 _mm256_srli_epi32(flags0_7, 1))); 510 tcp_udp = _mm256_and_si256(tcp_udp, mask1); 511 /* ipv4 (bit 5) => bit 2 */ 512 const __m256i mask2 = _mm256_set1_epi32(0x4); 513 __m256i ipv4 = _mm256_and_si256(mask2, 514 _mm256_srli_epi32(flags0_7, 3)); 515 /* 516 * ipv4_csum_ok (bit 3) => bit 3 517 * tcp_udp_csum_ok (bit 0) => bit 0 518 * 0x9 519 */ 520 const __m256i mask0_3 = _mm256_set1_epi32(0x9); 521 __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3); 522 csum_idx = _mm256_and_si256(csum_not_calc, 523 _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4), 524 tcp_udp)); 525 __m256i csum_flags = 526 _mm256_shuffle_epi8(csum_shuffle, csum_idx); 527 /* Shift left to restore CKSUM flags. See csum_shuffle. */ 528 csum_flags = _mm256_slli_epi32(csum_flags, 1); 529 /* Combine csum flags and offload flags: 0.125 inst/desc */ 530 rss_flags = _mm256_or_si256(rss_flags, csum_flags); 531 532 /* 533 * Collect 8 VLAN IDs and compute vlan_id != 0 on each. 534 * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc: 535 * 1.25 inst/desc 536 */ 537 __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask); 538 __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask); 539 __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask); 540 __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask); 541 __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22); 542 __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88); 543 /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */ 544 __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc); 545 /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */ 546 vlan0_7 = _mm256_permute4x64_epi64(vlan0_7, 547 (1 << 6) + (0 << 4) + (3 << 2) + 2); 548 /* 549 * Compare 0 == vlan_id produces 0xffffffff (-1) if 550 * vlan 0 and 0 if vlan non-0. Then subtracting the 551 * result from 0 produces 0 - (-1) = 1 for vlan 0, and 552 * 0 - 0 = 0 for vlan non-0. 553 */ 554 vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7); 555 /* vlan_id != 0 => 0, vlan_id == 0 => 1 */ 556 vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7); 557 558 /* 559 * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED. 560 * Use 3 shifts, 1 or, 1 shuffle for 8 desc: 0.625 inst/desc 561 * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28 562 * Everything else is zero. 563 */ 564 __m256i vlan_idx = 565 _mm256_or_si256(/* vlan_stripped => bit 0 */ 566 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 567 16), 31), 568 /* (vlan_id == 0) => bit 1 */ 569 _mm256_slli_epi32(vlan0_7, 1)); 570 /* 571 * The index captures 4 cases. 572 * stripped, id = 0 ==> 11b = 3 573 * stripped, id != 0 ==> 01b = 1 574 * not strip, id == 0 ==> 10b = 2 575 * not strip, id != 0 ==> 00b = 0 576 */ 577 __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle, 578 vlan_idx); 579 /* Combine vlan and offload flags: 0.125 inst/desc */ 580 rss_flags = _mm256_or_si256(rss_flags, vlan_flags); 581 582 /* 583 * Compute non-tunnel PTYPEs. 584 * 17 inst / 8 desc = 2.125 inst/desc 585 */ 586 /* ETHER and ETHER_VLAN */ 587 __m256i vlan_ptype = 588 _mm256_permutevar8x32_epi32(vlan_ptype_shuffle, 589 vlan_idx); 590 /* Build the ptype index from flags */ 591 tcp_udp = _mm256_slli_epi32(flags0_7, 29); 592 tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2); 593 __m256i ip4_ip6 = 594 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30); 595 __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6); 596 __m256i frag_bit = 597 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31); 598 __m256i nonfrag_ptype = 599 _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx); 600 __m256i frag_ptype = 601 _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx); 602 /* 603 * Zero out the unwanted types and combine the remaining bits. 604 * The effect is same as selecting non-frag or frag types 605 * depending on the frag bit. 606 */ 607 nonfrag_ptype = _mm256_and_si256(nonfrag_ptype, 608 _mm256_cmpeq_epi32(zero4, frag_bit)); 609 frag_ptype = _mm256_and_si256(frag_ptype, 610 _mm256_cmpgt_epi32(frag_bit, zero4)); 611 __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype); 612 ptype = _mm256_slli_epi32(ptype, 4); 613 /* 614 * Compute tunnel PTYPEs. 615 * 15 inst / 8 desc = 1.875 inst/desc 616 */ 617 __m256i tnl_l3_ptype = 618 _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx); 619 tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16); 620 /* 621 * Shift non-tunnel L4 types to make them tunnel types. 622 * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP 623 */ 624 __m256i tnl_l4_ptype = 625 _mm256_slli_epi32(_mm256_and_si256(ptype, 626 _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16); 627 __m256i tnl_ptype = 628 _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype); 629 tnl_ptype = _mm256_or_si256(tnl_ptype, 630 _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT | 631 RTE_PTYPE_INNER_L2_ETHER)); 632 /* 633 * Select non-tunnel or tunnel types by zeroing out the 634 * unwanted ones. 635 */ 636 __m256i tnl_flags = _mm256_and_si256(overlay_enabled, 637 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31)); 638 tnl_ptype = _mm256_and_si256(tnl_ptype, 639 _mm256_sub_epi32(zero4, tnl_flags)); 640 ptype = _mm256_and_si256(ptype, 641 _mm256_cmpeq_epi32(zero4, tnl_flags)); 642 /* 643 * Combine types and swap to have ptypes in the same order 644 * as desc. 645 * desc: 0 2 4 6 1 3 5 7 646 * 3 inst / 8 desc = 0.375 inst/desc 647 */ 648 ptype = _mm256_or_si256(ptype, tnl_ptype); 649 ptype = _mm256_or_si256(ptype, vlan_ptype); 650 ptype = _mm256_permute4x64_epi64(ptype, 651 (1 << 6) + (0 << 4) + (3 << 2) + 2); 652 653 /* 654 * Mask packet length. 655 * Use 4 ands: 0.5 instructions/desc 656 */ 657 cqd01 = _mm256_and_si256(cqd01, mask); 658 cqd23 = _mm256_and_si256(cqd23, mask); 659 cqd45 = _mm256_and_si256(cqd45, mask); 660 cqd67 = _mm256_and_si256(cqd67, mask); 661 /* 662 * Shuffle. Two 16B sets of the mbuf fields. 663 * packet_type, pkt_len, data_len, vlan_tci, rss 664 */ 665 __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask); 666 __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask); 667 __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask); 668 __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask); 669 670 /* 671 * Blend in ptypes 672 * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc 673 */ 674 rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11); 675 rearm23 = _mm256_blend_epi32(rearm23, 676 _mm256_shuffle_epi32(ptype, 1), 0x11); 677 rearm45 = _mm256_blend_epi32(rearm45, 678 _mm256_shuffle_epi32(ptype, 2), 0x11); 679 rearm67 = _mm256_blend_epi32(rearm67, 680 _mm256_shuffle_epi32(ptype, 3), 0x11); 681 682 /* 683 * Move rss_flags into ol_flags in mbuf_init. 684 * Use 1 shift and 1 blend for each desc: 2 inst/desc 685 */ 686 __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init, 687 rss_flags, 0x44); 688 __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init, 689 _mm256_slli_si256(rss_flags, 4), 0x44); 690 __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init, 691 _mm256_slli_si256(rss_flags, 8), 0x44); 692 __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init, 693 _mm256_srli_si256(rss_flags, 4), 0x44); 694 695 /* 696 * Build rearm, one per desc. 697 * 8 blends and 4 permutes: 1.5 inst/desc 698 */ 699 __m256i rearm0 = _mm256_blend_epi32(rearm01, 700 mbuf_init0_1, 0xf0); 701 __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1, 702 rearm01, 0xf0); 703 __m256i rearm2 = _mm256_blend_epi32(rearm23, 704 mbuf_init2_3, 0xf0); 705 __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3, 706 rearm23, 0xf0); 707 /* Swap upper and lower 64 bits */ 708 rearm0 = _mm256_permute4x64_epi64(rearm0, 709 (1 << 6) + (0 << 4) + (3 << 2) + 2); 710 rearm2 = _mm256_permute4x64_epi64(rearm2, 711 (1 << 6) + (0 << 4) + (3 << 2) + 2); 712 /* Second set of 4 descriptors */ 713 __m256i rearm4 = _mm256_blend_epi32(rearm45, 714 mbuf_init4_5, 0xf0); 715 __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5, 716 rearm45, 0xf0); 717 __m256i rearm6 = _mm256_blend_epi32(rearm67, 718 mbuf_init6_7, 0xf0); 719 __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7, 720 rearm67, 0xf0); 721 rearm4 = _mm256_permute4x64_epi64(rearm4, 722 (1 << 6) + (0 << 4) + (3 << 2) + 2); 723 rearm6 = _mm256_permute4x64_epi64(rearm6, 724 (1 << 6) + (0 << 4) + (3 << 2) + 2); 725 726 /* 727 * Write out 32B of mbuf fields. 728 * data_off - off 0 (mbuf_init) 729 * refcnt - 2 (mbuf_init) 730 * nb_segs - 4 (mbuf_init) 731 * port - 6 (mbuf_init) 732 * ol_flag - 8 (from cqd) 733 * packet_type - 16 (from cqd) 734 * pkt_len - 20 (from cqd) 735 * data_len - 24 (from cqd) 736 * vlan_tci - 26 (from cqd) 737 * rss - 28 (from cqd) 738 */ 739 _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0); 740 _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1); 741 _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2); 742 _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3); 743 _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4); 744 _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5); 745 _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6); 746 _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7); 747 748 max_rx -= 8; 749 cqd += 8; 750 rx += 8; 751 rxmb += 8; 752 } 753 754 /* 755 * Step 3: Slow path to handle a small (<8) number of packets and 756 * occasional truncated packets. 757 */ 758 while (max_rx && ((cqd->type_color & 759 CQ_DESC_COLOR_MASK_NOSHIFT) != color)) { 760 if (unlikely(cqd->bytes_written_flags & 761 CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) { 762 rte_pktmbuf_free(*rxmb++); 763 rte_atomic64_inc(&enic->soft_stats.rx_packet_errors); 764 } else { 765 *rx++ = rx_one(cqd, *rxmb++, enic); 766 } 767 cqd++; 768 max_rx--; 769 } 770 771 /* Number of descriptors visited */ 772 nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx; 773 if (nb_rx == 0) 774 return 0; 775 rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx; 776 rxmb = rq->mbuf_ring + cq_idx; 777 cq_idx += nb_rx; 778 rq->rx_nb_hold += nb_rx; 779 if (unlikely(cq_idx == cq->ring.desc_count)) { 780 cq_idx = 0; 781 cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT; 782 } 783 cq->to_clean = cq_idx; 784 785 /* Step 4: Restock RQ with new mbufs */ 786 memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs, 787 sizeof(struct rte_mbuf *) * nb_rx); 788 rq->num_free_mbufs -= nb_rx; 789 while (nb_rx) { 790 rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM; 791 nb_rx--; 792 rqd++; 793 rxmb++; 794 } 795 if (rq->rx_nb_hold > rq->rx_free_thresh) { 796 rq->posted_index = enic_ring_add(rq->ring.desc_count, 797 rq->posted_index, 798 rq->rx_nb_hold); 799 rq->rx_nb_hold = 0; 800 rte_wmb(); 801 iowrite32_relaxed(rq->posted_index, 802 &rq->ctrl->posted_index); 803 } 804 805 return rx - rx_pkts; 806 } 807 808 bool 809 enic_use_vector_rx_handler(struct enic *enic) 810 { 811 struct rte_eth_dev *eth_dev; 812 struct rte_fdir_conf *fconf; 813 814 eth_dev = enic->rte_dev; 815 /* User needs to request for the avx2 handler */ 816 if (!enic->enable_avx2_rx) 817 return false; 818 /* Do not support scatter Rx */ 819 if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0)) 820 return false; 821 /* Do not support fdir/flow */ 822 fconf = ð_dev->data->dev_conf.fdir_conf; 823 if (fconf->mode != RTE_FDIR_MODE_NONE) 824 return false; 825 if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) { 826 ENICPMD_LOG(DEBUG, " use the non-scatter avx2 Rx handler"); 827 eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts; 828 return true; 829 } 830 return false; 831 } 832