xref: /dpdk/drivers/net/enic/enic_rxtx_vec_avx2.c (revision 089e5ed727a15da2729cfee9b63533dd120bd04c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2008-2018 Cisco Systems, Inc.  All rights reserved.
3  * Copyright 2007 Nuova Systems, Inc.  All rights reserved.
4  */
5 
6 #include <rte_mbuf.h>
7 #include <rte_ethdev_driver.h>
8 
9 #include "enic_compat.h"
10 #include "rq_enet_desc.h"
11 #include "enic.h"
12 #include "enic_rxtx_common.h"
13 
14 #include <x86intrin.h>
15 
16 static struct rte_mbuf *
17 rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
18 {
19 	bool tnl;
20 
21 	*(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
22 	mb->data_len = cqd->bytes_written_flags &
23 		CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
24 	mb->pkt_len = mb->data_len;
25 	tnl = enic->overlay_offload && (cqd->completed_index_flags &
26 					CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
27 	mb->packet_type =
28 		enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
29 	enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
30 	/* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
31 	if (tnl) {
32 		mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
33 				     RTE_PTYPE_L4_MASK);
34 	}
35 	return mb;
36 }
37 
38 static uint16_t
39 enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
40 			     uint16_t nb_pkts)
41 {
42 	struct rte_mbuf **rx, **rxmb;
43 	uint16_t cq_idx, nb_rx, max_rx;
44 	struct cq_enet_rq_desc *cqd;
45 	struct rq_enet_desc *rqd;
46 	struct vnic_cq *cq;
47 	struct vnic_rq *rq;
48 	struct enic *enic;
49 	uint8_t color;
50 
51 	rq = rx_queue;
52 	enic = vnic_dev_priv(rq->vdev);
53 	cq = &enic->cq[enic_cq_rq(enic, rq->index)];
54 	cq_idx = cq->to_clean;
55 
56 	/*
57 	 * Fill up the reserve of free mbufs. Below, we restock the receive
58 	 * ring with these mbufs to avoid allocation failures.
59 	 */
60 	if (rq->num_free_mbufs == 0) {
61 		if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
62 					 ENIC_RX_BURST_MAX))
63 			return 0;
64 		rq->num_free_mbufs = ENIC_RX_BURST_MAX;
65 	}
66 	/* Receive until the end of the ring, at most. */
67 	max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
68 	max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
69 
70 	rxmb = rq->mbuf_ring + cq_idx;
71 	color = cq->last_color;
72 	cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
73 	rx = rx_pkts;
74 	if (max_rx == 0 ||
75 	    (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
76 		return 0;
77 
78 	/* Step 1: Process one packet to do aligned 256-bit load below */
79 	if (cq_idx & 0x1) {
80 		if (unlikely(cqd->bytes_written_flags &
81 			     CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
82 			rte_pktmbuf_free(*rxmb++);
83 			rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
84 		} else {
85 			*rx++ = rx_one(cqd, *rxmb++, enic);
86 		}
87 		cqd++;
88 		max_rx--;
89 	}
90 
91 	const __m256i mask =
92 		_mm256_set_epi8(/* Second descriptor */
93 			0xff, /* type_color */
94 			(CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
95 			 CQ_ENET_RQ_DESC_FLAGS_IPV4 |
96 			 CQ_ENET_RQ_DESC_FLAGS_IPV6 |
97 			 CQ_ENET_RQ_DESC_FLAGS_TCP |
98 			 CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
99 			0, 0, /* checksum_fcoe */
100 			0xff, 0xff, /* vlan */
101 			0x3f, 0xff, /* bytes_written_flags */
102 			0xff, 0xff, 0xff, 0xff, /* rss_hash */
103 			0xff, 0xff, /* q_number_rss_type_flags */
104 			0, 0, /* completed_index_flags */
105 			/* First descriptor */
106 			0xff, /* type_color */
107 			(CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
108 			 CQ_ENET_RQ_DESC_FLAGS_IPV4 |
109 			 CQ_ENET_RQ_DESC_FLAGS_IPV6 |
110 			 CQ_ENET_RQ_DESC_FLAGS_TCP |
111 			 CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
112 			0, 0, /* checksum_fcoe */
113 			0xff, 0xff, /* vlan */
114 			0x3f, 0xff, /* bytes_written_flags */
115 			0xff, 0xff, 0xff, 0xff, /* rss_hash */
116 			0xff, 0xff, /* q_number_rss_type_flags */
117 			0, 0 /* completed_index_flags */
118 			);
119 	const __m256i shuffle_mask =
120 		_mm256_set_epi8(/* Second descriptor */
121 			7, 6, 5, 4,             /* rss = rss_hash */
122 			11, 10,                 /* vlan_tci = vlan */
123 			9, 8,                   /* data_len = bytes_written */
124 			0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
125 			0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
126 			/* First descriptor */
127 			7, 6, 5, 4,             /* rss = rss_hash */
128 			11, 10,                 /* vlan_tci = vlan */
129 			9, 8,                   /* data_len = bytes_written */
130 			0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
131 			0x80, 0x80, 0x80, 0x80  /* packet_type = 0 */
132 			);
133 	/* Used to collect 8 flags from 8 desc into one register */
134 	const __m256i flags_shuffle_mask =
135 		_mm256_set_epi8(/* Second descriptor */
136 			1, 3, 9, 14,
137 			1, 3, 9, 14,
138 			1, 3, 9, 14,
139 			1, 3, 9, 14,
140 			/* First descriptor */
141 			1, 3, 9, 14,
142 			1, 3, 9, 14,
143 			1, 3, 9, 14,
144 			/*
145 			 * Byte 3: upper byte of completed_index_flags
146 			 *         bit 5 = fcoe (tunnel)
147 			 * Byte 2: upper byte of q_number_rss_type_flags
148 			 *         bits 2,3,4,5 = rss type
149 			 *         bit 6 = csum_not_calc
150 			 * Byte 1: upper byte of bytes_written_flags
151 			 *         bit 6 = truncated
152 			 *         bit 7 = vlan stripped
153 			 * Byte 0: flags
154 			 */
155 			1, 3, 9, 14
156 			);
157 	/* Used to collect 8 VLAN IDs from 8 desc into one register */
158 	const __m256i vlan_shuffle_mask =
159 		_mm256_set_epi8(/* Second descriptor */
160 			0x80, 0x80, 11, 10,
161 			0x80, 0x80, 11, 10,
162 			0x80, 0x80, 11, 10,
163 			0x80, 0x80, 11, 10,
164 			/* First descriptor */
165 			0x80, 0x80, 11, 10,
166 			0x80, 0x80, 11, 10,
167 			0x80, 0x80, 11, 10,
168 			0x80, 0x80, 11, 10);
169 	/* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
170 	const __m256i rss_shuffle =
171 		_mm256_set_epi8(/* second 128 bits */
172 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
173 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
174 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
175 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
176 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
177 			0, /* rss_types = 0 */
178 			/* first 128 bits */
179 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
180 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
181 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
182 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
183 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
184 			0 /* rss_types = 0 */);
185 	/*
186 	 * VLAN offload flags.
187 	 * shuffle index:
188 	 * vlan_stripped => bit 0
189 	 * vlan_id == 0  => bit 1
190 	 */
191 	const __m256i vlan_shuffle =
192 		_mm256_set_epi32(0, 0, 0, 0,
193 			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
194 			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
195 	/* Use the same shuffle index as vlan_shuffle */
196 	const __m256i vlan_ptype_shuffle =
197 		_mm256_set_epi32(0, 0, 0, 0,
198 				 RTE_PTYPE_L2_ETHER,
199 				 RTE_PTYPE_L2_ETHER,
200 				 RTE_PTYPE_L2_ETHER,
201 				 RTE_PTYPE_L2_ETHER_VLAN);
202 	/*
203 	 * CKSUM flags. Shift right so they fit int 8-bit integers.
204 	 * shuffle index:
205 	 * ipv4_csum_ok    => bit 3
206 	 * ip4             => bit 2
207 	 * tcp_or_udp      => bit 1
208 	 * tcp_udp_csum_ok => bit 0
209 	 */
210 	const __m256i csum_shuffle =
211 		_mm256_set_epi8(/* second 128 bits */
212 			/* 1111 ip4+ip4_ok+l4+l4_ok */
213 			((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
214 			/* 1110 ip4_ok+ip4+l4+!l4_ok */
215 			((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
216 			(PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
217 			(PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
218 			(PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
219 			(PKT_RX_L4_CKSUM_BAD >> 1),  /* 1010 l4+!l4_ok */
220 			0, /* 1001 */
221 			0, /* 1000 */
222 			/* 0111 !ip4_ok+ip4+l4+l4_ok */
223 			((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
224 			/* 0110 !ip4_ok+ip4+l4+!l4_ok */
225 			((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
226 			(PKT_RX_IP_CKSUM_BAD >> 1),  /* 0101 !ip4_ok+ip4 */
227 			(PKT_RX_IP_CKSUM_BAD >> 1),  /* 0100 !ip4_ok+ip4 */
228 			(PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
229 			(PKT_RX_L4_CKSUM_BAD >> 1),  /* 0010 l4+!l4_ok */
230 			0, /* 0001 */
231 			0, /* 0000 */
232 			/* first 128 bits */
233 			((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
234 			((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
235 			(PKT_RX_IP_CKSUM_GOOD >> 1),
236 			(PKT_RX_IP_CKSUM_GOOD >> 1),
237 			(PKT_RX_L4_CKSUM_GOOD >> 1),
238 			(PKT_RX_L4_CKSUM_BAD >> 1),
239 			0, 0,
240 			((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
241 			((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
242 			(PKT_RX_IP_CKSUM_BAD >> 1),
243 			(PKT_RX_IP_CKSUM_BAD >> 1),
244 			(PKT_RX_L4_CKSUM_GOOD >> 1),
245 			(PKT_RX_L4_CKSUM_BAD >> 1),
246 			0, 0);
247 	/*
248 	 * Non-fragment PTYPEs.
249 	 * Shuffle 4-bit index:
250 	 * ip6 => bit 0
251 	 * ip4 => bit 1
252 	 * udp => bit 2
253 	 * tcp => bit 3
254 	 *   bit
255 	 * 3 2 1 0
256 	 * -------
257 	 * 0 0 0 0 unknown
258 	 * 0 0 0 1 ip6 | nonfrag
259 	 * 0 0 1 0 ip4 | nonfrag
260 	 * 0 0 1 1 unknown
261 	 * 0 1 0 0 unknown
262 	 * 0 1 0 1 ip6 | udp
263 	 * 0 1 1 0 ip4 | udp
264 	 * 0 1 1 1 unknown
265 	 * 1 0 0 0 unknown
266 	 * 1 0 0 1 ip6 | tcp
267 	 * 1 0 1 0 ip4 | tcp
268 	 * 1 0 1 1 unknown
269 	 * 1 1 0 0 unknown
270 	 * 1 1 0 1 unknown
271 	 * 1 1 1 0 unknown
272 	 * 1 1 1 1 unknown
273 	 *
274 	 * PTYPEs do not fit in 8 bits, so shift right 4..
275 	 */
276 	const __m256i nonfrag_ptype_shuffle =
277 		_mm256_set_epi8(/* second 128 bits */
278 			RTE_PTYPE_UNKNOWN,
279 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
280 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
281 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
282 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
283 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
284 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
285 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
286 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
287 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
288 			 RTE_PTYPE_L4_NONFRAG) >> 4,
289 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
290 			 RTE_PTYPE_L4_NONFRAG) >> 4,
291 			RTE_PTYPE_UNKNOWN,
292 			/* first 128 bits */
293 			RTE_PTYPE_UNKNOWN,
294 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
295 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
296 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
297 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
298 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
299 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
300 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
301 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
302 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
303 			 RTE_PTYPE_L4_NONFRAG) >> 4,
304 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
305 			 RTE_PTYPE_L4_NONFRAG) >> 4,
306 			RTE_PTYPE_UNKNOWN);
307 	/* Fragment PTYPEs. Use the same shuffle index as above. */
308 	const __m256i frag_ptype_shuffle =
309 		_mm256_set_epi8(/* second 128 bits */
310 			RTE_PTYPE_UNKNOWN,
311 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
312 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
313 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
314 			 RTE_PTYPE_L4_FRAG) >> 4,
315 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
316 			 RTE_PTYPE_L4_FRAG) >> 4,
317 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
318 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
319 			 RTE_PTYPE_L4_FRAG) >> 4,
320 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
321 			 RTE_PTYPE_L4_FRAG) >> 4,
322 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
323 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
324 			 RTE_PTYPE_L4_FRAG) >> 4,
325 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
326 			 RTE_PTYPE_L4_FRAG) >> 4,
327 			RTE_PTYPE_UNKNOWN,
328 			/* first 128 bits */
329 			RTE_PTYPE_UNKNOWN,
330 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
331 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
332 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
333 			 RTE_PTYPE_L4_FRAG) >> 4,
334 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
335 			 RTE_PTYPE_L4_FRAG) >> 4,
336 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
337 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
338 			 RTE_PTYPE_L4_FRAG) >> 4,
339 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
340 			 RTE_PTYPE_L4_FRAG) >> 4,
341 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
342 			(RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
343 			 RTE_PTYPE_L4_FRAG) >> 4,
344 			(RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
345 			 RTE_PTYPE_L4_FRAG) >> 4,
346 			RTE_PTYPE_UNKNOWN);
347 	/*
348 	 * Tunnel PTYPEs. Use the same shuffle index as above.
349 	 * L4 types are not part of this table. They come from non-tunnel
350 	 * types above.
351 	 */
352 	const __m256i tnl_l3_ptype_shuffle =
353 		_mm256_set_epi8(/* second 128 bits */
354 			RTE_PTYPE_UNKNOWN,
355 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
356 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
357 			RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
358 			RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
359 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
360 			RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
361 			RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
362 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
363 			RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
364 			RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
365 			RTE_PTYPE_UNKNOWN,
366 			/* first 128 bits */
367 			RTE_PTYPE_UNKNOWN,
368 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
369 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
370 			RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
371 			RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
372 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
373 			RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
374 			RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
375 			RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
376 			RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
377 			RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
378 			RTE_PTYPE_UNKNOWN);
379 
380 	const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
381 						    0, enic->mbuf_initializer);
382 
383 	/*
384 	 * --- cq desc fields ---    offset
385 	 * completed_index_flags    - 0   use: fcoe
386 	 * q_number_rss_type_flags  - 2   use: rss types, csum_not_calc
387 	 * rss_hash                 - 4   ==> mbuf.hash.rss
388 	 * bytes_written_flags      - 8   ==> mbuf.pkt_len,data_len
389 	 *                                use: truncated, vlan_stripped
390 	 * vlan                     - 10  ==> mbuf.vlan_tci
391 	 * checksum_fcoe            - 12  (unused)
392 	 * flags                    - 14  use: all bits
393 	 * type_color               - 15  (unused)
394 	 *
395 	 * --- mbuf fields ---       offset
396 	 * rearm_data              ---- 16
397 	 * data_off    - 0      (mbuf_init) -+
398 	 * refcnt      - 2      (mbuf_init)  |
399 	 * nb_segs     - 4      (mbuf_init)  | 16B 128b
400 	 * port        - 6      (mbuf_init)  |
401 	 * ol_flag     - 8      (from cqd)  -+
402 	 * rx_descriptor_fields1   ---- 32
403 	 * packet_type - 0      (from cqd)  -+
404 	 * pkt_len     - 4      (from cqd)   |
405 	 * data_len    - 8      (from cqd)   | 16B 128b
406 	 * vlan_tci    - 10     (from cqd)   |
407 	 * rss         - 12     (from cqd)  -+
408 	 */
409 
410 	__m256i overlay_enabled =
411 		_mm256_set1_epi32((uint32_t)enic->overlay_offload);
412 
413 	/* Step 2: Process 8 packets per loop using SIMD */
414 	while (max_rx > 7 && (((cqd + 7)->type_color &
415 			       CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
416 		/* Load 8 16B CQ descriptors */
417 		__m256i cqd01 = _mm256_load_si256((void *)cqd);
418 		__m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
419 		__m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
420 		__m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
421 		/* Copy 8 mbuf pointers to rx_pkts */
422 		_mm256_storeu_si256((void *)rx,
423 				    _mm256_loadu_si256((void *)rxmb));
424 		_mm256_storeu_si256((void *)(rx + 4),
425 				    _mm256_loadu_si256((void *)(rxmb + 4)));
426 
427 		/*
428 		 * Collect 8 flags (each 32 bits) into one register.
429 		 * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
430 		 */
431 		__m256i flags01 =
432 			_mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
433 		/*
434 		 * Shuffle above produces 8 x 32-bit flags for 8 descriptors
435 		 * in this order: 0, 0, 0, 0, 1, 1, 1, 1
436 		 * The duplicates in each 128-bit lane simplifies blending
437 		 * below.
438 		 */
439 		__m256i flags23 =
440 			_mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
441 		__m256i flags45 =
442 			_mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
443 		__m256i flags67 =
444 			_mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
445 		/* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
446 		__m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
447 		/* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
448 		__m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
449 		/* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
450 		__m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
451 		/*
452 		 * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
453 		 * This order simplifies blend operations way below that
454 		 * produce 'rearm' data for each mbuf.
455 		 */
456 		flags0_7 = _mm256_permute4x64_epi64(flags0_7,
457 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
458 
459 		/*
460 		 * Check truncated bits and bail out early on.
461 		 * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
462 		 */
463 		__m256i trunc =
464 			_mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
465 		trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
466 			(1 << 6) + (0 << 4) + (3 << 2) + 2));
467 		/* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
468 		if (_mm256_extract_epi64(trunc, 0) ||
469 		    _mm256_extract_epi64(trunc, 1))
470 			break;
471 
472 		/*
473 		 * Compute PKT_RX_RSS_HASH.
474 		 * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
475 		 * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
476 		 * Everything else is zero.
477 		 */
478 		__m256i rss_types =
479 			_mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
480 		/*
481 		 * RSS flags (PKT_RX_RSS_HASH) are in
482 		 * byte 0, 4, 8, 12, 16, 20, 24, 28
483 		 * Everything else is zero.
484 		 */
485 		__m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
486 
487 		/*
488 		 * Compute CKSUM flags. First build the index and then
489 		 * use it to shuffle csum_shuffle.
490 		 * 20 instructions including const loads: 2.5 inst/desc
491 		 */
492 		/*
493 		 * csum_not_calc (bit 22)
494 		 * csum_not_calc (0) => 0xffffffff
495 		 * csum_not_calc (1) => 0x0
496 		 */
497 		const __m256i zero4 = _mm256_setzero_si256();
498 		const __m256i mask22 = _mm256_set1_epi32(0x400000);
499 		__m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
500 			_mm256_and_si256(flags0_7, mask22));
501 		/*
502 		 * (tcp|udp) && !fragment => bit 1
503 		 * tcp = bit 2, udp = bit 1, frag = bit 6
504 		 */
505 		const __m256i mask1 = _mm256_set1_epi32(0x2);
506 		__m256i tcp_udp =
507 			_mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
508 				_mm256_or_si256(flags0_7,
509 					_mm256_srli_epi32(flags0_7, 1)));
510 		tcp_udp = _mm256_and_si256(tcp_udp, mask1);
511 		/* ipv4 (bit 5) => bit 2 */
512 		const __m256i mask2 = _mm256_set1_epi32(0x4);
513 		__m256i ipv4 = _mm256_and_si256(mask2,
514 			_mm256_srli_epi32(flags0_7, 3));
515 		/*
516 		 * ipv4_csum_ok (bit 3) => bit 3
517 		 * tcp_udp_csum_ok (bit 0) => bit 0
518 		 * 0x9
519 		 */
520 		const __m256i mask0_3 = _mm256_set1_epi32(0x9);
521 		__m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
522 		csum_idx = _mm256_and_si256(csum_not_calc,
523 			_mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
524 				tcp_udp));
525 		__m256i csum_flags =
526 			_mm256_shuffle_epi8(csum_shuffle, csum_idx);
527 		/* Shift left to restore CKSUM flags. See csum_shuffle. */
528 		csum_flags = _mm256_slli_epi32(csum_flags, 1);
529 		/* Combine csum flags and offload flags: 0.125 inst/desc */
530 		rss_flags = _mm256_or_si256(rss_flags, csum_flags);
531 
532 		/*
533 		 * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
534 		 * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
535 		 * 1.25 inst/desc
536 		 */
537 		__m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
538 		__m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
539 		__m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
540 		__m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
541 		__m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
542 		__m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
543 		/* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
544 		__m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
545 		/* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
546 		vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
547 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
548 		/*
549 		 * Compare 0 == vlan_id produces 0xffffffff (-1) if
550 		 * vlan 0 and 0 if vlan non-0. Then subtracting the
551 		 * result from 0 produces 0 - (-1) = 1 for vlan 0, and
552 		 * 0 - 0 = 0 for vlan non-0.
553 		 */
554 		vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
555 		/* vlan_id != 0 => 0, vlan_id == 0 => 1 */
556 		vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
557 
558 		/*
559 		 * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
560 		 * Use 3 shifts, 1 or,  1 shuffle for 8 desc: 0.625 inst/desc
561 		 * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
562 		 * Everything else is zero.
563 		 */
564 		__m256i vlan_idx =
565 			_mm256_or_si256(/* vlan_stripped => bit 0 */
566 				_mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
567 					16), 31),
568 				/* (vlan_id == 0) => bit 1 */
569 				_mm256_slli_epi32(vlan0_7, 1));
570 		/*
571 		 * The index captures 4 cases.
572 		 * stripped, id = 0   ==> 11b = 3
573 		 * stripped, id != 0  ==> 01b = 1
574 		 * not strip, id == 0 ==> 10b = 2
575 		 * not strip, id != 0 ==> 00b = 0
576 		 */
577 		__m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
578 			vlan_idx);
579 		/* Combine vlan and offload flags: 0.125 inst/desc */
580 		rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
581 
582 		/*
583 		 * Compute non-tunnel PTYPEs.
584 		 * 17 inst / 8 desc = 2.125 inst/desc
585 		 */
586 		/* ETHER and ETHER_VLAN */
587 		__m256i vlan_ptype =
588 			_mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
589 				vlan_idx);
590 		/* Build the ptype index from flags */
591 		tcp_udp = _mm256_slli_epi32(flags0_7, 29);
592 		tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
593 		__m256i ip4_ip6 =
594 			_mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
595 		__m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
596 		__m256i frag_bit =
597 			_mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
598 		__m256i nonfrag_ptype =
599 			_mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
600 		__m256i frag_ptype =
601 			_mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
602 		/*
603 		 * Zero out the unwanted types and combine the remaining bits.
604 		 * The effect is same as selecting non-frag or frag types
605 		 * depending on the frag bit.
606 		 */
607 		nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
608 			_mm256_cmpeq_epi32(zero4, frag_bit));
609 		frag_ptype = _mm256_and_si256(frag_ptype,
610 			_mm256_cmpgt_epi32(frag_bit, zero4));
611 		__m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
612 		ptype = _mm256_slli_epi32(ptype, 4);
613 		/*
614 		 * Compute tunnel PTYPEs.
615 		 * 15 inst / 8 desc = 1.875 inst/desc
616 		 */
617 		__m256i tnl_l3_ptype =
618 			_mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
619 		tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
620 		/*
621 		 * Shift non-tunnel L4 types to make them tunnel types.
622 		 * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
623 		 */
624 		__m256i tnl_l4_ptype =
625 			_mm256_slli_epi32(_mm256_and_si256(ptype,
626 				_mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
627 		__m256i tnl_ptype =
628 			_mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
629 		tnl_ptype = _mm256_or_si256(tnl_ptype,
630 			_mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
631 				RTE_PTYPE_INNER_L2_ETHER));
632 		/*
633 		 * Select non-tunnel or tunnel types by zeroing out the
634 		 * unwanted ones.
635 		 */
636 		__m256i tnl_flags = _mm256_and_si256(overlay_enabled,
637 			_mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
638 		tnl_ptype = _mm256_and_si256(tnl_ptype,
639 			_mm256_sub_epi32(zero4, tnl_flags));
640 		ptype =	_mm256_and_si256(ptype,
641 			_mm256_cmpeq_epi32(zero4, tnl_flags));
642 		/*
643 		 * Combine types and swap to have ptypes in the same order
644 		 * as desc.
645 		 * desc: 0 2 4 6 1 3 5 7
646 		 * 3 inst / 8 desc = 0.375 inst/desc
647 		 */
648 		ptype = _mm256_or_si256(ptype, tnl_ptype);
649 		ptype = _mm256_or_si256(ptype, vlan_ptype);
650 		ptype = _mm256_permute4x64_epi64(ptype,
651 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
652 
653 		/*
654 		 * Mask packet length.
655 		 * Use 4 ands: 0.5 instructions/desc
656 		 */
657 		cqd01 = _mm256_and_si256(cqd01, mask);
658 		cqd23 = _mm256_and_si256(cqd23, mask);
659 		cqd45 = _mm256_and_si256(cqd45, mask);
660 		cqd67 = _mm256_and_si256(cqd67, mask);
661 		/*
662 		 * Shuffle. Two 16B sets of the mbuf fields.
663 		 * packet_type, pkt_len, data_len, vlan_tci, rss
664 		 */
665 		__m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
666 		__m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
667 		__m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
668 		__m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
669 
670 		/*
671 		 * Blend in ptypes
672 		 * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
673 		 */
674 		rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
675 		rearm23 = _mm256_blend_epi32(rearm23,
676 			_mm256_shuffle_epi32(ptype, 1), 0x11);
677 		rearm45 = _mm256_blend_epi32(rearm45,
678 			_mm256_shuffle_epi32(ptype, 2), 0x11);
679 		rearm67 = _mm256_blend_epi32(rearm67,
680 			_mm256_shuffle_epi32(ptype, 3), 0x11);
681 
682 		/*
683 		 * Move rss_flags into ol_flags in mbuf_init.
684 		 * Use 1 shift and 1 blend for each desc: 2 inst/desc
685 		 */
686 		__m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
687 			rss_flags, 0x44);
688 		__m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
689 			_mm256_slli_si256(rss_flags, 4), 0x44);
690 		__m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
691 			_mm256_slli_si256(rss_flags, 8), 0x44);
692 		__m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
693 			_mm256_srli_si256(rss_flags, 4), 0x44);
694 
695 		/*
696 		 * Build rearm, one per desc.
697 		 * 8 blends and 4 permutes: 1.5 inst/desc
698 		 */
699 		__m256i rearm0 = _mm256_blend_epi32(rearm01,
700 			mbuf_init0_1, 0xf0);
701 		__m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
702 			rearm01, 0xf0);
703 		__m256i rearm2 = _mm256_blend_epi32(rearm23,
704 			mbuf_init2_3, 0xf0);
705 		__m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
706 			rearm23, 0xf0);
707 		/* Swap upper and lower 64 bits */
708 		rearm0 = _mm256_permute4x64_epi64(rearm0,
709 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
710 		rearm2 = _mm256_permute4x64_epi64(rearm2,
711 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
712 		/* Second set of 4 descriptors */
713 		__m256i rearm4 = _mm256_blend_epi32(rearm45,
714 			mbuf_init4_5, 0xf0);
715 		__m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
716 			rearm45, 0xf0);
717 		__m256i rearm6 = _mm256_blend_epi32(rearm67,
718 			mbuf_init6_7, 0xf0);
719 		__m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
720 			rearm67, 0xf0);
721 		rearm4 = _mm256_permute4x64_epi64(rearm4,
722 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
723 		rearm6 = _mm256_permute4x64_epi64(rearm6,
724 			(1 << 6) + (0 << 4) + (3 << 2) + 2);
725 
726 		/*
727 		 * Write out 32B of mbuf fields.
728 		 * data_off    - off 0  (mbuf_init)
729 		 * refcnt      - 2      (mbuf_init)
730 		 * nb_segs     - 4      (mbuf_init)
731 		 * port        - 6      (mbuf_init)
732 		 * ol_flag     - 8      (from cqd)
733 		 * packet_type - 16     (from cqd)
734 		 * pkt_len     - 20     (from cqd)
735 		 * data_len    - 24     (from cqd)
736 		 * vlan_tci    - 26     (from cqd)
737 		 * rss         - 28     (from cqd)
738 		 */
739 		_mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
740 		_mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
741 		_mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
742 		_mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
743 		_mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
744 		_mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
745 		_mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
746 		_mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
747 
748 		max_rx -= 8;
749 		cqd += 8;
750 		rx += 8;
751 		rxmb += 8;
752 	}
753 
754 	/*
755 	 * Step 3: Slow path to handle a small (<8) number of packets and
756 	 * occasional truncated packets.
757 	 */
758 	while (max_rx && ((cqd->type_color &
759 			   CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
760 		if (unlikely(cqd->bytes_written_flags &
761 			     CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
762 			rte_pktmbuf_free(*rxmb++);
763 			rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
764 		} else {
765 			*rx++ = rx_one(cqd, *rxmb++, enic);
766 		}
767 		cqd++;
768 		max_rx--;
769 	}
770 
771 	/* Number of descriptors visited */
772 	nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
773 	if (nb_rx == 0)
774 		return 0;
775 	rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
776 	rxmb = rq->mbuf_ring + cq_idx;
777 	cq_idx += nb_rx;
778 	rq->rx_nb_hold += nb_rx;
779 	if (unlikely(cq_idx == cq->ring.desc_count)) {
780 		cq_idx = 0;
781 		cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
782 	}
783 	cq->to_clean = cq_idx;
784 
785 	/* Step 4: Restock RQ with new mbufs */
786 	memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
787 	       sizeof(struct rte_mbuf *) * nb_rx);
788 	rq->num_free_mbufs -= nb_rx;
789 	while (nb_rx) {
790 		rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
791 		nb_rx--;
792 		rqd++;
793 		rxmb++;
794 	}
795 	if (rq->rx_nb_hold > rq->rx_free_thresh) {
796 		rq->posted_index = enic_ring_add(rq->ring.desc_count,
797 						 rq->posted_index,
798 						 rq->rx_nb_hold);
799 		rq->rx_nb_hold = 0;
800 		rte_wmb();
801 		iowrite32_relaxed(rq->posted_index,
802 				  &rq->ctrl->posted_index);
803 	}
804 
805 	return rx - rx_pkts;
806 }
807 
808 bool
809 enic_use_vector_rx_handler(struct enic *enic)
810 {
811 	struct rte_eth_dev *eth_dev;
812 	struct rte_fdir_conf *fconf;
813 
814 	eth_dev = enic->rte_dev;
815 	/* User needs to request for the avx2 handler */
816 	if (!enic->enable_avx2_rx)
817 		return false;
818 	/* Do not support scatter Rx */
819 	if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
820 		return false;
821 	/* Do not support fdir/flow */
822 	fconf = &eth_dev->data->dev_conf.fdir_conf;
823 	if (fconf->mode != RTE_FDIR_MODE_NONE)
824 		return false;
825 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
826 		ENICPMD_LOG(DEBUG, " use the non-scatter avx2 Rx handler");
827 		eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
828 		return true;
829 	}
830 	return false;
831 }
832