xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision a49342abbb5d68fafab1d2ba4c669c0e76e32c65)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <assert.h>
35 #include <stdint.h>
36 #include <string.h>
37 #include <stdlib.h>
38 
39 /* Verbs header. */
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41 #ifdef PEDANTIC
42 #pragma GCC diagnostic ignored "-Wpedantic"
43 #endif
44 #include <infiniband/verbs.h>
45 #include <infiniband/mlx5_hw.h>
46 #include <infiniband/arch.h>
47 #ifdef PEDANTIC
48 #pragma GCC diagnostic error "-Wpedantic"
49 #endif
50 
51 /* DPDK headers don't like -pedantic. */
52 #ifdef PEDANTIC
53 #pragma GCC diagnostic ignored "-Wpedantic"
54 #endif
55 #include <rte_mbuf.h>
56 #include <rte_mempool.h>
57 #include <rte_prefetch.h>
58 #include <rte_common.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ether.h>
61 #ifdef PEDANTIC
62 #pragma GCC diagnostic error "-Wpedantic"
63 #endif
64 
65 #include "mlx5.h"
66 #include "mlx5_utils.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_autoconf.h"
69 #include "mlx5_defs.h"
70 #include "mlx5_prm.h"
71 
72 #ifndef NDEBUG
73 
74 /**
75  * Verify or set magic value in CQE.
76  *
77  * @param cqe
78  *   Pointer to CQE.
79  *
80  * @return
81  *   0 the first time.
82  */
83 static inline int
84 check_cqe_seen(volatile struct mlx5_cqe *cqe)
85 {
86 	static const uint8_t magic[] = "seen";
87 	volatile uint8_t (*buf)[sizeof(cqe->rsvd3)] = &cqe->rsvd3;
88 	int ret = 1;
89 	unsigned int i;
90 
91 	for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
92 		if (!ret || (*buf)[i] != magic[i]) {
93 			ret = 0;
94 			(*buf)[i] = magic[i];
95 		}
96 	return ret;
97 }
98 
99 #endif /* NDEBUG */
100 
101 static inline int
102 check_cqe(volatile struct mlx5_cqe *cqe,
103 	  unsigned int cqes_n, const uint16_t ci)
104 	  __attribute__((always_inline));
105 
106 /**
107  * Check whether CQE is valid.
108  *
109  * @param cqe
110  *   Pointer to CQE.
111  * @param cqes_n
112  *   Size of completion queue.
113  * @param ci
114  *   Consumer index.
115  *
116  * @return
117  *   0 on success, 1 on failure.
118  */
119 static inline int
120 check_cqe(volatile struct mlx5_cqe *cqe,
121 	  unsigned int cqes_n, const uint16_t ci)
122 {
123 	uint16_t idx = ci & cqes_n;
124 	uint8_t op_own = cqe->op_own;
125 	uint8_t op_owner = MLX5_CQE_OWNER(op_own);
126 	uint8_t op_code = MLX5_CQE_OPCODE(op_own);
127 
128 	if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
129 		return 1; /* No CQE. */
130 #ifndef NDEBUG
131 	if ((op_code == MLX5_CQE_RESP_ERR) ||
132 	    (op_code == MLX5_CQE_REQ_ERR)) {
133 		volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
134 		uint8_t syndrome = err_cqe->syndrome;
135 
136 		if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
137 		    (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
138 			return 0;
139 		if (!check_cqe_seen(cqe))
140 			ERROR("unexpected CQE error %u (0x%02x)"
141 			      " syndrome 0x%02x",
142 			      op_code, op_code, syndrome);
143 		return 1;
144 	} else if ((op_code != MLX5_CQE_RESP_SEND) &&
145 		   (op_code != MLX5_CQE_REQ)) {
146 		if (!check_cqe_seen(cqe))
147 			ERROR("unexpected CQE opcode %u (0x%02x)",
148 			      op_code, op_code);
149 		return 1;
150 	}
151 #endif /* NDEBUG */
152 	return 0;
153 }
154 
155 static inline void
156 txq_complete(struct txq *txq) __attribute__((always_inline));
157 
158 /**
159  * Manage TX completions.
160  *
161  * When sending a burst, mlx5_tx_burst() posts several WRs.
162  *
163  * @param txq
164  *   Pointer to TX queue structure.
165  */
166 static inline void
167 txq_complete(struct txq *txq)
168 {
169 	const unsigned int elts_n = 1 << txq->elts_n;
170 	const unsigned int cqe_n = 1 << txq->cqe_n;
171 	const unsigned int cqe_cnt = cqe_n - 1;
172 	uint16_t elts_free = txq->elts_tail;
173 	uint16_t elts_tail;
174 	uint16_t cq_ci = txq->cq_ci;
175 	volatile struct mlx5_cqe *cqe = NULL;
176 	volatile struct mlx5_wqe *wqe;
177 
178 	do {
179 		volatile struct mlx5_cqe *tmp;
180 
181 		tmp = &(*txq->cqes)[cq_ci & cqe_cnt];
182 		if (check_cqe(tmp, cqe_n, cq_ci))
183 			break;
184 		cqe = tmp;
185 #ifndef NDEBUG
186 		if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
187 			if (!check_cqe_seen(cqe))
188 				ERROR("unexpected compressed CQE, TX stopped");
189 			return;
190 		}
191 		if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
192 		    (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
193 			if (!check_cqe_seen(cqe))
194 				ERROR("unexpected error CQE, TX stopped");
195 			return;
196 		}
197 #endif /* NDEBUG */
198 		++cq_ci;
199 	} while (1);
200 	if (unlikely(cqe == NULL))
201 		return;
202 	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) &
203 			    ((1 << txq->wqe_n) - 1)].hdr;
204 	elts_tail = wqe->ctrl[3];
205 	assert(elts_tail < (1 << txq->wqe_n));
206 	/* Free buffers. */
207 	while (elts_free != elts_tail) {
208 		struct rte_mbuf *elt = (*txq->elts)[elts_free];
209 		unsigned int elts_free_next =
210 			(elts_free + 1) & (elts_n - 1);
211 		struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
212 
213 #ifndef NDEBUG
214 		/* Poisoning. */
215 		memset(&(*txq->elts)[elts_free],
216 		       0x66,
217 		       sizeof((*txq->elts)[elts_free]));
218 #endif
219 		RTE_MBUF_PREFETCH_TO_FREE(elt_next);
220 		/* Only one segment needs to be freed. */
221 		rte_pktmbuf_free_seg(elt);
222 		elts_free = elts_free_next;
223 	}
224 	txq->cq_ci = cq_ci;
225 	txq->elts_tail = elts_tail;
226 	/* Update the consumer index. */
227 	rte_wmb();
228 	*txq->cq_db = htonl(cq_ci);
229 }
230 
231 /**
232  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
233  * the cloned mbuf is allocated is returned instead.
234  *
235  * @param buf
236  *   Pointer to mbuf.
237  *
238  * @return
239  *   Memory pool where data is located for given mbuf.
240  */
241 static struct rte_mempool *
242 txq_mb2mp(struct rte_mbuf *buf)
243 {
244 	if (unlikely(RTE_MBUF_INDIRECT(buf)))
245 		return rte_mbuf_from_indirect(buf)->pool;
246 	return buf->pool;
247 }
248 
249 static inline uint32_t
250 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
251 	__attribute__((always_inline));
252 
253 /**
254  * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
255  * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
256  * remove an entry first.
257  *
258  * @param txq
259  *   Pointer to TX queue structure.
260  * @param[in] mp
261  *   Memory Pool for which a Memory Region lkey must be returned.
262  *
263  * @return
264  *   mr->lkey on success, (uint32_t)-1 on failure.
265  */
266 static inline uint32_t
267 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
268 {
269 	unsigned int i;
270 	uint32_t lkey = (uint32_t)-1;
271 
272 	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
273 		if (unlikely(txq->mp2mr[i].mp == NULL)) {
274 			/* Unknown MP, add a new MR for it. */
275 			break;
276 		}
277 		if (txq->mp2mr[i].mp == mp) {
278 			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
279 			assert(htonl(txq->mp2mr[i].mr->lkey) ==
280 			       txq->mp2mr[i].lkey);
281 			lkey = txq->mp2mr[i].lkey;
282 			break;
283 		}
284 	}
285 	if (unlikely(lkey == (uint32_t)-1))
286 		lkey = txq_mp2mr_reg(txq, mp, i);
287 	return lkey;
288 }
289 
290 /**
291  * Ring TX queue doorbell.
292  *
293  * @param txq
294  *   Pointer to TX queue structure.
295  */
296 static inline void
297 mlx5_tx_dbrec(struct txq *txq)
298 {
299 	uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
300 	uint32_t data[4] = {
301 		htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
302 		htonl(txq->qp_num_8s),
303 		0,
304 		0,
305 	};
306 	rte_wmb();
307 	*txq->qp_db = htonl(txq->wqe_ci);
308 	/* Ensure ordering between DB record and BF copy. */
309 	rte_wmb();
310 	memcpy(dst, (uint8_t *)data, 16);
311 	txq->bf_offset ^= (1 << txq->bf_buf_size);
312 }
313 
314 /**
315  * Prefetch a CQE.
316  *
317  * @param txq
318  *   Pointer to TX queue structure.
319  * @param cqe_ci
320  *   CQE consumer index.
321  */
322 static inline void
323 tx_prefetch_cqe(struct txq *txq, uint16_t ci)
324 {
325 	volatile struct mlx5_cqe *cqe;
326 
327 	cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
328 	rte_prefetch0(cqe);
329 }
330 
331 /**
332  * Prefetch a WQE.
333  *
334  * @param txq
335  *   Pointer to TX queue structure.
336  * @param  wqe_ci
337  *   WQE consumer index.
338  */
339 static inline void
340 tx_prefetch_wqe(struct txq *txq, uint16_t ci)
341 {
342 	volatile struct mlx5_wqe64 *wqe;
343 
344 	wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
345 	rte_prefetch0(wqe);
346 }
347 
348 /**
349  * DPDK callback for TX.
350  *
351  * @param dpdk_txq
352  *   Generic pointer to TX queue structure.
353  * @param[in] pkts
354  *   Packets to transmit.
355  * @param pkts_n
356  *   Number of packets in array.
357  *
358  * @return
359  *   Number of packets successfully transmitted (<= pkts_n).
360  */
361 uint16_t
362 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
363 {
364 	struct txq *txq = (struct txq *)dpdk_txq;
365 	uint16_t elts_head = txq->elts_head;
366 	const unsigned int elts_n = 1 << txq->elts_n;
367 	unsigned int i = 0;
368 	unsigned int j = 0;
369 	unsigned int max;
370 	unsigned int comp;
371 	volatile struct mlx5_wqe *wqe = NULL;
372 	unsigned int segs_n = 0;
373 	struct rte_mbuf *buf = NULL;
374 	uint8_t *raw;
375 
376 	if (unlikely(!pkts_n))
377 		return 0;
378 	/* Prefetch first packet cacheline. */
379 	tx_prefetch_cqe(txq, txq->cq_ci);
380 	tx_prefetch_cqe(txq, txq->cq_ci + 1);
381 	rte_prefetch0(*pkts);
382 	/* Start processing. */
383 	txq_complete(txq);
384 	max = (elts_n - (elts_head - txq->elts_tail));
385 	if (max > elts_n)
386 		max -= elts_n;
387 	do {
388 		volatile struct mlx5_wqe_data_seg *dseg = NULL;
389 		uint32_t length;
390 		unsigned int ds = 0;
391 		uintptr_t addr;
392 #ifdef MLX5_PMD_SOFT_COUNTERS
393 		uint32_t total_length = 0;
394 #endif
395 
396 		/* first_seg */
397 		buf = *(pkts++);
398 		segs_n = buf->nb_segs;
399 		/*
400 		 * Make sure there is enough room to store this packet and
401 		 * that one ring entry remains unused.
402 		 */
403 		assert(segs_n);
404 		if (max < segs_n + 1)
405 			break;
406 		max -= segs_n;
407 		--segs_n;
408 		if (!segs_n)
409 			--pkts_n;
410 		wqe = &(*txq->wqes)[txq->wqe_ci &
411 				    ((1 << txq->wqe_n) - 1)].hdr;
412 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
413 		if (pkts_n > 1)
414 			rte_prefetch0(*pkts);
415 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
416 		length = DATA_LEN(buf);
417 #ifdef MLX5_PMD_SOFT_COUNTERS
418 		total_length = length;
419 #endif
420 		assert(length >= MLX5_WQE_DWORD_SIZE);
421 		/* Update element. */
422 		(*txq->elts)[elts_head] = buf;
423 		elts_head = (elts_head + 1) & (elts_n - 1);
424 		/* Prefetch next buffer data. */
425 		if (pkts_n > 1) {
426 			volatile void *pkt_addr;
427 
428 			pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
429 			rte_prefetch0(pkt_addr);
430 		}
431 		/* Should we enable HW CKSUM offload */
432 		if (buf->ol_flags &
433 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
434 			wqe->eseg.cs_flags =
435 				MLX5_ETH_WQE_L3_CSUM |
436 				MLX5_ETH_WQE_L4_CSUM;
437 		} else {
438 			wqe->eseg.cs_flags = 0;
439 		}
440 		raw  = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
441 		/* Start the know and common part of the WQE structure. */
442 		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
443 		wqe->ctrl[2] = 0;
444 		wqe->ctrl[3] = 0;
445 		wqe->eseg.rsvd0 = 0;
446 		wqe->eseg.rsvd1 = 0;
447 		wqe->eseg.mss = 0;
448 		wqe->eseg.rsvd2 = 0;
449 		/* Start by copying the Ethernet Header. */
450 		memcpy((uint8_t *)raw, ((uint8_t *)addr), 16);
451 		length -= MLX5_WQE_DWORD_SIZE;
452 		addr += MLX5_WQE_DWORD_SIZE;
453 		/* Replace the Ethernet type by the VLAN if necessary. */
454 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
455 			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
456 
457 			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
458 					   sizeof(vlan)),
459 			       &vlan, sizeof(vlan));
460 			addr -= sizeof(vlan);
461 			length += sizeof(vlan);
462 		}
463 		/* Inline if enough room. */
464 		if (txq->max_inline != 0) {
465 			uintptr_t end =
466 				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
467 			uint16_t max_inline =
468 				txq->max_inline * RTE_CACHE_LINE_SIZE;
469 			uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
470 			uint16_t room;
471 
472 			raw += MLX5_WQE_DWORD_SIZE;
473 			room = end - (uintptr_t)raw;
474 			if (room > max_inline) {
475 				uintptr_t addr_end = (addr + max_inline) &
476 					~(RTE_CACHE_LINE_SIZE - 1);
477 				uint16_t copy_b = ((addr_end - addr) > length) ?
478 						  length :
479 						  (addr_end - addr);
480 
481 				rte_memcpy((void *)raw, (void *)addr, copy_b);
482 				addr += copy_b;
483 				length -= copy_b;
484 				pkt_inline_sz += copy_b;
485 				/* Sanity check. */
486 				assert(addr <= addr_end);
487 			}
488 			/* Store the inlined packet size in the WQE. */
489 			wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
490 			/*
491 			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
492 			 * the size of the inline part of the packet.
493 			 */
494 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
495 			if (length > 0) {
496 				dseg = (struct mlx5_wqe_data_seg *)
497 					((uintptr_t)wqe +
498 					 (ds * MLX5_WQE_DWORD_SIZE));
499 				if ((uintptr_t)dseg >= end)
500 					dseg = (struct mlx5_wqe_data_seg *)
501 						((uintptr_t)&(*txq->wqes)[0]);
502 				goto use_dseg;
503 			} else if (!segs_n) {
504 				goto next_pkt;
505 			} else {
506 				goto next_seg;
507 			}
508 		} else {
509 			/*
510 			 * No inline has been done in the packet, only the
511 			 * Ethernet Header as been stored.
512 			 */
513 			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
514 			dseg = (struct mlx5_wqe_data_seg *)
515 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
516 			ds = 3;
517 use_dseg:
518 			/* Add the remaining packet as a simple ds. */
519 			*dseg = (struct mlx5_wqe_data_seg) {
520 				.addr = htonll(addr),
521 				.byte_count = htonl(length),
522 				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
523 			};
524 			++ds;
525 			if (!segs_n)
526 				goto next_pkt;
527 		}
528 next_seg:
529 		assert(buf);
530 		assert(ds);
531 		assert(wqe);
532 		/*
533 		 * Spill on next WQE when the current one does not have
534 		 * enough room left. Size of WQE must a be a multiple
535 		 * of data segment size.
536 		 */
537 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
538 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
539 			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
540 				((1 << txq->wqe_n) - 1);
541 
542 			dseg = (struct mlx5_wqe_data_seg *)
543 				((uintptr_t)&(*txq->wqes)[n]);
544 			tx_prefetch_wqe(txq, n + 1);
545 		} else {
546 			++dseg;
547 		}
548 		++ds;
549 		buf = buf->next;
550 		assert(buf);
551 		length = DATA_LEN(buf);
552 #ifdef MLX5_PMD_SOFT_COUNTERS
553 		total_length += length;
554 #endif
555 		/* Store segment information. */
556 		*dseg = (struct mlx5_wqe_data_seg) {
557 			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
558 			.byte_count = htonl(length),
559 			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
560 		};
561 		(*txq->elts)[elts_head] = buf;
562 		elts_head = (elts_head + 1) & (elts_n - 1);
563 		++j;
564 		--segs_n;
565 		if (segs_n)
566 			goto next_seg;
567 		else
568 			--pkts_n;
569 next_pkt:
570 		++i;
571 		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
572 		txq->wqe_ci += (ds + 3) / 4;
573 #ifdef MLX5_PMD_SOFT_COUNTERS
574 		/* Increment sent bytes counter. */
575 		txq->stats.obytes += total_length;
576 #endif
577 	} while (pkts_n);
578 	/* Take a shortcut if nothing must be sent. */
579 	if (unlikely(i == 0))
580 		return 0;
581 	/* Check whether completion threshold has been reached. */
582 	comp = txq->elts_comp + i + j;
583 	if (comp >= MLX5_TX_COMP_THRESH) {
584 		/* Request completion on last WQE. */
585 		wqe->ctrl[2] = htonl(8);
586 		/* Save elts_head in unused "immediate" field of WQE. */
587 		wqe->ctrl[3] = elts_head;
588 		txq->elts_comp = 0;
589 	} else {
590 		txq->elts_comp = comp;
591 	}
592 #ifdef MLX5_PMD_SOFT_COUNTERS
593 	/* Increment sent packets counter. */
594 	txq->stats.opackets += i;
595 #endif
596 	/* Ring QP doorbell. */
597 	mlx5_tx_dbrec(txq);
598 	txq->elts_head = elts_head;
599 	return i;
600 }
601 
602 /**
603  * Open a MPW session.
604  *
605  * @param txq
606  *   Pointer to TX queue structure.
607  * @param mpw
608  *   Pointer to MPW session structure.
609  * @param length
610  *   Packet length.
611  */
612 static inline void
613 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
614 {
615 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
616 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
617 		(volatile struct mlx5_wqe_data_seg (*)[])
618 		(uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
619 
620 	mpw->state = MLX5_MPW_STATE_OPENED;
621 	mpw->pkts_n = 0;
622 	mpw->len = length;
623 	mpw->total_len = 0;
624 	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
625 	mpw->wqe->eseg.mss = htons(length);
626 	mpw->wqe->eseg.inline_hdr_sz = 0;
627 	mpw->wqe->eseg.rsvd0 = 0;
628 	mpw->wqe->eseg.rsvd1 = 0;
629 	mpw->wqe->eseg.rsvd2 = 0;
630 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
631 				  (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
632 	mpw->wqe->ctrl[2] = 0;
633 	mpw->wqe->ctrl[3] = 0;
634 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
635 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
636 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
637 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
638 	mpw->data.dseg[2] = &(*dseg)[0];
639 	mpw->data.dseg[3] = &(*dseg)[1];
640 	mpw->data.dseg[4] = &(*dseg)[2];
641 }
642 
643 /**
644  * Close a MPW session.
645  *
646  * @param txq
647  *   Pointer to TX queue structure.
648  * @param mpw
649  *   Pointer to MPW session structure.
650  */
651 static inline void
652 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
653 {
654 	unsigned int num = mpw->pkts_n;
655 
656 	/*
657 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
658 	 * count as 2.
659 	 */
660 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
661 	mpw->state = MLX5_MPW_STATE_CLOSED;
662 	if (num < 3)
663 		++txq->wqe_ci;
664 	else
665 		txq->wqe_ci += 2;
666 	tx_prefetch_wqe(txq, txq->wqe_ci);
667 	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
668 }
669 
670 /**
671  * DPDK callback for TX with MPW support.
672  *
673  * @param dpdk_txq
674  *   Generic pointer to TX queue structure.
675  * @param[in] pkts
676  *   Packets to transmit.
677  * @param pkts_n
678  *   Number of packets in array.
679  *
680  * @return
681  *   Number of packets successfully transmitted (<= pkts_n).
682  */
683 uint16_t
684 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
685 {
686 	struct txq *txq = (struct txq *)dpdk_txq;
687 	uint16_t elts_head = txq->elts_head;
688 	const unsigned int elts_n = 1 << txq->elts_n;
689 	unsigned int i = 0;
690 	unsigned int j = 0;
691 	unsigned int max;
692 	unsigned int comp;
693 	struct mlx5_mpw mpw = {
694 		.state = MLX5_MPW_STATE_CLOSED,
695 	};
696 
697 	if (unlikely(!pkts_n))
698 		return 0;
699 	/* Prefetch first packet cacheline. */
700 	tx_prefetch_cqe(txq, txq->cq_ci);
701 	tx_prefetch_wqe(txq, txq->wqe_ci);
702 	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
703 	/* Start processing. */
704 	txq_complete(txq);
705 	max = (elts_n - (elts_head - txq->elts_tail));
706 	if (max > elts_n)
707 		max -= elts_n;
708 	do {
709 		struct rte_mbuf *buf = *(pkts++);
710 		unsigned int elts_head_next;
711 		uint32_t length;
712 		unsigned int segs_n = buf->nb_segs;
713 		uint32_t cs_flags = 0;
714 
715 		/*
716 		 * Make sure there is enough room to store this packet and
717 		 * that one ring entry remains unused.
718 		 */
719 		assert(segs_n);
720 		if (max < segs_n + 1)
721 			break;
722 		/* Do not bother with large packets MPW cannot handle. */
723 		if (segs_n > MLX5_MPW_DSEG_MAX)
724 			break;
725 		max -= segs_n;
726 		--pkts_n;
727 		/* Should we enable HW CKSUM offload */
728 		if (buf->ol_flags &
729 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
730 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
731 		/* Retrieve packet information. */
732 		length = PKT_LEN(buf);
733 		assert(length);
734 		/* Start new session if packet differs. */
735 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
736 		    ((mpw.len != length) ||
737 		     (segs_n != 1) ||
738 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
739 			mlx5_mpw_close(txq, &mpw);
740 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
741 			mlx5_mpw_new(txq, &mpw, length);
742 			mpw.wqe->eseg.cs_flags = cs_flags;
743 		}
744 		/* Multi-segment packets must be alone in their MPW. */
745 		assert((segs_n == 1) || (mpw.pkts_n == 0));
746 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
747 		length = 0;
748 #endif
749 		do {
750 			volatile struct mlx5_wqe_data_seg *dseg;
751 			uintptr_t addr;
752 
753 			elts_head_next = (elts_head + 1) & (elts_n - 1);
754 			assert(buf);
755 			(*txq->elts)[elts_head] = buf;
756 			dseg = mpw.data.dseg[mpw.pkts_n];
757 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
758 			*dseg = (struct mlx5_wqe_data_seg){
759 				.byte_count = htonl(DATA_LEN(buf)),
760 				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
761 				.addr = htonll(addr),
762 			};
763 			elts_head = elts_head_next;
764 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
765 			length += DATA_LEN(buf);
766 #endif
767 			buf = buf->next;
768 			++mpw.pkts_n;
769 			++j;
770 		} while (--segs_n);
771 		assert(length == mpw.len);
772 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
773 			mlx5_mpw_close(txq, &mpw);
774 		elts_head = elts_head_next;
775 #ifdef MLX5_PMD_SOFT_COUNTERS
776 		/* Increment sent bytes counter. */
777 		txq->stats.obytes += length;
778 #endif
779 		++i;
780 	} while (pkts_n);
781 	/* Take a shortcut if nothing must be sent. */
782 	if (unlikely(i == 0))
783 		return 0;
784 	/* Check whether completion threshold has been reached. */
785 	/* "j" includes both packets and segments. */
786 	comp = txq->elts_comp + j;
787 	if (comp >= MLX5_TX_COMP_THRESH) {
788 		volatile struct mlx5_wqe *wqe = mpw.wqe;
789 
790 		/* Request completion on last WQE. */
791 		wqe->ctrl[2] = htonl(8);
792 		/* Save elts_head in unused "immediate" field of WQE. */
793 		wqe->ctrl[3] = elts_head;
794 		txq->elts_comp = 0;
795 	} else {
796 		txq->elts_comp = comp;
797 	}
798 #ifdef MLX5_PMD_SOFT_COUNTERS
799 	/* Increment sent packets counter. */
800 	txq->stats.opackets += i;
801 #endif
802 	/* Ring QP doorbell. */
803 	if (mpw.state == MLX5_MPW_STATE_OPENED)
804 		mlx5_mpw_close(txq, &mpw);
805 	mlx5_tx_dbrec(txq);
806 	txq->elts_head = elts_head;
807 	return i;
808 }
809 
810 /**
811  * Open a MPW inline session.
812  *
813  * @param txq
814  *   Pointer to TX queue structure.
815  * @param mpw
816  *   Pointer to MPW session structure.
817  * @param length
818  *   Packet length.
819  */
820 static inline void
821 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
822 {
823 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
824 	struct mlx5_wqe_inl_small *inl;
825 
826 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
827 	mpw->pkts_n = 0;
828 	mpw->len = length;
829 	mpw->total_len = 0;
830 	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
831 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
832 				  (txq->wqe_ci << 8) |
833 				  MLX5_OPCODE_TSO);
834 	mpw->wqe->ctrl[2] = 0;
835 	mpw->wqe->ctrl[3] = 0;
836 	mpw->wqe->eseg.mss = htons(length);
837 	mpw->wqe->eseg.inline_hdr_sz = 0;
838 	mpw->wqe->eseg.cs_flags = 0;
839 	mpw->wqe->eseg.rsvd0 = 0;
840 	mpw->wqe->eseg.rsvd1 = 0;
841 	mpw->wqe->eseg.rsvd2 = 0;
842 	inl = (struct mlx5_wqe_inl_small *)
843 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
844 	mpw->data.raw = (uint8_t *)&inl->raw;
845 }
846 
847 /**
848  * Close a MPW inline session.
849  *
850  * @param txq
851  *   Pointer to TX queue structure.
852  * @param mpw
853  *   Pointer to MPW session structure.
854  */
855 static inline void
856 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
857 {
858 	unsigned int size;
859 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
860 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
861 
862 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
863 	/*
864 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
865 	 * count as 2.
866 	 */
867 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
868 	mpw->state = MLX5_MPW_STATE_CLOSED;
869 	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
870 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
871 }
872 
873 /**
874  * DPDK callback for TX with MPW inline support.
875  *
876  * @param dpdk_txq
877  *   Generic pointer to TX queue structure.
878  * @param[in] pkts
879  *   Packets to transmit.
880  * @param pkts_n
881  *   Number of packets in array.
882  *
883  * @return
884  *   Number of packets successfully transmitted (<= pkts_n).
885  */
886 uint16_t
887 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
888 			 uint16_t pkts_n)
889 {
890 	struct txq *txq = (struct txq *)dpdk_txq;
891 	uint16_t elts_head = txq->elts_head;
892 	const unsigned int elts_n = 1 << txq->elts_n;
893 	unsigned int i = 0;
894 	unsigned int j = 0;
895 	unsigned int max;
896 	unsigned int comp;
897 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
898 	struct mlx5_mpw mpw = {
899 		.state = MLX5_MPW_STATE_CLOSED,
900 	};
901 
902 	if (unlikely(!pkts_n))
903 		return 0;
904 	/* Prefetch first packet cacheline. */
905 	tx_prefetch_cqe(txq, txq->cq_ci);
906 	tx_prefetch_wqe(txq, txq->wqe_ci);
907 	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
908 	/* Start processing. */
909 	txq_complete(txq);
910 	max = (elts_n - (elts_head - txq->elts_tail));
911 	if (max > elts_n)
912 		max -= elts_n;
913 	do {
914 		struct rte_mbuf *buf = *(pkts++);
915 		unsigned int elts_head_next;
916 		uintptr_t addr;
917 		uint32_t length;
918 		unsigned int segs_n = buf->nb_segs;
919 		uint32_t cs_flags = 0;
920 
921 		/*
922 		 * Make sure there is enough room to store this packet and
923 		 * that one ring entry remains unused.
924 		 */
925 		assert(segs_n);
926 		if (max < segs_n + 1)
927 			break;
928 		/* Do not bother with large packets MPW cannot handle. */
929 		if (segs_n > MLX5_MPW_DSEG_MAX)
930 			break;
931 		max -= segs_n;
932 		--pkts_n;
933 		/* Should we enable HW CKSUM offload */
934 		if (buf->ol_flags &
935 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
936 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
937 		/* Retrieve packet information. */
938 		length = PKT_LEN(buf);
939 		/* Start new session if packet differs. */
940 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
941 			if ((mpw.len != length) ||
942 			    (segs_n != 1) ||
943 			    (mpw.wqe->eseg.cs_flags != cs_flags))
944 				mlx5_mpw_close(txq, &mpw);
945 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
946 			if ((mpw.len != length) ||
947 			    (segs_n != 1) ||
948 			    (length > inline_room) ||
949 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
950 				mlx5_mpw_inline_close(txq, &mpw);
951 				inline_room =
952 					txq->max_inline * RTE_CACHE_LINE_SIZE;
953 			}
954 		}
955 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
956 			if ((segs_n != 1) ||
957 			    (length > inline_room)) {
958 				mlx5_mpw_new(txq, &mpw, length);
959 				mpw.wqe->eseg.cs_flags = cs_flags;
960 			} else {
961 				mlx5_mpw_inline_new(txq, &mpw, length);
962 				mpw.wqe->eseg.cs_flags = cs_flags;
963 			}
964 		}
965 		/* Multi-segment packets must be alone in their MPW. */
966 		assert((segs_n == 1) || (mpw.pkts_n == 0));
967 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
968 			assert(inline_room ==
969 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
970 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
971 			length = 0;
972 #endif
973 			do {
974 				volatile struct mlx5_wqe_data_seg *dseg;
975 
976 				elts_head_next =
977 					(elts_head + 1) & (elts_n - 1);
978 				assert(buf);
979 				(*txq->elts)[elts_head] = buf;
980 				dseg = mpw.data.dseg[mpw.pkts_n];
981 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
982 				*dseg = (struct mlx5_wqe_data_seg){
983 					.byte_count = htonl(DATA_LEN(buf)),
984 					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
985 					.addr = htonll(addr),
986 				};
987 				elts_head = elts_head_next;
988 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
989 				length += DATA_LEN(buf);
990 #endif
991 				buf = buf->next;
992 				++mpw.pkts_n;
993 				++j;
994 			} while (--segs_n);
995 			assert(length == mpw.len);
996 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
997 				mlx5_mpw_close(txq, &mpw);
998 		} else {
999 			unsigned int max;
1000 
1001 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1002 			assert(length <= inline_room);
1003 			assert(length == DATA_LEN(buf));
1004 			elts_head_next = (elts_head + 1) & (elts_n - 1);
1005 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1006 			(*txq->elts)[elts_head] = buf;
1007 			/* Maximum number of bytes before wrapping. */
1008 			max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
1009 			       (uintptr_t)mpw.data.raw);
1010 			if (length > max) {
1011 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1012 					   (void *)addr,
1013 					   max);
1014 				mpw.data.raw =
1015 					(volatile void *)&(*txq->wqes)[0];
1016 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1017 					   (void *)(addr + max),
1018 					   length - max);
1019 				mpw.data.raw += length - max;
1020 			} else {
1021 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1022 					   (void *)addr,
1023 					   length);
1024 				mpw.data.raw += length;
1025 			}
1026 			if ((uintptr_t)mpw.data.raw ==
1027 			    (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
1028 				mpw.data.raw =
1029 					(volatile void *)&(*txq->wqes)[0];
1030 			++mpw.pkts_n;
1031 			++j;
1032 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1033 				mlx5_mpw_inline_close(txq, &mpw);
1034 				inline_room =
1035 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1036 			} else {
1037 				inline_room -= length;
1038 			}
1039 		}
1040 		mpw.total_len += length;
1041 		elts_head = elts_head_next;
1042 #ifdef MLX5_PMD_SOFT_COUNTERS
1043 		/* Increment sent bytes counter. */
1044 		txq->stats.obytes += length;
1045 #endif
1046 		++i;
1047 	} while (pkts_n);
1048 	/* Take a shortcut if nothing must be sent. */
1049 	if (unlikely(i == 0))
1050 		return 0;
1051 	/* Check whether completion threshold has been reached. */
1052 	/* "j" includes both packets and segments. */
1053 	comp = txq->elts_comp + j;
1054 	if (comp >= MLX5_TX_COMP_THRESH) {
1055 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1056 
1057 		/* Request completion on last WQE. */
1058 		wqe->ctrl[2] = htonl(8);
1059 		/* Save elts_head in unused "immediate" field of WQE. */
1060 		wqe->ctrl[3] = elts_head;
1061 		txq->elts_comp = 0;
1062 	} else {
1063 		txq->elts_comp = comp;
1064 	}
1065 #ifdef MLX5_PMD_SOFT_COUNTERS
1066 	/* Increment sent packets counter. */
1067 	txq->stats.opackets += i;
1068 #endif
1069 	/* Ring QP doorbell. */
1070 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1071 		mlx5_mpw_inline_close(txq, &mpw);
1072 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1073 		mlx5_mpw_close(txq, &mpw);
1074 	mlx5_tx_dbrec(txq);
1075 	txq->elts_head = elts_head;
1076 	return i;
1077 }
1078 
1079 /**
1080  * Translate RX completion flags to packet type.
1081  *
1082  * @param[in] cqe
1083  *   Pointer to CQE.
1084  *
1085  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1086  *
1087  * @return
1088  *   Packet type for struct rte_mbuf.
1089  */
1090 static inline uint32_t
1091 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1092 {
1093 	uint32_t pkt_type;
1094 	uint8_t flags = cqe->l4_hdr_type_etc;
1095 
1096 	if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET)
1097 		pkt_type =
1098 			TRANSPOSE(flags,
1099 				  MLX5_CQE_RX_OUTER_IPV4_PACKET,
1100 				  RTE_PTYPE_L3_IPV4) |
1101 			TRANSPOSE(flags,
1102 				  MLX5_CQE_RX_OUTER_IPV6_PACKET,
1103 				  RTE_PTYPE_L3_IPV6) |
1104 			TRANSPOSE(flags,
1105 				  MLX5_CQE_RX_IPV4_PACKET,
1106 				  RTE_PTYPE_INNER_L3_IPV4) |
1107 			TRANSPOSE(flags,
1108 				  MLX5_CQE_RX_IPV6_PACKET,
1109 				  RTE_PTYPE_INNER_L3_IPV6);
1110 	else
1111 		pkt_type =
1112 			TRANSPOSE(flags,
1113 				  MLX5_CQE_L3_HDR_TYPE_IPV6,
1114 				  RTE_PTYPE_L3_IPV6) |
1115 			TRANSPOSE(flags,
1116 				  MLX5_CQE_L3_HDR_TYPE_IPV4,
1117 				  RTE_PTYPE_L3_IPV4);
1118 	return pkt_type;
1119 }
1120 
1121 /**
1122  * Get size of the next packet for a given CQE. For compressed CQEs, the
1123  * consumer index is updated only once all packets of the current one have
1124  * been processed.
1125  *
1126  * @param rxq
1127  *   Pointer to RX queue.
1128  * @param cqe
1129  *   CQE to process.
1130  * @param[out] rss_hash
1131  *   Packet RSS Hash result.
1132  *
1133  * @return
1134  *   Packet size in bytes (0 if there is none), -1 in case of completion
1135  *   with error.
1136  */
1137 static inline int
1138 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
1139 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1140 {
1141 	struct rxq_zip *zip = &rxq->zip;
1142 	uint16_t cqe_n = cqe_cnt + 1;
1143 	int len = 0;
1144 
1145 	/* Process compressed data in the CQE and mini arrays. */
1146 	if (zip->ai) {
1147 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1148 			(volatile struct mlx5_mini_cqe8 (*)[8])
1149 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]);
1150 
1151 		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1152 		*rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
1153 		if ((++zip->ai & 7) == 0) {
1154 			/*
1155 			 * Increment consumer index to skip the number of
1156 			 * CQEs consumed. Hardware leaves holes in the CQ
1157 			 * ring for software use.
1158 			 */
1159 			zip->ca = zip->na;
1160 			zip->na += 8;
1161 		}
1162 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1163 			uint16_t idx = rxq->cq_ci;
1164 			uint16_t end = zip->cq_ci;
1165 
1166 			while (idx != end) {
1167 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1168 					MLX5_CQE_INVALIDATE;
1169 				++idx;
1170 			}
1171 			rxq->cq_ci = zip->cq_ci;
1172 			zip->ai = 0;
1173 		}
1174 	/* No compressed data, get next CQE and verify if it is compressed. */
1175 	} else {
1176 		int ret;
1177 		int8_t op_own;
1178 
1179 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1180 		if (unlikely(ret == 1))
1181 			return 0;
1182 		++rxq->cq_ci;
1183 		op_own = cqe->op_own;
1184 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1185 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1186 				(volatile struct mlx5_mini_cqe8 (*)[8])
1187 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1188 							  cqe_cnt]);
1189 
1190 			/* Fix endianness. */
1191 			zip->cqe_cnt = ntohl(cqe->byte_cnt);
1192 			/*
1193 			 * Current mini array position is the one returned by
1194 			 * check_cqe64().
1195 			 *
1196 			 * If completion comprises several mini arrays, as a
1197 			 * special case the second one is located 7 CQEs after
1198 			 * the initial CQE instead of 8 for subsequent ones.
1199 			 */
1200 			zip->ca = rxq->cq_ci & cqe_cnt;
1201 			zip->na = zip->ca + 7;
1202 			/* Compute the next non compressed CQE. */
1203 			--rxq->cq_ci;
1204 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1205 			/* Get packet size to return. */
1206 			len = ntohl((*mc)[0].byte_cnt);
1207 			*rss_hash = ntohl((*mc)[0].rx_hash_result);
1208 			zip->ai = 1;
1209 		} else {
1210 			len = ntohl(cqe->byte_cnt);
1211 			*rss_hash = ntohl(cqe->rx_hash_res);
1212 		}
1213 		/* Error while receiving packet. */
1214 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1215 			return -1;
1216 	}
1217 	return len;
1218 }
1219 
1220 /**
1221  * Translate RX completion flags to offload flags.
1222  *
1223  * @param[in] rxq
1224  *   Pointer to RX queue structure.
1225  * @param[in] cqe
1226  *   Pointer to CQE.
1227  *
1228  * @return
1229  *   Offload flags (ol_flags) for struct rte_mbuf.
1230  */
1231 static inline uint32_t
1232 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
1233 {
1234 	uint32_t ol_flags = 0;
1235 	uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
1236 	uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
1237 
1238 	if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
1239 	    (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
1240 		ol_flags |= TRANSPOSE(cqe->hds_ip_ext,
1241 				      MLX5_CQE_L3_OK,
1242 				      PKT_RX_IP_CKSUM_GOOD);
1243 	if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
1244 	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
1245 	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
1246 	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
1247 		ol_flags |= TRANSPOSE(cqe->hds_ip_ext,
1248 				      MLX5_CQE_L4_OK,
1249 				      PKT_RX_L4_CKSUM_GOOD);
1250 	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1251 		ol_flags |=
1252 			TRANSPOSE(cqe->l4_hdr_type_etc,
1253 				  MLX5_CQE_RX_OUTER_IP_CSUM_OK,
1254 				  PKT_RX_IP_CKSUM_GOOD) |
1255 			TRANSPOSE(cqe->l4_hdr_type_etc,
1256 				  MLX5_CQE_RX_OUTER_TCP_UDP_CSUM_OK,
1257 				  PKT_RX_L4_CKSUM_GOOD);
1258 	return ol_flags;
1259 }
1260 
1261 /**
1262  * DPDK callback for RX.
1263  *
1264  * @param dpdk_rxq
1265  *   Generic pointer to RX queue structure.
1266  * @param[out] pkts
1267  *   Array to store received packets.
1268  * @param pkts_n
1269  *   Maximum number of packets in array.
1270  *
1271  * @return
1272  *   Number of packets successfully received (<= pkts_n).
1273  */
1274 uint16_t
1275 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1276 {
1277 	struct rxq *rxq = dpdk_rxq;
1278 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1279 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1280 	const unsigned int sges_n = rxq->sges_n;
1281 	struct rte_mbuf *pkt = NULL;
1282 	struct rte_mbuf *seg = NULL;
1283 	volatile struct mlx5_cqe *cqe =
1284 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1285 	unsigned int i = 0;
1286 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1287 	int len; /* keep its value across iterations. */
1288 
1289 	while (pkts_n) {
1290 		unsigned int idx = rq_ci & wqe_cnt;
1291 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1292 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1293 		uint32_t rss_hash_res = 0;
1294 
1295 		if (pkt)
1296 			NEXT(seg) = rep;
1297 		seg = rep;
1298 		rte_prefetch0(seg);
1299 		rte_prefetch0(cqe);
1300 		rte_prefetch0(wqe);
1301 		rep = rte_mbuf_raw_alloc(rxq->mp);
1302 		if (unlikely(rep == NULL)) {
1303 			++rxq->stats.rx_nombuf;
1304 			if (!pkt) {
1305 				/*
1306 				 * no buffers before we even started,
1307 				 * bail out silently.
1308 				 */
1309 				break;
1310 			}
1311 			while (pkt != seg) {
1312 				assert(pkt != (*rxq->elts)[idx]);
1313 				seg = NEXT(pkt);
1314 				rte_mbuf_refcnt_set(pkt, 0);
1315 				__rte_mbuf_raw_free(pkt);
1316 				pkt = seg;
1317 			}
1318 			break;
1319 		}
1320 		if (!pkt) {
1321 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1322 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1323 					       &rss_hash_res);
1324 			if (!len) {
1325 				rte_mbuf_refcnt_set(rep, 0);
1326 				__rte_mbuf_raw_free(rep);
1327 				break;
1328 			}
1329 			if (unlikely(len == -1)) {
1330 				/* RX error, packet is likely too large. */
1331 				rte_mbuf_refcnt_set(rep, 0);
1332 				__rte_mbuf_raw_free(rep);
1333 				++rxq->stats.idropped;
1334 				goto skip;
1335 			}
1336 			pkt = seg;
1337 			assert(len >= (rxq->crc_present << 2));
1338 			/* Update packet information. */
1339 			pkt->packet_type = 0;
1340 			pkt->ol_flags = 0;
1341 			if (rxq->rss_hash) {
1342 				pkt->hash.rss = rss_hash_res;
1343 				pkt->ol_flags = PKT_RX_RSS_HASH;
1344 			}
1345 			if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
1346 			    rxq->crc_present) {
1347 				if (rxq->csum) {
1348 					pkt->packet_type =
1349 						rxq_cq_to_pkt_type(cqe);
1350 					pkt->ol_flags |=
1351 						rxq_cq_to_ol_flags(rxq, cqe);
1352 				}
1353 				if (cqe->l4_hdr_type_etc &
1354 				    MLX5_CQE_VLAN_STRIPPED) {
1355 					pkt->ol_flags |= PKT_RX_VLAN_PKT |
1356 						PKT_RX_VLAN_STRIPPED;
1357 					pkt->vlan_tci = ntohs(cqe->vlan_info);
1358 				}
1359 				if (rxq->crc_present)
1360 					len -= ETHER_CRC_LEN;
1361 			}
1362 			PKT_LEN(pkt) = len;
1363 		}
1364 		DATA_LEN(rep) = DATA_LEN(seg);
1365 		PKT_LEN(rep) = PKT_LEN(seg);
1366 		SET_DATA_OFF(rep, DATA_OFF(seg));
1367 		NB_SEGS(rep) = NB_SEGS(seg);
1368 		PORT(rep) = PORT(seg);
1369 		NEXT(rep) = NULL;
1370 		(*rxq->elts)[idx] = rep;
1371 		/*
1372 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1373 		 * of the buffers are already known, only the buffer address
1374 		 * changes.
1375 		 */
1376 		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
1377 		if (len > DATA_LEN(seg)) {
1378 			len -= DATA_LEN(seg);
1379 			++NB_SEGS(pkt);
1380 			++rq_ci;
1381 			continue;
1382 		}
1383 		DATA_LEN(seg) = len;
1384 #ifdef MLX5_PMD_SOFT_COUNTERS
1385 		/* Increment bytes counter. */
1386 		rxq->stats.ibytes += PKT_LEN(pkt);
1387 #endif
1388 		/* Return packet. */
1389 		*(pkts++) = pkt;
1390 		pkt = NULL;
1391 		--pkts_n;
1392 		++i;
1393 skip:
1394 		/* Align consumer index to the next stride. */
1395 		rq_ci >>= sges_n;
1396 		++rq_ci;
1397 		rq_ci <<= sges_n;
1398 	}
1399 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1400 		return 0;
1401 	/* Update the consumer index. */
1402 	rxq->rq_ci = rq_ci >> sges_n;
1403 	rte_wmb();
1404 	*rxq->cq_db = htonl(rxq->cq_ci);
1405 	rte_wmb();
1406 	*rxq->rq_db = htonl(rxq->rq_ci);
1407 #ifdef MLX5_PMD_SOFT_COUNTERS
1408 	/* Increment packets counter. */
1409 	rxq->stats.ipackets += i;
1410 #endif
1411 	return i;
1412 }
1413 
1414 /**
1415  * Dummy DPDK callback for TX.
1416  *
1417  * This function is used to temporarily replace the real callback during
1418  * unsafe control operations on the queue, or in case of error.
1419  *
1420  * @param dpdk_txq
1421  *   Generic pointer to TX queue structure.
1422  * @param[in] pkts
1423  *   Packets to transmit.
1424  * @param pkts_n
1425  *   Number of packets in array.
1426  *
1427  * @return
1428  *   Number of packets successfully transmitted (<= pkts_n).
1429  */
1430 uint16_t
1431 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1432 {
1433 	(void)dpdk_txq;
1434 	(void)pkts;
1435 	(void)pkts_n;
1436 	return 0;
1437 }
1438 
1439 /**
1440  * Dummy DPDK callback for RX.
1441  *
1442  * This function is used to temporarily replace the real callback during
1443  * unsafe control operations on the queue, or in case of error.
1444  *
1445  * @param dpdk_rxq
1446  *   Generic pointer to RX queue structure.
1447  * @param[out] pkts
1448  *   Array to store received packets.
1449  * @param pkts_n
1450  *   Maximum number of packets in array.
1451  *
1452  * @return
1453  *   Number of packets successfully received (<= pkts_n).
1454  */
1455 uint16_t
1456 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1457 {
1458 	(void)dpdk_rxq;
1459 	(void)pkts;
1460 	(void)pkts_n;
1461 	return 0;
1462 }
1463