xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 8205e241b2b01c05f2cffe5158c053d614d1f68c)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <assert.h>
35 #include <stdint.h>
36 #include <string.h>
37 #include <stdlib.h>
38 
39 /* Verbs header. */
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41 #ifdef PEDANTIC
42 #pragma GCC diagnostic ignored "-pedantic"
43 #endif
44 #include <infiniband/verbs.h>
45 #ifdef PEDANTIC
46 #pragma GCC diagnostic error "-pedantic"
47 #endif
48 
49 /* DPDK headers don't like -pedantic. */
50 #ifdef PEDANTIC
51 #pragma GCC diagnostic ignored "-pedantic"
52 #endif
53 #include <rte_mbuf.h>
54 #include <rte_mempool.h>
55 #include <rte_prefetch.h>
56 #include <rte_common.h>
57 #include <rte_branch_prediction.h>
58 #ifdef PEDANTIC
59 #pragma GCC diagnostic error "-pedantic"
60 #endif
61 
62 #include "mlx5.h"
63 #include "mlx5_utils.h"
64 #include "mlx5_rxtx.h"
65 #include "mlx5_defs.h"
66 
67 /**
68  * Manage TX completions.
69  *
70  * When sending a burst, mlx5_tx_burst() posts several WRs.
71  * To improve performance, a completion event is only required once every
72  * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
73  * for other WRs, but this information would not be used anyway.
74  *
75  * @param txq
76  *   Pointer to TX queue structure.
77  *
78  * @return
79  *   0 on success, -1 on failure.
80  */
81 static int
82 txq_complete(struct txq *txq)
83 {
84 	unsigned int elts_comp = txq->elts_comp;
85 	unsigned int elts_tail = txq->elts_tail;
86 	const unsigned int elts_n = txq->elts_n;
87 	int wcs_n;
88 
89 	if (unlikely(elts_comp == 0))
90 		return 0;
91 #ifdef DEBUG_SEND
92 	DEBUG("%p: processing %u work requests completions",
93 	      (void *)txq, elts_comp);
94 #endif
95 	wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
96 	if (unlikely(wcs_n == 0))
97 		return 0;
98 	if (unlikely(wcs_n < 0)) {
99 		DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
100 		      (void *)txq, wcs_n);
101 		return -1;
102 	}
103 	elts_comp -= wcs_n;
104 	assert(elts_comp <= txq->elts_comp);
105 	/*
106 	 * Assume WC status is successful as nothing can be done about it
107 	 * anyway.
108 	 */
109 	elts_tail += wcs_n * txq->elts_comp_cd_init;
110 	if (elts_tail >= elts_n)
111 		elts_tail -= elts_n;
112 	txq->elts_tail = elts_tail;
113 	txq->elts_comp = elts_comp;
114 	return 0;
115 }
116 
117 /**
118  * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
119  * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
120  * remove an entry first.
121  *
122  * @param txq
123  *   Pointer to TX queue structure.
124  * @param[in] mp
125  *   Memory Pool for which a Memory Region lkey must be returned.
126  *
127  * @return
128  *   mr->lkey on success, (uint32_t)-1 on failure.
129  */
130 static uint32_t
131 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
132 {
133 	unsigned int i;
134 	struct ibv_mr *mr;
135 
136 	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
137 		if (unlikely(txq->mp2mr[i].mp == NULL)) {
138 			/* Unknown MP, add a new MR for it. */
139 			break;
140 		}
141 		if (txq->mp2mr[i].mp == mp) {
142 			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
143 			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
144 			return txq->mp2mr[i].lkey;
145 		}
146 	}
147 	/* Add a new entry, register MR first. */
148 	DEBUG("%p: discovered new memory pool %p", (void *)txq, (void *)mp);
149 	mr = ibv_reg_mr(txq->priv->pd,
150 			(void *)mp->elt_va_start,
151 			(mp->elt_va_end - mp->elt_va_start),
152 			(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
153 	if (unlikely(mr == NULL)) {
154 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
155 		      (void *)txq);
156 		return (uint32_t)-1;
157 	}
158 	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
159 		/* Table is full, remove oldest entry. */
160 		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
161 		      (void *)txq);
162 		--i;
163 		claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
164 		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
165 			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
166 	}
167 	/* Store the new entry. */
168 	txq->mp2mr[i].mp = mp;
169 	txq->mp2mr[i].mr = mr;
170 	txq->mp2mr[i].lkey = mr->lkey;
171 	DEBUG("%p: new MR lkey for MP %p: 0x%08" PRIu32,
172 	      (void *)txq, (void *)mp, txq->mp2mr[i].lkey);
173 	return txq->mp2mr[i].lkey;
174 }
175 
176 #if MLX5_PMD_SGE_WR_N > 1
177 
178 /**
179  * Copy scattered mbuf contents to a single linear buffer.
180  *
181  * @param[out] linear
182  *   Linear output buffer.
183  * @param[in] buf
184  *   Scattered input buffer.
185  *
186  * @return
187  *   Number of bytes copied to the output buffer or 0 if not large enough.
188  */
189 static unsigned int
190 linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
191 {
192 	unsigned int size = 0;
193 	unsigned int offset;
194 
195 	do {
196 		unsigned int len = DATA_LEN(buf);
197 
198 		offset = size;
199 		size += len;
200 		if (unlikely(size > sizeof(*linear)))
201 			return 0;
202 		memcpy(&(*linear)[offset],
203 		       rte_pktmbuf_mtod(buf, uint8_t *),
204 		       len);
205 		buf = NEXT(buf);
206 	} while (buf != NULL);
207 	return size;
208 }
209 
210 /**
211  * Handle scattered buffers for mlx5_tx_burst().
212  *
213  * @param txq
214  *   TX queue structure.
215  * @param segs
216  *   Number of segments in buf.
217  * @param elt
218  *   TX queue element to fill.
219  * @param[in] buf
220  *   Buffer to process.
221  * @param elts_head
222  *   Index of the linear buffer to use if necessary (normally txq->elts_head).
223  * @param[out] sges
224  *   Array filled with SGEs on success.
225  *
226  * @return
227  *   A structure containing the processed packet size in bytes and the
228  *   number of SGEs. Both fields are set to (unsigned int)-1 in case of
229  *   failure.
230  */
231 static struct tx_burst_sg_ret {
232 	unsigned int length;
233 	unsigned int num;
234 }
235 tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
236 	    struct rte_mbuf *buf, unsigned int elts_head,
237 	    struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
238 {
239 	unsigned int sent_size = 0;
240 	unsigned int j;
241 	int linearize = 0;
242 
243 	/* When there are too many segments, extra segments are
244 	 * linearized in the last SGE. */
245 	if (unlikely(segs > RTE_DIM(*sges))) {
246 		segs = (RTE_DIM(*sges) - 1);
247 		linearize = 1;
248 	}
249 	/* Update element. */
250 	elt->buf = buf;
251 	/* Register segments as SGEs. */
252 	for (j = 0; (j != segs); ++j) {
253 		struct ibv_sge *sge = &(*sges)[j];
254 		uint32_t lkey;
255 
256 		/* Retrieve Memory Region key for this memory pool. */
257 		lkey = txq_mp2mr(txq, buf->pool);
258 		if (unlikely(lkey == (uint32_t)-1)) {
259 			/* MR does not exist. */
260 			DEBUG("%p: unable to get MP <-> MR association",
261 			      (void *)txq);
262 			/* Clean up TX element. */
263 			elt->buf = NULL;
264 			goto stop;
265 		}
266 		/* Update SGE. */
267 		sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
268 		if (txq->priv->vf)
269 			rte_prefetch0((volatile void *)
270 				      (uintptr_t)sge->addr);
271 		sge->length = DATA_LEN(buf);
272 		sge->lkey = lkey;
273 		sent_size += sge->length;
274 		buf = NEXT(buf);
275 	}
276 	/* If buf is not NULL here and is not going to be linearized,
277 	 * nb_segs is not valid. */
278 	assert(j == segs);
279 	assert((buf == NULL) || (linearize));
280 	/* Linearize extra segments. */
281 	if (linearize) {
282 		struct ibv_sge *sge = &(*sges)[segs];
283 		linear_t *linear = &(*txq->elts_linear)[elts_head];
284 		unsigned int size = linearize_mbuf(linear, buf);
285 
286 		assert(segs == (RTE_DIM(*sges) - 1));
287 		if (size == 0) {
288 			/* Invalid packet. */
289 			DEBUG("%p: packet too large to be linearized.",
290 			      (void *)txq);
291 			/* Clean up TX element. */
292 			elt->buf = NULL;
293 			goto stop;
294 		}
295 		/* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
296 		if (RTE_DIM(*sges) == 1) {
297 			do {
298 				struct rte_mbuf *next = NEXT(buf);
299 
300 				rte_pktmbuf_free_seg(buf);
301 				buf = next;
302 			} while (buf != NULL);
303 			elt->buf = NULL;
304 		}
305 		/* Update SGE. */
306 		sge->addr = (uintptr_t)&(*linear)[0];
307 		sge->length = size;
308 		sge->lkey = txq->mr_linear->lkey;
309 		sent_size += size;
310 	}
311 	return (struct tx_burst_sg_ret){
312 		.length = sent_size,
313 		.num = segs,
314 	};
315 stop:
316 	return (struct tx_burst_sg_ret){
317 		.length = -1,
318 		.num = -1,
319 	};
320 }
321 
322 #endif /* MLX5_PMD_SGE_WR_N > 1 */
323 
324 /**
325  * DPDK callback for TX.
326  *
327  * @param dpdk_txq
328  *   Generic pointer to TX queue structure.
329  * @param[in] pkts
330  *   Packets to transmit.
331  * @param pkts_n
332  *   Number of packets in array.
333  *
334  * @return
335  *   Number of packets successfully transmitted (<= pkts_n).
336  */
337 uint16_t
338 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
339 {
340 	struct txq *txq = (struct txq *)dpdk_txq;
341 	unsigned int elts_head = txq->elts_head;
342 	const unsigned int elts_tail = txq->elts_tail;
343 	const unsigned int elts_n = txq->elts_n;
344 	unsigned int elts_comp_cd = txq->elts_comp_cd;
345 	unsigned int elts_comp = 0;
346 	unsigned int i;
347 	unsigned int max;
348 	int err;
349 
350 	assert(elts_comp_cd != 0);
351 	txq_complete(txq);
352 	max = (elts_n - (elts_head - elts_tail));
353 	if (max > elts_n)
354 		max -= elts_n;
355 	assert(max >= 1);
356 	assert(max <= elts_n);
357 	/* Always leave one free entry in the ring. */
358 	--max;
359 	if (max == 0)
360 		return 0;
361 	if (max > pkts_n)
362 		max = pkts_n;
363 	for (i = 0; (i != max); ++i) {
364 		struct rte_mbuf *buf = pkts[i];
365 		unsigned int elts_head_next =
366 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
367 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
368 		struct txq_elt *elt = &(*txq->elts)[elts_head];
369 		unsigned int segs = NB_SEGS(buf);
370 #ifdef MLX5_PMD_SOFT_COUNTERS
371 		unsigned int sent_size = 0;
372 #endif
373 		uint32_t send_flags = 0;
374 
375 		/* Clean up old buffer. */
376 		if (likely(elt->buf != NULL)) {
377 			struct rte_mbuf *tmp = elt->buf;
378 
379 			/* Faster than rte_pktmbuf_free(). */
380 			do {
381 				struct rte_mbuf *next = NEXT(tmp);
382 
383 				rte_pktmbuf_free_seg(tmp);
384 				tmp = next;
385 			} while (tmp != NULL);
386 		}
387 		/* Request TX completion. */
388 		if (unlikely(--elts_comp_cd == 0)) {
389 			elts_comp_cd = txq->elts_comp_cd_init;
390 			++elts_comp;
391 			send_flags |= IBV_EXP_QP_BURST_SIGNALED;
392 		}
393 		/* Should we enable HW CKSUM offload */
394 		if (buf->ol_flags &
395 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
396 			send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
397 			/* HW does not support checksum offloads at arbitrary
398 			 * offsets but automatically recognizes the packet
399 			 * type. For inner L3/L4 checksums, only VXLAN (UDP)
400 			 * tunnels are currently supported. */
401 			if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
402 				send_flags |= IBV_EXP_QP_BURST_TUNNEL;
403 		}
404 		if (likely(segs == 1)) {
405 			uintptr_t addr;
406 			uint32_t length;
407 			uint32_t lkey;
408 
409 			/* Retrieve buffer information. */
410 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
411 			length = DATA_LEN(buf);
412 			/* Retrieve Memory Region key for this memory pool. */
413 			lkey = txq_mp2mr(txq, buf->pool);
414 			if (unlikely(lkey == (uint32_t)-1)) {
415 				/* MR does not exist. */
416 				DEBUG("%p: unable to get MP <-> MR"
417 				      " association", (void *)txq);
418 				/* Clean up TX element. */
419 				elt->buf = NULL;
420 				goto stop;
421 			}
422 			/* Update element. */
423 			elt->buf = buf;
424 			if (txq->priv->vf)
425 				rte_prefetch0((volatile void *)
426 					      (uintptr_t)addr);
427 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
428 			/* Put packet into send queue. */
429 #if MLX5_PMD_MAX_INLINE > 0
430 			if (length <= txq->max_inline)
431 				err = txq->if_qp->send_pending_inline
432 					(txq->qp,
433 					 (void *)addr,
434 					 length,
435 					 send_flags);
436 			else
437 #endif
438 				err = txq->if_qp->send_pending
439 					(txq->qp,
440 					 addr,
441 					 length,
442 					 lkey,
443 					 send_flags);
444 			if (unlikely(err))
445 				goto stop;
446 #ifdef MLX5_PMD_SOFT_COUNTERS
447 			sent_size += length;
448 #endif
449 		} else {
450 #if MLX5_PMD_SGE_WR_N > 1
451 			struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
452 			struct tx_burst_sg_ret ret;
453 
454 			ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
455 					  &sges);
456 			if (ret.length == (unsigned int)-1)
457 				goto stop;
458 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
459 			/* Put SG list into send queue. */
460 			err = txq->if_qp->send_pending_sg_list
461 				(txq->qp,
462 				 sges,
463 				 ret.num,
464 				 send_flags);
465 			if (unlikely(err))
466 				goto stop;
467 #ifdef MLX5_PMD_SOFT_COUNTERS
468 			sent_size += ret.length;
469 #endif
470 #else /* MLX5_PMD_SGE_WR_N > 1 */
471 			DEBUG("%p: TX scattered buffers support not"
472 			      " compiled in", (void *)txq);
473 			goto stop;
474 #endif /* MLX5_PMD_SGE_WR_N > 1 */
475 		}
476 		elts_head = elts_head_next;
477 #ifdef MLX5_PMD_SOFT_COUNTERS
478 		/* Increment sent bytes counter. */
479 		txq->stats.obytes += sent_size;
480 #endif
481 	}
482 stop:
483 	/* Take a shortcut if nothing must be sent. */
484 	if (unlikely(i == 0))
485 		return 0;
486 #ifdef MLX5_PMD_SOFT_COUNTERS
487 	/* Increment sent packets counter. */
488 	txq->stats.opackets += i;
489 #endif
490 	/* Ring QP doorbell. */
491 	err = txq->if_qp->send_flush(txq->qp);
492 	if (unlikely(err)) {
493 		/* A nonzero value is not supposed to be returned.
494 		 * Nothing can be done about it. */
495 		DEBUG("%p: send_flush() failed with error %d",
496 		      (void *)txq, err);
497 	}
498 	txq->elts_head = elts_head;
499 	txq->elts_comp += elts_comp;
500 	txq->elts_comp_cd = elts_comp_cd;
501 	return i;
502 }
503 
504 /**
505  * Translate RX completion flags to packet type.
506  *
507  * @param flags
508  *   RX completion flags returned by poll_length_flags().
509  *
510  * @return
511  *   Packet type for struct rte_mbuf.
512  */
513 static inline uint32_t
514 rxq_cq_to_pkt_type(uint32_t flags)
515 {
516 	uint32_t pkt_type;
517 
518 	if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
519 		pkt_type =
520 			TRANSPOSE(flags,
521 				  IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
522 				  RTE_PTYPE_L3_IPV4) |
523 			TRANSPOSE(flags,
524 				  IBV_EXP_CQ_RX_OUTER_IPV6_PACKET,
525 				  RTE_PTYPE_L3_IPV6) |
526 			TRANSPOSE(flags,
527 				  IBV_EXP_CQ_RX_IPV4_PACKET,
528 				  RTE_PTYPE_INNER_L3_IPV4) |
529 			TRANSPOSE(flags,
530 				  IBV_EXP_CQ_RX_IPV6_PACKET,
531 				  RTE_PTYPE_INNER_L3_IPV6);
532 	else
533 		pkt_type =
534 			TRANSPOSE(flags,
535 				  IBV_EXP_CQ_RX_IPV4_PACKET,
536 				  RTE_PTYPE_L3_IPV4) |
537 			TRANSPOSE(flags,
538 				  IBV_EXP_CQ_RX_IPV6_PACKET,
539 				  RTE_PTYPE_L3_IPV6);
540 	return pkt_type;
541 }
542 
543 /**
544  * Translate RX completion flags to offload flags.
545  *
546  * @param[in] rxq
547  *   Pointer to RX queue structure.
548  * @param flags
549  *   RX completion flags returned by poll_length_flags().
550  *
551  * @return
552  *   Offload flags (ol_flags) for struct rte_mbuf.
553  */
554 static inline uint32_t
555 rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
556 {
557 	uint32_t ol_flags = 0;
558 
559 	if (rxq->csum)
560 		ol_flags |=
561 			TRANSPOSE(~flags,
562 				  IBV_EXP_CQ_RX_IP_CSUM_OK,
563 				  PKT_RX_IP_CKSUM_BAD) |
564 			TRANSPOSE(~flags,
565 				  IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
566 				  PKT_RX_L4_CKSUM_BAD);
567 	/*
568 	 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
569 	 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
570 	 * (its value is 0).
571 	 */
572 	if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
573 		ol_flags |=
574 			TRANSPOSE(~flags,
575 				  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
576 				  PKT_RX_IP_CKSUM_BAD) |
577 			TRANSPOSE(~flags,
578 				  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
579 				  PKT_RX_L4_CKSUM_BAD);
580 	return ol_flags;
581 }
582 
583 /**
584  * DPDK callback for RX with scattered packets support.
585  *
586  * @param dpdk_rxq
587  *   Generic pointer to RX queue structure.
588  * @param[out] pkts
589  *   Array to store received packets.
590  * @param pkts_n
591  *   Maximum number of packets in array.
592  *
593  * @return
594  *   Number of packets successfully received (<= pkts_n).
595  */
596 uint16_t
597 mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
598 {
599 	struct rxq *rxq = (struct rxq *)dpdk_rxq;
600 	struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
601 	const unsigned int elts_n = rxq->elts_n;
602 	unsigned int elts_head = rxq->elts_head;
603 	unsigned int i;
604 	unsigned int pkts_ret = 0;
605 	int ret;
606 
607 	if (unlikely(!rxq->sp))
608 		return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
609 	if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
610 		return 0;
611 	for (i = 0; (i != pkts_n); ++i) {
612 		struct rxq_elt_sp *elt = &(*elts)[elts_head];
613 		unsigned int len;
614 		unsigned int pkt_buf_len;
615 		struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
616 		struct rte_mbuf **pkt_buf_next = &pkt_buf;
617 		unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
618 		unsigned int j = 0;
619 		uint32_t flags;
620 
621 		/* Sanity checks. */
622 		assert(elts_head < rxq->elts_n);
623 		assert(rxq->elts_head < rxq->elts_n);
624 		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
625 						    &flags);
626 		if (unlikely(ret < 0)) {
627 			struct ibv_wc wc;
628 			int wcs_n;
629 
630 			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
631 			      (void *)rxq, ret);
632 			/* ibv_poll_cq() must be used in case of failure. */
633 			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
634 			if (unlikely(wcs_n == 0))
635 				break;
636 			if (unlikely(wcs_n < 0)) {
637 				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
638 				      (void *)rxq, wcs_n);
639 				break;
640 			}
641 			assert(wcs_n == 1);
642 			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
643 				/* Whatever, just repost the offending WR. */
644 				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
645 				      " completion status (%d): %s",
646 				      (void *)rxq, wc.wr_id, wc.status,
647 				      ibv_wc_status_str(wc.status));
648 #ifdef MLX5_PMD_SOFT_COUNTERS
649 				/* Increment dropped packets counter. */
650 				++rxq->stats.idropped;
651 #endif
652 				goto repost;
653 			}
654 			ret = wc.byte_len;
655 		}
656 		if (ret == 0)
657 			break;
658 		len = ret;
659 		pkt_buf_len = len;
660 		/*
661 		 * Replace spent segments with new ones, concatenate and
662 		 * return them as pkt_buf.
663 		 */
664 		while (1) {
665 			struct ibv_sge *sge = &elt->sges[j];
666 			struct rte_mbuf *seg = elt->bufs[j];
667 			struct rte_mbuf *rep;
668 			unsigned int seg_tailroom;
669 
670 			assert(seg != NULL);
671 			/*
672 			 * Fetch initial bytes of packet descriptor into a
673 			 * cacheline while allocating rep.
674 			 */
675 			rte_prefetch0(seg);
676 			rep = __rte_mbuf_raw_alloc(rxq->mp);
677 			if (unlikely(rep == NULL)) {
678 				/*
679 				 * Unable to allocate a replacement mbuf,
680 				 * repost WR.
681 				 */
682 				DEBUG("rxq=%p: can't allocate a new mbuf",
683 				      (void *)rxq);
684 				if (pkt_buf != NULL) {
685 					*pkt_buf_next = NULL;
686 					rte_pktmbuf_free(pkt_buf);
687 				}
688 				/* Increment out of memory counters. */
689 				++rxq->stats.rx_nombuf;
690 				++rxq->priv->dev->data->rx_mbuf_alloc_failed;
691 				goto repost;
692 			}
693 #ifndef NDEBUG
694 			/* Poison user-modifiable fields in rep. */
695 			NEXT(rep) = (void *)((uintptr_t)-1);
696 			SET_DATA_OFF(rep, 0xdead);
697 			DATA_LEN(rep) = 0xd00d;
698 			PKT_LEN(rep) = 0xdeadd00d;
699 			NB_SEGS(rep) = 0x2a;
700 			PORT(rep) = 0x2a;
701 			rep->ol_flags = -1;
702 #endif
703 			assert(rep->buf_len == seg->buf_len);
704 			assert(rep->buf_len == rxq->mb_len);
705 			/* Reconfigure sge to use rep instead of seg. */
706 			assert(sge->lkey == rxq->mr->lkey);
707 			sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
708 			elt->bufs[j] = rep;
709 			++j;
710 			/* Update pkt_buf if it's the first segment, or link
711 			 * seg to the previous one and update pkt_buf_next. */
712 			*pkt_buf_next = seg;
713 			pkt_buf_next = &NEXT(seg);
714 			/* Update seg information. */
715 			seg_tailroom = (seg->buf_len - seg_headroom);
716 			assert(sge->length == seg_tailroom);
717 			SET_DATA_OFF(seg, seg_headroom);
718 			if (likely(len <= seg_tailroom)) {
719 				/* Last segment. */
720 				DATA_LEN(seg) = len;
721 				PKT_LEN(seg) = len;
722 				/* Sanity check. */
723 				assert(rte_pktmbuf_headroom(seg) ==
724 				       seg_headroom);
725 				assert(rte_pktmbuf_tailroom(seg) ==
726 				       (seg_tailroom - len));
727 				break;
728 			}
729 			DATA_LEN(seg) = seg_tailroom;
730 			PKT_LEN(seg) = seg_tailroom;
731 			/* Sanity check. */
732 			assert(rte_pktmbuf_headroom(seg) == seg_headroom);
733 			assert(rte_pktmbuf_tailroom(seg) == 0);
734 			/* Fix len and clear headroom for next segments. */
735 			len -= seg_tailroom;
736 			seg_headroom = 0;
737 		}
738 		/* Update head and tail segments. */
739 		*pkt_buf_next = NULL;
740 		assert(pkt_buf != NULL);
741 		assert(j != 0);
742 		NB_SEGS(pkt_buf) = j;
743 		PORT(pkt_buf) = rxq->port_id;
744 		PKT_LEN(pkt_buf) = pkt_buf_len;
745 		pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
746 		pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
747 
748 		/* Return packet. */
749 		*(pkts++) = pkt_buf;
750 		++pkts_ret;
751 #ifdef MLX5_PMD_SOFT_COUNTERS
752 		/* Increment bytes counter. */
753 		rxq->stats.ibytes += pkt_buf_len;
754 #endif
755 repost:
756 		ret = rxq->if_wq->recv_sg_list(rxq->wq,
757 					       elt->sges,
758 					       RTE_DIM(elt->sges));
759 		if (unlikely(ret)) {
760 			/* Inability to repost WRs is fatal. */
761 			DEBUG("%p: recv_sg_list(): failed (ret=%d)",
762 			      (void *)rxq->priv,
763 			      ret);
764 			abort();
765 		}
766 		if (++elts_head >= elts_n)
767 			elts_head = 0;
768 		continue;
769 	}
770 	if (unlikely(i == 0))
771 		return 0;
772 	rxq->elts_head = elts_head;
773 #ifdef MLX5_PMD_SOFT_COUNTERS
774 	/* Increment packets counter. */
775 	rxq->stats.ipackets += pkts_ret;
776 #endif
777 	return pkts_ret;
778 }
779 
780 /**
781  * DPDK callback for RX.
782  *
783  * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
784  * manage scattered packets. Improves performance when MRU is lower than the
785  * size of the first segment.
786  *
787  * @param dpdk_rxq
788  *   Generic pointer to RX queue structure.
789  * @param[out] pkts
790  *   Array to store received packets.
791  * @param pkts_n
792  *   Maximum number of packets in array.
793  *
794  * @return
795  *   Number of packets successfully received (<= pkts_n).
796  */
797 uint16_t
798 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
799 {
800 	struct rxq *rxq = (struct rxq *)dpdk_rxq;
801 	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
802 	const unsigned int elts_n = rxq->elts_n;
803 	unsigned int elts_head = rxq->elts_head;
804 	struct ibv_sge sges[pkts_n];
805 	unsigned int i;
806 	unsigned int pkts_ret = 0;
807 	int ret;
808 
809 	if (unlikely(rxq->sp))
810 		return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
811 	for (i = 0; (i != pkts_n); ++i) {
812 		struct rxq_elt *elt = &(*elts)[elts_head];
813 		unsigned int len;
814 		struct rte_mbuf *seg = elt->buf;
815 		struct rte_mbuf *rep;
816 		uint32_t flags;
817 
818 		/* Sanity checks. */
819 		assert(seg != NULL);
820 		assert(elts_head < rxq->elts_n);
821 		assert(rxq->elts_head < rxq->elts_n);
822 		/*
823 		 * Fetch initial bytes of packet descriptor into a
824 		 * cacheline while allocating rep.
825 		 */
826 		rte_prefetch0(seg);
827 		rte_prefetch0(&seg->cacheline1);
828 		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
829 						    &flags);
830 		if (unlikely(ret < 0)) {
831 			struct ibv_wc wc;
832 			int wcs_n;
833 
834 			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
835 			      (void *)rxq, ret);
836 			/* ibv_poll_cq() must be used in case of failure. */
837 			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
838 			if (unlikely(wcs_n == 0))
839 				break;
840 			if (unlikely(wcs_n < 0)) {
841 				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
842 				      (void *)rxq, wcs_n);
843 				break;
844 			}
845 			assert(wcs_n == 1);
846 			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
847 				/* Whatever, just repost the offending WR. */
848 				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
849 				      " completion status (%d): %s",
850 				      (void *)rxq, wc.wr_id, wc.status,
851 				      ibv_wc_status_str(wc.status));
852 #ifdef MLX5_PMD_SOFT_COUNTERS
853 				/* Increment dropped packets counter. */
854 				++rxq->stats.idropped;
855 #endif
856 				/* Add SGE to array for repost. */
857 				sges[i] = elt->sge;
858 				goto repost;
859 			}
860 			ret = wc.byte_len;
861 		}
862 		if (ret == 0)
863 			break;
864 		len = ret;
865 		rep = __rte_mbuf_raw_alloc(rxq->mp);
866 		if (unlikely(rep == NULL)) {
867 			/*
868 			 * Unable to allocate a replacement mbuf,
869 			 * repost WR.
870 			 */
871 			DEBUG("rxq=%p: can't allocate a new mbuf",
872 			      (void *)rxq);
873 			/* Increment out of memory counters. */
874 			++rxq->stats.rx_nombuf;
875 			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
876 			goto repost;
877 		}
878 
879 		/* Reconfigure sge to use rep instead of seg. */
880 		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
881 		assert(elt->sge.lkey == rxq->mr->lkey);
882 		elt->buf = rep;
883 
884 		/* Add SGE to array for repost. */
885 		sges[i] = elt->sge;
886 
887 		/* Update seg information. */
888 		SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
889 		NB_SEGS(seg) = 1;
890 		PORT(seg) = rxq->port_id;
891 		NEXT(seg) = NULL;
892 		PKT_LEN(seg) = len;
893 		DATA_LEN(seg) = len;
894 		seg->packet_type = rxq_cq_to_pkt_type(flags);
895 		seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
896 
897 		/* Return packet. */
898 		*(pkts++) = seg;
899 		++pkts_ret;
900 #ifdef MLX5_PMD_SOFT_COUNTERS
901 		/* Increment bytes counter. */
902 		rxq->stats.ibytes += len;
903 #endif
904 repost:
905 		if (++elts_head >= elts_n)
906 			elts_head = 0;
907 		continue;
908 	}
909 	if (unlikely(i == 0))
910 		return 0;
911 	/* Repost WRs. */
912 #ifdef DEBUG_RECV
913 	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
914 #endif
915 	ret = rxq->if_wq->recv_burst(rxq->wq, sges, i);
916 	if (unlikely(ret)) {
917 		/* Inability to repost WRs is fatal. */
918 		DEBUG("%p: recv_burst(): failed (ret=%d)",
919 		      (void *)rxq->priv,
920 		      ret);
921 		abort();
922 	}
923 	rxq->elts_head = elts_head;
924 #ifdef MLX5_PMD_SOFT_COUNTERS
925 	/* Increment packets counter. */
926 	rxq->stats.ipackets += pkts_ret;
927 #endif
928 	return pkts_ret;
929 }
930 
931 /**
932  * Dummy DPDK callback for TX.
933  *
934  * This function is used to temporarily replace the real callback during
935  * unsafe control operations on the queue, or in case of error.
936  *
937  * @param dpdk_txq
938  *   Generic pointer to TX queue structure.
939  * @param[in] pkts
940  *   Packets to transmit.
941  * @param pkts_n
942  *   Number of packets in array.
943  *
944  * @return
945  *   Number of packets successfully transmitted (<= pkts_n).
946  */
947 uint16_t
948 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
949 {
950 	(void)dpdk_txq;
951 	(void)pkts;
952 	(void)pkts_n;
953 	return 0;
954 }
955 
956 /**
957  * Dummy DPDK callback for RX.
958  *
959  * This function is used to temporarily replace the real callback during
960  * unsafe control operations on the queue, or in case of error.
961  *
962  * @param dpdk_rxq
963  *   Generic pointer to RX queue structure.
964  * @param[out] pkts
965  *   Array to store received packets.
966  * @param pkts_n
967  *   Maximum number of packets in array.
968  *
969  * @return
970  *   Number of packets successfully received (<= pkts_n).
971  */
972 uint16_t
973 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
974 {
975 	(void)dpdk_rxq;
976 	(void)pkts;
977 	(void)pkts_n;
978 	return 0;
979 }
980