xref: /dpdk/drivers/net/mlx5/mlx5_txpp.c (revision 10b71caecbe1cddcbb65c050ca775fba575e88db)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2020 Mellanox Technologies, Ltd
3  */
4 #include <fcntl.h>
5 #include <stdint.h>
6 
7 #include <rte_ether.h>
8 #include <rte_ethdev_driver.h>
9 #include <rte_interrupts.h>
10 #include <rte_alarm.h>
11 #include <rte_malloc.h>
12 #include <rte_cycles.h>
13 #include <rte_eal_paging.h>
14 
15 #include <mlx5_malloc.h>
16 
17 #include "mlx5.h"
18 #include "mlx5_rxtx.h"
19 #include "mlx5_common_os.h"
20 
21 static const char * const mlx5_txpp_stat_names[] = {
22 	"txpp_err_miss_int", /* Missed service interrupt. */
23 	"txpp_err_rearm_queue",	/* Rearm Queue errors. */
24 	"txpp_err_clock_queue", /* Clock Queue errors. */
25 	"txpp_err_ts_past", /* Timestamp in the past. */
26 	"txpp_err_ts_future", /* Timestamp in the distant future. */
27 	"txpp_jitter", /* Timestamp jitter (one Clock Queue completion). */
28 	"txpp_wander", /* Timestamp jitter (half of Clock Queue completions). */
29 	"txpp_sync_lost", /* Scheduling synchronization lost. */
30 };
31 
32 /* Destroy Event Queue Notification Channel. */
33 static void
34 mlx5_txpp_destroy_eqn(struct mlx5_dev_ctx_shared *sh)
35 {
36 	if (sh->txpp.echan) {
37 		mlx5_glue->devx_destroy_event_channel(sh->txpp.echan);
38 		sh->txpp.echan = NULL;
39 	}
40 	sh->txpp.eqn = 0;
41 }
42 
43 /* Create Event Queue Notification Channel. */
44 static int
45 mlx5_txpp_create_eqn(struct mlx5_dev_ctx_shared *sh)
46 {
47 	uint32_t lcore;
48 
49 	MLX5_ASSERT(!sh->txpp.echan);
50 	lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
51 	if (mlx5_glue->devx_query_eqn(sh->ctx, lcore, &sh->txpp.eqn)) {
52 		rte_errno = errno;
53 		DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
54 		sh->txpp.eqn = 0;
55 		return -rte_errno;
56 	}
57 	sh->txpp.echan = mlx5_glue->devx_create_event_channel(sh->ctx,
58 			MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
59 	if (!sh->txpp.echan) {
60 		sh->txpp.eqn = 0;
61 		rte_errno = errno;
62 		DRV_LOG(ERR, "Failed to create event channel %d.",
63 			rte_errno);
64 		return -rte_errno;
65 	}
66 	return 0;
67 }
68 
69 static void
70 mlx5_txpp_free_pp_index(struct mlx5_dev_ctx_shared *sh)
71 {
72 	if (sh->txpp.pp) {
73 		mlx5_glue->dv_free_pp(sh->txpp.pp);
74 		sh->txpp.pp = NULL;
75 		sh->txpp.pp_id = 0;
76 	}
77 }
78 
79 /* Allocate Packet Pacing index from kernel via mlx5dv call. */
80 static int
81 mlx5_txpp_alloc_pp_index(struct mlx5_dev_ctx_shared *sh)
82 {
83 #ifdef HAVE_MLX5DV_PP_ALLOC
84 	uint32_t pp[MLX5_ST_SZ_DW(set_pp_rate_limit_context)];
85 	uint64_t rate;
86 
87 	MLX5_ASSERT(!sh->txpp.pp);
88 	memset(&pp, 0, sizeof(pp));
89 	rate = NS_PER_S / sh->txpp.tick;
90 	if (rate * sh->txpp.tick != NS_PER_S)
91 		DRV_LOG(WARNING, "Packet pacing frequency is not precise.");
92 	if (sh->txpp.test) {
93 		uint32_t len;
94 
95 		len = RTE_MAX(MLX5_TXPP_TEST_PKT_SIZE,
96 			      (size_t)RTE_ETHER_MIN_LEN);
97 		MLX5_SET(set_pp_rate_limit_context, &pp,
98 			 burst_upper_bound, len);
99 		MLX5_SET(set_pp_rate_limit_context, &pp,
100 			 typical_packet_size, len);
101 		/* Convert packets per second into kilobits. */
102 		rate = (rate * len) / (1000ul / CHAR_BIT);
103 		DRV_LOG(INFO, "Packet pacing rate set to %" PRIu64, rate);
104 	}
105 	MLX5_SET(set_pp_rate_limit_context, &pp, rate_limit, rate);
106 	MLX5_SET(set_pp_rate_limit_context, &pp, rate_mode,
107 		 sh->txpp.test ? MLX5_DATA_RATE : MLX5_WQE_RATE);
108 	sh->txpp.pp = mlx5_glue->dv_alloc_pp
109 				(sh->ctx, sizeof(pp), &pp,
110 				 MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX);
111 	if (sh->txpp.pp == NULL) {
112 		DRV_LOG(ERR, "Failed to allocate packet pacing index.");
113 		rte_errno = errno;
114 		return -errno;
115 	}
116 	if (!sh->txpp.pp->index) {
117 		DRV_LOG(ERR, "Zero packet pacing index allocated.");
118 		mlx5_txpp_free_pp_index(sh);
119 		rte_errno = ENOTSUP;
120 		return -ENOTSUP;
121 	}
122 	sh->txpp.pp_id = sh->txpp.pp->index;
123 	return 0;
124 #else
125 	RTE_SET_USED(sh);
126 	DRV_LOG(ERR, "Allocating pacing index is not supported.");
127 	rte_errno = ENOTSUP;
128 	return -ENOTSUP;
129 #endif
130 }
131 
132 static void
133 mlx5_txpp_destroy_send_queue(struct mlx5_txpp_wq *wq)
134 {
135 	if (wq->sq)
136 		claim_zero(mlx5_devx_cmd_destroy(wq->sq));
137 	if (wq->sq_umem)
138 		claim_zero(mlx5_glue->devx_umem_dereg(wq->sq_umem));
139 	if (wq->sq_buf)
140 		mlx5_free((void *)(uintptr_t)wq->sq_buf);
141 	if (wq->cq)
142 		claim_zero(mlx5_devx_cmd_destroy(wq->cq));
143 	if (wq->cq_umem)
144 		claim_zero(mlx5_glue->devx_umem_dereg(wq->cq_umem));
145 	if (wq->cq_buf)
146 		mlx5_free((void *)(uintptr_t)wq->cq_buf);
147 	memset(wq, 0, sizeof(*wq));
148 }
149 
150 static void
151 mlx5_txpp_destroy_rearm_queue(struct mlx5_dev_ctx_shared *sh)
152 {
153 	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
154 
155 	mlx5_txpp_destroy_send_queue(wq);
156 }
157 
158 static void
159 mlx5_txpp_destroy_clock_queue(struct mlx5_dev_ctx_shared *sh)
160 {
161 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
162 
163 	mlx5_txpp_destroy_send_queue(wq);
164 	if (sh->txpp.tsa) {
165 		mlx5_free(sh->txpp.tsa);
166 		sh->txpp.tsa = NULL;
167 	}
168 }
169 
170 static void
171 mlx5_txpp_doorbell_rearm_queue(struct mlx5_dev_ctx_shared *sh, uint16_t ci)
172 {
173 	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
174 	union {
175 		uint32_t w32[2];
176 		uint64_t w64;
177 	} cs;
178 
179 	wq->sq_ci = ci + 1;
180 	cs.w32[0] = rte_cpu_to_be_32(rte_be_to_cpu_32
181 		   (wq->wqes[ci & (wq->sq_size - 1)].ctrl[0]) | (ci - 1) << 8);
182 	cs.w32[1] = wq->wqes[ci & (wq->sq_size - 1)].ctrl[1];
183 	/* Update SQ doorbell record with new SQ ci. */
184 	rte_compiler_barrier();
185 	*wq->sq_dbrec = rte_cpu_to_be_32(wq->sq_ci);
186 	/* Make sure the doorbell record is updated. */
187 	rte_wmb();
188 	/* Write to doorbel register to start processing. */
189 	__mlx5_uar_write64_relaxed(cs.w64, sh->tx_uar->reg_addr, NULL);
190 	rte_wmb();
191 }
192 
193 static void
194 mlx5_txpp_fill_cqe_rearm_queue(struct mlx5_dev_ctx_shared *sh)
195 {
196 	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
197 	struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
198 	uint32_t i;
199 
200 	for (i = 0; i < MLX5_TXPP_REARM_CQ_SIZE; i++) {
201 		cqe->op_own = (MLX5_CQE_INVALID << 4) | MLX5_CQE_OWNER_MASK;
202 		++cqe;
203 	}
204 }
205 
206 static void
207 mlx5_txpp_fill_wqe_rearm_queue(struct mlx5_dev_ctx_shared *sh)
208 {
209 	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
210 	struct mlx5_wqe *wqe = (struct mlx5_wqe *)(uintptr_t)wq->wqes;
211 	uint32_t i;
212 
213 	for (i = 0; i < wq->sq_size; i += 2) {
214 		struct mlx5_wqe_cseg *cs;
215 		struct mlx5_wqe_qseg *qs;
216 		uint32_t index;
217 
218 		/* Build SEND_EN request with slave WQE index. */
219 		cs = &wqe[i + 0].cseg;
220 		cs->opcode = RTE_BE32(MLX5_OPCODE_SEND_EN | 0);
221 		cs->sq_ds = rte_cpu_to_be_32((wq->sq->id << 8) | 2);
222 		cs->flags = RTE_BE32(MLX5_COMP_ALWAYS <<
223 				     MLX5_COMP_MODE_OFFSET);
224 		cs->misc = RTE_BE32(0);
225 		qs = RTE_PTR_ADD(cs, sizeof(struct mlx5_wqe_cseg));
226 		index = (i * MLX5_TXPP_REARM / 2 + MLX5_TXPP_REARM) &
227 			((1 << MLX5_WQ_INDEX_WIDTH) - 1);
228 		qs->max_index = rte_cpu_to_be_32(index);
229 		qs->qpn_cqn = rte_cpu_to_be_32(sh->txpp.clock_queue.sq->id);
230 		/* Build WAIT request with slave CQE index. */
231 		cs = &wqe[i + 1].cseg;
232 		cs->opcode = RTE_BE32(MLX5_OPCODE_WAIT | 0);
233 		cs->sq_ds = rte_cpu_to_be_32((wq->sq->id << 8) | 2);
234 		cs->flags = RTE_BE32(MLX5_COMP_ONLY_ERR <<
235 				     MLX5_COMP_MODE_OFFSET);
236 		cs->misc = RTE_BE32(0);
237 		qs = RTE_PTR_ADD(cs, sizeof(struct mlx5_wqe_cseg));
238 		index = (i * MLX5_TXPP_REARM / 2 + MLX5_TXPP_REARM / 2) &
239 			((1 << MLX5_CQ_INDEX_WIDTH) - 1);
240 		qs->max_index = rte_cpu_to_be_32(index);
241 		qs->qpn_cqn = rte_cpu_to_be_32(sh->txpp.clock_queue.cq->id);
242 	}
243 }
244 
245 /* Creates the Rearm Queue to fire the requests to Clock Queue in realtime. */
246 static int
247 mlx5_txpp_create_rearm_queue(struct mlx5_dev_ctx_shared *sh)
248 {
249 	struct mlx5_devx_create_sq_attr sq_attr = { 0 };
250 	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
251 	struct mlx5_devx_cq_attr cq_attr = { 0 };
252 	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
253 	size_t page_size;
254 	uint32_t umem_size, umem_dbrec;
255 	int ret;
256 
257 	page_size = rte_mem_page_size();
258 	if (page_size == (size_t)-1) {
259 		DRV_LOG(ERR, "Failed to get mem page size");
260 		return -ENOMEM;
261 	}
262 	/* Allocate memory buffer for CQEs and doorbell record. */
263 	umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_REARM_CQ_SIZE;
264 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
265 	umem_size += MLX5_DBR_SIZE;
266 	wq->cq_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
267 				 page_size, sh->numa_node);
268 	if (!wq->cq_buf) {
269 		DRV_LOG(ERR, "Failed to allocate memory for Rearm Queue.");
270 		return -ENOMEM;
271 	}
272 	/* Register allocated buffer in user space with DevX. */
273 	wq->cq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
274 					       (void *)(uintptr_t)wq->cq_buf,
275 					       umem_size,
276 					       IBV_ACCESS_LOCAL_WRITE);
277 	if (!wq->cq_umem) {
278 		rte_errno = errno;
279 		DRV_LOG(ERR, "Failed to register umem for Rearm Queue.");
280 		goto error;
281 	}
282 	/* Create completion queue object for Rearm Queue. */
283 	cq_attr.cqe_size = (sizeof(struct mlx5_cqe) == 128) ?
284 			    MLX5_CQE_SIZE_128B : MLX5_CQE_SIZE_64B;
285 	cq_attr.uar_page_id = sh->tx_uar->page_id;
286 	cq_attr.eqn = sh->txpp.eqn;
287 	cq_attr.q_umem_valid = 1;
288 	cq_attr.q_umem_offset = 0;
289 	cq_attr.q_umem_id = mlx5_os_get_umem_id(wq->cq_umem);
290 	cq_attr.db_umem_valid = 1;
291 	cq_attr.db_umem_offset = umem_dbrec;
292 	cq_attr.db_umem_id = mlx5_os_get_umem_id(wq->cq_umem);
293 	cq_attr.log_cq_size = rte_log2_u32(MLX5_TXPP_REARM_CQ_SIZE);
294 	cq_attr.log_page_size = rte_log2_u32(page_size);
295 	wq->cq = mlx5_devx_cmd_create_cq(sh->ctx, &cq_attr);
296 	if (!wq->cq) {
297 		rte_errno = errno;
298 		DRV_LOG(ERR, "Failed to create CQ for Rearm Queue.");
299 		goto error;
300 	}
301 	wq->cq_dbrec = RTE_PTR_ADD(wq->cq_buf, umem_dbrec);
302 	wq->cq_ci = 0;
303 	wq->arm_sn = 0;
304 	/* Mark all CQEs initially as invalid. */
305 	mlx5_txpp_fill_cqe_rearm_queue(sh);
306 	/*
307 	 * Allocate memory buffer for Send Queue WQEs.
308 	 * There should be no WQE leftovers in the cyclic queue.
309 	 */
310 	wq->sq_size = MLX5_TXPP_REARM_SQ_SIZE;
311 	MLX5_ASSERT(wq->sq_size == (1 << log2above(wq->sq_size)));
312 	umem_size =  MLX5_WQE_SIZE * wq->sq_size;
313 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
314 	umem_size += MLX5_DBR_SIZE;
315 	wq->sq_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
316 				 page_size, sh->numa_node);
317 	if (!wq->sq_buf) {
318 		DRV_LOG(ERR, "Failed to allocate memory for Rearm Queue.");
319 		rte_errno = ENOMEM;
320 		goto error;
321 	}
322 	/* Register allocated buffer in user space with DevX. */
323 	wq->sq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
324 					       (void *)(uintptr_t)wq->sq_buf,
325 					       umem_size,
326 					       IBV_ACCESS_LOCAL_WRITE);
327 	if (!wq->sq_umem) {
328 		rte_errno = errno;
329 		DRV_LOG(ERR, "Failed to register umem for Rearm Queue.");
330 		goto error;
331 	}
332 	/* Create send queue object for Rearm Queue. */
333 	sq_attr.state = MLX5_SQC_STATE_RST;
334 	sq_attr.tis_lst_sz = 1;
335 	sq_attr.tis_num = sh->tis->id;
336 	sq_attr.cqn = wq->cq->id;
337 	sq_attr.cd_master = 1;
338 	sq_attr.wq_attr.uar_page = sh->tx_uar->page_id;
339 	sq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
340 	sq_attr.wq_attr.pd = sh->pdn;
341 	sq_attr.wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
342 	sq_attr.wq_attr.log_wq_sz = rte_log2_u32(wq->sq_size);
343 	sq_attr.wq_attr.dbr_umem_valid = 1;
344 	sq_attr.wq_attr.dbr_addr = umem_dbrec;
345 	sq_attr.wq_attr.dbr_umem_id = mlx5_os_get_umem_id(wq->sq_umem);
346 	sq_attr.wq_attr.wq_umem_valid = 1;
347 	sq_attr.wq_attr.wq_umem_id = mlx5_os_get_umem_id(wq->sq_umem);
348 	sq_attr.wq_attr.wq_umem_offset = 0;
349 	wq->sq = mlx5_devx_cmd_create_sq(sh->ctx, &sq_attr);
350 	if (!wq->sq) {
351 		rte_errno = errno;
352 		DRV_LOG(ERR, "Failed to create SQ for Rearm Queue.");
353 		goto error;
354 	}
355 	wq->sq_dbrec = RTE_PTR_ADD(wq->sq_buf, umem_dbrec +
356 				   MLX5_SND_DBR * sizeof(uint32_t));
357 	/* Build the WQEs in the Send Queue before goto Ready state. */
358 	mlx5_txpp_fill_wqe_rearm_queue(sh);
359 	/* Change queue state to ready. */
360 	msq_attr.sq_state = MLX5_SQC_STATE_RST;
361 	msq_attr.state = MLX5_SQC_STATE_RDY;
362 	ret = mlx5_devx_cmd_modify_sq(wq->sq, &msq_attr);
363 	if (ret) {
364 		DRV_LOG(ERR, "Failed to set SQ ready state Rearm Queue.");
365 		goto error;
366 	}
367 	return 0;
368 error:
369 	ret = -rte_errno;
370 	mlx5_txpp_destroy_rearm_queue(sh);
371 	rte_errno = -ret;
372 	return ret;
373 }
374 
375 static void
376 mlx5_txpp_fill_wqe_clock_queue(struct mlx5_dev_ctx_shared *sh)
377 {
378 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
379 	struct mlx5_wqe *wqe = (struct mlx5_wqe *)(uintptr_t)wq->wqes;
380 	struct mlx5_wqe_cseg *cs = &wqe->cseg;
381 	uint32_t wqe_size, opcode, i;
382 	uint8_t *dst;
383 
384 	/* For test purposes fill the WQ with SEND inline packet. */
385 	if (sh->txpp.test) {
386 		wqe_size = RTE_ALIGN(MLX5_TXPP_TEST_PKT_SIZE +
387 				     MLX5_WQE_CSEG_SIZE +
388 				     2 * MLX5_WQE_ESEG_SIZE -
389 				     MLX5_ESEG_MIN_INLINE_SIZE,
390 				     MLX5_WSEG_SIZE);
391 		opcode = MLX5_OPCODE_SEND;
392 	} else {
393 		wqe_size = MLX5_WSEG_SIZE;
394 		opcode = MLX5_OPCODE_NOP;
395 	}
396 	cs->opcode = rte_cpu_to_be_32(opcode | 0); /* Index is ignored. */
397 	cs->sq_ds = rte_cpu_to_be_32((wq->sq->id << 8) |
398 				     (wqe_size / MLX5_WSEG_SIZE));
399 	cs->flags = RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET);
400 	cs->misc = RTE_BE32(0);
401 	wqe_size = RTE_ALIGN(wqe_size, MLX5_WQE_SIZE);
402 	if (sh->txpp.test) {
403 		struct mlx5_wqe_eseg *es = &wqe->eseg;
404 		struct rte_ether_hdr *eth_hdr;
405 		struct rte_ipv4_hdr *ip_hdr;
406 		struct rte_udp_hdr *udp_hdr;
407 
408 		/* Build the inline test packet pattern. */
409 		MLX5_ASSERT(wqe_size <= MLX5_WQE_SIZE_MAX);
410 		MLX5_ASSERT(MLX5_TXPP_TEST_PKT_SIZE >=
411 				(sizeof(struct rte_ether_hdr) +
412 				 sizeof(struct rte_ipv4_hdr)));
413 		es->flags = 0;
414 		es->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
415 		es->swp_offs = 0;
416 		es->metadata = 0;
417 		es->swp_flags = 0;
418 		es->mss = 0;
419 		es->inline_hdr_sz = RTE_BE16(MLX5_TXPP_TEST_PKT_SIZE);
420 		/* Build test packet L2 header (Ethernet). */
421 		dst = (uint8_t *)&es->inline_data;
422 		eth_hdr = (struct rte_ether_hdr *)dst;
423 		rte_eth_random_addr(&eth_hdr->d_addr.addr_bytes[0]);
424 		rte_eth_random_addr(&eth_hdr->s_addr.addr_bytes[0]);
425 		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
426 		/* Build test packet L3 header (IP v4). */
427 		dst += sizeof(struct rte_ether_hdr);
428 		ip_hdr = (struct rte_ipv4_hdr *)dst;
429 		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
430 		ip_hdr->type_of_service = 0;
431 		ip_hdr->fragment_offset = 0;
432 		ip_hdr->time_to_live = 64;
433 		ip_hdr->next_proto_id = IPPROTO_UDP;
434 		ip_hdr->packet_id = 0;
435 		ip_hdr->total_length = RTE_BE16(MLX5_TXPP_TEST_PKT_SIZE -
436 						sizeof(struct rte_ether_hdr));
437 		/* use RFC5735 / RFC2544 reserved network test addresses */
438 		ip_hdr->src_addr = RTE_BE32((198U << 24) | (18 << 16) |
439 					    (0 << 8) | 1);
440 		ip_hdr->dst_addr = RTE_BE32((198U << 24) | (18 << 16) |
441 					    (0 << 8) | 2);
442 		if (MLX5_TXPP_TEST_PKT_SIZE <
443 					(sizeof(struct rte_ether_hdr) +
444 					 sizeof(struct rte_ipv4_hdr) +
445 					 sizeof(struct rte_udp_hdr)))
446 			goto wcopy;
447 		/* Build test packet L4 header (UDP). */
448 		dst += sizeof(struct rte_ipv4_hdr);
449 		udp_hdr = (struct rte_udp_hdr *)dst;
450 		udp_hdr->src_port = RTE_BE16(9); /* RFC863 Discard. */
451 		udp_hdr->dst_port = RTE_BE16(9);
452 		udp_hdr->dgram_len = RTE_BE16(MLX5_TXPP_TEST_PKT_SIZE -
453 					      sizeof(struct rte_ether_hdr) -
454 					      sizeof(struct rte_ipv4_hdr));
455 		udp_hdr->dgram_cksum = 0;
456 		/* Fill the test packet data. */
457 		dst += sizeof(struct rte_udp_hdr);
458 		for (i = sizeof(struct rte_ether_hdr) +
459 			sizeof(struct rte_ipv4_hdr) +
460 			sizeof(struct rte_udp_hdr);
461 				i < MLX5_TXPP_TEST_PKT_SIZE; i++)
462 			*dst++ = (uint8_t)(i & 0xFF);
463 	}
464 wcopy:
465 	/* Duplicate the pattern to the next WQEs. */
466 	dst = (uint8_t *)(uintptr_t)wq->sq_buf;
467 	for (i = 1; i < MLX5_TXPP_CLKQ_SIZE; i++) {
468 		dst += wqe_size;
469 		rte_memcpy(dst, (void *)(uintptr_t)wq->sq_buf, wqe_size);
470 	}
471 }
472 
473 /* Creates the Clock Queue for packet pacing, returns zero on success. */
474 static int
475 mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
476 {
477 	struct mlx5_devx_create_sq_attr sq_attr = { 0 };
478 	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
479 	struct mlx5_devx_cq_attr cq_attr = { 0 };
480 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
481 	size_t page_size;
482 	uint32_t umem_size, umem_dbrec;
483 	int ret;
484 
485 	page_size = rte_mem_page_size();
486 	if (page_size == (size_t)-1) {
487 		DRV_LOG(ERR, "Failed to get mem page size");
488 		return -ENOMEM;
489 	}
490 	sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
491 				   MLX5_TXPP_REARM_SQ_SIZE *
492 				   sizeof(struct mlx5_txpp_ts),
493 				   0, sh->numa_node);
494 	if (!sh->txpp.tsa) {
495 		DRV_LOG(ERR, "Failed to allocate memory for CQ stats.");
496 		return -ENOMEM;
497 	}
498 	sh->txpp.ts_p = 0;
499 	sh->txpp.ts_n = 0;
500 	/* Allocate memory buffer for CQEs and doorbell record. */
501 	umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_CLKQ_SIZE;
502 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
503 	umem_size += MLX5_DBR_SIZE;
504 	wq->cq_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
505 					page_size, sh->numa_node);
506 	if (!wq->cq_buf) {
507 		DRV_LOG(ERR, "Failed to allocate memory for Clock Queue.");
508 		return -ENOMEM;
509 	}
510 	/* Register allocated buffer in user space with DevX. */
511 	wq->cq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
512 					       (void *)(uintptr_t)wq->cq_buf,
513 					       umem_size,
514 					       IBV_ACCESS_LOCAL_WRITE);
515 	if (!wq->cq_umem) {
516 		rte_errno = errno;
517 		DRV_LOG(ERR, "Failed to register umem for Clock Queue.");
518 		goto error;
519 	}
520 	/* Create completion queue object for Clock Queue. */
521 	cq_attr.cqe_size = (sizeof(struct mlx5_cqe) == 128) ?
522 			    MLX5_CQE_SIZE_128B : MLX5_CQE_SIZE_64B;
523 	cq_attr.use_first_only = 1;
524 	cq_attr.overrun_ignore = 1;
525 	cq_attr.uar_page_id = sh->tx_uar->page_id;
526 	cq_attr.eqn = sh->txpp.eqn;
527 	cq_attr.q_umem_valid = 1;
528 	cq_attr.q_umem_offset = 0;
529 	cq_attr.q_umem_id = wq->cq_umem->umem_id;
530 	cq_attr.db_umem_valid = 1;
531 	cq_attr.db_umem_offset = umem_dbrec;
532 	cq_attr.db_umem_id = wq->cq_umem->umem_id;
533 	cq_attr.log_cq_size = rte_log2_u32(MLX5_TXPP_CLKQ_SIZE);
534 	cq_attr.log_page_size = rte_log2_u32(page_size);
535 	wq->cq = mlx5_devx_cmd_create_cq(sh->ctx, &cq_attr);
536 	if (!wq->cq) {
537 		rte_errno = errno;
538 		DRV_LOG(ERR, "Failed to create CQ for Clock Queue.");
539 		goto error;
540 	}
541 	wq->cq_dbrec = RTE_PTR_ADD(wq->cq_buf, umem_dbrec);
542 	wq->cq_ci = 0;
543 	/* Allocate memory buffer for Send Queue WQEs. */
544 	if (sh->txpp.test) {
545 		wq->sq_size = RTE_ALIGN(MLX5_TXPP_TEST_PKT_SIZE +
546 					MLX5_WQE_CSEG_SIZE +
547 					2 * MLX5_WQE_ESEG_SIZE -
548 					MLX5_ESEG_MIN_INLINE_SIZE,
549 					MLX5_WQE_SIZE) / MLX5_WQE_SIZE;
550 		wq->sq_size *= MLX5_TXPP_CLKQ_SIZE;
551 	} else {
552 		wq->sq_size = MLX5_TXPP_CLKQ_SIZE;
553 	}
554 	/* There should not be WQE leftovers in the cyclic queue. */
555 	MLX5_ASSERT(wq->sq_size == (1 << log2above(wq->sq_size)));
556 	umem_size =  MLX5_WQE_SIZE * wq->sq_size;
557 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
558 	umem_size += MLX5_DBR_SIZE;
559 	wq->sq_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
560 				 page_size, sh->numa_node);
561 	if (!wq->sq_buf) {
562 		DRV_LOG(ERR, "Failed to allocate memory for Clock Queue.");
563 		rte_errno = ENOMEM;
564 		goto error;
565 	}
566 	/* Register allocated buffer in user space with DevX. */
567 	wq->sq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
568 					       (void *)(uintptr_t)wq->sq_buf,
569 					       umem_size,
570 					       IBV_ACCESS_LOCAL_WRITE);
571 	if (!wq->sq_umem) {
572 		rte_errno = errno;
573 		DRV_LOG(ERR, "Failed to register umem for Clock Queue.");
574 		goto error;
575 	}
576 	/* Create send queue object for Clock Queue. */
577 	if (sh->txpp.test) {
578 		sq_attr.tis_lst_sz = 1;
579 		sq_attr.tis_num = sh->tis->id;
580 		sq_attr.non_wire = 0;
581 		sq_attr.static_sq_wq = 1;
582 	} else {
583 		sq_attr.non_wire = 1;
584 		sq_attr.static_sq_wq = 1;
585 	}
586 	sq_attr.state = MLX5_SQC_STATE_RST;
587 	sq_attr.cqn = wq->cq->id;
588 	sq_attr.packet_pacing_rate_limit_index = sh->txpp.pp_id;
589 	sq_attr.wq_attr.cd_slave = 1;
590 	sq_attr.wq_attr.uar_page = sh->tx_uar->page_id;
591 	sq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
592 	sq_attr.wq_attr.pd = sh->pdn;
593 	sq_attr.wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
594 	sq_attr.wq_attr.log_wq_sz = rte_log2_u32(wq->sq_size);
595 	sq_attr.wq_attr.dbr_umem_valid = 1;
596 	sq_attr.wq_attr.dbr_addr = umem_dbrec;
597 	sq_attr.wq_attr.dbr_umem_id = wq->sq_umem->umem_id;
598 	sq_attr.wq_attr.wq_umem_valid = 1;
599 	sq_attr.wq_attr.wq_umem_id = wq->sq_umem->umem_id;
600 	/* umem_offset must be zero for static_sq_wq queue. */
601 	sq_attr.wq_attr.wq_umem_offset = 0;
602 	wq->sq = mlx5_devx_cmd_create_sq(sh->ctx, &sq_attr);
603 	if (!wq->sq) {
604 		rte_errno = errno;
605 		DRV_LOG(ERR, "Failed to create SQ for Clock Queue.");
606 		goto error;
607 	}
608 	wq->sq_dbrec = RTE_PTR_ADD(wq->sq_buf, umem_dbrec +
609 				   MLX5_SND_DBR * sizeof(uint32_t));
610 	/* Build the WQEs in the Send Queue before goto Ready state. */
611 	mlx5_txpp_fill_wqe_clock_queue(sh);
612 	/* Change queue state to ready. */
613 	msq_attr.sq_state = MLX5_SQC_STATE_RST;
614 	msq_attr.state = MLX5_SQC_STATE_RDY;
615 	wq->sq_ci = 0;
616 	ret = mlx5_devx_cmd_modify_sq(wq->sq, &msq_attr);
617 	if (ret) {
618 		DRV_LOG(ERR, "Failed to set SQ ready state Clock Queue.");
619 		goto error;
620 	}
621 	return 0;
622 error:
623 	ret = -rte_errno;
624 	mlx5_txpp_destroy_clock_queue(sh);
625 	rte_errno = -ret;
626 	return ret;
627 }
628 
629 /* Enable notification from the Rearm Queue CQ. */
630 static inline void
631 mlx5_txpp_cq_arm(struct mlx5_dev_ctx_shared *sh)
632 {
633 	struct mlx5_txpp_wq *aq = &sh->txpp.rearm_queue;
634 	uint32_t arm_sn = aq->arm_sn << MLX5_CQ_SQN_OFFSET;
635 	uint32_t db_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | aq->cq_ci;
636 	uint64_t db_be = rte_cpu_to_be_64(((uint64_t)db_hi << 32) | aq->cq->id);
637 	uint32_t *addr = RTE_PTR_ADD(sh->tx_uar->base_addr, MLX5_CQ_DOORBELL);
638 
639 	rte_compiler_barrier();
640 	aq->cq_dbrec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(db_hi);
641 	rte_wmb();
642 #ifdef RTE_ARCH_64
643 	*(uint64_t *)addr = db_be;
644 #else
645 	*(uint32_t *)addr = db_be;
646 	rte_io_wmb();
647 	*((uint32_t *)addr + 1) = db_be >> 32;
648 #endif
649 	aq->arm_sn++;
650 }
651 
652 static inline void
653 mlx5_atomic_read_cqe(rte_int128_t *from, rte_int128_t *ts)
654 {
655 	/*
656 	 * The only CQE of Clock Queue is being continuously
657 	 * update by hardware with soecified rate. We have to
658 	 * read timestump and WQE completion index atomically.
659 	 */
660 #if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
661 	rte_int128_t src;
662 
663 	memset(&src, 0, sizeof(src));
664 	*ts = src;
665 	/* if (*from == *ts) *from = *src else *ts = *from; */
666 	rte_atomic128_cmp_exchange(from, ts, &src, 0,
667 				   __ATOMIC_RELAXED, __ATOMIC_RELAXED);
668 #else
669 	rte_atomic64_t *cqe = (rte_atomic64_t *)from;
670 
671 	/* Power architecture does not support 16B compare-and-swap. */
672 	for (;;) {
673 		int64_t tm, op;
674 		int64_t *ps;
675 
676 		rte_compiler_barrier();
677 		tm = rte_atomic64_read(cqe + 0);
678 		op = rte_atomic64_read(cqe + 1);
679 		rte_compiler_barrier();
680 		if (tm != rte_atomic64_read(cqe + 0))
681 			continue;
682 		if (op != rte_atomic64_read(cqe + 1))
683 			continue;
684 		ps = (int64_t *)ts;
685 		ps[0] = tm;
686 		ps[1] = op;
687 		return;
688 	}
689 #endif
690 }
691 
692 /* Stores timestamp in the cache structure to share data with datapath. */
693 static inline void
694 mlx5_txpp_cache_timestamp(struct mlx5_dev_ctx_shared *sh,
695 			   uint64_t ts, uint64_t ci)
696 {
697 	ci = ci << (64 - MLX5_CQ_INDEX_WIDTH);
698 	ci |= (ts << MLX5_CQ_INDEX_WIDTH) >> MLX5_CQ_INDEX_WIDTH;
699 	rte_compiler_barrier();
700 	rte_atomic64_set(&sh->txpp.ts.ts, ts);
701 	rte_atomic64_set(&sh->txpp.ts.ci_ts, ci);
702 	rte_wmb();
703 }
704 
705 /* Reads timestamp from Clock Queue CQE and stores in the cache. */
706 static inline void
707 mlx5_txpp_update_timestamp(struct mlx5_dev_ctx_shared *sh)
708 {
709 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
710 	struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
711 	union {
712 		rte_int128_t u128;
713 		struct mlx5_cqe_ts cts;
714 	} to;
715 	uint64_t ts;
716 	uint16_t ci;
717 
718 	static_assert(sizeof(struct mlx5_cqe_ts) == sizeof(rte_int128_t),
719 		      "Wrong timestamp CQE part size");
720 	mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
721 	if (to.cts.op_own >> 4) {
722 		DRV_LOG(DEBUG, "Clock Queue error sync lost.");
723 		rte_atomic32_inc(&sh->txpp.err_clock_queue);
724 		sh->txpp.sync_lost = 1;
725 		return;
726 	}
727 	ci = rte_be_to_cpu_16(to.cts.wqe_counter);
728 	ts = rte_be_to_cpu_64(to.cts.timestamp);
729 	ts = mlx5_txpp_convert_rx_ts(sh, ts);
730 	wq->cq_ci += (ci - wq->sq_ci) & UINT16_MAX;
731 	wq->sq_ci = ci;
732 	mlx5_txpp_cache_timestamp(sh, ts, wq->cq_ci);
733 }
734 
735 /* Waits for the first completion on Clock Queue to init timestamp. */
736 static inline void
737 mlx5_txpp_init_timestamp(struct mlx5_dev_ctx_shared *sh)
738 {
739 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
740 	uint32_t wait;
741 
742 	sh->txpp.ts_p = 0;
743 	sh->txpp.ts_n = 0;
744 	for (wait = 0; wait < MLX5_TXPP_WAIT_INIT_TS; wait++) {
745 		struct timespec onems;
746 
747 		mlx5_txpp_update_timestamp(sh);
748 		if (wq->sq_ci)
749 			return;
750 		/* Wait one millisecond and try again. */
751 		onems.tv_sec = 0;
752 		onems.tv_nsec = NS_PER_S / MS_PER_S;
753 		nanosleep(&onems, 0);
754 	}
755 	DRV_LOG(ERR, "Unable to initialize timestamp.");
756 	sh->txpp.sync_lost = 1;
757 }
758 
759 #ifdef HAVE_IBV_DEVX_EVENT
760 /* Gather statistics for timestamp from Clock Queue CQE. */
761 static inline void
762 mlx5_txpp_gather_timestamp(struct mlx5_dev_ctx_shared *sh)
763 {
764 	/* Check whether we have a valid timestamp. */
765 	if (!sh->txpp.clock_queue.sq_ci && !sh->txpp.ts_n)
766 		return;
767 	MLX5_ASSERT(sh->txpp.ts_p < MLX5_TXPP_REARM_SQ_SIZE);
768 	sh->txpp.tsa[sh->txpp.ts_p] = sh->txpp.ts;
769 	if (++sh->txpp.ts_p >= MLX5_TXPP_REARM_SQ_SIZE)
770 		sh->txpp.ts_p = 0;
771 	if (sh->txpp.ts_n < MLX5_TXPP_REARM_SQ_SIZE)
772 		++sh->txpp.ts_n;
773 }
774 
775 /* Handles Rearm Queue completions in periodic service. */
776 static __rte_always_inline void
777 mlx5_txpp_handle_rearm_queue(struct mlx5_dev_ctx_shared *sh)
778 {
779 	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
780 	uint32_t cq_ci = wq->cq_ci;
781 	bool error = false;
782 	int ret;
783 
784 	do {
785 		volatile struct mlx5_cqe *cqe;
786 
787 		cqe = &wq->cqes[cq_ci & (MLX5_TXPP_REARM_CQ_SIZE - 1)];
788 		ret = check_cqe(cqe, MLX5_TXPP_REARM_CQ_SIZE, cq_ci);
789 		switch (ret) {
790 		case MLX5_CQE_STATUS_ERR:
791 			error = true;
792 			++cq_ci;
793 			break;
794 		case MLX5_CQE_STATUS_SW_OWN:
795 			wq->sq_ci += 2;
796 			++cq_ci;
797 			break;
798 		case MLX5_CQE_STATUS_HW_OWN:
799 			break;
800 		default:
801 			MLX5_ASSERT(false);
802 			break;
803 		}
804 	} while (ret != MLX5_CQE_STATUS_HW_OWN);
805 	if (likely(cq_ci != wq->cq_ci)) {
806 		/* Check whether we have missed interrupts. */
807 		if (cq_ci - wq->cq_ci != 1) {
808 			DRV_LOG(DEBUG, "Rearm Queue missed interrupt.");
809 			rte_atomic32_inc(&sh->txpp.err_miss_int);
810 			/* Check sync lost on wqe index. */
811 			if (cq_ci - wq->cq_ci >=
812 				(((1UL << MLX5_WQ_INDEX_WIDTH) /
813 				  MLX5_TXPP_REARM) - 1))
814 				error = 1;
815 		}
816 		/* Update doorbell record to notify hardware. */
817 		rte_compiler_barrier();
818 		*wq->cq_dbrec = rte_cpu_to_be_32(cq_ci);
819 		rte_wmb();
820 		wq->cq_ci = cq_ci;
821 		/* Fire new requests to Rearm Queue. */
822 		if (error) {
823 			DRV_LOG(DEBUG, "Rearm Queue error sync lost.");
824 			rte_atomic32_inc(&sh->txpp.err_rearm_queue);
825 			sh->txpp.sync_lost = 1;
826 		}
827 	}
828 }
829 
830 /* Handles Clock Queue completions in periodic service. */
831 static __rte_always_inline void
832 mlx5_txpp_handle_clock_queue(struct mlx5_dev_ctx_shared *sh)
833 {
834 	mlx5_txpp_update_timestamp(sh);
835 	mlx5_txpp_gather_timestamp(sh);
836 }
837 #endif
838 
839 /* Invoked periodically on Rearm Queue completions. */
840 void
841 mlx5_txpp_interrupt_handler(void *cb_arg)
842 {
843 #ifndef HAVE_IBV_DEVX_EVENT
844 	RTE_SET_USED(cb_arg);
845 	return;
846 #else
847 	struct mlx5_dev_ctx_shared *sh = cb_arg;
848 	union {
849 		struct mlx5dv_devx_async_event_hdr event_resp;
850 		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
851 	} out;
852 
853 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
854 	/* Process events in the loop. Only rearm completions are expected. */
855 	while (mlx5_glue->devx_get_event
856 				(sh->txpp.echan,
857 				 &out.event_resp,
858 				 sizeof(out.buf)) >=
859 				 (ssize_t)sizeof(out.event_resp.cookie)) {
860 		mlx5_txpp_handle_rearm_queue(sh);
861 		mlx5_txpp_handle_clock_queue(sh);
862 		mlx5_txpp_cq_arm(sh);
863 		mlx5_txpp_doorbell_rearm_queue
864 					(sh, sh->txpp.rearm_queue.sq_ci - 1);
865 	}
866 #endif /* HAVE_IBV_DEVX_ASYNC */
867 }
868 
869 static void
870 mlx5_txpp_stop_service(struct mlx5_dev_ctx_shared *sh)
871 {
872 	if (!sh->txpp.intr_handle.fd)
873 		return;
874 	mlx5_intr_callback_unregister(&sh->txpp.intr_handle,
875 				      mlx5_txpp_interrupt_handler, sh);
876 	sh->txpp.intr_handle.fd = 0;
877 }
878 
879 /* Attach interrupt handler and fires first request to Rearm Queue. */
880 static int
881 mlx5_txpp_start_service(struct mlx5_dev_ctx_shared *sh)
882 {
883 	uint16_t event_nums[1] = {0};
884 	int flags;
885 	int ret;
886 
887 	rte_atomic32_set(&sh->txpp.err_miss_int, 0);
888 	rte_atomic32_set(&sh->txpp.err_rearm_queue, 0);
889 	rte_atomic32_set(&sh->txpp.err_clock_queue, 0);
890 	rte_atomic32_set(&sh->txpp.err_ts_past, 0);
891 	rte_atomic32_set(&sh->txpp.err_ts_future, 0);
892 	/* Attach interrupt handler to process Rearm Queue completions. */
893 	flags = fcntl(sh->txpp.echan->fd, F_GETFL);
894 	ret = fcntl(sh->txpp.echan->fd, F_SETFL, flags | O_NONBLOCK);
895 	if (ret) {
896 		DRV_LOG(ERR, "Failed to change event channel FD.");
897 		rte_errno = errno;
898 		return -rte_errno;
899 	}
900 	memset(&sh->txpp.intr_handle, 0, sizeof(sh->txpp.intr_handle));
901 	sh->txpp.intr_handle.fd = sh->txpp.echan->fd;
902 	sh->txpp.intr_handle.type = RTE_INTR_HANDLE_EXT;
903 	if (rte_intr_callback_register(&sh->txpp.intr_handle,
904 				       mlx5_txpp_interrupt_handler, sh)) {
905 		sh->txpp.intr_handle.fd = 0;
906 		DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
907 		return -rte_errno;
908 	}
909 	/* Subscribe CQ event to the event channel controlled by the driver. */
910 	ret = mlx5_glue->devx_subscribe_devx_event(sh->txpp.echan,
911 						   sh->txpp.rearm_queue.cq->obj,
912 						   sizeof(event_nums),
913 						   event_nums, 0);
914 	if (ret) {
915 		DRV_LOG(ERR, "Failed to subscribe CQE event.");
916 		rte_errno = errno;
917 		return -errno;
918 	}
919 	/* Enable interrupts in the CQ. */
920 	mlx5_txpp_cq_arm(sh);
921 	/* Fire the first request on Rearm Queue. */
922 	mlx5_txpp_doorbell_rearm_queue(sh, sh->txpp.rearm_queue.sq_size - 1);
923 	mlx5_txpp_init_timestamp(sh);
924 	return 0;
925 }
926 
927 /*
928  * The routine initializes the packet pacing infrastructure:
929  * - allocates PP context
930  * - Clock CQ/SQ
931  * - Rearm CQ/SQ
932  * - attaches rearm interrupt handler
933  * - starts Clock Queue
934  *
935  * Returns 0 on success, negative otherwise
936  */
937 static int
938 mlx5_txpp_create(struct mlx5_dev_ctx_shared *sh, struct mlx5_priv *priv)
939 {
940 	int tx_pp = priv->config.tx_pp;
941 	int ret;
942 
943 	/* Store the requested pacing parameters. */
944 	sh->txpp.tick = tx_pp >= 0 ? tx_pp : -tx_pp;
945 	sh->txpp.test = !!(tx_pp < 0);
946 	sh->txpp.skew = priv->config.tx_skew;
947 	sh->txpp.freq = priv->config.hca_attr.dev_freq_khz;
948 	ret = mlx5_txpp_create_eqn(sh);
949 	if (ret)
950 		goto exit;
951 	ret = mlx5_txpp_alloc_pp_index(sh);
952 	if (ret)
953 		goto exit;
954 	ret = mlx5_txpp_create_clock_queue(sh);
955 	if (ret)
956 		goto exit;
957 	ret = mlx5_txpp_create_rearm_queue(sh);
958 	if (ret)
959 		goto exit;
960 	ret = mlx5_txpp_start_service(sh);
961 	if (ret)
962 		goto exit;
963 exit:
964 	if (ret) {
965 		mlx5_txpp_stop_service(sh);
966 		mlx5_txpp_destroy_rearm_queue(sh);
967 		mlx5_txpp_destroy_clock_queue(sh);
968 		mlx5_txpp_free_pp_index(sh);
969 		mlx5_txpp_destroy_eqn(sh);
970 		sh->txpp.tick = 0;
971 		sh->txpp.test = 0;
972 		sh->txpp.skew = 0;
973 	}
974 	return ret;
975 }
976 
977 /*
978  * The routine destroys the packet pacing infrastructure:
979  * - detaches rearm interrupt handler
980  * - Rearm CQ/SQ
981  * - Clock CQ/SQ
982  * - PP context
983  */
984 static void
985 mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
986 {
987 	mlx5_txpp_stop_service(sh);
988 	mlx5_txpp_destroy_rearm_queue(sh);
989 	mlx5_txpp_destroy_clock_queue(sh);
990 	mlx5_txpp_free_pp_index(sh);
991 	mlx5_txpp_destroy_eqn(sh);
992 	sh->txpp.tick = 0;
993 	sh->txpp.test = 0;
994 	sh->txpp.skew = 0;
995 }
996 
997 /**
998  * Creates and starts packet pacing infrastructure on specified device.
999  *
1000  * @param dev
1001  *   Pointer to Ethernet device structure.
1002  *
1003  * @return
1004  *   0 on success, a negative errno value otherwise and rte_errno is set.
1005  */
1006 int
1007 mlx5_txpp_start(struct rte_eth_dev *dev)
1008 {
1009 	struct mlx5_priv *priv = dev->data->dev_private;
1010 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1011 	int err = 0;
1012 	int ret;
1013 
1014 	if (!priv->config.tx_pp) {
1015 		/* Packet pacing is not requested for the device. */
1016 		MLX5_ASSERT(priv->txpp_en == 0);
1017 		return 0;
1018 	}
1019 	if (priv->txpp_en) {
1020 		/* Packet pacing is already enabled for the device. */
1021 		MLX5_ASSERT(sh->txpp.refcnt);
1022 		return 0;
1023 	}
1024 	if (priv->config.tx_pp > 0) {
1025 		ret = rte_mbuf_dynflag_lookup
1026 				(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL);
1027 		if (ret < 0)
1028 			return 0;
1029 	}
1030 	ret = pthread_mutex_lock(&sh->txpp.mutex);
1031 	MLX5_ASSERT(!ret);
1032 	RTE_SET_USED(ret);
1033 	if (sh->txpp.refcnt) {
1034 		priv->txpp_en = 1;
1035 		++sh->txpp.refcnt;
1036 	} else {
1037 		err = mlx5_txpp_create(sh, priv);
1038 		if (!err) {
1039 			MLX5_ASSERT(sh->txpp.tick);
1040 			priv->txpp_en = 1;
1041 			sh->txpp.refcnt = 1;
1042 		} else {
1043 			rte_errno = -err;
1044 		}
1045 	}
1046 	ret = pthread_mutex_unlock(&sh->txpp.mutex);
1047 	MLX5_ASSERT(!ret);
1048 	RTE_SET_USED(ret);
1049 	return err;
1050 }
1051 
1052 /**
1053  * Stops and destroys packet pacing infrastructure on specified device.
1054  *
1055  * @param dev
1056  *   Pointer to Ethernet device structure.
1057  *
1058  * @return
1059  *   0 on success, a negative errno value otherwise and rte_errno is set.
1060  */
1061 void
1062 mlx5_txpp_stop(struct rte_eth_dev *dev)
1063 {
1064 	struct mlx5_priv *priv = dev->data->dev_private;
1065 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1066 	int ret;
1067 
1068 	if (!priv->txpp_en) {
1069 		/* Packet pacing is already disabled for the device. */
1070 		return;
1071 	}
1072 	priv->txpp_en = 0;
1073 	ret = pthread_mutex_lock(&sh->txpp.mutex);
1074 	MLX5_ASSERT(!ret);
1075 	RTE_SET_USED(ret);
1076 	MLX5_ASSERT(sh->txpp.refcnt);
1077 	if (!sh->txpp.refcnt || --sh->txpp.refcnt)
1078 		return;
1079 	/* No references any more, do actual destroy. */
1080 	mlx5_txpp_destroy(sh);
1081 	ret = pthread_mutex_unlock(&sh->txpp.mutex);
1082 	MLX5_ASSERT(!ret);
1083 	RTE_SET_USED(ret);
1084 }
1085 
1086 /*
1087  * Read the current clock counter of an Ethernet device
1088  *
1089  * This returns the current raw clock value of an Ethernet device. It is
1090  * a raw amount of ticks, with no given time reference.
1091  * The value returned here is from the same clock than the one
1092  * filling timestamp field of Rx/Tx packets when using hardware timestamp
1093  * offload. Therefore it can be used to compute a precise conversion of
1094  * the device clock to the real time.
1095  *
1096  * @param dev
1097  *   Pointer to Ethernet device structure.
1098  * @param clock
1099  *   Pointer to the uint64_t that holds the raw clock value.
1100  *
1101  * @return
1102  *   - 0: Success.
1103  *   - -ENOTSUP: The function is not supported in this mode. Requires
1104  *     packet pacing module configured and started (tx_pp devarg)
1105  */
1106 int
1107 mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
1108 {
1109 	struct mlx5_priv *priv = dev->data->dev_private;
1110 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1111 	int ret;
1112 
1113 	if (sh->txpp.refcnt) {
1114 		struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
1115 		struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
1116 		union {
1117 			rte_int128_t u128;
1118 			struct mlx5_cqe_ts cts;
1119 		} to;
1120 		uint64_t ts;
1121 
1122 		mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
1123 		if (to.cts.op_own >> 4) {
1124 			DRV_LOG(DEBUG, "Clock Queue error sync lost.");
1125 			rte_atomic32_inc(&sh->txpp.err_clock_queue);
1126 			sh->txpp.sync_lost = 1;
1127 			return -EIO;
1128 		}
1129 		ts = rte_be_to_cpu_64(to.cts.timestamp);
1130 		ts = mlx5_txpp_convert_rx_ts(sh, ts);
1131 		*timestamp = ts;
1132 		return 0;
1133 	}
1134 	/* Not supported in isolated mode - kernel does not see the CQEs. */
1135 	if (priv->isolated || rte_eal_process_type() != RTE_PROC_PRIMARY)
1136 		return -ENOTSUP;
1137 	ret = mlx5_read_clock(dev, timestamp);
1138 	return ret;
1139 }
1140 
1141 /**
1142  * DPDK callback to clear device extended statistics.
1143  *
1144  * @param dev
1145  *   Pointer to Ethernet device structure.
1146  *
1147  * @return
1148  *   0 on success and stats is reset, negative errno value otherwise and
1149  *   rte_errno is set.
1150  */
1151 int mlx5_txpp_xstats_reset(struct rte_eth_dev *dev)
1152 {
1153 	struct mlx5_priv *priv = dev->data->dev_private;
1154 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1155 
1156 	rte_atomic32_set(&sh->txpp.err_miss_int, 0);
1157 	rte_atomic32_set(&sh->txpp.err_rearm_queue, 0);
1158 	rte_atomic32_set(&sh->txpp.err_clock_queue, 0);
1159 	rte_atomic32_set(&sh->txpp.err_ts_past, 0);
1160 	rte_atomic32_set(&sh->txpp.err_ts_future, 0);
1161 	return 0;
1162 }
1163 
1164 /**
1165  * Routine to retrieve names of extended device statistics
1166  * for packet send scheduling. It appends the specific stats names
1167  * after the parts filled by preceding modules (eth stats, etc.)
1168  *
1169  * @param dev
1170  *   Pointer to Ethernet device structure.
1171  * @param[out] xstats_names
1172  *   Buffer to insert names into.
1173  * @param n
1174  *   Number of names.
1175  * @param n_used
1176  *   Number of names filled by preceding statistics modules.
1177  *
1178  * @return
1179  *   Number of xstats names.
1180  */
1181 int mlx5_txpp_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
1182 			       struct rte_eth_xstat_name *xstats_names,
1183 			       unsigned int n, unsigned int n_used)
1184 {
1185 	unsigned int n_txpp = RTE_DIM(mlx5_txpp_stat_names);
1186 	unsigned int i;
1187 
1188 	if (n >= n_used + n_txpp && xstats_names) {
1189 		for (i = 0; i < n_txpp; ++i) {
1190 			strncpy(xstats_names[i + n_used].name,
1191 				mlx5_txpp_stat_names[i],
1192 				RTE_ETH_XSTATS_NAME_SIZE);
1193 			xstats_names[i + n_used].name
1194 					[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0;
1195 		}
1196 	}
1197 	return n_used + n_txpp;
1198 }
1199 
1200 static inline void
1201 mlx5_txpp_read_tsa(struct mlx5_dev_txpp *txpp,
1202 		   struct mlx5_txpp_ts *tsa, uint16_t idx)
1203 {
1204 	do {
1205 		int64_t ts, ci;
1206 
1207 		ts = rte_atomic64_read(&txpp->tsa[idx].ts);
1208 		ci = rte_atomic64_read(&txpp->tsa[idx].ci_ts);
1209 		rte_compiler_barrier();
1210 		if ((ci ^ ts) << MLX5_CQ_INDEX_WIDTH != 0)
1211 			continue;
1212 		if (rte_atomic64_read(&txpp->tsa[idx].ts) != ts)
1213 			continue;
1214 		if (rte_atomic64_read(&txpp->tsa[idx].ci_ts) != ci)
1215 			continue;
1216 		rte_atomic64_set(&tsa->ts, ts);
1217 		rte_atomic64_set(&tsa->ci_ts, ci);
1218 		return;
1219 	} while (true);
1220 }
1221 
1222 /*
1223  * Jitter reflects the clock change between
1224  * neighbours Clock Queue completions.
1225  */
1226 static uint64_t
1227 mlx5_txpp_xstats_jitter(struct mlx5_dev_txpp *txpp)
1228 {
1229 	struct mlx5_txpp_ts tsa0, tsa1;
1230 	int64_t dts, dci;
1231 	uint16_t ts_p;
1232 
1233 	if (txpp->ts_n < 2) {
1234 		/* No gathered enough reports yet. */
1235 		return 0;
1236 	}
1237 	do {
1238 		int ts_0, ts_1;
1239 
1240 		ts_p = txpp->ts_p;
1241 		rte_compiler_barrier();
1242 		ts_0 = ts_p - 2;
1243 		if (ts_0 < 0)
1244 			ts_0 += MLX5_TXPP_REARM_SQ_SIZE;
1245 		ts_1 = ts_p - 1;
1246 		if (ts_1 < 0)
1247 			ts_1 += MLX5_TXPP_REARM_SQ_SIZE;
1248 		mlx5_txpp_read_tsa(txpp, &tsa0, ts_0);
1249 		mlx5_txpp_read_tsa(txpp, &tsa1, ts_1);
1250 		rte_compiler_barrier();
1251 	} while (ts_p != txpp->ts_p);
1252 	/* We have two neighbor reports, calculate the jitter. */
1253 	dts = rte_atomic64_read(&tsa1.ts) - rte_atomic64_read(&tsa0.ts);
1254 	dci = (rte_atomic64_read(&tsa1.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH)) -
1255 	      (rte_atomic64_read(&tsa0.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH));
1256 	if (dci < 0)
1257 		dci += 1 << MLX5_CQ_INDEX_WIDTH;
1258 	dci *= txpp->tick;
1259 	return (dts > dci) ? dts - dci : dci - dts;
1260 }
1261 
1262 /*
1263  * Wander reflects the long-term clock change
1264  * over the entire length of all Clock Queue completions.
1265  */
1266 static uint64_t
1267 mlx5_txpp_xstats_wander(struct mlx5_dev_txpp *txpp)
1268 {
1269 	struct mlx5_txpp_ts tsa0, tsa1;
1270 	int64_t dts, dci;
1271 	uint16_t ts_p;
1272 
1273 	if (txpp->ts_n < MLX5_TXPP_REARM_SQ_SIZE) {
1274 		/* No gathered enough reports yet. */
1275 		return 0;
1276 	}
1277 	do {
1278 		int ts_0, ts_1;
1279 
1280 		ts_p = txpp->ts_p;
1281 		rte_compiler_barrier();
1282 		ts_0 = ts_p - MLX5_TXPP_REARM_SQ_SIZE / 2 - 1;
1283 		if (ts_0 < 0)
1284 			ts_0 += MLX5_TXPP_REARM_SQ_SIZE;
1285 		ts_1 = ts_p - 1;
1286 		if (ts_1 < 0)
1287 			ts_1 += MLX5_TXPP_REARM_SQ_SIZE;
1288 		mlx5_txpp_read_tsa(txpp, &tsa0, ts_0);
1289 		mlx5_txpp_read_tsa(txpp, &tsa1, ts_1);
1290 		rte_compiler_barrier();
1291 	} while (ts_p != txpp->ts_p);
1292 	/* We have two neighbor reports, calculate the jitter. */
1293 	dts = rte_atomic64_read(&tsa1.ts) - rte_atomic64_read(&tsa0.ts);
1294 	dci = (rte_atomic64_read(&tsa1.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH)) -
1295 	      (rte_atomic64_read(&tsa0.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH));
1296 	dci += 1 << MLX5_CQ_INDEX_WIDTH;
1297 	dci *= txpp->tick;
1298 	return (dts > dci) ? dts - dci : dci - dts;
1299 }
1300 
1301 /**
1302  * Routine to retrieve extended device statistics
1303  * for packet send scheduling. It appends the specific statistics
1304  * after the parts filled by preceding modules (eth stats, etc.)
1305  *
1306  * @param dev
1307  *   Pointer to Ethernet device.
1308  * @param[out] stats
1309  *   Pointer to rte extended stats table.
1310  * @param n
1311  *   The size of the stats table.
1312  * @param n_used
1313  *   Number of stats filled by preceding statistics modules.
1314  *
1315  * @return
1316  *   Number of extended stats on success and stats is filled,
1317  *   negative on error and rte_errno is set.
1318  */
1319 int
1320 mlx5_txpp_xstats_get(struct rte_eth_dev *dev,
1321 		     struct rte_eth_xstat *stats,
1322 		     unsigned int n, unsigned int n_used)
1323 {
1324 	unsigned int n_txpp = RTE_DIM(mlx5_txpp_stat_names);
1325 
1326 	if (n >= n_used + n_txpp && stats) {
1327 		struct mlx5_priv *priv = dev->data->dev_private;
1328 		struct mlx5_dev_ctx_shared *sh = priv->sh;
1329 		unsigned int i;
1330 
1331 		for (i = 0; i < n_txpp; ++i)
1332 			stats[n_used + i].id = n_used + i;
1333 		stats[n_used + 0].value =
1334 				rte_atomic32_read(&sh->txpp.err_miss_int);
1335 		stats[n_used + 1].value =
1336 				rte_atomic32_read(&sh->txpp.err_rearm_queue);
1337 		stats[n_used + 2].value =
1338 				rte_atomic32_read(&sh->txpp.err_clock_queue);
1339 		stats[n_used + 3].value =
1340 				rte_atomic32_read(&sh->txpp.err_ts_past);
1341 		stats[n_used + 4].value =
1342 				rte_atomic32_read(&sh->txpp.err_ts_future);
1343 		stats[n_used + 5].value = mlx5_txpp_xstats_jitter(&sh->txpp);
1344 		stats[n_used + 6].value = mlx5_txpp_xstats_wander(&sh->txpp);
1345 		stats[n_used + 7].value = sh->txpp.sync_lost;
1346 	}
1347 	return n_used + n_txpp;
1348 }
1349