xref: /dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2019 Mellanox Technologies, Ltd
3  */
4 #include <unistd.h>
5 #include <stdint.h>
6 #include <sched.h>
7 #include <fcntl.h>
8 #include <sys/eventfd.h>
9 
10 #include <rte_malloc.h>
11 #include <rte_memory.h>
12 #include <rte_errno.h>
13 #include <rte_lcore.h>
14 #include <rte_atomic.h>
15 #include <rte_common.h>
16 #include <rte_io.h>
17 #include <rte_alarm.h>
18 
19 #include <mlx5_common.h>
20 #include <mlx5_common_os.h>
21 #include <mlx5_common_devx.h>
22 #include <mlx5_glue.h>
23 
24 #include "mlx5_vdpa_utils.h"
25 #include "mlx5_vdpa.h"
26 
27 
28 #define MLX5_VDPA_ERROR_TIME_SEC 3u
29 
30 void
31 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
32 {
33 	if (priv->uar) {
34 		mlx5_glue->devx_free_uar(priv->uar);
35 		priv->uar = NULL;
36 	}
37 #ifdef HAVE_IBV_DEVX_EVENT
38 	if (priv->eventc) {
39 		union {
40 			struct mlx5dv_devx_async_event_hdr event_resp;
41 			uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr)
42 									 + 128];
43 		} out;
44 
45 		/* Clean all pending events. */
46 		while (mlx5_glue->devx_get_event(priv->eventc, &out.event_resp,
47 		       sizeof(out.buf)) >=
48 		       (ssize_t)sizeof(out.event_resp.cookie))
49 			;
50 		mlx5_os_devx_destroy_event_channel(priv->eventc);
51 		priv->eventc = NULL;
52 	}
53 #endif
54 }
55 
56 /* Prepare all the global resources for all the event objects.*/
57 static int
58 mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv *priv)
59 {
60 	int flags, ret;
61 
62 	if (priv->eventc)
63 		return 0;
64 	priv->eventc = mlx5_os_devx_create_event_channel(priv->ctx,
65 			   MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
66 	if (!priv->eventc) {
67 		rte_errno = errno;
68 		DRV_LOG(ERR, "Failed to create event channel %d.",
69 			rte_errno);
70 		goto error;
71 	}
72 	flags = fcntl(priv->eventc->fd, F_GETFL);
73 	ret = fcntl(priv->eventc->fd, F_SETFL, flags | O_NONBLOCK);
74 	if (ret) {
75 		DRV_LOG(ERR, "Failed to change event channel FD.");
76 		goto error;
77 	}
78 	/*
79 	 * This PMD always claims the write memory barrier on UAR
80 	 * registers writings, it is safe to allocate UAR with any
81 	 * memory mapping type.
82 	 */
83 	priv->uar = mlx5_devx_alloc_uar(priv->ctx, -1);
84 	if (!priv->uar) {
85 		rte_errno = errno;
86 		DRV_LOG(ERR, "Failed to allocate UAR.");
87 		goto error;
88 	}
89 	return 0;
90 error:
91 	mlx5_vdpa_event_qp_global_release(priv);
92 	return -1;
93 }
94 
95 static void
96 mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq)
97 {
98 	mlx5_devx_cq_destroy(&cq->cq_obj);
99 	memset(cq, 0, sizeof(*cq));
100 }
101 
102 static inline void __rte_unused
103 mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq *cq)
104 {
105 	uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
106 	uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK;
107 	uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci;
108 	uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq_obj.cq->id;
109 	uint64_t db_be = rte_cpu_to_be_64(doorbell);
110 	uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr, MLX5_CQ_DOORBELL);
111 
112 	rte_io_wmb();
113 	cq->cq_obj.db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
114 	rte_wmb();
115 #ifdef RTE_ARCH_64
116 	*(uint64_t *)addr = db_be;
117 #else
118 	*(uint32_t *)addr = db_be;
119 	rte_io_wmb();
120 	*((uint32_t *)addr + 1) = db_be >> 32;
121 #endif
122 	cq->arm_sn++;
123 	cq->armed = 1;
124 }
125 
126 static int
127 mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
128 		    int callfd, struct mlx5_vdpa_cq *cq)
129 {
130 	struct mlx5_devx_cq_attr attr = {
131 		.use_first_only = 1,
132 		.uar_page_id = priv->uar->page_id,
133 	};
134 	uint16_t event_nums[1] = {0};
135 	int ret;
136 
137 	ret = mlx5_devx_cq_create(priv->ctx, &cq->cq_obj, log_desc_n, &attr,
138 				  SOCKET_ID_ANY);
139 	if (ret)
140 		goto error;
141 	cq->cq_ci = 0;
142 	cq->log_desc_n = log_desc_n;
143 	rte_spinlock_init(&cq->sl);
144 	/* Subscribe CQ event to the event channel controlled by the driver. */
145 	ret = mlx5_os_devx_subscribe_devx_event(priv->eventc,
146 						cq->cq_obj.cq->obj,
147 						sizeof(event_nums), event_nums,
148 						(uint64_t)(uintptr_t)cq);
149 	if (ret) {
150 		DRV_LOG(ERR, "Failed to subscribe CQE event.");
151 		rte_errno = errno;
152 		goto error;
153 	}
154 	cq->callfd = callfd;
155 	/* Init CQ to ones to be in HW owner in the start. */
156 	cq->cq_obj.cqes[0].op_own = MLX5_CQE_OWNER_MASK;
157 	cq->cq_obj.cqes[0].wqe_counter = rte_cpu_to_be_16(UINT16_MAX);
158 	/* First arming. */
159 	mlx5_vdpa_cq_arm(priv, cq);
160 	return 0;
161 error:
162 	mlx5_vdpa_cq_destroy(cq);
163 	return -1;
164 }
165 
166 static inline uint32_t
167 mlx5_vdpa_cq_poll(struct mlx5_vdpa_cq *cq)
168 {
169 	struct mlx5_vdpa_event_qp *eqp =
170 				container_of(cq, struct mlx5_vdpa_event_qp, cq);
171 	const unsigned int cq_size = 1 << cq->log_desc_n;
172 	union {
173 		struct {
174 			uint16_t wqe_counter;
175 			uint8_t rsvd5;
176 			uint8_t op_own;
177 		};
178 		uint32_t word;
179 	} last_word;
180 	uint16_t next_wqe_counter = cq->cq_ci;
181 	uint16_t cur_wqe_counter;
182 	uint16_t comp;
183 
184 	last_word.word = rte_read32(&cq->cq_obj.cqes[0].wqe_counter);
185 	cur_wqe_counter = rte_be_to_cpu_16(last_word.wqe_counter);
186 	comp = cur_wqe_counter + (uint16_t)1 - next_wqe_counter;
187 	if (comp) {
188 		cq->cq_ci += comp;
189 		MLX5_ASSERT(MLX5_CQE_OPCODE(last_word.op_own) !=
190 			    MLX5_CQE_INVALID);
191 		if (unlikely(!(MLX5_CQE_OPCODE(last_word.op_own) ==
192 			       MLX5_CQE_RESP_ERR ||
193 			       MLX5_CQE_OPCODE(last_word.op_own) ==
194 			       MLX5_CQE_REQ_ERR)))
195 			cq->errors++;
196 		rte_io_wmb();
197 		/* Ring CQ doorbell record. */
198 		cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
199 		rte_io_wmb();
200 		/* Ring SW QP doorbell record. */
201 		eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cq_size);
202 	}
203 	return comp;
204 }
205 
206 static void
207 mlx5_vdpa_arm_all_cqs(struct mlx5_vdpa_priv *priv)
208 {
209 	struct mlx5_vdpa_cq *cq;
210 	int i;
211 
212 	for (i = 0; i < priv->nr_virtqs; i++) {
213 		cq = &priv->virtqs[i].eqp.cq;
214 		if (cq->cq_obj.cq && !cq->armed)
215 			mlx5_vdpa_cq_arm(priv, cq);
216 	}
217 }
218 
219 static void
220 mlx5_vdpa_timer_sleep(struct mlx5_vdpa_priv *priv, uint32_t max)
221 {
222 	if (priv->event_mode == MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER) {
223 		switch (max) {
224 		case 0:
225 			priv->timer_delay_us += priv->event_us;
226 			break;
227 		case 1:
228 			break;
229 		default:
230 			priv->timer_delay_us /= max;
231 			break;
232 		}
233 	}
234 	if (priv->timer_delay_us)
235 		usleep(priv->timer_delay_us);
236 	else
237 		/* Give-up CPU to improve polling threads scheduling. */
238 		sched_yield();
239 }
240 
241 static void *
242 mlx5_vdpa_poll_handle(void *arg)
243 {
244 	struct mlx5_vdpa_priv *priv = arg;
245 	int i;
246 	struct mlx5_vdpa_cq *cq;
247 	uint32_t max;
248 	uint64_t current_tic;
249 
250 	pthread_mutex_lock(&priv->timer_lock);
251 	while (!priv->timer_on)
252 		pthread_cond_wait(&priv->timer_cond, &priv->timer_lock);
253 	pthread_mutex_unlock(&priv->timer_lock);
254 	priv->timer_delay_us = priv->event_mode ==
255 					    MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER ?
256 					      MLX5_VDPA_DEFAULT_TIMER_DELAY_US :
257 								 priv->event_us;
258 	while (1) {
259 		max = 0;
260 		pthread_mutex_lock(&priv->vq_config_lock);
261 		for (i = 0; i < priv->nr_virtqs; i++) {
262 			cq = &priv->virtqs[i].eqp.cq;
263 			if (cq->cq_obj.cq && !cq->armed) {
264 				uint32_t comp = mlx5_vdpa_cq_poll(cq);
265 
266 				if (comp) {
267 					/* Notify guest for descs consuming. */
268 					if (cq->callfd != -1)
269 						eventfd_write(cq->callfd,
270 							      (eventfd_t)1);
271 					if (comp > max)
272 						max = comp;
273 				}
274 			}
275 		}
276 		current_tic = rte_rdtsc();
277 		if (!max) {
278 			/* No traffic ? stop timer and load interrupts. */
279 			if (current_tic - priv->last_traffic_tic >=
280 			    rte_get_timer_hz() * priv->no_traffic_time_s) {
281 				DRV_LOG(DEBUG, "Device %s traffic was stopped.",
282 					priv->vdev->device->name);
283 				mlx5_vdpa_arm_all_cqs(priv);
284 				pthread_mutex_unlock(&priv->vq_config_lock);
285 				pthread_mutex_lock(&priv->timer_lock);
286 				priv->timer_on = 0;
287 				while (!priv->timer_on)
288 					pthread_cond_wait(&priv->timer_cond,
289 							  &priv->timer_lock);
290 				pthread_mutex_unlock(&priv->timer_lock);
291 				priv->timer_delay_us = priv->event_mode ==
292 					    MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER ?
293 					      MLX5_VDPA_DEFAULT_TIMER_DELAY_US :
294 								 priv->event_us;
295 				continue;
296 			}
297 		} else {
298 			priv->last_traffic_tic = current_tic;
299 		}
300 		pthread_mutex_unlock(&priv->vq_config_lock);
301 		mlx5_vdpa_timer_sleep(priv, max);
302 	}
303 	return NULL;
304 }
305 
306 static void
307 mlx5_vdpa_interrupt_handler(void *cb_arg)
308 {
309 	struct mlx5_vdpa_priv *priv = cb_arg;
310 #ifdef HAVE_IBV_DEVX_EVENT
311 	union {
312 		struct mlx5dv_devx_async_event_hdr event_resp;
313 		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
314 	} out;
315 
316 	pthread_mutex_lock(&priv->vq_config_lock);
317 	while (mlx5_glue->devx_get_event(priv->eventc, &out.event_resp,
318 					 sizeof(out.buf)) >=
319 				       (ssize_t)sizeof(out.event_resp.cookie)) {
320 		struct mlx5_vdpa_cq *cq = (struct mlx5_vdpa_cq *)
321 					       (uintptr_t)out.event_resp.cookie;
322 		struct mlx5_vdpa_event_qp *eqp = container_of(cq,
323 						 struct mlx5_vdpa_event_qp, cq);
324 		struct mlx5_vdpa_virtq *virtq = container_of(eqp,
325 						   struct mlx5_vdpa_virtq, eqp);
326 
327 		if (!virtq->enable)
328 			continue;
329 		mlx5_vdpa_cq_poll(cq);
330 		/* Notify guest for descs consuming. */
331 		if (cq->callfd != -1)
332 			eventfd_write(cq->callfd, (eventfd_t)1);
333 		if (priv->event_mode == MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT) {
334 			mlx5_vdpa_cq_arm(priv, cq);
335 			pthread_mutex_unlock(&priv->vq_config_lock);
336 			return;
337 		}
338 		/* Don't arm again - timer will take control. */
339 		DRV_LOG(DEBUG, "Device %s virtq %d cq %d event was captured."
340 			" Timer is %s, cq ci is %u.\n",
341 			priv->vdev->device->name,
342 			(int)virtq->index, cq->cq_obj.cq->id,
343 			priv->timer_on ? "on" : "off", cq->cq_ci);
344 		cq->armed = 0;
345 	}
346 #endif
347 
348 	/* Traffic detected: make sure timer is on. */
349 	priv->last_traffic_tic = rte_rdtsc();
350 	pthread_mutex_lock(&priv->timer_lock);
351 	if (!priv->timer_on) {
352 		priv->timer_on = 1;
353 		pthread_cond_signal(&priv->timer_cond);
354 	}
355 	pthread_mutex_unlock(&priv->timer_lock);
356 	pthread_mutex_unlock(&priv->vq_config_lock);
357 }
358 
359 static void
360 mlx5_vdpa_err_interrupt_handler(void *cb_arg __rte_unused)
361 {
362 #ifdef HAVE_IBV_DEVX_EVENT
363 	struct mlx5_vdpa_priv *priv = cb_arg;
364 	union {
365 		struct mlx5dv_devx_async_event_hdr event_resp;
366 		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
367 	} out;
368 	uint32_t vq_index, i, version;
369 	struct mlx5_vdpa_virtq *virtq;
370 	uint64_t sec;
371 
372 	pthread_mutex_lock(&priv->vq_config_lock);
373 	while (mlx5_glue->devx_get_event(priv->err_chnl, &out.event_resp,
374 					 sizeof(out.buf)) >=
375 				       (ssize_t)sizeof(out.event_resp.cookie)) {
376 		vq_index = out.event_resp.cookie & UINT32_MAX;
377 		version = out.event_resp.cookie >> 32;
378 		if (vq_index >= priv->nr_virtqs) {
379 			DRV_LOG(ERR, "Invalid device %s error event virtq %d.",
380 				priv->vdev->device->name, vq_index);
381 			continue;
382 		}
383 		virtq = &priv->virtqs[vq_index];
384 		if (!virtq->enable || virtq->version != version)
385 			continue;
386 		if (rte_rdtsc() / rte_get_tsc_hz() < MLX5_VDPA_ERROR_TIME_SEC)
387 			continue;
388 		virtq->stopped = true;
389 		/* Query error info. */
390 		if (mlx5_vdpa_virtq_query(priv, vq_index))
391 			goto log;
392 		/* Disable vq. */
393 		if (mlx5_vdpa_virtq_enable(priv, vq_index, 0)) {
394 			DRV_LOG(ERR, "Failed to disable virtq %d.", vq_index);
395 			goto log;
396 		}
397 		/* Retry if error happens less than N times in 3 seconds. */
398 		sec = (rte_rdtsc() - virtq->err_time[0]) / rte_get_tsc_hz();
399 		if (sec > MLX5_VDPA_ERROR_TIME_SEC) {
400 			/* Retry. */
401 			if (mlx5_vdpa_virtq_enable(priv, vq_index, 1))
402 				DRV_LOG(ERR, "Failed to enable virtq %d.",
403 					vq_index);
404 			else
405 				DRV_LOG(WARNING, "Recover virtq %d: %u.",
406 					vq_index, ++virtq->n_retry);
407 		} else {
408 			/* Retry timeout, give up. */
409 			DRV_LOG(ERR, "Device %s virtq %d failed to recover.",
410 				priv->vdev->device->name, vq_index);
411 		}
412 log:
413 		/* Shift in current time to error time log end. */
414 		for (i = 1; i < RTE_DIM(virtq->err_time); i++)
415 			virtq->err_time[i - 1] = virtq->err_time[i];
416 		virtq->err_time[RTE_DIM(virtq->err_time) - 1] = rte_rdtsc();
417 	}
418 	pthread_mutex_unlock(&priv->vq_config_lock);
419 #endif
420 }
421 
422 int
423 mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv)
424 {
425 	int ret;
426 	int flags;
427 
428 	/* Setup device event channel. */
429 	priv->err_chnl = mlx5_glue->devx_create_event_channel(priv->ctx, 0);
430 	if (!priv->err_chnl) {
431 		rte_errno = errno;
432 		DRV_LOG(ERR, "Failed to create device event channel %d.",
433 			rte_errno);
434 		goto error;
435 	}
436 	flags = fcntl(priv->err_chnl->fd, F_GETFL);
437 	ret = fcntl(priv->err_chnl->fd, F_SETFL, flags | O_NONBLOCK);
438 	if (ret) {
439 		DRV_LOG(ERR, "Failed to change device event channel FD.");
440 		goto error;
441 	}
442 	priv->err_intr_handle.fd = priv->err_chnl->fd;
443 	priv->err_intr_handle.type = RTE_INTR_HANDLE_EXT;
444 	if (rte_intr_callback_register(&priv->err_intr_handle,
445 				       mlx5_vdpa_err_interrupt_handler,
446 				       priv)) {
447 		priv->err_intr_handle.fd = 0;
448 		DRV_LOG(ERR, "Failed to register error interrupt for device %d.",
449 			priv->vid);
450 		goto error;
451 	} else {
452 		DRV_LOG(DEBUG, "Registered error interrupt for device%d.",
453 			priv->vid);
454 	}
455 	return 0;
456 error:
457 	mlx5_vdpa_err_event_unset(priv);
458 	return -1;
459 }
460 
461 void
462 mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv)
463 {
464 	int retries = MLX5_VDPA_INTR_RETRIES;
465 	int ret = -EAGAIN;
466 
467 	if (!priv->err_intr_handle.fd)
468 		return;
469 	while (retries-- && ret == -EAGAIN) {
470 		ret = rte_intr_callback_unregister(&priv->err_intr_handle,
471 					    mlx5_vdpa_err_interrupt_handler,
472 					    priv);
473 		if (ret == -EAGAIN) {
474 			DRV_LOG(DEBUG, "Try again to unregister fd %d "
475 				"of error interrupt, retries = %d.",
476 				priv->err_intr_handle.fd, retries);
477 			rte_pause();
478 		}
479 	}
480 	memset(&priv->err_intr_handle, 0, sizeof(priv->err_intr_handle));
481 	if (priv->err_chnl) {
482 #ifdef HAVE_IBV_DEVX_EVENT
483 		union {
484 			struct mlx5dv_devx_async_event_hdr event_resp;
485 			uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) +
486 				    128];
487 		} out;
488 
489 		/* Clean all pending events. */
490 		while (mlx5_glue->devx_get_event(priv->err_chnl,
491 		       &out.event_resp, sizeof(out.buf)) >=
492 		       (ssize_t)sizeof(out.event_resp.cookie))
493 			;
494 #endif
495 		mlx5_glue->devx_destroy_event_channel(priv->err_chnl);
496 		priv->err_chnl = NULL;
497 	}
498 }
499 
500 int
501 mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
502 {
503 	int ret;
504 	rte_cpuset_t cpuset;
505 	pthread_attr_t attr;
506 	char name[16];
507 	const struct sched_param sp = {
508 		.sched_priority = sched_get_priority_max(SCHED_RR),
509 	};
510 
511 	if (!priv->eventc)
512 		/* All virtqs are in poll mode. */
513 		return 0;
514 	if (priv->event_mode != MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT) {
515 		pthread_mutex_init(&priv->timer_lock, NULL);
516 		pthread_cond_init(&priv->timer_cond, NULL);
517 		priv->timer_on = 0;
518 		pthread_attr_init(&attr);
519 		ret = pthread_attr_setschedpolicy(&attr, SCHED_RR);
520 		if (ret) {
521 			DRV_LOG(ERR, "Failed to set thread sched policy = RR.");
522 			return -1;
523 		}
524 		ret = pthread_attr_setschedparam(&attr, &sp);
525 		if (ret) {
526 			DRV_LOG(ERR, "Failed to set thread priority.");
527 			return -1;
528 		}
529 		ret = pthread_create(&priv->timer_tid, &attr,
530 				     mlx5_vdpa_poll_handle, (void *)priv);
531 		if (ret) {
532 			DRV_LOG(ERR, "Failed to create timer thread.");
533 			return -1;
534 		}
535 		CPU_ZERO(&cpuset);
536 		if (priv->event_core != -1)
537 			CPU_SET(priv->event_core, &cpuset);
538 		else
539 			cpuset = rte_lcore_cpuset(rte_get_main_lcore());
540 		ret = pthread_setaffinity_np(priv->timer_tid,
541 					     sizeof(cpuset), &cpuset);
542 		if (ret) {
543 			DRV_LOG(ERR, "Failed to set thread affinity.");
544 			goto error;
545 		}
546 		snprintf(name, sizeof(name), "vDPA-mlx5-%d", priv->vid);
547 		ret = pthread_setname_np(priv->timer_tid, name);
548 		if (ret) {
549 			DRV_LOG(ERR, "Failed to set timer thread name.");
550 			return -1;
551 		}
552 	}
553 	priv->intr_handle.fd = priv->eventc->fd;
554 	priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
555 	if (rte_intr_callback_register(&priv->intr_handle,
556 				       mlx5_vdpa_interrupt_handler, priv)) {
557 		priv->intr_handle.fd = 0;
558 		DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
559 		goto error;
560 	}
561 	return 0;
562 error:
563 	mlx5_vdpa_cqe_event_unset(priv);
564 	return -1;
565 }
566 
567 void
568 mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv)
569 {
570 	int retries = MLX5_VDPA_INTR_RETRIES;
571 	int ret = -EAGAIN;
572 	void *status;
573 
574 	if (priv->intr_handle.fd) {
575 		while (retries-- && ret == -EAGAIN) {
576 			ret = rte_intr_callback_unregister(&priv->intr_handle,
577 						    mlx5_vdpa_interrupt_handler,
578 						    priv);
579 			if (ret == -EAGAIN) {
580 				DRV_LOG(DEBUG, "Try again to unregister fd %d "
581 					"of CQ interrupt, retries = %d.",
582 					priv->intr_handle.fd, retries);
583 				rte_pause();
584 			}
585 		}
586 		memset(&priv->intr_handle, 0, sizeof(priv->intr_handle));
587 	}
588 	if (priv->timer_tid) {
589 		pthread_cancel(priv->timer_tid);
590 		pthread_join(priv->timer_tid, &status);
591 	}
592 	priv->timer_tid = 0;
593 }
594 
595 void
596 mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp)
597 {
598 	if (eqp->sw_qp)
599 		claim_zero(mlx5_devx_cmd_destroy(eqp->sw_qp));
600 	if (eqp->umem_obj)
601 		claim_zero(mlx5_glue->devx_umem_dereg(eqp->umem_obj));
602 	if (eqp->umem_buf)
603 		rte_free(eqp->umem_buf);
604 	if (eqp->fw_qp)
605 		claim_zero(mlx5_devx_cmd_destroy(eqp->fw_qp));
606 	mlx5_vdpa_cq_destroy(&eqp->cq);
607 	memset(eqp, 0, sizeof(*eqp));
608 }
609 
610 static int
611 mlx5_vdpa_qps2rts(struct mlx5_vdpa_event_qp *eqp)
612 {
613 	if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RST2INIT_QP,
614 					  eqp->sw_qp->id)) {
615 		DRV_LOG(ERR, "Failed to modify FW QP to INIT state(%u).",
616 			rte_errno);
617 		return -1;
618 	}
619 	if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RST2INIT_QP,
620 					  eqp->fw_qp->id)) {
621 		DRV_LOG(ERR, "Failed to modify SW QP to INIT state(%u).",
622 			rte_errno);
623 		return -1;
624 	}
625 	if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_INIT2RTR_QP,
626 					  eqp->sw_qp->id)) {
627 		DRV_LOG(ERR, "Failed to modify FW QP to RTR state(%u).",
628 			rte_errno);
629 		return -1;
630 	}
631 	if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_INIT2RTR_QP,
632 					  eqp->fw_qp->id)) {
633 		DRV_LOG(ERR, "Failed to modify SW QP to RTR state(%u).",
634 			rte_errno);
635 		return -1;
636 	}
637 	if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RTR2RTS_QP,
638 					  eqp->sw_qp->id)) {
639 		DRV_LOG(ERR, "Failed to modify FW QP to RTS state(%u).",
640 			rte_errno);
641 		return -1;
642 	}
643 	if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RTR2RTS_QP,
644 					  eqp->fw_qp->id)) {
645 		DRV_LOG(ERR, "Failed to modify SW QP to RTS state(%u).",
646 			rte_errno);
647 		return -1;
648 	}
649 	return 0;
650 }
651 
652 int
653 mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n,
654 			  int callfd, struct mlx5_vdpa_event_qp *eqp)
655 {
656 	struct mlx5_devx_qp_attr attr = {0};
657 	uint16_t log_desc_n = rte_log2_u32(desc_n);
658 	uint32_t umem_size = (1 << log_desc_n) * MLX5_WSEG_SIZE +
659 						       sizeof(*eqp->db_rec) * 2;
660 
661 	if (mlx5_vdpa_event_qp_global_prepare(priv))
662 		return -1;
663 	if (mlx5_vdpa_cq_create(priv, log_desc_n, callfd, &eqp->cq))
664 		return -1;
665 	attr.pd = priv->pdn;
666 	attr.ts_format = mlx5_ts_format_conv(priv->qp_ts_format);
667 	eqp->fw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr);
668 	if (!eqp->fw_qp) {
669 		DRV_LOG(ERR, "Failed to create FW QP(%u).", rte_errno);
670 		goto error;
671 	}
672 	eqp->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
673 	if (!eqp->umem_buf) {
674 		DRV_LOG(ERR, "Failed to allocate memory for SW QP.");
675 		rte_errno = ENOMEM;
676 		goto error;
677 	}
678 	eqp->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
679 					       (void *)(uintptr_t)eqp->umem_buf,
680 					       umem_size,
681 					       IBV_ACCESS_LOCAL_WRITE);
682 	if (!eqp->umem_obj) {
683 		DRV_LOG(ERR, "Failed to register umem for SW QP.");
684 		goto error;
685 	}
686 	attr.uar_index = priv->uar->page_id;
687 	attr.cqn = eqp->cq.cq_obj.cq->id;
688 	attr.log_page_size = rte_log2_u32(sysconf(_SC_PAGESIZE));
689 	attr.rq_size = 1 << log_desc_n;
690 	attr.log_rq_stride = rte_log2_u32(MLX5_WSEG_SIZE);
691 	attr.sq_size = 0; /* No need SQ. */
692 	attr.dbr_umem_valid = 1;
693 	attr.wq_umem_id = eqp->umem_obj->umem_id;
694 	attr.wq_umem_offset = 0;
695 	attr.dbr_umem_id = eqp->umem_obj->umem_id;
696 	attr.dbr_address = (1 << log_desc_n) * MLX5_WSEG_SIZE;
697 	attr.ts_format = mlx5_ts_format_conv(priv->qp_ts_format);
698 	eqp->sw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr);
699 	if (!eqp->sw_qp) {
700 		DRV_LOG(ERR, "Failed to create SW QP(%u).", rte_errno);
701 		goto error;
702 	}
703 	eqp->db_rec = RTE_PTR_ADD(eqp->umem_buf, (uintptr_t)attr.dbr_address);
704 	if (mlx5_vdpa_qps2rts(eqp))
705 		goto error;
706 	/* First ringing. */
707 	rte_write32(rte_cpu_to_be_32(1 << log_desc_n), &eqp->db_rec[0]);
708 	return 0;
709 error:
710 	mlx5_vdpa_event_qp_destroy(eqp);
711 	return -1;
712 }
713