xref: /dpdk/lib/eal/linux/eal_interrupts.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21 
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37 
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41 
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44 
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46 
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51 	struct {
52 		int pipefd[2];
53 	};
54 	struct {
55 		int readfd;
56 		int writefd;
57 	};
58 };
59 
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64 	int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66 	uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68 	uint64_t timerfd_num;            /* for timerfd */
69 	char charbuf[16];                /* for others */
70 };
71 
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74 
75 struct rte_intr_callback {
76 	TAILQ_ENTRY(rte_intr_callback) next;
77 	rte_intr_callback_fn cb_fn;  /**< callback address */
78 	void *cb_arg;                /**< parameter for callback */
79 	uint8_t pending_delete;      /**< delete after callback is called */
80 	rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82 
83 struct rte_intr_source {
84 	TAILQ_ENTRY(rte_intr_source) next;
85 	struct rte_intr_handle intr_handle; /**< interrupt handle */
86 	struct rte_intr_cb_list callbacks;  /**< user callbacks */
87 	uint32_t active;
88 };
89 
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92 
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95 
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98 
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101 
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104 
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108 			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109 
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113 	struct vfio_irq_set *irq_set;
114 	char irq_set_buf[IRQ_SET_BUF_LEN];
115 	int len, ret;
116 	int *fd_ptr;
117 
118 	len = sizeof(irq_set_buf);
119 
120 	/* enable INTx */
121 	irq_set = (struct vfio_irq_set *) irq_set_buf;
122 	irq_set->argsz = len;
123 	irq_set->count = 1;
124 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126 	irq_set->start = 0;
127 	fd_ptr = (int *) &irq_set->data;
128 	*fd_ptr = intr_handle->fd;
129 
130 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
131 
132 	if (ret) {
133 		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
134 						intr_handle->fd);
135 		return -1;
136 	}
137 
138 	/* unmask INTx after enabling */
139 	memset(irq_set, 0, len);
140 	len = sizeof(struct vfio_irq_set);
141 	irq_set->argsz = len;
142 	irq_set->count = 1;
143 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
144 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
145 	irq_set->start = 0;
146 
147 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
148 
149 	if (ret) {
150 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
151 						intr_handle->fd);
152 		return -1;
153 	}
154 	return 0;
155 }
156 
157 /* disable legacy (INTx) interrupts */
158 static int
159 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
160 	struct vfio_irq_set *irq_set;
161 	char irq_set_buf[IRQ_SET_BUF_LEN];
162 	int len, ret;
163 
164 	len = sizeof(struct vfio_irq_set);
165 
166 	/* mask interrupts before disabling */
167 	irq_set = (struct vfio_irq_set *) irq_set_buf;
168 	irq_set->argsz = len;
169 	irq_set->count = 1;
170 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
171 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
172 	irq_set->start = 0;
173 
174 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
175 
176 	if (ret) {
177 		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
178 						intr_handle->fd);
179 		return -1;
180 	}
181 
182 	/* disable INTx*/
183 	memset(irq_set, 0, len);
184 	irq_set->argsz = len;
185 	irq_set->count = 0;
186 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
187 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
188 	irq_set->start = 0;
189 
190 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
191 
192 	if (ret) {
193 		RTE_LOG(ERR, EAL,
194 			"Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
195 		return -1;
196 	}
197 	return 0;
198 }
199 
200 /* unmask/ack legacy (INTx) interrupts */
201 static int
202 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
203 {
204 	struct vfio_irq_set irq_set;
205 
206 	/* unmask INTx */
207 	memset(&irq_set, 0, sizeof(irq_set));
208 	irq_set.argsz = sizeof(irq_set);
209 	irq_set.count = 1;
210 	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
211 	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
212 	irq_set.start = 0;
213 
214 	if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
215 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
216 			intr_handle->fd);
217 		return -1;
218 	}
219 	return 0;
220 }
221 
222 /* enable MSI interrupts */
223 static int
224 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
225 	int len, ret;
226 	char irq_set_buf[IRQ_SET_BUF_LEN];
227 	struct vfio_irq_set *irq_set;
228 	int *fd_ptr;
229 
230 	len = sizeof(irq_set_buf);
231 
232 	irq_set = (struct vfio_irq_set *) irq_set_buf;
233 	irq_set->argsz = len;
234 	irq_set->count = 1;
235 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
236 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
237 	irq_set->start = 0;
238 	fd_ptr = (int *) &irq_set->data;
239 	*fd_ptr = intr_handle->fd;
240 
241 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
242 
243 	if (ret) {
244 		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
245 						intr_handle->fd);
246 		return -1;
247 	}
248 	return 0;
249 }
250 
251 /* disable MSI interrupts */
252 static int
253 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
254 	struct vfio_irq_set *irq_set;
255 	char irq_set_buf[IRQ_SET_BUF_LEN];
256 	int len, ret;
257 
258 	len = sizeof(struct vfio_irq_set);
259 
260 	irq_set = (struct vfio_irq_set *) irq_set_buf;
261 	irq_set->argsz = len;
262 	irq_set->count = 0;
263 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
264 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
265 	irq_set->start = 0;
266 
267 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
268 
269 	if (ret)
270 		RTE_LOG(ERR, EAL,
271 			"Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
272 
273 	return ret;
274 }
275 
276 /* enable MSI-X interrupts */
277 static int
278 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
279 	int len, ret;
280 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
281 	struct vfio_irq_set *irq_set;
282 	int *fd_ptr;
283 
284 	len = sizeof(irq_set_buf);
285 
286 	irq_set = (struct vfio_irq_set *) irq_set_buf;
287 	irq_set->argsz = len;
288 	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
289 	irq_set->count = intr_handle->max_intr ?
290 		(intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
291 		RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
292 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
293 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
294 	irq_set->start = 0;
295 	fd_ptr = (int *) &irq_set->data;
296 	/* INTR vector offset 0 reserve for non-efds mapping */
297 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
298 	memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
299 		sizeof(*intr_handle->efds) * intr_handle->nb_efd);
300 
301 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
302 
303 	if (ret) {
304 		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
305 						intr_handle->fd);
306 		return -1;
307 	}
308 
309 	return 0;
310 }
311 
312 /* disable MSI-X interrupts */
313 static int
314 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
315 	struct vfio_irq_set *irq_set;
316 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
317 	int len, ret;
318 
319 	len = sizeof(struct vfio_irq_set);
320 
321 	irq_set = (struct vfio_irq_set *) irq_set_buf;
322 	irq_set->argsz = len;
323 	irq_set->count = 0;
324 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
325 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
326 	irq_set->start = 0;
327 
328 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
329 
330 	if (ret)
331 		RTE_LOG(ERR, EAL,
332 			"Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
333 
334 	return ret;
335 }
336 
337 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
338 /* enable req notifier */
339 static int
340 vfio_enable_req(const struct rte_intr_handle *intr_handle)
341 {
342 	int len, ret;
343 	char irq_set_buf[IRQ_SET_BUF_LEN];
344 	struct vfio_irq_set *irq_set;
345 	int *fd_ptr;
346 
347 	len = sizeof(irq_set_buf);
348 
349 	irq_set = (struct vfio_irq_set *) irq_set_buf;
350 	irq_set->argsz = len;
351 	irq_set->count = 1;
352 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353 			 VFIO_IRQ_SET_ACTION_TRIGGER;
354 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
355 	irq_set->start = 0;
356 	fd_ptr = (int *) &irq_set->data;
357 	*fd_ptr = intr_handle->fd;
358 
359 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
360 
361 	if (ret) {
362 		RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
363 						intr_handle->fd);
364 		return -1;
365 	}
366 
367 	return 0;
368 }
369 
370 /* disable req notifier */
371 static int
372 vfio_disable_req(const struct rte_intr_handle *intr_handle)
373 {
374 	struct vfio_irq_set *irq_set;
375 	char irq_set_buf[IRQ_SET_BUF_LEN];
376 	int len, ret;
377 
378 	len = sizeof(struct vfio_irq_set);
379 
380 	irq_set = (struct vfio_irq_set *) irq_set_buf;
381 	irq_set->argsz = len;
382 	irq_set->count = 0;
383 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
385 	irq_set->start = 0;
386 
387 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388 
389 	if (ret)
390 		RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
391 			intr_handle->fd);
392 
393 	return ret;
394 }
395 #endif
396 #endif
397 
398 static int
399 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
400 {
401 	unsigned char command_high;
402 
403 	/* use UIO config file descriptor for uio_pci_generic */
404 	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
405 		RTE_LOG(ERR, EAL,
406 			"Error reading interrupts status for fd %d\n",
407 			intr_handle->uio_cfg_fd);
408 		return -1;
409 	}
410 	/* disable interrupts */
411 	command_high |= 0x4;
412 	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
413 		RTE_LOG(ERR, EAL,
414 			"Error disabling interrupts for fd %d\n",
415 			intr_handle->uio_cfg_fd);
416 		return -1;
417 	}
418 
419 	return 0;
420 }
421 
422 static int
423 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
424 {
425 	unsigned char command_high;
426 
427 	/* use UIO config file descriptor for uio_pci_generic */
428 	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
429 		RTE_LOG(ERR, EAL,
430 			"Error reading interrupts status for fd %d\n",
431 			intr_handle->uio_cfg_fd);
432 		return -1;
433 	}
434 	/* enable interrupts */
435 	command_high &= ~0x4;
436 	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
437 		RTE_LOG(ERR, EAL,
438 			"Error enabling interrupts for fd %d\n",
439 			intr_handle->uio_cfg_fd);
440 		return -1;
441 	}
442 
443 	return 0;
444 }
445 
446 static int
447 uio_intr_disable(const struct rte_intr_handle *intr_handle)
448 {
449 	const int value = 0;
450 
451 	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
452 		RTE_LOG(ERR, EAL,
453 			"Error disabling interrupts for fd %d (%s)\n",
454 			intr_handle->fd, strerror(errno));
455 		return -1;
456 	}
457 	return 0;
458 }
459 
460 static int
461 uio_intr_enable(const struct rte_intr_handle *intr_handle)
462 {
463 	const int value = 1;
464 
465 	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
466 		RTE_LOG(ERR, EAL,
467 			"Error enabling interrupts for fd %d (%s)\n",
468 			intr_handle->fd, strerror(errno));
469 		return -1;
470 	}
471 	return 0;
472 }
473 
474 int
475 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
476 			rte_intr_callback_fn cb, void *cb_arg)
477 {
478 	int ret, wake_thread;
479 	struct rte_intr_source *src;
480 	struct rte_intr_callback *callback;
481 
482 	wake_thread = 0;
483 
484 	/* first do parameter checking */
485 	if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
486 		RTE_LOG(ERR, EAL,
487 			"Registering with invalid input parameter\n");
488 		return -EINVAL;
489 	}
490 
491 	/* allocate a new interrupt callback entity */
492 	callback = calloc(1, sizeof(*callback));
493 	if (callback == NULL) {
494 		RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495 		return -ENOMEM;
496 	}
497 	callback->cb_fn = cb;
498 	callback->cb_arg = cb_arg;
499 	callback->pending_delete = 0;
500 	callback->ucb_fn = NULL;
501 
502 	rte_spinlock_lock(&intr_lock);
503 
504 	/* check if there is at least one callback registered for the fd */
505 	TAILQ_FOREACH(src, &intr_sources, next) {
506 		if (src->intr_handle.fd == intr_handle->fd) {
507 			/* we had no interrupts for this */
508 			if (TAILQ_EMPTY(&src->callbacks))
509 				wake_thread = 1;
510 
511 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
512 			ret = 0;
513 			break;
514 		}
515 	}
516 
517 	/* no existing callbacks for this - add new source */
518 	if (src == NULL) {
519 		src = calloc(1, sizeof(*src));
520 		if (src == NULL) {
521 			RTE_LOG(ERR, EAL, "Can not allocate memory\n");
522 			free(callback);
523 			ret = -ENOMEM;
524 		} else {
525 			src->intr_handle = *intr_handle;
526 			TAILQ_INIT(&src->callbacks);
527 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528 			TAILQ_INSERT_TAIL(&intr_sources, src, next);
529 			wake_thread = 1;
530 			ret = 0;
531 		}
532 	}
533 
534 	rte_spinlock_unlock(&intr_lock);
535 
536 	/**
537 	 * check if need to notify the pipe fd waited by epoll_wait to
538 	 * rebuild the wait list.
539 	 */
540 	if (wake_thread)
541 		if (write(intr_pipe.writefd, "1", 1) < 0)
542 			ret = -EPIPE;
543 
544 	rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
545 	return ret;
546 }
547 
548 int
549 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
550 				rte_intr_callback_fn cb_fn, void *cb_arg,
551 				rte_intr_unregister_callback_fn ucb_fn)
552 {
553 	int ret;
554 	struct rte_intr_source *src;
555 	struct rte_intr_callback *cb, *next;
556 
557 	/* do parameter checking first */
558 	if (intr_handle == NULL || intr_handle->fd < 0) {
559 		RTE_LOG(ERR, EAL,
560 		"Unregistering with invalid input parameter\n");
561 		return -EINVAL;
562 	}
563 
564 	rte_spinlock_lock(&intr_lock);
565 
566 	/* check if the insterrupt source for the fd is existent */
567 	TAILQ_FOREACH(src, &intr_sources, next)
568 		if (src->intr_handle.fd == intr_handle->fd)
569 			break;
570 
571 	/* No interrupt source registered for the fd */
572 	if (src == NULL) {
573 		ret = -ENOENT;
574 
575 	/* only usable if the source is active */
576 	} else if (src->active == 0) {
577 		ret = -EAGAIN;
578 
579 	} else {
580 		ret = 0;
581 
582 		/* walk through the callbacks and mark all that match. */
583 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
584 			next = TAILQ_NEXT(cb, next);
585 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
586 					cb->cb_arg == cb_arg)) {
587 				cb->pending_delete = 1;
588 				cb->ucb_fn = ucb_fn;
589 				ret++;
590 			}
591 		}
592 	}
593 
594 	rte_spinlock_unlock(&intr_lock);
595 
596 	return ret;
597 }
598 
599 int
600 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
601 			rte_intr_callback_fn cb_fn, void *cb_arg)
602 {
603 	int ret;
604 	struct rte_intr_source *src;
605 	struct rte_intr_callback *cb, *next;
606 
607 	/* do parameter checking first */
608 	if (intr_handle == NULL || intr_handle->fd < 0) {
609 		RTE_LOG(ERR, EAL,
610 		"Unregistering with invalid input parameter\n");
611 		return -EINVAL;
612 	}
613 
614 	rte_spinlock_lock(&intr_lock);
615 
616 	/* check if the insterrupt source for the fd is existent */
617 	TAILQ_FOREACH(src, &intr_sources, next)
618 		if (src->intr_handle.fd == intr_handle->fd)
619 			break;
620 
621 	/* No interrupt source registered for the fd */
622 	if (src == NULL) {
623 		ret = -ENOENT;
624 
625 	/* interrupt source has some active callbacks right now. */
626 	} else if (src->active != 0) {
627 		ret = -EAGAIN;
628 
629 	/* ok to remove. */
630 	} else {
631 		ret = 0;
632 
633 		/*walk through the callbacks and remove all that match. */
634 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
635 
636 			next = TAILQ_NEXT(cb, next);
637 
638 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
639 					cb->cb_arg == cb_arg)) {
640 				TAILQ_REMOVE(&src->callbacks, cb, next);
641 				free(cb);
642 				ret++;
643 			}
644 		}
645 
646 		/* all callbacks for that source are removed. */
647 		if (TAILQ_EMPTY(&src->callbacks)) {
648 			TAILQ_REMOVE(&intr_sources, src, next);
649 			free(src);
650 		}
651 	}
652 
653 	rte_spinlock_unlock(&intr_lock);
654 
655 	/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
656 	if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
657 		ret = -EPIPE;
658 	}
659 
660 	rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
661 		ret);
662 	return ret;
663 }
664 
665 int
666 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
667 			rte_intr_callback_fn cb_fn, void *cb_arg)
668 {
669 	int ret = 0;
670 
671 	while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
672 		rte_pause();
673 
674 	return ret;
675 }
676 
677 int
678 rte_intr_enable(const struct rte_intr_handle *intr_handle)
679 {
680 	int rc = 0;
681 
682 	if (intr_handle == NULL)
683 		return -1;
684 
685 	if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
686 		rc = 0;
687 		goto out;
688 	}
689 
690 	if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
691 		rc = -1;
692 		goto out;
693 	}
694 
695 	switch (intr_handle->type){
696 	/* write to the uio fd to enable the interrupt */
697 	case RTE_INTR_HANDLE_UIO:
698 		if (uio_intr_enable(intr_handle))
699 			rc = -1;
700 		break;
701 	case RTE_INTR_HANDLE_UIO_INTX:
702 		if (uio_intx_intr_enable(intr_handle))
703 			rc = -1;
704 		break;
705 	/* not used at this moment */
706 	case RTE_INTR_HANDLE_ALARM:
707 		rc = -1;
708 		break;
709 #ifdef VFIO_PRESENT
710 	case RTE_INTR_HANDLE_VFIO_MSIX:
711 		if (vfio_enable_msix(intr_handle))
712 			rc = -1;
713 		break;
714 	case RTE_INTR_HANDLE_VFIO_MSI:
715 		if (vfio_enable_msi(intr_handle))
716 			rc = -1;
717 		break;
718 	case RTE_INTR_HANDLE_VFIO_LEGACY:
719 		if (vfio_enable_intx(intr_handle))
720 			rc = -1;
721 		break;
722 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
723 	case RTE_INTR_HANDLE_VFIO_REQ:
724 		if (vfio_enable_req(intr_handle))
725 			rc = -1;
726 		break;
727 #endif
728 #endif
729 	/* not used at this moment */
730 	case RTE_INTR_HANDLE_DEV_EVENT:
731 		rc = -1;
732 		break;
733 	/* unknown handle type */
734 	default:
735 		RTE_LOG(ERR, EAL,
736 			"Unknown handle type of fd %d\n",
737 					intr_handle->fd);
738 		rc = -1;
739 		break;
740 	}
741 out:
742 	rte_eal_trace_intr_enable(intr_handle, rc);
743 	return rc;
744 }
745 
746 /**
747  * PMD generally calls this function at the end of its IRQ callback.
748  * Internally, it unmasks the interrupt if possible.
749  *
750  * For INTx, unmasking is required as the interrupt is auto-masked prior to
751  * invoking callback.
752  *
753  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
754  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
755  * this function is no-op.
756  */
757 int
758 rte_intr_ack(const struct rte_intr_handle *intr_handle)
759 {
760 	if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
761 		return 0;
762 
763 	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
764 		return -1;
765 
766 	switch (intr_handle->type) {
767 	/* Both acking and enabling are same for UIO */
768 	case RTE_INTR_HANDLE_UIO:
769 		if (uio_intr_enable(intr_handle))
770 			return -1;
771 		break;
772 	case RTE_INTR_HANDLE_UIO_INTX:
773 		if (uio_intx_intr_enable(intr_handle))
774 			return -1;
775 		break;
776 	/* not used at this moment */
777 	case RTE_INTR_HANDLE_ALARM:
778 		return -1;
779 #ifdef VFIO_PRESENT
780 	/* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
781 	case RTE_INTR_HANDLE_VFIO_MSIX:
782 	case RTE_INTR_HANDLE_VFIO_MSI:
783 		return 0;
784 	case RTE_INTR_HANDLE_VFIO_LEGACY:
785 		if (vfio_ack_intx(intr_handle))
786 			return -1;
787 		break;
788 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
789 	case RTE_INTR_HANDLE_VFIO_REQ:
790 		return -1;
791 #endif
792 #endif
793 	/* not used at this moment */
794 	case RTE_INTR_HANDLE_DEV_EVENT:
795 		return -1;
796 	/* unknown handle type */
797 	default:
798 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
799 			intr_handle->fd);
800 		return -1;
801 	}
802 
803 	return 0;
804 }
805 
806 int
807 rte_intr_disable(const struct rte_intr_handle *intr_handle)
808 {
809 	int rc = 0;
810 
811 	if (intr_handle == NULL)
812 		return -1;
813 
814 	if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
815 		rc = 0;
816 		goto out;
817 	}
818 
819 	if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
820 		rc = -1;
821 		goto out;
822 	}
823 
824 	switch (intr_handle->type){
825 	/* write to the uio fd to disable the interrupt */
826 	case RTE_INTR_HANDLE_UIO:
827 		if (uio_intr_disable(intr_handle))
828 			rc = -1;
829 		break;
830 	case RTE_INTR_HANDLE_UIO_INTX:
831 		if (uio_intx_intr_disable(intr_handle))
832 			rc = -1;
833 		break;
834 	/* not used at this moment */
835 	case RTE_INTR_HANDLE_ALARM:
836 		rc = -1;
837 		break;
838 #ifdef VFIO_PRESENT
839 	case RTE_INTR_HANDLE_VFIO_MSIX:
840 		if (vfio_disable_msix(intr_handle))
841 			rc = -1;
842 		break;
843 	case RTE_INTR_HANDLE_VFIO_MSI:
844 		if (vfio_disable_msi(intr_handle))
845 			rc = -1;
846 		break;
847 	case RTE_INTR_HANDLE_VFIO_LEGACY:
848 		if (vfio_disable_intx(intr_handle))
849 			rc = -1;
850 		break;
851 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
852 	case RTE_INTR_HANDLE_VFIO_REQ:
853 		if (vfio_disable_req(intr_handle))
854 			rc = -1;
855 		break;
856 #endif
857 #endif
858 	/* not used at this moment */
859 	case RTE_INTR_HANDLE_DEV_EVENT:
860 		rc = -1;
861 		break;
862 	/* unknown handle type */
863 	default:
864 		RTE_LOG(ERR, EAL,
865 			"Unknown handle type of fd %d\n",
866 					intr_handle->fd);
867 		rc = -1;
868 		break;
869 	}
870 out:
871 	rte_eal_trace_intr_disable(intr_handle, rc);
872 	return rc;
873 }
874 
875 static int
876 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
877 {
878 	bool call = false;
879 	int n, bytes_read, rv;
880 	struct rte_intr_source *src;
881 	struct rte_intr_callback *cb, *next;
882 	union rte_intr_read_buffer buf;
883 	struct rte_intr_callback active_cb;
884 
885 	for (n = 0; n < nfds; n++) {
886 
887 		/**
888 		 * if the pipe fd is ready to read, return out to
889 		 * rebuild the wait list.
890 		 */
891 		if (events[n].data.fd == intr_pipe.readfd){
892 			int r = read(intr_pipe.readfd, buf.charbuf,
893 					sizeof(buf.charbuf));
894 			RTE_SET_USED(r);
895 			return -1;
896 		}
897 		rte_spinlock_lock(&intr_lock);
898 		TAILQ_FOREACH(src, &intr_sources, next)
899 			if (src->intr_handle.fd ==
900 					events[n].data.fd)
901 				break;
902 		if (src == NULL){
903 			rte_spinlock_unlock(&intr_lock);
904 			continue;
905 		}
906 
907 		/* mark this interrupt source as active and release the lock. */
908 		src->active = 1;
909 		rte_spinlock_unlock(&intr_lock);
910 
911 		/* set the length to be read dor different handle type */
912 		switch (src->intr_handle.type) {
913 		case RTE_INTR_HANDLE_UIO:
914 		case RTE_INTR_HANDLE_UIO_INTX:
915 			bytes_read = sizeof(buf.uio_intr_count);
916 			break;
917 		case RTE_INTR_HANDLE_ALARM:
918 			bytes_read = sizeof(buf.timerfd_num);
919 			break;
920 #ifdef VFIO_PRESENT
921 		case RTE_INTR_HANDLE_VFIO_MSIX:
922 		case RTE_INTR_HANDLE_VFIO_MSI:
923 		case RTE_INTR_HANDLE_VFIO_LEGACY:
924 			bytes_read = sizeof(buf.vfio_intr_count);
925 			break;
926 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
927 		case RTE_INTR_HANDLE_VFIO_REQ:
928 			bytes_read = 0;
929 			call = true;
930 			break;
931 #endif
932 #endif
933 		case RTE_INTR_HANDLE_VDEV:
934 		case RTE_INTR_HANDLE_EXT:
935 			bytes_read = 0;
936 			call = true;
937 			break;
938 		case RTE_INTR_HANDLE_DEV_EVENT:
939 			bytes_read = 0;
940 			call = true;
941 			break;
942 		default:
943 			bytes_read = 1;
944 			break;
945 		}
946 
947 		if (bytes_read > 0) {
948 			/**
949 			 * read out to clear the ready-to-be-read flag
950 			 * for epoll_wait.
951 			 */
952 			bytes_read = read(events[n].data.fd, &buf, bytes_read);
953 			if (bytes_read < 0) {
954 				if (errno == EINTR || errno == EWOULDBLOCK)
955 					continue;
956 
957 				RTE_LOG(ERR, EAL, "Error reading from file "
958 					"descriptor %d: %s\n",
959 					events[n].data.fd,
960 					strerror(errno));
961 				/*
962 				 * The device is unplugged or buggy, remove
963 				 * it as an interrupt source and return to
964 				 * force the wait list to be rebuilt.
965 				 */
966 				rte_spinlock_lock(&intr_lock);
967 				TAILQ_REMOVE(&intr_sources, src, next);
968 				rte_spinlock_unlock(&intr_lock);
969 
970 				for (cb = TAILQ_FIRST(&src->callbacks); cb;
971 							cb = next) {
972 					next = TAILQ_NEXT(cb, next);
973 					TAILQ_REMOVE(&src->callbacks, cb, next);
974 					free(cb);
975 				}
976 				free(src);
977 				return -1;
978 			} else if (bytes_read == 0)
979 				RTE_LOG(ERR, EAL, "Read nothing from file "
980 					"descriptor %d\n", events[n].data.fd);
981 			else
982 				call = true;
983 		}
984 
985 		/* grab a lock, again to call callbacks and update status. */
986 		rte_spinlock_lock(&intr_lock);
987 
988 		if (call) {
989 
990 			/* Finally, call all callbacks. */
991 			TAILQ_FOREACH(cb, &src->callbacks, next) {
992 
993 				/* make a copy and unlock. */
994 				active_cb = *cb;
995 				rte_spinlock_unlock(&intr_lock);
996 
997 				/* call the actual callback */
998 				active_cb.cb_fn(active_cb.cb_arg);
999 
1000 				/*get the lock back. */
1001 				rte_spinlock_lock(&intr_lock);
1002 			}
1003 		}
1004 		/* we done with that interrupt source, release it. */
1005 		src->active = 0;
1006 
1007 		rv = 0;
1008 
1009 		/* check if any callback are supposed to be removed */
1010 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1011 			next = TAILQ_NEXT(cb, next);
1012 			if (cb->pending_delete) {
1013 				TAILQ_REMOVE(&src->callbacks, cb, next);
1014 				if (cb->ucb_fn)
1015 					cb->ucb_fn(&src->intr_handle, cb->cb_arg);
1016 				free(cb);
1017 				rv++;
1018 			}
1019 		}
1020 
1021 		/* all callbacks for that source are removed. */
1022 		if (TAILQ_EMPTY(&src->callbacks)) {
1023 			TAILQ_REMOVE(&intr_sources, src, next);
1024 			free(src);
1025 		}
1026 
1027 		/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1028 		if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1029 			rte_spinlock_unlock(&intr_lock);
1030 			return -EPIPE;
1031 		}
1032 
1033 		rte_spinlock_unlock(&intr_lock);
1034 	}
1035 
1036 	return 0;
1037 }
1038 
1039 /**
1040  * It handles all the interrupts.
1041  *
1042  * @param pfd
1043  *  epoll file descriptor.
1044  * @param totalfds
1045  *  The number of file descriptors added in epoll.
1046  *
1047  * @return
1048  *  void
1049  */
1050 static void
1051 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1052 {
1053 	struct epoll_event events[totalfds];
1054 	int nfds = 0;
1055 
1056 	for(;;) {
1057 		nfds = epoll_wait(pfd, events, totalfds,
1058 			EAL_INTR_EPOLL_WAIT_FOREVER);
1059 		/* epoll_wait fail */
1060 		if (nfds < 0) {
1061 			if (errno == EINTR)
1062 				continue;
1063 			RTE_LOG(ERR, EAL,
1064 				"epoll_wait returns with fail\n");
1065 			return;
1066 		}
1067 		/* epoll_wait timeout, will never happens here */
1068 		else if (nfds == 0)
1069 			continue;
1070 		/* epoll_wait has at least one fd ready to read */
1071 		if (eal_intr_process_interrupts(events, nfds) < 0)
1072 			return;
1073 	}
1074 }
1075 
1076 /**
1077  * It builds/rebuilds up the epoll file descriptor with all the
1078  * file descriptors being waited on. Then handles the interrupts.
1079  *
1080  * @param arg
1081  *  pointer. (unused)
1082  *
1083  * @return
1084  *  never return;
1085  */
1086 static __rte_noreturn void *
1087 eal_intr_thread_main(__rte_unused void *arg)
1088 {
1089 	/* host thread, never break out */
1090 	for (;;) {
1091 		/* build up the epoll fd with all descriptors we are to
1092 		 * wait on then pass it to the handle_interrupts function
1093 		 */
1094 		static struct epoll_event pipe_event = {
1095 			.events = EPOLLIN | EPOLLPRI,
1096 		};
1097 		struct rte_intr_source *src;
1098 		unsigned numfds = 0;
1099 
1100 		/* create epoll fd */
1101 		int pfd = epoll_create(1);
1102 		if (pfd < 0)
1103 			rte_panic("Cannot create epoll instance\n");
1104 
1105 		pipe_event.data.fd = intr_pipe.readfd;
1106 		/**
1107 		 * add pipe fd into wait list, this pipe is used to
1108 		 * rebuild the wait list.
1109 		 */
1110 		if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1111 						&pipe_event) < 0) {
1112 			rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1113 					intr_pipe.readfd, strerror(errno));
1114 		}
1115 		numfds++;
1116 
1117 		rte_spinlock_lock(&intr_lock);
1118 
1119 		TAILQ_FOREACH(src, &intr_sources, next) {
1120 			struct epoll_event ev;
1121 
1122 			if (src->callbacks.tqh_first == NULL)
1123 				continue; /* skip those with no callbacks */
1124 			memset(&ev, 0, sizeof(ev));
1125 			ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1126 			ev.data.fd = src->intr_handle.fd;
1127 
1128 			/**
1129 			 * add all the uio device file descriptor
1130 			 * into wait list.
1131 			 */
1132 			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1133 					src->intr_handle.fd, &ev) < 0){
1134 				rte_panic("Error adding fd %d epoll_ctl, %s\n",
1135 					src->intr_handle.fd, strerror(errno));
1136 			}
1137 			else
1138 				numfds++;
1139 		}
1140 		rte_spinlock_unlock(&intr_lock);
1141 		/* serve the interrupt */
1142 		eal_intr_handle_interrupts(pfd, numfds);
1143 
1144 		/**
1145 		 * when we return, we need to rebuild the
1146 		 * list of fds to monitor.
1147 		 */
1148 		close(pfd);
1149 	}
1150 }
1151 
1152 int
1153 rte_eal_intr_init(void)
1154 {
1155 	int ret = 0;
1156 
1157 	/* init the global interrupt source head */
1158 	TAILQ_INIT(&intr_sources);
1159 
1160 	/**
1161 	 * create a pipe which will be waited by epoll and notified to
1162 	 * rebuild the wait list of epoll.
1163 	 */
1164 	if (pipe(intr_pipe.pipefd) < 0) {
1165 		rte_errno = errno;
1166 		return -1;
1167 	}
1168 
1169 	/* create the host thread to wait/handle the interrupt */
1170 	ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1171 			eal_intr_thread_main, NULL);
1172 	if (ret != 0) {
1173 		rte_errno = -ret;
1174 		RTE_LOG(ERR, EAL,
1175 			"Failed to create thread for interrupt handling\n");
1176 	}
1177 
1178 	return ret;
1179 }
1180 
1181 static void
1182 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1183 {
1184 	union rte_intr_read_buffer buf;
1185 	int bytes_read = 0;
1186 	int nbytes;
1187 
1188 	switch (intr_handle->type) {
1189 	case RTE_INTR_HANDLE_UIO:
1190 	case RTE_INTR_HANDLE_UIO_INTX:
1191 		bytes_read = sizeof(buf.uio_intr_count);
1192 		break;
1193 #ifdef VFIO_PRESENT
1194 	case RTE_INTR_HANDLE_VFIO_MSIX:
1195 	case RTE_INTR_HANDLE_VFIO_MSI:
1196 	case RTE_INTR_HANDLE_VFIO_LEGACY:
1197 		bytes_read = sizeof(buf.vfio_intr_count);
1198 		break;
1199 #endif
1200 	case RTE_INTR_HANDLE_VDEV:
1201 		bytes_read = intr_handle->efd_counter_size;
1202 		/* For vdev, number of bytes to read is set by driver */
1203 		break;
1204 	case RTE_INTR_HANDLE_EXT:
1205 		return;
1206 	default:
1207 		bytes_read = 1;
1208 		RTE_LOG(INFO, EAL, "unexpected intr type\n");
1209 		break;
1210 	}
1211 
1212 	/**
1213 	 * read out to clear the ready-to-be-read flag
1214 	 * for epoll_wait.
1215 	 */
1216 	if (bytes_read == 0)
1217 		return;
1218 	do {
1219 		nbytes = read(fd, &buf, bytes_read);
1220 		if (nbytes < 0) {
1221 			if (errno == EINTR || errno == EWOULDBLOCK ||
1222 			    errno == EAGAIN)
1223 				continue;
1224 			RTE_LOG(ERR, EAL,
1225 				"Error reading from fd %d: %s\n",
1226 				fd, strerror(errno));
1227 		} else if (nbytes == 0)
1228 			RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1229 		return;
1230 	} while (1);
1231 }
1232 
1233 static int
1234 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1235 			struct rte_epoll_event *events)
1236 {
1237 	unsigned int i, count = 0;
1238 	struct rte_epoll_event *rev;
1239 	uint32_t valid_status;
1240 
1241 	for (i = 0; i < n; i++) {
1242 		rev = evs[i].data.ptr;
1243 		valid_status =  RTE_EPOLL_VALID;
1244 		/* ACQUIRE memory ordering here pairs with RELEASE
1245 		 * ordering below acting as a lock to synchronize
1246 		 * the event data updating.
1247 		 */
1248 		if (!rev || !__atomic_compare_exchange_n(&rev->status,
1249 				    &valid_status, RTE_EPOLL_EXEC, 0,
1250 				    __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1251 			continue;
1252 
1253 		events[count].status        = RTE_EPOLL_VALID;
1254 		events[count].fd            = rev->fd;
1255 		events[count].epfd          = rev->epfd;
1256 		events[count].epdata.event  = evs[i].events;
1257 		events[count].epdata.data   = rev->epdata.data;
1258 		if (rev->epdata.cb_fun)
1259 			rev->epdata.cb_fun(rev->fd,
1260 					   rev->epdata.cb_arg);
1261 
1262 		/* the status update should be observed after
1263 		 * the other fields change.
1264 		 */
1265 		__atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1266 				__ATOMIC_RELEASE);
1267 		count++;
1268 	}
1269 	return count;
1270 }
1271 
1272 static inline int
1273 eal_init_tls_epfd(void)
1274 {
1275 	int pfd = epoll_create(255);
1276 
1277 	if (pfd < 0) {
1278 		RTE_LOG(ERR, EAL,
1279 			"Cannot create epoll instance\n");
1280 		return -1;
1281 	}
1282 	return pfd;
1283 }
1284 
1285 int
1286 rte_intr_tls_epfd(void)
1287 {
1288 	if (RTE_PER_LCORE(_epfd) == -1)
1289 		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1290 
1291 	return RTE_PER_LCORE(_epfd);
1292 }
1293 
1294 static int
1295 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1296 	       int maxevents, int timeout, bool interruptible)
1297 {
1298 	struct epoll_event evs[maxevents];
1299 	int rc;
1300 
1301 	if (!events) {
1302 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1303 		return -1;
1304 	}
1305 
1306 	/* using per thread epoll fd */
1307 	if (epfd == RTE_EPOLL_PER_THREAD)
1308 		epfd = rte_intr_tls_epfd();
1309 
1310 	while (1) {
1311 		rc = epoll_wait(epfd, evs, maxevents, timeout);
1312 		if (likely(rc > 0)) {
1313 			/* epoll_wait has at least one fd ready to read */
1314 			rc = eal_epoll_process_event(evs, rc, events);
1315 			break;
1316 		} else if (rc < 0) {
1317 			if (errno == EINTR) {
1318 				if (interruptible)
1319 					return -1;
1320 				else
1321 					continue;
1322 			}
1323 			/* epoll_wait fail */
1324 			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1325 				strerror(errno));
1326 			rc = -1;
1327 			break;
1328 		} else {
1329 			/* rc == 0, epoll_wait timed out */
1330 			break;
1331 		}
1332 	}
1333 
1334 	return rc;
1335 }
1336 
1337 int
1338 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1339 	       int maxevents, int timeout)
1340 {
1341 	return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1342 }
1343 
1344 int
1345 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1346 			     int maxevents, int timeout)
1347 {
1348 	return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1349 }
1350 
1351 static inline void
1352 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1353 {
1354 	uint32_t valid_status = RTE_EPOLL_VALID;
1355 
1356 	while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1357 		    RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1358 		while (__atomic_load_n(&ev->status,
1359 				__ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1360 			rte_pause();
1361 		valid_status = RTE_EPOLL_VALID;
1362 	}
1363 	memset(&ev->epdata, 0, sizeof(ev->epdata));
1364 	ev->fd = -1;
1365 	ev->epfd = -1;
1366 }
1367 
1368 int
1369 rte_epoll_ctl(int epfd, int op, int fd,
1370 	      struct rte_epoll_event *event)
1371 {
1372 	struct epoll_event ev;
1373 
1374 	if (!event) {
1375 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1376 		return -1;
1377 	}
1378 
1379 	/* using per thread epoll fd */
1380 	if (epfd == RTE_EPOLL_PER_THREAD)
1381 		epfd = rte_intr_tls_epfd();
1382 
1383 	if (op == EPOLL_CTL_ADD) {
1384 		__atomic_store_n(&event->status, RTE_EPOLL_VALID,
1385 				__ATOMIC_RELAXED);
1386 		event->fd = fd;  /* ignore fd in event */
1387 		event->epfd = epfd;
1388 		ev.data.ptr = (void *)event;
1389 	}
1390 
1391 	ev.events = event->epdata.event;
1392 	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1393 		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1394 			op, fd, strerror(errno));
1395 		if (op == EPOLL_CTL_ADD)
1396 			/* rollback status when CTL_ADD fail */
1397 			__atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1398 					__ATOMIC_RELAXED);
1399 		return -1;
1400 	}
1401 
1402 	if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1403 			__ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1404 		eal_epoll_data_safe_free(event);
1405 
1406 	return 0;
1407 }
1408 
1409 int
1410 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1411 		int op, unsigned int vec, void *data)
1412 {
1413 	struct rte_epoll_event *rev;
1414 	struct rte_epoll_data *epdata;
1415 	int epfd_op;
1416 	unsigned int efd_idx;
1417 	int rc = 0;
1418 
1419 	efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1420 		(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1421 
1422 	if (!intr_handle || intr_handle->nb_efd == 0 ||
1423 	    efd_idx >= intr_handle->nb_efd) {
1424 		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1425 		return -EPERM;
1426 	}
1427 
1428 	switch (op) {
1429 	case RTE_INTR_EVENT_ADD:
1430 		epfd_op = EPOLL_CTL_ADD;
1431 		rev = &intr_handle->elist[efd_idx];
1432 		if (__atomic_load_n(&rev->status,
1433 				__ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1434 			RTE_LOG(INFO, EAL, "Event already been added.\n");
1435 			return -EEXIST;
1436 		}
1437 
1438 		/* attach to intr vector fd */
1439 		epdata = &rev->epdata;
1440 		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1441 		epdata->data   = data;
1442 		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1443 		epdata->cb_arg = (void *)intr_handle;
1444 		rc = rte_epoll_ctl(epfd, epfd_op,
1445 				   intr_handle->efds[efd_idx], rev);
1446 		if (!rc)
1447 			RTE_LOG(DEBUG, EAL,
1448 				"efd %d associated with vec %d added on epfd %d"
1449 				"\n", rev->fd, vec, epfd);
1450 		else
1451 			rc = -EPERM;
1452 		break;
1453 	case RTE_INTR_EVENT_DEL:
1454 		epfd_op = EPOLL_CTL_DEL;
1455 		rev = &intr_handle->elist[efd_idx];
1456 		if (__atomic_load_n(&rev->status,
1457 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1458 			RTE_LOG(INFO, EAL, "Event does not exist.\n");
1459 			return -EPERM;
1460 		}
1461 
1462 		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1463 		if (rc)
1464 			rc = -EPERM;
1465 		break;
1466 	default:
1467 		RTE_LOG(ERR, EAL, "event op type mismatch\n");
1468 		rc = -EPERM;
1469 	}
1470 
1471 	return rc;
1472 }
1473 
1474 void
1475 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1476 {
1477 	uint32_t i;
1478 	struct rte_epoll_event *rev;
1479 
1480 	for (i = 0; i < intr_handle->nb_efd; i++) {
1481 		rev = &intr_handle->elist[i];
1482 		if (__atomic_load_n(&rev->status,
1483 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1484 			continue;
1485 		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1486 			/* force free if the entry valid */
1487 			eal_epoll_data_safe_free(rev);
1488 		}
1489 	}
1490 }
1491 
1492 int
1493 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1494 {
1495 	uint32_t i;
1496 	int fd;
1497 	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1498 
1499 	assert(nb_efd != 0);
1500 
1501 	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1502 		for (i = 0; i < n; i++) {
1503 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1504 			if (fd < 0) {
1505 				RTE_LOG(ERR, EAL,
1506 					"can't setup eventfd, error %i (%s)\n",
1507 					errno, strerror(errno));
1508 				return -errno;
1509 			}
1510 			intr_handle->efds[i] = fd;
1511 		}
1512 		intr_handle->nb_efd   = n;
1513 		intr_handle->max_intr = NB_OTHER_INTR + n;
1514 	} else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1515 		/* only check, initialization would be done in vdev driver.*/
1516 		if (intr_handle->efd_counter_size >
1517 		    sizeof(union rte_intr_read_buffer)) {
1518 			RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1519 			return -EINVAL;
1520 		}
1521 	} else {
1522 		intr_handle->efds[0]  = intr_handle->fd;
1523 		intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1524 		intr_handle->max_intr = NB_OTHER_INTR;
1525 	}
1526 
1527 	return 0;
1528 }
1529 
1530 void
1531 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1532 {
1533 	uint32_t i;
1534 
1535 	rte_intr_free_epoll_fd(intr_handle);
1536 	if (intr_handle->max_intr > intr_handle->nb_efd) {
1537 		for (i = 0; i < intr_handle->nb_efd; i++)
1538 			close(intr_handle->efds[i]);
1539 	}
1540 	intr_handle->nb_efd = 0;
1541 	intr_handle->max_intr = 0;
1542 }
1543 
1544 int
1545 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1546 {
1547 	return !(!intr_handle->nb_efd);
1548 }
1549 
1550 int
1551 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1552 {
1553 	if (!rte_intr_dp_is_en(intr_handle))
1554 		return 1;
1555 	else
1556 		return !!(intr_handle->max_intr - intr_handle->nb_efd);
1557 }
1558 
1559 int
1560 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1561 {
1562 	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1563 		return 1;
1564 
1565 	if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1566 		return 1;
1567 
1568 	return 0;
1569 }
1570 
1571 int rte_thread_is_intr(void)
1572 {
1573 	return pthread_equal(intr_thread, pthread_self());
1574 }
1575