xref: /dpdk/lib/eal/linux/eal_interrupts.c (revision 8b8036a66e3d59ffa58afb8d96fa2c73262155a7)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21 
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37 
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41 
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44 
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46 
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51 	struct {
52 		int pipefd[2];
53 	};
54 	struct {
55 		int readfd;
56 		int writefd;
57 	};
58 };
59 
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64 	int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66 	uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68 	uint64_t timerfd_num;            /* for timerfd */
69 	char charbuf[16];                /* for others */
70 };
71 
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74 
75 struct rte_intr_callback {
76 	TAILQ_ENTRY(rte_intr_callback) next;
77 	rte_intr_callback_fn cb_fn;  /**< callback address */
78 	void *cb_arg;                /**< parameter for callback */
79 	uint8_t pending_delete;      /**< delete after callback is called */
80 	rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82 
83 struct rte_intr_source {
84 	TAILQ_ENTRY(rte_intr_source) next;
85 	struct rte_intr_handle *intr_handle; /**< interrupt handle */
86 	struct rte_intr_cb_list callbacks;  /**< user callbacks */
87 	uint32_t active;
88 };
89 
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92 
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95 
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98 
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101 
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104 
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108 			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109 
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113 	struct vfio_irq_set *irq_set;
114 	char irq_set_buf[IRQ_SET_BUF_LEN];
115 	int len, ret, vfio_dev_fd;
116 	int *fd_ptr;
117 
118 	len = sizeof(irq_set_buf);
119 
120 	/* enable INTx */
121 	irq_set = (struct vfio_irq_set *) irq_set_buf;
122 	irq_set->argsz = len;
123 	irq_set->count = 1;
124 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126 	irq_set->start = 0;
127 	fd_ptr = (int *) &irq_set->data;
128 	*fd_ptr = rte_intr_fd_get(intr_handle);
129 
130 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
131 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
132 
133 	if (ret) {
134 		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
135 			rte_intr_fd_get(intr_handle));
136 		return -1;
137 	}
138 
139 	/* unmask INTx after enabling */
140 	memset(irq_set, 0, len);
141 	len = sizeof(struct vfio_irq_set);
142 	irq_set->argsz = len;
143 	irq_set->count = 1;
144 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
145 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
146 	irq_set->start = 0;
147 
148 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
149 
150 	if (ret) {
151 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
152 			rte_intr_fd_get(intr_handle));
153 		return -1;
154 	}
155 	return 0;
156 }
157 
158 /* disable legacy (INTx) interrupts */
159 static int
160 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
161 	struct vfio_irq_set *irq_set;
162 	char irq_set_buf[IRQ_SET_BUF_LEN];
163 	int len, ret, vfio_dev_fd;
164 
165 	len = sizeof(struct vfio_irq_set);
166 
167 	/* mask interrupts before disabling */
168 	irq_set = (struct vfio_irq_set *) irq_set_buf;
169 	irq_set->argsz = len;
170 	irq_set->count = 1;
171 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
172 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
173 	irq_set->start = 0;
174 
175 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
176 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
177 
178 	if (ret) {
179 		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
180 			rte_intr_fd_get(intr_handle));
181 		return -1;
182 	}
183 
184 	/* disable INTx*/
185 	memset(irq_set, 0, len);
186 	irq_set->argsz = len;
187 	irq_set->count = 0;
188 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
189 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
190 	irq_set->start = 0;
191 
192 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
193 
194 	if (ret) {
195 		RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
196 			rte_intr_fd_get(intr_handle));
197 		return -1;
198 	}
199 	return 0;
200 }
201 
202 /* unmask/ack legacy (INTx) interrupts */
203 static int
204 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
205 {
206 	struct vfio_irq_set irq_set;
207 	int vfio_dev_fd;
208 
209 	/* unmask INTx */
210 	memset(&irq_set, 0, sizeof(irq_set));
211 	irq_set.argsz = sizeof(irq_set);
212 	irq_set.count = 1;
213 	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
214 	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
215 	irq_set.start = 0;
216 
217 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
218 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
219 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
220 			rte_intr_fd_get(intr_handle));
221 		return -1;
222 	}
223 	return 0;
224 }
225 
226 /* enable MSI interrupts */
227 static int
228 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
229 	int len, ret;
230 	char irq_set_buf[IRQ_SET_BUF_LEN];
231 	struct vfio_irq_set *irq_set;
232 	int *fd_ptr, vfio_dev_fd;
233 
234 	len = sizeof(irq_set_buf);
235 
236 	irq_set = (struct vfio_irq_set *) irq_set_buf;
237 	irq_set->argsz = len;
238 	irq_set->count = 1;
239 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
240 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
241 	irq_set->start = 0;
242 	fd_ptr = (int *) &irq_set->data;
243 	*fd_ptr = rte_intr_fd_get(intr_handle);
244 
245 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
246 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
247 
248 	if (ret) {
249 		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
250 			rte_intr_fd_get(intr_handle));
251 		return -1;
252 	}
253 	return 0;
254 }
255 
256 /* disable MSI interrupts */
257 static int
258 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
259 	struct vfio_irq_set *irq_set;
260 	char irq_set_buf[IRQ_SET_BUF_LEN];
261 	int len, ret, vfio_dev_fd;
262 
263 	len = sizeof(struct vfio_irq_set);
264 
265 	irq_set = (struct vfio_irq_set *) irq_set_buf;
266 	irq_set->argsz = len;
267 	irq_set->count = 0;
268 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
269 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
270 	irq_set->start = 0;
271 
272 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
273 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
274 	if (ret)
275 		RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
276 			rte_intr_fd_get(intr_handle));
277 
278 	return ret;
279 }
280 
281 /* enable MSI-X interrupts */
282 static int
283 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
284 	int len, ret;
285 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
286 	struct vfio_irq_set *irq_set;
287 	int *fd_ptr, vfio_dev_fd, i;
288 
289 	len = sizeof(irq_set_buf);
290 
291 	irq_set = (struct vfio_irq_set *) irq_set_buf;
292 	irq_set->argsz = len;
293 	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
294 	irq_set->count = rte_intr_max_intr_get(intr_handle) ?
295 		(rte_intr_max_intr_get(intr_handle) >
296 		 RTE_MAX_RXTX_INTR_VEC_ID + 1 ?	RTE_MAX_RXTX_INTR_VEC_ID + 1 :
297 		 rte_intr_max_intr_get(intr_handle)) : 1;
298 
299 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
300 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301 	irq_set->start = 0;
302 	fd_ptr = (int *) &irq_set->data;
303 	/* INTR vector offset 0 reserve for non-efds mapping */
304 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
305 	for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
306 		fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
307 			rte_intr_efds_index_get(intr_handle, i);
308 	}
309 
310 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
311 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
312 
313 	if (ret) {
314 		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
315 			rte_intr_fd_get(intr_handle));
316 		return -1;
317 	}
318 
319 	return 0;
320 }
321 
322 /* disable MSI-X interrupts */
323 static int
324 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
325 	struct vfio_irq_set *irq_set;
326 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
327 	int len, ret, vfio_dev_fd;
328 
329 	len = sizeof(struct vfio_irq_set);
330 
331 	irq_set = (struct vfio_irq_set *) irq_set_buf;
332 	irq_set->argsz = len;
333 	irq_set->count = 0;
334 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
335 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
336 	irq_set->start = 0;
337 
338 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
339 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
340 
341 	if (ret)
342 		RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
343 			rte_intr_fd_get(intr_handle));
344 
345 	return ret;
346 }
347 
348 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
349 /* enable req notifier */
350 static int
351 vfio_enable_req(const struct rte_intr_handle *intr_handle)
352 {
353 	int len, ret;
354 	char irq_set_buf[IRQ_SET_BUF_LEN];
355 	struct vfio_irq_set *irq_set;
356 	int *fd_ptr, vfio_dev_fd;
357 
358 	len = sizeof(irq_set_buf);
359 
360 	irq_set = (struct vfio_irq_set *) irq_set_buf;
361 	irq_set->argsz = len;
362 	irq_set->count = 1;
363 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364 			 VFIO_IRQ_SET_ACTION_TRIGGER;
365 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
366 	irq_set->start = 0;
367 	fd_ptr = (int *) &irq_set->data;
368 	*fd_ptr = rte_intr_fd_get(intr_handle);
369 
370 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
371 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
372 
373 	if (ret) {
374 		RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
375 			rte_intr_fd_get(intr_handle));
376 		return -1;
377 	}
378 
379 	return 0;
380 }
381 
382 /* disable req notifier */
383 static int
384 vfio_disable_req(const struct rte_intr_handle *intr_handle)
385 {
386 	struct vfio_irq_set *irq_set;
387 	char irq_set_buf[IRQ_SET_BUF_LEN];
388 	int len, ret, vfio_dev_fd;
389 
390 	len = sizeof(struct vfio_irq_set);
391 
392 	irq_set = (struct vfio_irq_set *) irq_set_buf;
393 	irq_set->argsz = len;
394 	irq_set->count = 0;
395 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
396 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
397 	irq_set->start = 0;
398 
399 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
400 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
401 
402 	if (ret)
403 		RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
404 			rte_intr_fd_get(intr_handle));
405 
406 	return ret;
407 }
408 #endif
409 #endif
410 
411 static int
412 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
413 {
414 	unsigned char command_high;
415 	int uio_cfg_fd;
416 
417 	/* use UIO config file descriptor for uio_pci_generic */
418 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
419 	if (pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
420 		RTE_LOG(ERR, EAL,
421 			"Error reading interrupts status for fd %d\n",
422 			uio_cfg_fd);
423 		return -1;
424 	}
425 	/* disable interrupts */
426 	command_high |= 0x4;
427 	if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
428 		RTE_LOG(ERR, EAL,
429 			"Error disabling interrupts for fd %d\n",
430 			uio_cfg_fd);
431 		return -1;
432 	}
433 
434 	return 0;
435 }
436 
437 static int
438 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
439 {
440 	unsigned char command_high;
441 	int uio_cfg_fd;
442 
443 	/* use UIO config file descriptor for uio_pci_generic */
444 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
445 	if (pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
446 		RTE_LOG(ERR, EAL,
447 			"Error reading interrupts status for fd %d\n",
448 			uio_cfg_fd);
449 		return -1;
450 	}
451 	/* enable interrupts */
452 	command_high &= ~0x4;
453 	if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
454 		RTE_LOG(ERR, EAL,
455 			"Error enabling interrupts for fd %d\n",
456 			uio_cfg_fd);
457 		return -1;
458 	}
459 
460 	return 0;
461 }
462 
463 static int
464 uio_intr_disable(const struct rte_intr_handle *intr_handle)
465 {
466 	const int value = 0;
467 
468 	if (write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
469 		RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
470 			rte_intr_fd_get(intr_handle), strerror(errno));
471 		return -1;
472 	}
473 	return 0;
474 }
475 
476 static int
477 uio_intr_enable(const struct rte_intr_handle *intr_handle)
478 {
479 	const int value = 1;
480 
481 	if (write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
482 		RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
483 			rte_intr_fd_get(intr_handle), strerror(errno));
484 		return -1;
485 	}
486 	return 0;
487 }
488 
489 int
490 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
491 			rte_intr_callback_fn cb, void *cb_arg)
492 {
493 	int ret, wake_thread;
494 	struct rte_intr_source *src;
495 	struct rte_intr_callback *callback;
496 
497 	wake_thread = 0;
498 
499 	/* first do parameter checking */
500 	if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
501 		RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
502 		return -EINVAL;
503 	}
504 
505 	/* allocate a new interrupt callback entity */
506 	callback = calloc(1, sizeof(*callback));
507 	if (callback == NULL) {
508 		RTE_LOG(ERR, EAL, "Can not allocate memory\n");
509 		return -ENOMEM;
510 	}
511 	callback->cb_fn = cb;
512 	callback->cb_arg = cb_arg;
513 	callback->pending_delete = 0;
514 	callback->ucb_fn = NULL;
515 
516 	rte_spinlock_lock(&intr_lock);
517 
518 	/* check if there is at least one callback registered for the fd */
519 	TAILQ_FOREACH(src, &intr_sources, next) {
520 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
521 			/* we had no interrupts for this */
522 			if (TAILQ_EMPTY(&src->callbacks))
523 				wake_thread = 1;
524 
525 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
526 			ret = 0;
527 			break;
528 		}
529 	}
530 
531 	/* no existing callbacks for this - add new source */
532 	if (src == NULL) {
533 		src = calloc(1, sizeof(*src));
534 		if (src == NULL) {
535 			RTE_LOG(ERR, EAL, "Can not allocate memory\n");
536 			ret = -ENOMEM;
537 			free(callback);
538 			callback = NULL;
539 		} else {
540 			src->intr_handle = rte_intr_instance_dup(intr_handle);
541 			if (src->intr_handle == NULL) {
542 				RTE_LOG(ERR, EAL, "Can not create intr instance\n");
543 				ret = -ENOMEM;
544 				free(callback);
545 				callback = NULL;
546 				free(src);
547 				src = NULL;
548 			} else {
549 				TAILQ_INIT(&src->callbacks);
550 				TAILQ_INSERT_TAIL(&(src->callbacks), callback,
551 						  next);
552 				TAILQ_INSERT_TAIL(&intr_sources, src, next);
553 				wake_thread = 1;
554 				ret = 0;
555 			}
556 		}
557 	}
558 
559 	rte_spinlock_unlock(&intr_lock);
560 
561 	/**
562 	 * check if need to notify the pipe fd waited by epoll_wait to
563 	 * rebuild the wait list.
564 	 */
565 	if (wake_thread)
566 		if (write(intr_pipe.writefd, "1", 1) < 0)
567 			ret = -EPIPE;
568 
569 	rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
570 	return ret;
571 }
572 
573 int
574 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
575 				rte_intr_callback_fn cb_fn, void *cb_arg,
576 				rte_intr_unregister_callback_fn ucb_fn)
577 {
578 	int ret;
579 	struct rte_intr_source *src;
580 	struct rte_intr_callback *cb, *next;
581 
582 	/* do parameter checking first */
583 	if (rte_intr_fd_get(intr_handle) < 0) {
584 		RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
585 		return -EINVAL;
586 	}
587 
588 	rte_spinlock_lock(&intr_lock);
589 
590 	/* check if the insterrupt source for the fd is existent */
591 	TAILQ_FOREACH(src, &intr_sources, next) {
592 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
593 			break;
594 	}
595 
596 	/* No interrupt source registered for the fd */
597 	if (src == NULL) {
598 		ret = -ENOENT;
599 
600 	/* only usable if the source is active */
601 	} else if (src->active == 0) {
602 		ret = -EAGAIN;
603 
604 	} else {
605 		ret = 0;
606 
607 		/* walk through the callbacks and mark all that match. */
608 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
609 			next = TAILQ_NEXT(cb, next);
610 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
611 					cb->cb_arg == cb_arg)) {
612 				cb->pending_delete = 1;
613 				cb->ucb_fn = ucb_fn;
614 				ret++;
615 			}
616 		}
617 	}
618 
619 	rte_spinlock_unlock(&intr_lock);
620 
621 	return ret;
622 }
623 
624 int
625 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
626 			rte_intr_callback_fn cb_fn, void *cb_arg)
627 {
628 	int ret;
629 	struct rte_intr_source *src;
630 	struct rte_intr_callback *cb, *next;
631 
632 	/* do parameter checking first */
633 	if (rte_intr_fd_get(intr_handle) < 0) {
634 		RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
635 		return -EINVAL;
636 	}
637 
638 	rte_spinlock_lock(&intr_lock);
639 
640 	/* check if the insterrupt source for the fd is existent */
641 	TAILQ_FOREACH(src, &intr_sources, next)
642 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
643 			break;
644 
645 	/* No interrupt source registered for the fd */
646 	if (src == NULL) {
647 		ret = -ENOENT;
648 
649 	/* interrupt source has some active callbacks right now. */
650 	} else if (src->active != 0) {
651 		ret = -EAGAIN;
652 
653 	/* ok to remove. */
654 	} else {
655 		ret = 0;
656 
657 		/*walk through the callbacks and remove all that match. */
658 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
659 
660 			next = TAILQ_NEXT(cb, next);
661 
662 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
663 					cb->cb_arg == cb_arg)) {
664 				TAILQ_REMOVE(&src->callbacks, cb, next);
665 				free(cb);
666 				ret++;
667 			}
668 		}
669 
670 		/* all callbacks for that source are removed. */
671 		if (TAILQ_EMPTY(&src->callbacks)) {
672 			TAILQ_REMOVE(&intr_sources, src, next);
673 			rte_intr_instance_free(src->intr_handle);
674 			free(src);
675 		}
676 	}
677 
678 	rte_spinlock_unlock(&intr_lock);
679 
680 	/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
681 	if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
682 		ret = -EPIPE;
683 	}
684 
685 	rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
686 		ret);
687 	return ret;
688 }
689 
690 int
691 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
692 			rte_intr_callback_fn cb_fn, void *cb_arg)
693 {
694 	int ret = 0;
695 
696 	while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
697 		rte_pause();
698 
699 	return ret;
700 }
701 
702 int
703 rte_intr_enable(const struct rte_intr_handle *intr_handle)
704 {
705 	int rc = 0, uio_cfg_fd;
706 
707 	if (intr_handle == NULL)
708 		return -1;
709 
710 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
711 		rc = 0;
712 		goto out;
713 	}
714 
715 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
716 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
717 		rc = -1;
718 		goto out;
719 	}
720 
721 	switch (rte_intr_type_get(intr_handle)) {
722 	/* write to the uio fd to enable the interrupt */
723 	case RTE_INTR_HANDLE_UIO:
724 		if (uio_intr_enable(intr_handle))
725 			rc = -1;
726 		break;
727 	case RTE_INTR_HANDLE_UIO_INTX:
728 		if (uio_intx_intr_enable(intr_handle))
729 			rc = -1;
730 		break;
731 	/* not used at this moment */
732 	case RTE_INTR_HANDLE_ALARM:
733 		rc = -1;
734 		break;
735 #ifdef VFIO_PRESENT
736 	case RTE_INTR_HANDLE_VFIO_MSIX:
737 		if (vfio_enable_msix(intr_handle))
738 			rc = -1;
739 		break;
740 	case RTE_INTR_HANDLE_VFIO_MSI:
741 		if (vfio_enable_msi(intr_handle))
742 			rc = -1;
743 		break;
744 	case RTE_INTR_HANDLE_VFIO_LEGACY:
745 		if (vfio_enable_intx(intr_handle))
746 			rc = -1;
747 		break;
748 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
749 	case RTE_INTR_HANDLE_VFIO_REQ:
750 		if (vfio_enable_req(intr_handle))
751 			rc = -1;
752 		break;
753 #endif
754 #endif
755 	/* not used at this moment */
756 	case RTE_INTR_HANDLE_DEV_EVENT:
757 		rc = -1;
758 		break;
759 	/* unknown handle type */
760 	default:
761 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
762 			rte_intr_fd_get(intr_handle));
763 		rc = -1;
764 		break;
765 	}
766 out:
767 	rte_eal_trace_intr_enable(intr_handle, rc);
768 	return rc;
769 }
770 
771 /**
772  * PMD generally calls this function at the end of its IRQ callback.
773  * Internally, it unmasks the interrupt if possible.
774  *
775  * For INTx, unmasking is required as the interrupt is auto-masked prior to
776  * invoking callback.
777  *
778  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
779  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
780  * this function is no-op.
781  */
782 int
783 rte_intr_ack(const struct rte_intr_handle *intr_handle)
784 {
785 	int uio_cfg_fd;
786 
787 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
788 		return 0;
789 
790 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
791 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
792 		return -1;
793 
794 	switch (rte_intr_type_get(intr_handle)) {
795 	/* Both acking and enabling are same for UIO */
796 	case RTE_INTR_HANDLE_UIO:
797 		if (uio_intr_enable(intr_handle))
798 			return -1;
799 		break;
800 	case RTE_INTR_HANDLE_UIO_INTX:
801 		if (uio_intx_intr_enable(intr_handle))
802 			return -1;
803 		break;
804 	/* not used at this moment */
805 	case RTE_INTR_HANDLE_ALARM:
806 		return -1;
807 #ifdef VFIO_PRESENT
808 	/* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
809 	case RTE_INTR_HANDLE_VFIO_MSIX:
810 	case RTE_INTR_HANDLE_VFIO_MSI:
811 		return 0;
812 	case RTE_INTR_HANDLE_VFIO_LEGACY:
813 		if (vfio_ack_intx(intr_handle))
814 			return -1;
815 		break;
816 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
817 	case RTE_INTR_HANDLE_VFIO_REQ:
818 		return -1;
819 #endif
820 #endif
821 	/* not used at this moment */
822 	case RTE_INTR_HANDLE_DEV_EVENT:
823 		return -1;
824 	/* unknown handle type */
825 	default:
826 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
827 			rte_intr_fd_get(intr_handle));
828 		return -1;
829 	}
830 
831 	return 0;
832 }
833 
834 int
835 rte_intr_disable(const struct rte_intr_handle *intr_handle)
836 {
837 	int rc = 0, uio_cfg_fd;
838 
839 	if (intr_handle == NULL)
840 		return -1;
841 
842 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
843 		rc = 0;
844 		goto out;
845 	}
846 
847 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
848 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
849 		rc = -1;
850 		goto out;
851 	}
852 
853 	switch (rte_intr_type_get(intr_handle)) {
854 	/* write to the uio fd to disable the interrupt */
855 	case RTE_INTR_HANDLE_UIO:
856 		if (uio_intr_disable(intr_handle))
857 			rc = -1;
858 		break;
859 	case RTE_INTR_HANDLE_UIO_INTX:
860 		if (uio_intx_intr_disable(intr_handle))
861 			rc = -1;
862 		break;
863 	/* not used at this moment */
864 	case RTE_INTR_HANDLE_ALARM:
865 		rc = -1;
866 		break;
867 #ifdef VFIO_PRESENT
868 	case RTE_INTR_HANDLE_VFIO_MSIX:
869 		if (vfio_disable_msix(intr_handle))
870 			rc = -1;
871 		break;
872 	case RTE_INTR_HANDLE_VFIO_MSI:
873 		if (vfio_disable_msi(intr_handle))
874 			rc = -1;
875 		break;
876 	case RTE_INTR_HANDLE_VFIO_LEGACY:
877 		if (vfio_disable_intx(intr_handle))
878 			rc = -1;
879 		break;
880 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
881 	case RTE_INTR_HANDLE_VFIO_REQ:
882 		if (vfio_disable_req(intr_handle))
883 			rc = -1;
884 		break;
885 #endif
886 #endif
887 	/* not used at this moment */
888 	case RTE_INTR_HANDLE_DEV_EVENT:
889 		rc = -1;
890 		break;
891 	/* unknown handle type */
892 	default:
893 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
894 			rte_intr_fd_get(intr_handle));
895 		rc = -1;
896 		break;
897 	}
898 out:
899 	rte_eal_trace_intr_disable(intr_handle, rc);
900 	return rc;
901 }
902 
903 static int
904 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
905 {
906 	bool call = false;
907 	int n, bytes_read, rv;
908 	struct rte_intr_source *src;
909 	struct rte_intr_callback *cb, *next;
910 	union rte_intr_read_buffer buf;
911 	struct rte_intr_callback active_cb;
912 
913 	for (n = 0; n < nfds; n++) {
914 
915 		/**
916 		 * if the pipe fd is ready to read, return out to
917 		 * rebuild the wait list.
918 		 */
919 		if (events[n].data.fd == intr_pipe.readfd){
920 			int r = read(intr_pipe.readfd, buf.charbuf,
921 					sizeof(buf.charbuf));
922 			RTE_SET_USED(r);
923 			return -1;
924 		}
925 		rte_spinlock_lock(&intr_lock);
926 		TAILQ_FOREACH(src, &intr_sources, next)
927 			if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
928 				break;
929 		if (src == NULL){
930 			rte_spinlock_unlock(&intr_lock);
931 			continue;
932 		}
933 
934 		/* mark this interrupt source as active and release the lock. */
935 		src->active = 1;
936 		rte_spinlock_unlock(&intr_lock);
937 
938 		/* set the length to be read dor different handle type */
939 		switch (rte_intr_type_get(src->intr_handle)) {
940 		case RTE_INTR_HANDLE_UIO:
941 		case RTE_INTR_HANDLE_UIO_INTX:
942 			bytes_read = sizeof(buf.uio_intr_count);
943 			break;
944 		case RTE_INTR_HANDLE_ALARM:
945 			bytes_read = sizeof(buf.timerfd_num);
946 			break;
947 #ifdef VFIO_PRESENT
948 		case RTE_INTR_HANDLE_VFIO_MSIX:
949 		case RTE_INTR_HANDLE_VFIO_MSI:
950 		case RTE_INTR_HANDLE_VFIO_LEGACY:
951 			bytes_read = sizeof(buf.vfio_intr_count);
952 			break;
953 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
954 		case RTE_INTR_HANDLE_VFIO_REQ:
955 			bytes_read = 0;
956 			call = true;
957 			break;
958 #endif
959 #endif
960 		case RTE_INTR_HANDLE_VDEV:
961 		case RTE_INTR_HANDLE_EXT:
962 			bytes_read = 0;
963 			call = true;
964 			break;
965 		case RTE_INTR_HANDLE_DEV_EVENT:
966 			bytes_read = 0;
967 			call = true;
968 			break;
969 		default:
970 			bytes_read = 1;
971 			break;
972 		}
973 
974 		if (bytes_read > 0) {
975 			/**
976 			 * read out to clear the ready-to-be-read flag
977 			 * for epoll_wait.
978 			 */
979 			bytes_read = read(events[n].data.fd, &buf, bytes_read);
980 			if (bytes_read < 0) {
981 				if (errno == EINTR || errno == EWOULDBLOCK)
982 					continue;
983 
984 				RTE_LOG(ERR, EAL, "Error reading from file "
985 					"descriptor %d: %s\n",
986 					events[n].data.fd,
987 					strerror(errno));
988 				/*
989 				 * The device is unplugged or buggy, remove
990 				 * it as an interrupt source and return to
991 				 * force the wait list to be rebuilt.
992 				 */
993 				rte_spinlock_lock(&intr_lock);
994 				TAILQ_REMOVE(&intr_sources, src, next);
995 				rte_spinlock_unlock(&intr_lock);
996 
997 				for (cb = TAILQ_FIRST(&src->callbacks); cb;
998 							cb = next) {
999 					next = TAILQ_NEXT(cb, next);
1000 					TAILQ_REMOVE(&src->callbacks, cb, next);
1001 					free(cb);
1002 				}
1003 				rte_intr_instance_free(src->intr_handle);
1004 				free(src);
1005 				return -1;
1006 			} else if (bytes_read == 0)
1007 				RTE_LOG(ERR, EAL, "Read nothing from file "
1008 					"descriptor %d\n", events[n].data.fd);
1009 			else
1010 				call = true;
1011 		}
1012 
1013 		/* grab a lock, again to call callbacks and update status. */
1014 		rte_spinlock_lock(&intr_lock);
1015 
1016 		if (call) {
1017 
1018 			/* Finally, call all callbacks. */
1019 			TAILQ_FOREACH(cb, &src->callbacks, next) {
1020 
1021 				/* make a copy and unlock. */
1022 				active_cb = *cb;
1023 				rte_spinlock_unlock(&intr_lock);
1024 
1025 				/* call the actual callback */
1026 				active_cb.cb_fn(active_cb.cb_arg);
1027 
1028 				/*get the lock back. */
1029 				rte_spinlock_lock(&intr_lock);
1030 			}
1031 		}
1032 		/* we done with that interrupt source, release it. */
1033 		src->active = 0;
1034 
1035 		rv = 0;
1036 
1037 		/* check if any callback are supposed to be removed */
1038 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1039 			next = TAILQ_NEXT(cb, next);
1040 			if (cb->pending_delete) {
1041 				TAILQ_REMOVE(&src->callbacks, cb, next);
1042 				if (cb->ucb_fn)
1043 					cb->ucb_fn(src->intr_handle, cb->cb_arg);
1044 				free(cb);
1045 				rv++;
1046 			}
1047 		}
1048 
1049 		/* all callbacks for that source are removed. */
1050 		if (TAILQ_EMPTY(&src->callbacks)) {
1051 			TAILQ_REMOVE(&intr_sources, src, next);
1052 			rte_intr_instance_free(src->intr_handle);
1053 			free(src);
1054 		}
1055 
1056 		/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1057 		if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1058 			rte_spinlock_unlock(&intr_lock);
1059 			return -EPIPE;
1060 		}
1061 
1062 		rte_spinlock_unlock(&intr_lock);
1063 	}
1064 
1065 	return 0;
1066 }
1067 
1068 /**
1069  * It handles all the interrupts.
1070  *
1071  * @param pfd
1072  *  epoll file descriptor.
1073  * @param totalfds
1074  *  The number of file descriptors added in epoll.
1075  *
1076  * @return
1077  *  void
1078  */
1079 static void
1080 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1081 {
1082 	struct epoll_event events[totalfds];
1083 	int nfds = 0;
1084 
1085 	for(;;) {
1086 		nfds = epoll_wait(pfd, events, totalfds,
1087 			EAL_INTR_EPOLL_WAIT_FOREVER);
1088 		/* epoll_wait fail */
1089 		if (nfds < 0) {
1090 			if (errno == EINTR)
1091 				continue;
1092 			RTE_LOG(ERR, EAL,
1093 				"epoll_wait returns with fail\n");
1094 			return;
1095 		}
1096 		/* epoll_wait timeout, will never happens here */
1097 		else if (nfds == 0)
1098 			continue;
1099 		/* epoll_wait has at least one fd ready to read */
1100 		if (eal_intr_process_interrupts(events, nfds) < 0)
1101 			return;
1102 	}
1103 }
1104 
1105 /**
1106  * It builds/rebuilds up the epoll file descriptor with all the
1107  * file descriptors being waited on. Then handles the interrupts.
1108  *
1109  * @param arg
1110  *  pointer. (unused)
1111  *
1112  * @return
1113  *  never return;
1114  */
1115 static __rte_noreturn void *
1116 eal_intr_thread_main(__rte_unused void *arg)
1117 {
1118 	/* host thread, never break out */
1119 	for (;;) {
1120 		/* build up the epoll fd with all descriptors we are to
1121 		 * wait on then pass it to the handle_interrupts function
1122 		 */
1123 		static struct epoll_event pipe_event = {
1124 			.events = EPOLLIN | EPOLLPRI,
1125 		};
1126 		struct rte_intr_source *src;
1127 		unsigned numfds = 0;
1128 
1129 		/* create epoll fd */
1130 		int pfd = epoll_create(1);
1131 		if (pfd < 0)
1132 			rte_panic("Cannot create epoll instance\n");
1133 
1134 		pipe_event.data.fd = intr_pipe.readfd;
1135 		/**
1136 		 * add pipe fd into wait list, this pipe is used to
1137 		 * rebuild the wait list.
1138 		 */
1139 		if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1140 						&pipe_event) < 0) {
1141 			rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1142 					intr_pipe.readfd, strerror(errno));
1143 		}
1144 		numfds++;
1145 
1146 		rte_spinlock_lock(&intr_lock);
1147 
1148 		TAILQ_FOREACH(src, &intr_sources, next) {
1149 			struct epoll_event ev;
1150 
1151 			if (src->callbacks.tqh_first == NULL)
1152 				continue; /* skip those with no callbacks */
1153 			memset(&ev, 0, sizeof(ev));
1154 			ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1155 			ev.data.fd = rte_intr_fd_get(src->intr_handle);
1156 
1157 			/**
1158 			 * add all the uio device file descriptor
1159 			 * into wait list.
1160 			 */
1161 			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1162 					rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1163 				rte_panic("Error adding fd %d epoll_ctl, %s\n",
1164 					rte_intr_fd_get(src->intr_handle),
1165 					strerror(errno));
1166 			}
1167 			else
1168 				numfds++;
1169 		}
1170 		rte_spinlock_unlock(&intr_lock);
1171 		/* serve the interrupt */
1172 		eal_intr_handle_interrupts(pfd, numfds);
1173 
1174 		/**
1175 		 * when we return, we need to rebuild the
1176 		 * list of fds to monitor.
1177 		 */
1178 		close(pfd);
1179 	}
1180 }
1181 
1182 int
1183 rte_eal_intr_init(void)
1184 {
1185 	int ret = 0;
1186 
1187 	/* init the global interrupt source head */
1188 	TAILQ_INIT(&intr_sources);
1189 
1190 	/**
1191 	 * create a pipe which will be waited by epoll and notified to
1192 	 * rebuild the wait list of epoll.
1193 	 */
1194 	if (pipe(intr_pipe.pipefd) < 0) {
1195 		rte_errno = errno;
1196 		return -1;
1197 	}
1198 
1199 	/* create the host thread to wait/handle the interrupt */
1200 	ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1201 			eal_intr_thread_main, NULL);
1202 	if (ret != 0) {
1203 		rte_errno = -ret;
1204 		RTE_LOG(ERR, EAL,
1205 			"Failed to create thread for interrupt handling\n");
1206 	}
1207 
1208 	return ret;
1209 }
1210 
1211 static void
1212 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1213 {
1214 	union rte_intr_read_buffer buf;
1215 	int bytes_read = 0;
1216 	int nbytes;
1217 
1218 	switch (rte_intr_type_get(intr_handle)) {
1219 	case RTE_INTR_HANDLE_UIO:
1220 	case RTE_INTR_HANDLE_UIO_INTX:
1221 		bytes_read = sizeof(buf.uio_intr_count);
1222 		break;
1223 #ifdef VFIO_PRESENT
1224 	case RTE_INTR_HANDLE_VFIO_MSIX:
1225 	case RTE_INTR_HANDLE_VFIO_MSI:
1226 	case RTE_INTR_HANDLE_VFIO_LEGACY:
1227 		bytes_read = sizeof(buf.vfio_intr_count);
1228 		break;
1229 #endif
1230 	case RTE_INTR_HANDLE_VDEV:
1231 		bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1232 		/* For vdev, number of bytes to read is set by driver */
1233 		break;
1234 	case RTE_INTR_HANDLE_EXT:
1235 		return;
1236 	default:
1237 		bytes_read = 1;
1238 		RTE_LOG(INFO, EAL, "unexpected intr type\n");
1239 		break;
1240 	}
1241 
1242 	/**
1243 	 * read out to clear the ready-to-be-read flag
1244 	 * for epoll_wait.
1245 	 */
1246 	if (bytes_read == 0)
1247 		return;
1248 	do {
1249 		nbytes = read(fd, &buf, bytes_read);
1250 		if (nbytes < 0) {
1251 			if (errno == EINTR || errno == EWOULDBLOCK ||
1252 			    errno == EAGAIN)
1253 				continue;
1254 			RTE_LOG(ERR, EAL,
1255 				"Error reading from fd %d: %s\n",
1256 				fd, strerror(errno));
1257 		} else if (nbytes == 0)
1258 			RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1259 		return;
1260 	} while (1);
1261 }
1262 
1263 static int
1264 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1265 			struct rte_epoll_event *events)
1266 {
1267 	unsigned int i, count = 0;
1268 	struct rte_epoll_event *rev;
1269 	uint32_t valid_status;
1270 
1271 	for (i = 0; i < n; i++) {
1272 		rev = evs[i].data.ptr;
1273 		valid_status =  RTE_EPOLL_VALID;
1274 		/* ACQUIRE memory ordering here pairs with RELEASE
1275 		 * ordering below acting as a lock to synchronize
1276 		 * the event data updating.
1277 		 */
1278 		if (!rev || !__atomic_compare_exchange_n(&rev->status,
1279 				    &valid_status, RTE_EPOLL_EXEC, 0,
1280 				    __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1281 			continue;
1282 
1283 		events[count].status        = RTE_EPOLL_VALID;
1284 		events[count].fd            = rev->fd;
1285 		events[count].epfd          = rev->epfd;
1286 		events[count].epdata.event  = evs[i].events;
1287 		events[count].epdata.data   = rev->epdata.data;
1288 		if (rev->epdata.cb_fun)
1289 			rev->epdata.cb_fun(rev->fd,
1290 					   rev->epdata.cb_arg);
1291 
1292 		/* the status update should be observed after
1293 		 * the other fields change.
1294 		 */
1295 		__atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1296 				__ATOMIC_RELEASE);
1297 		count++;
1298 	}
1299 	return count;
1300 }
1301 
1302 static inline int
1303 eal_init_tls_epfd(void)
1304 {
1305 	int pfd = epoll_create(255);
1306 
1307 	if (pfd < 0) {
1308 		RTE_LOG(ERR, EAL,
1309 			"Cannot create epoll instance\n");
1310 		return -1;
1311 	}
1312 	return pfd;
1313 }
1314 
1315 int
1316 rte_intr_tls_epfd(void)
1317 {
1318 	if (RTE_PER_LCORE(_epfd) == -1)
1319 		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1320 
1321 	return RTE_PER_LCORE(_epfd);
1322 }
1323 
1324 static int
1325 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1326 	       int maxevents, int timeout, bool interruptible)
1327 {
1328 	struct epoll_event evs[maxevents];
1329 	int rc;
1330 
1331 	if (!events) {
1332 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1333 		return -1;
1334 	}
1335 
1336 	/* using per thread epoll fd */
1337 	if (epfd == RTE_EPOLL_PER_THREAD)
1338 		epfd = rte_intr_tls_epfd();
1339 
1340 	while (1) {
1341 		rc = epoll_wait(epfd, evs, maxevents, timeout);
1342 		if (likely(rc > 0)) {
1343 			/* epoll_wait has at least one fd ready to read */
1344 			rc = eal_epoll_process_event(evs, rc, events);
1345 			break;
1346 		} else if (rc < 0) {
1347 			if (errno == EINTR) {
1348 				if (interruptible)
1349 					return -1;
1350 				else
1351 					continue;
1352 			}
1353 			/* epoll_wait fail */
1354 			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1355 				strerror(errno));
1356 			rc = -1;
1357 			break;
1358 		} else {
1359 			/* rc == 0, epoll_wait timed out */
1360 			break;
1361 		}
1362 	}
1363 
1364 	return rc;
1365 }
1366 
1367 int
1368 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1369 	       int maxevents, int timeout)
1370 {
1371 	return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1372 }
1373 
1374 int
1375 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1376 			     int maxevents, int timeout)
1377 {
1378 	return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1379 }
1380 
1381 static inline void
1382 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1383 {
1384 	uint32_t valid_status = RTE_EPOLL_VALID;
1385 
1386 	while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1387 		    RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1388 		while (__atomic_load_n(&ev->status,
1389 				__ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1390 			rte_pause();
1391 		valid_status = RTE_EPOLL_VALID;
1392 	}
1393 	memset(&ev->epdata, 0, sizeof(ev->epdata));
1394 	ev->fd = -1;
1395 	ev->epfd = -1;
1396 }
1397 
1398 int
1399 rte_epoll_ctl(int epfd, int op, int fd,
1400 	      struct rte_epoll_event *event)
1401 {
1402 	struct epoll_event ev;
1403 
1404 	if (!event) {
1405 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1406 		return -1;
1407 	}
1408 
1409 	/* using per thread epoll fd */
1410 	if (epfd == RTE_EPOLL_PER_THREAD)
1411 		epfd = rte_intr_tls_epfd();
1412 
1413 	if (op == EPOLL_CTL_ADD) {
1414 		__atomic_store_n(&event->status, RTE_EPOLL_VALID,
1415 				__ATOMIC_RELAXED);
1416 		event->fd = fd;  /* ignore fd in event */
1417 		event->epfd = epfd;
1418 		ev.data.ptr = (void *)event;
1419 	}
1420 
1421 	ev.events = event->epdata.event;
1422 	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1423 		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1424 			op, fd, strerror(errno));
1425 		if (op == EPOLL_CTL_ADD)
1426 			/* rollback status when CTL_ADD fail */
1427 			__atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1428 					__ATOMIC_RELAXED);
1429 		return -1;
1430 	}
1431 
1432 	if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1433 			__ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1434 		eal_epoll_data_safe_free(event);
1435 
1436 	return 0;
1437 }
1438 
1439 int
1440 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1441 		int op, unsigned int vec, void *data)
1442 {
1443 	struct rte_epoll_event *rev;
1444 	struct rte_epoll_data *epdata;
1445 	int epfd_op;
1446 	unsigned int efd_idx;
1447 	int rc = 0;
1448 
1449 	efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1450 		(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1451 
1452 	if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1453 			efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1454 		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1455 		return -EPERM;
1456 	}
1457 
1458 	switch (op) {
1459 	case RTE_INTR_EVENT_ADD:
1460 		epfd_op = EPOLL_CTL_ADD;
1461 		rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1462 		if (__atomic_load_n(&rev->status,
1463 				__ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1464 			RTE_LOG(INFO, EAL, "Event already been added.\n");
1465 			return -EEXIST;
1466 		}
1467 
1468 		/* attach to intr vector fd */
1469 		epdata = &rev->epdata;
1470 		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1471 		epdata->data   = data;
1472 		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1473 		epdata->cb_arg = (void *)intr_handle;
1474 		rc = rte_epoll_ctl(epfd, epfd_op,
1475 			rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1476 		if (!rc)
1477 			RTE_LOG(DEBUG, EAL,
1478 				"efd %d associated with vec %d added on epfd %d"
1479 				"\n", rev->fd, vec, epfd);
1480 		else
1481 			rc = -EPERM;
1482 		break;
1483 	case RTE_INTR_EVENT_DEL:
1484 		epfd_op = EPOLL_CTL_DEL;
1485 		rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1486 		if (__atomic_load_n(&rev->status,
1487 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1488 			RTE_LOG(INFO, EAL, "Event does not exist.\n");
1489 			return -EPERM;
1490 		}
1491 
1492 		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1493 		if (rc)
1494 			rc = -EPERM;
1495 		break;
1496 	default:
1497 		RTE_LOG(ERR, EAL, "event op type mismatch\n");
1498 		rc = -EPERM;
1499 	}
1500 
1501 	return rc;
1502 }
1503 
1504 void
1505 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1506 {
1507 	uint32_t i;
1508 	struct rte_epoll_event *rev;
1509 
1510 	for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1511 		rev = rte_intr_elist_index_get(intr_handle, i);
1512 		if (__atomic_load_n(&rev->status,
1513 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1514 			continue;
1515 		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1516 			/* force free if the entry valid */
1517 			eal_epoll_data_safe_free(rev);
1518 		}
1519 	}
1520 }
1521 
1522 int
1523 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1524 {
1525 	uint32_t i;
1526 	int fd;
1527 	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1528 
1529 	assert(nb_efd != 0);
1530 
1531 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1532 		for (i = 0; i < n; i++) {
1533 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1534 			if (fd < 0) {
1535 				RTE_LOG(ERR, EAL,
1536 					"can't setup eventfd, error %i (%s)\n",
1537 					errno, strerror(errno));
1538 				return -errno;
1539 			}
1540 
1541 			if (rte_intr_efds_index_set(intr_handle, i, fd))
1542 				return -rte_errno;
1543 		}
1544 
1545 		if (rte_intr_nb_efd_set(intr_handle, n))
1546 			return -rte_errno;
1547 
1548 		if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1549 			return -rte_errno;
1550 	} else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1551 		/* only check, initialization would be done in vdev driver.*/
1552 		if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1553 		    sizeof(union rte_intr_read_buffer)) {
1554 			RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1555 			return -EINVAL;
1556 		}
1557 	} else {
1558 		if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1559 			return -rte_errno;
1560 		if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1561 			return -rte_errno;
1562 		if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1563 			return -rte_errno;
1564 	}
1565 
1566 	return 0;
1567 }
1568 
1569 void
1570 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1571 {
1572 	uint32_t i;
1573 
1574 	rte_intr_free_epoll_fd(intr_handle);
1575 	if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1576 		for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1577 			close(rte_intr_efds_index_get(intr_handle, i));
1578 	}
1579 	rte_intr_nb_efd_set(intr_handle, 0);
1580 	rte_intr_max_intr_set(intr_handle, 0);
1581 }
1582 
1583 int
1584 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1585 {
1586 	return !(!rte_intr_nb_efd_get(intr_handle));
1587 }
1588 
1589 int
1590 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1591 {
1592 	if (!rte_intr_dp_is_en(intr_handle))
1593 		return 1;
1594 	else
1595 		return !!(rte_intr_max_intr_get(intr_handle) -
1596 				rte_intr_nb_efd_get(intr_handle));
1597 }
1598 
1599 int
1600 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1601 {
1602 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1603 		return 1;
1604 
1605 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1606 		return 1;
1607 
1608 	return 0;
1609 }
1610 
1611 int rte_thread_is_intr(void)
1612 {
1613 	return pthread_equal(intr_thread, pthread_self());
1614 }
1615