xref: /dpdk/lib/eal/linux/eal_interrupts.c (revision f8dbaebbf1c9efcbb2e2354b341ed62175466a57)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21 
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37 
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41 
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44 
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46 
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51 	struct {
52 		int pipefd[2];
53 	};
54 	struct {
55 		int readfd;
56 		int writefd;
57 	};
58 };
59 
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64 	int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66 	uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68 	uint64_t timerfd_num;            /* for timerfd */
69 	char charbuf[16];                /* for others */
70 };
71 
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74 
75 struct rte_intr_callback {
76 	TAILQ_ENTRY(rte_intr_callback) next;
77 	rte_intr_callback_fn cb_fn;  /**< callback address */
78 	void *cb_arg;                /**< parameter for callback */
79 	uint8_t pending_delete;      /**< delete after callback is called */
80 	rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82 
83 struct rte_intr_source {
84 	TAILQ_ENTRY(rte_intr_source) next;
85 	struct rte_intr_handle *intr_handle; /**< interrupt handle */
86 	struct rte_intr_cb_list callbacks;  /**< user callbacks */
87 	uint32_t active;
88 };
89 
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92 
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95 
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98 
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101 
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104 
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108 			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109 
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113 	struct vfio_irq_set *irq_set;
114 	char irq_set_buf[IRQ_SET_BUF_LEN];
115 	int len, ret, vfio_dev_fd;
116 	int *fd_ptr;
117 
118 	len = sizeof(irq_set_buf);
119 
120 	/* enable INTx */
121 	irq_set = (struct vfio_irq_set *) irq_set_buf;
122 	irq_set->argsz = len;
123 	irq_set->count = 1;
124 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126 	irq_set->start = 0;
127 	fd_ptr = (int *) &irq_set->data;
128 	*fd_ptr = rte_intr_fd_get(intr_handle);
129 
130 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
131 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
132 
133 	if (ret) {
134 		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
135 			rte_intr_fd_get(intr_handle));
136 		return -1;
137 	}
138 
139 	/* unmask INTx after enabling */
140 	memset(irq_set, 0, len);
141 	len = sizeof(struct vfio_irq_set);
142 	irq_set->argsz = len;
143 	irq_set->count = 1;
144 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
145 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
146 	irq_set->start = 0;
147 
148 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
149 
150 	if (ret) {
151 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
152 			rte_intr_fd_get(intr_handle));
153 		return -1;
154 	}
155 	return 0;
156 }
157 
158 /* disable legacy (INTx) interrupts */
159 static int
160 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
161 	struct vfio_irq_set *irq_set;
162 	char irq_set_buf[IRQ_SET_BUF_LEN];
163 	int len, ret, vfio_dev_fd;
164 
165 	len = sizeof(struct vfio_irq_set);
166 
167 	/* mask interrupts before disabling */
168 	irq_set = (struct vfio_irq_set *) irq_set_buf;
169 	irq_set->argsz = len;
170 	irq_set->count = 1;
171 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
172 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
173 	irq_set->start = 0;
174 
175 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
176 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
177 
178 	if (ret) {
179 		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
180 			rte_intr_fd_get(intr_handle));
181 		return -1;
182 	}
183 
184 	/* disable INTx*/
185 	memset(irq_set, 0, len);
186 	irq_set->argsz = len;
187 	irq_set->count = 0;
188 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
189 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
190 	irq_set->start = 0;
191 
192 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
193 
194 	if (ret) {
195 		RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
196 			rte_intr_fd_get(intr_handle));
197 		return -1;
198 	}
199 	return 0;
200 }
201 
202 /* unmask/ack legacy (INTx) interrupts */
203 static int
204 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
205 {
206 	struct vfio_irq_set irq_set;
207 	int vfio_dev_fd;
208 
209 	/* unmask INTx */
210 	memset(&irq_set, 0, sizeof(irq_set));
211 	irq_set.argsz = sizeof(irq_set);
212 	irq_set.count = 1;
213 	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
214 	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
215 	irq_set.start = 0;
216 
217 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
218 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
219 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
220 			rte_intr_fd_get(intr_handle));
221 		return -1;
222 	}
223 	return 0;
224 }
225 
226 /* enable MSI interrupts */
227 static int
228 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
229 	int len, ret;
230 	char irq_set_buf[IRQ_SET_BUF_LEN];
231 	struct vfio_irq_set *irq_set;
232 	int *fd_ptr, vfio_dev_fd;
233 
234 	len = sizeof(irq_set_buf);
235 
236 	irq_set = (struct vfio_irq_set *) irq_set_buf;
237 	irq_set->argsz = len;
238 	irq_set->count = 1;
239 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
240 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
241 	irq_set->start = 0;
242 	fd_ptr = (int *) &irq_set->data;
243 	*fd_ptr = rte_intr_fd_get(intr_handle);
244 
245 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
246 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
247 
248 	if (ret) {
249 		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
250 			rte_intr_fd_get(intr_handle));
251 		return -1;
252 	}
253 	return 0;
254 }
255 
256 /* disable MSI interrupts */
257 static int
258 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
259 	struct vfio_irq_set *irq_set;
260 	char irq_set_buf[IRQ_SET_BUF_LEN];
261 	int len, ret, vfio_dev_fd;
262 
263 	len = sizeof(struct vfio_irq_set);
264 
265 	irq_set = (struct vfio_irq_set *) irq_set_buf;
266 	irq_set->argsz = len;
267 	irq_set->count = 0;
268 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
269 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
270 	irq_set->start = 0;
271 
272 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
273 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
274 	if (ret)
275 		RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
276 			rte_intr_fd_get(intr_handle));
277 
278 	return ret;
279 }
280 
281 /* enable MSI-X interrupts */
282 static int
283 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
284 	int len, ret;
285 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
286 	struct vfio_irq_set *irq_set;
287 	int *fd_ptr, vfio_dev_fd, i;
288 
289 	len = sizeof(irq_set_buf);
290 
291 	irq_set = (struct vfio_irq_set *) irq_set_buf;
292 	irq_set->argsz = len;
293 	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
294 	irq_set->count = rte_intr_max_intr_get(intr_handle) ?
295 		(rte_intr_max_intr_get(intr_handle) >
296 		 RTE_MAX_RXTX_INTR_VEC_ID + 1 ?	RTE_MAX_RXTX_INTR_VEC_ID + 1 :
297 		 rte_intr_max_intr_get(intr_handle)) : 1;
298 
299 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
300 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301 	irq_set->start = 0;
302 	fd_ptr = (int *) &irq_set->data;
303 	/* INTR vector offset 0 reserve for non-efds mapping */
304 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
305 	for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
306 		fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
307 			rte_intr_efds_index_get(intr_handle, i);
308 	}
309 
310 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
311 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
312 
313 	if (ret) {
314 		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
315 			rte_intr_fd_get(intr_handle));
316 		return -1;
317 	}
318 
319 	return 0;
320 }
321 
322 /* disable MSI-X interrupts */
323 static int
324 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
325 	struct vfio_irq_set *irq_set;
326 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
327 	int len, ret, vfio_dev_fd;
328 
329 	len = sizeof(struct vfio_irq_set);
330 
331 	irq_set = (struct vfio_irq_set *) irq_set_buf;
332 	irq_set->argsz = len;
333 	irq_set->count = 0;
334 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
335 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
336 	irq_set->start = 0;
337 
338 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
339 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
340 
341 	if (ret)
342 		RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
343 			rte_intr_fd_get(intr_handle));
344 
345 	return ret;
346 }
347 
348 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
349 /* enable req notifier */
350 static int
351 vfio_enable_req(const struct rte_intr_handle *intr_handle)
352 {
353 	int len, ret;
354 	char irq_set_buf[IRQ_SET_BUF_LEN];
355 	struct vfio_irq_set *irq_set;
356 	int *fd_ptr, vfio_dev_fd;
357 
358 	len = sizeof(irq_set_buf);
359 
360 	irq_set = (struct vfio_irq_set *) irq_set_buf;
361 	irq_set->argsz = len;
362 	irq_set->count = 1;
363 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364 			 VFIO_IRQ_SET_ACTION_TRIGGER;
365 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
366 	irq_set->start = 0;
367 	fd_ptr = (int *) &irq_set->data;
368 	*fd_ptr = rte_intr_fd_get(intr_handle);
369 
370 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
371 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
372 
373 	if (ret) {
374 		RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
375 			rte_intr_fd_get(intr_handle));
376 		return -1;
377 	}
378 
379 	return 0;
380 }
381 
382 /* disable req notifier */
383 static int
384 vfio_disable_req(const struct rte_intr_handle *intr_handle)
385 {
386 	struct vfio_irq_set *irq_set;
387 	char irq_set_buf[IRQ_SET_BUF_LEN];
388 	int len, ret, vfio_dev_fd;
389 
390 	len = sizeof(struct vfio_irq_set);
391 
392 	irq_set = (struct vfio_irq_set *) irq_set_buf;
393 	irq_set->argsz = len;
394 	irq_set->count = 0;
395 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
396 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
397 	irq_set->start = 0;
398 
399 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
400 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
401 
402 	if (ret)
403 		RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
404 			rte_intr_fd_get(intr_handle));
405 
406 	return ret;
407 }
408 #endif
409 #endif
410 
411 static int
412 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
413 {
414 	unsigned char command_high;
415 	int uio_cfg_fd;
416 
417 	/* use UIO config file descriptor for uio_pci_generic */
418 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
419 	if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
420 		RTE_LOG(ERR, EAL,
421 			"Error reading interrupts status for fd %d\n",
422 			uio_cfg_fd);
423 		return -1;
424 	}
425 	/* disable interrupts */
426 	command_high |= 0x4;
427 	if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
428 		RTE_LOG(ERR, EAL,
429 			"Error disabling interrupts for fd %d\n",
430 			uio_cfg_fd);
431 		return -1;
432 	}
433 
434 	return 0;
435 }
436 
437 static int
438 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
439 {
440 	unsigned char command_high;
441 	int uio_cfg_fd;
442 
443 	/* use UIO config file descriptor for uio_pci_generic */
444 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
445 	if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
446 		RTE_LOG(ERR, EAL,
447 			"Error reading interrupts status for fd %d\n",
448 			uio_cfg_fd);
449 		return -1;
450 	}
451 	/* enable interrupts */
452 	command_high &= ~0x4;
453 	if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
454 		RTE_LOG(ERR, EAL,
455 			"Error enabling interrupts for fd %d\n",
456 			uio_cfg_fd);
457 		return -1;
458 	}
459 
460 	return 0;
461 }
462 
463 static int
464 uio_intr_disable(const struct rte_intr_handle *intr_handle)
465 {
466 	const int value = 0;
467 
468 	if (rte_intr_fd_get(intr_handle) < 0 ||
469 	    write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
470 		RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
471 			rte_intr_fd_get(intr_handle), strerror(errno));
472 		return -1;
473 	}
474 	return 0;
475 }
476 
477 static int
478 uio_intr_enable(const struct rte_intr_handle *intr_handle)
479 {
480 	const int value = 1;
481 
482 	if (rte_intr_fd_get(intr_handle) < 0 ||
483 	    write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
484 		RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
485 			rte_intr_fd_get(intr_handle), strerror(errno));
486 		return -1;
487 	}
488 	return 0;
489 }
490 
491 int
492 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
493 			rte_intr_callback_fn cb, void *cb_arg)
494 {
495 	int ret, wake_thread;
496 	struct rte_intr_source *src;
497 	struct rte_intr_callback *callback;
498 
499 	wake_thread = 0;
500 
501 	/* first do parameter checking */
502 	if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
503 		RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
504 		return -EINVAL;
505 	}
506 
507 	/* allocate a new interrupt callback entity */
508 	callback = calloc(1, sizeof(*callback));
509 	if (callback == NULL) {
510 		RTE_LOG(ERR, EAL, "Can not allocate memory\n");
511 		return -ENOMEM;
512 	}
513 	callback->cb_fn = cb;
514 	callback->cb_arg = cb_arg;
515 	callback->pending_delete = 0;
516 	callback->ucb_fn = NULL;
517 
518 	rte_spinlock_lock(&intr_lock);
519 
520 	/* check if there is at least one callback registered for the fd */
521 	TAILQ_FOREACH(src, &intr_sources, next) {
522 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
523 			/* we had no interrupts for this */
524 			if (TAILQ_EMPTY(&src->callbacks))
525 				wake_thread = 1;
526 
527 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528 			ret = 0;
529 			break;
530 		}
531 	}
532 
533 	/* no existing callbacks for this - add new source */
534 	if (src == NULL) {
535 		src = calloc(1, sizeof(*src));
536 		if (src == NULL) {
537 			RTE_LOG(ERR, EAL, "Can not allocate memory\n");
538 			ret = -ENOMEM;
539 			free(callback);
540 			callback = NULL;
541 		} else {
542 			src->intr_handle = rte_intr_instance_dup(intr_handle);
543 			if (src->intr_handle == NULL) {
544 				RTE_LOG(ERR, EAL, "Can not create intr instance\n");
545 				ret = -ENOMEM;
546 				free(callback);
547 				callback = NULL;
548 				free(src);
549 				src = NULL;
550 			} else {
551 				TAILQ_INIT(&src->callbacks);
552 				TAILQ_INSERT_TAIL(&(src->callbacks), callback,
553 						  next);
554 				TAILQ_INSERT_TAIL(&intr_sources, src, next);
555 				wake_thread = 1;
556 				ret = 0;
557 			}
558 		}
559 	}
560 
561 	rte_spinlock_unlock(&intr_lock);
562 
563 	/**
564 	 * check if need to notify the pipe fd waited by epoll_wait to
565 	 * rebuild the wait list.
566 	 */
567 	if (wake_thread)
568 		if (write(intr_pipe.writefd, "1", 1) < 0)
569 			ret = -EPIPE;
570 
571 	rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
572 	return ret;
573 }
574 
575 int
576 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
577 				rte_intr_callback_fn cb_fn, void *cb_arg,
578 				rte_intr_unregister_callback_fn ucb_fn)
579 {
580 	int ret;
581 	struct rte_intr_source *src;
582 	struct rte_intr_callback *cb, *next;
583 
584 	/* do parameter checking first */
585 	if (rte_intr_fd_get(intr_handle) < 0) {
586 		RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
587 		return -EINVAL;
588 	}
589 
590 	rte_spinlock_lock(&intr_lock);
591 
592 	/* check if the insterrupt source for the fd is existent */
593 	TAILQ_FOREACH(src, &intr_sources, next) {
594 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
595 			break;
596 	}
597 
598 	/* No interrupt source registered for the fd */
599 	if (src == NULL) {
600 		ret = -ENOENT;
601 
602 	/* only usable if the source is active */
603 	} else if (src->active == 0) {
604 		ret = -EAGAIN;
605 
606 	} else {
607 		ret = 0;
608 
609 		/* walk through the callbacks and mark all that match. */
610 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
611 			next = TAILQ_NEXT(cb, next);
612 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
613 					cb->cb_arg == cb_arg)) {
614 				cb->pending_delete = 1;
615 				cb->ucb_fn = ucb_fn;
616 				ret++;
617 			}
618 		}
619 	}
620 
621 	rte_spinlock_unlock(&intr_lock);
622 
623 	return ret;
624 }
625 
626 int
627 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
628 			rte_intr_callback_fn cb_fn, void *cb_arg)
629 {
630 	int ret;
631 	struct rte_intr_source *src;
632 	struct rte_intr_callback *cb, *next;
633 
634 	/* do parameter checking first */
635 	if (rte_intr_fd_get(intr_handle) < 0) {
636 		RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
637 		return -EINVAL;
638 	}
639 
640 	rte_spinlock_lock(&intr_lock);
641 
642 	/* check if the insterrupt source for the fd is existent */
643 	TAILQ_FOREACH(src, &intr_sources, next)
644 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
645 			break;
646 
647 	/* No interrupt source registered for the fd */
648 	if (src == NULL) {
649 		ret = -ENOENT;
650 
651 	/* interrupt source has some active callbacks right now. */
652 	} else if (src->active != 0) {
653 		ret = -EAGAIN;
654 
655 	/* ok to remove. */
656 	} else {
657 		ret = 0;
658 
659 		/*walk through the callbacks and remove all that match. */
660 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
661 
662 			next = TAILQ_NEXT(cb, next);
663 
664 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
665 					cb->cb_arg == cb_arg)) {
666 				TAILQ_REMOVE(&src->callbacks, cb, next);
667 				free(cb);
668 				ret++;
669 			}
670 		}
671 
672 		/* all callbacks for that source are removed. */
673 		if (TAILQ_EMPTY(&src->callbacks)) {
674 			TAILQ_REMOVE(&intr_sources, src, next);
675 			rte_intr_instance_free(src->intr_handle);
676 			free(src);
677 		}
678 	}
679 
680 	rte_spinlock_unlock(&intr_lock);
681 
682 	/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
683 	if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
684 		ret = -EPIPE;
685 	}
686 
687 	rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
688 		ret);
689 	return ret;
690 }
691 
692 int
693 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
694 			rte_intr_callback_fn cb_fn, void *cb_arg)
695 {
696 	int ret = 0;
697 
698 	while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
699 		rte_pause();
700 
701 	return ret;
702 }
703 
704 int
705 rte_intr_enable(const struct rte_intr_handle *intr_handle)
706 {
707 	int rc = 0, uio_cfg_fd;
708 
709 	if (intr_handle == NULL)
710 		return -1;
711 
712 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
713 		rc = 0;
714 		goto out;
715 	}
716 
717 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
718 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
719 		rc = -1;
720 		goto out;
721 	}
722 
723 	switch (rte_intr_type_get(intr_handle)) {
724 	/* write to the uio fd to enable the interrupt */
725 	case RTE_INTR_HANDLE_UIO:
726 		if (uio_intr_enable(intr_handle))
727 			rc = -1;
728 		break;
729 	case RTE_INTR_HANDLE_UIO_INTX:
730 		if (uio_intx_intr_enable(intr_handle))
731 			rc = -1;
732 		break;
733 	/* not used at this moment */
734 	case RTE_INTR_HANDLE_ALARM:
735 		rc = -1;
736 		break;
737 #ifdef VFIO_PRESENT
738 	case RTE_INTR_HANDLE_VFIO_MSIX:
739 		if (vfio_enable_msix(intr_handle))
740 			rc = -1;
741 		break;
742 	case RTE_INTR_HANDLE_VFIO_MSI:
743 		if (vfio_enable_msi(intr_handle))
744 			rc = -1;
745 		break;
746 	case RTE_INTR_HANDLE_VFIO_LEGACY:
747 		if (vfio_enable_intx(intr_handle))
748 			rc = -1;
749 		break;
750 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
751 	case RTE_INTR_HANDLE_VFIO_REQ:
752 		if (vfio_enable_req(intr_handle))
753 			rc = -1;
754 		break;
755 #endif
756 #endif
757 	/* not used at this moment */
758 	case RTE_INTR_HANDLE_DEV_EVENT:
759 		rc = -1;
760 		break;
761 	/* unknown handle type */
762 	default:
763 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
764 			rte_intr_fd_get(intr_handle));
765 		rc = -1;
766 		break;
767 	}
768 out:
769 	rte_eal_trace_intr_enable(intr_handle, rc);
770 	return rc;
771 }
772 
773 /**
774  * PMD generally calls this function at the end of its IRQ callback.
775  * Internally, it unmasks the interrupt if possible.
776  *
777  * For INTx, unmasking is required as the interrupt is auto-masked prior to
778  * invoking callback.
779  *
780  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
781  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
782  * this function is no-op.
783  */
784 int
785 rte_intr_ack(const struct rte_intr_handle *intr_handle)
786 {
787 	int uio_cfg_fd;
788 
789 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
790 		return 0;
791 
792 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
793 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
794 		return -1;
795 
796 	switch (rte_intr_type_get(intr_handle)) {
797 	/* Both acking and enabling are same for UIO */
798 	case RTE_INTR_HANDLE_UIO:
799 		if (uio_intr_enable(intr_handle))
800 			return -1;
801 		break;
802 	case RTE_INTR_HANDLE_UIO_INTX:
803 		if (uio_intx_intr_enable(intr_handle))
804 			return -1;
805 		break;
806 	/* not used at this moment */
807 	case RTE_INTR_HANDLE_ALARM:
808 		return -1;
809 #ifdef VFIO_PRESENT
810 	/* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
811 	case RTE_INTR_HANDLE_VFIO_MSIX:
812 	case RTE_INTR_HANDLE_VFIO_MSI:
813 		return 0;
814 	case RTE_INTR_HANDLE_VFIO_LEGACY:
815 		if (vfio_ack_intx(intr_handle))
816 			return -1;
817 		break;
818 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
819 	case RTE_INTR_HANDLE_VFIO_REQ:
820 		return -1;
821 #endif
822 #endif
823 	/* not used at this moment */
824 	case RTE_INTR_HANDLE_DEV_EVENT:
825 		return -1;
826 	/* unknown handle type */
827 	default:
828 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
829 			rte_intr_fd_get(intr_handle));
830 		return -1;
831 	}
832 
833 	return 0;
834 }
835 
836 int
837 rte_intr_disable(const struct rte_intr_handle *intr_handle)
838 {
839 	int rc = 0, uio_cfg_fd;
840 
841 	if (intr_handle == NULL)
842 		return -1;
843 
844 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
845 		rc = 0;
846 		goto out;
847 	}
848 
849 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
850 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
851 		rc = -1;
852 		goto out;
853 	}
854 
855 	switch (rte_intr_type_get(intr_handle)) {
856 	/* write to the uio fd to disable the interrupt */
857 	case RTE_INTR_HANDLE_UIO:
858 		if (uio_intr_disable(intr_handle))
859 			rc = -1;
860 		break;
861 	case RTE_INTR_HANDLE_UIO_INTX:
862 		if (uio_intx_intr_disable(intr_handle))
863 			rc = -1;
864 		break;
865 	/* not used at this moment */
866 	case RTE_INTR_HANDLE_ALARM:
867 		rc = -1;
868 		break;
869 #ifdef VFIO_PRESENT
870 	case RTE_INTR_HANDLE_VFIO_MSIX:
871 		if (vfio_disable_msix(intr_handle))
872 			rc = -1;
873 		break;
874 	case RTE_INTR_HANDLE_VFIO_MSI:
875 		if (vfio_disable_msi(intr_handle))
876 			rc = -1;
877 		break;
878 	case RTE_INTR_HANDLE_VFIO_LEGACY:
879 		if (vfio_disable_intx(intr_handle))
880 			rc = -1;
881 		break;
882 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
883 	case RTE_INTR_HANDLE_VFIO_REQ:
884 		if (vfio_disable_req(intr_handle))
885 			rc = -1;
886 		break;
887 #endif
888 #endif
889 	/* not used at this moment */
890 	case RTE_INTR_HANDLE_DEV_EVENT:
891 		rc = -1;
892 		break;
893 	/* unknown handle type */
894 	default:
895 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
896 			rte_intr_fd_get(intr_handle));
897 		rc = -1;
898 		break;
899 	}
900 out:
901 	rte_eal_trace_intr_disable(intr_handle, rc);
902 	return rc;
903 }
904 
905 static int
906 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
907 {
908 	bool call = false;
909 	int n, bytes_read, rv;
910 	struct rte_intr_source *src;
911 	struct rte_intr_callback *cb, *next;
912 	union rte_intr_read_buffer buf;
913 	struct rte_intr_callback active_cb;
914 
915 	for (n = 0; n < nfds; n++) {
916 
917 		/**
918 		 * if the pipe fd is ready to read, return out to
919 		 * rebuild the wait list.
920 		 */
921 		if (events[n].data.fd == intr_pipe.readfd){
922 			int r = read(intr_pipe.readfd, buf.charbuf,
923 					sizeof(buf.charbuf));
924 			RTE_SET_USED(r);
925 			return -1;
926 		}
927 		rte_spinlock_lock(&intr_lock);
928 		TAILQ_FOREACH(src, &intr_sources, next)
929 			if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
930 				break;
931 		if (src == NULL){
932 			rte_spinlock_unlock(&intr_lock);
933 			continue;
934 		}
935 
936 		/* mark this interrupt source as active and release the lock. */
937 		src->active = 1;
938 		rte_spinlock_unlock(&intr_lock);
939 
940 		/* set the length to be read dor different handle type */
941 		switch (rte_intr_type_get(src->intr_handle)) {
942 		case RTE_INTR_HANDLE_UIO:
943 		case RTE_INTR_HANDLE_UIO_INTX:
944 			bytes_read = sizeof(buf.uio_intr_count);
945 			break;
946 		case RTE_INTR_HANDLE_ALARM:
947 			bytes_read = sizeof(buf.timerfd_num);
948 			break;
949 #ifdef VFIO_PRESENT
950 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
951 		case RTE_INTR_HANDLE_VFIO_REQ:
952 #endif
953 		case RTE_INTR_HANDLE_VFIO_MSIX:
954 		case RTE_INTR_HANDLE_VFIO_MSI:
955 		case RTE_INTR_HANDLE_VFIO_LEGACY:
956 			bytes_read = sizeof(buf.vfio_intr_count);
957 			break;
958 #endif
959 		case RTE_INTR_HANDLE_VDEV:
960 		case RTE_INTR_HANDLE_EXT:
961 			bytes_read = 0;
962 			call = true;
963 			break;
964 		case RTE_INTR_HANDLE_DEV_EVENT:
965 			bytes_read = 0;
966 			call = true;
967 			break;
968 		default:
969 			bytes_read = 1;
970 			break;
971 		}
972 
973 		if (bytes_read > 0) {
974 			/**
975 			 * read out to clear the ready-to-be-read flag
976 			 * for epoll_wait.
977 			 */
978 			bytes_read = read(events[n].data.fd, &buf, bytes_read);
979 			if (bytes_read < 0) {
980 				if (errno == EINTR || errno == EWOULDBLOCK)
981 					continue;
982 
983 				RTE_LOG(ERR, EAL, "Error reading from file "
984 					"descriptor %d: %s\n",
985 					events[n].data.fd,
986 					strerror(errno));
987 				/*
988 				 * The device is unplugged or buggy, remove
989 				 * it as an interrupt source and return to
990 				 * force the wait list to be rebuilt.
991 				 */
992 				rte_spinlock_lock(&intr_lock);
993 				TAILQ_REMOVE(&intr_sources, src, next);
994 				rte_spinlock_unlock(&intr_lock);
995 
996 				for (cb = TAILQ_FIRST(&src->callbacks); cb;
997 							cb = next) {
998 					next = TAILQ_NEXT(cb, next);
999 					TAILQ_REMOVE(&src->callbacks, cb, next);
1000 					free(cb);
1001 				}
1002 				rte_intr_instance_free(src->intr_handle);
1003 				free(src);
1004 				return -1;
1005 			} else if (bytes_read == 0)
1006 				RTE_LOG(ERR, EAL, "Read nothing from file "
1007 					"descriptor %d\n", events[n].data.fd);
1008 			else
1009 				call = true;
1010 		}
1011 
1012 		/* grab a lock, again to call callbacks and update status. */
1013 		rte_spinlock_lock(&intr_lock);
1014 
1015 		if (call) {
1016 
1017 			/* Finally, call all callbacks. */
1018 			TAILQ_FOREACH(cb, &src->callbacks, next) {
1019 
1020 				/* make a copy and unlock. */
1021 				active_cb = *cb;
1022 				rte_spinlock_unlock(&intr_lock);
1023 
1024 				/* call the actual callback */
1025 				active_cb.cb_fn(active_cb.cb_arg);
1026 
1027 				/*get the lock back. */
1028 				rte_spinlock_lock(&intr_lock);
1029 			}
1030 		}
1031 		/* we done with that interrupt source, release it. */
1032 		src->active = 0;
1033 
1034 		rv = 0;
1035 
1036 		/* check if any callback are supposed to be removed */
1037 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1038 			next = TAILQ_NEXT(cb, next);
1039 			if (cb->pending_delete) {
1040 				TAILQ_REMOVE(&src->callbacks, cb, next);
1041 				if (cb->ucb_fn)
1042 					cb->ucb_fn(src->intr_handle, cb->cb_arg);
1043 				free(cb);
1044 				rv++;
1045 			}
1046 		}
1047 
1048 		/* all callbacks for that source are removed. */
1049 		if (TAILQ_EMPTY(&src->callbacks)) {
1050 			TAILQ_REMOVE(&intr_sources, src, next);
1051 			rte_intr_instance_free(src->intr_handle);
1052 			free(src);
1053 		}
1054 
1055 		/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1056 		if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1057 			rte_spinlock_unlock(&intr_lock);
1058 			return -EPIPE;
1059 		}
1060 
1061 		rte_spinlock_unlock(&intr_lock);
1062 	}
1063 
1064 	return 0;
1065 }
1066 
1067 /**
1068  * It handles all the interrupts.
1069  *
1070  * @param pfd
1071  *  epoll file descriptor.
1072  * @param totalfds
1073  *  The number of file descriptors added in epoll.
1074  *
1075  * @return
1076  *  void
1077  */
1078 static void
1079 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1080 {
1081 	struct epoll_event events[totalfds];
1082 	int nfds = 0;
1083 
1084 	for(;;) {
1085 		nfds = epoll_wait(pfd, events, totalfds,
1086 			EAL_INTR_EPOLL_WAIT_FOREVER);
1087 		/* epoll_wait fail */
1088 		if (nfds < 0) {
1089 			if (errno == EINTR)
1090 				continue;
1091 			RTE_LOG(ERR, EAL,
1092 				"epoll_wait returns with fail\n");
1093 			return;
1094 		}
1095 		/* epoll_wait timeout, will never happens here */
1096 		else if (nfds == 0)
1097 			continue;
1098 		/* epoll_wait has at least one fd ready to read */
1099 		if (eal_intr_process_interrupts(events, nfds) < 0)
1100 			return;
1101 	}
1102 }
1103 
1104 /**
1105  * It builds/rebuilds up the epoll file descriptor with all the
1106  * file descriptors being waited on. Then handles the interrupts.
1107  *
1108  * @param arg
1109  *  pointer. (unused)
1110  *
1111  * @return
1112  *  never return;
1113  */
1114 static __rte_noreturn void *
1115 eal_intr_thread_main(__rte_unused void *arg)
1116 {
1117 	/* host thread, never break out */
1118 	for (;;) {
1119 		/* build up the epoll fd with all descriptors we are to
1120 		 * wait on then pass it to the handle_interrupts function
1121 		 */
1122 		static struct epoll_event pipe_event = {
1123 			.events = EPOLLIN | EPOLLPRI,
1124 		};
1125 		struct rte_intr_source *src;
1126 		unsigned numfds = 0;
1127 
1128 		/* create epoll fd */
1129 		int pfd = epoll_create(1);
1130 		if (pfd < 0)
1131 			rte_panic("Cannot create epoll instance\n");
1132 
1133 		pipe_event.data.fd = intr_pipe.readfd;
1134 		/**
1135 		 * add pipe fd into wait list, this pipe is used to
1136 		 * rebuild the wait list.
1137 		 */
1138 		if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1139 						&pipe_event) < 0) {
1140 			rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1141 					intr_pipe.readfd, strerror(errno));
1142 		}
1143 		numfds++;
1144 
1145 		rte_spinlock_lock(&intr_lock);
1146 
1147 		TAILQ_FOREACH(src, &intr_sources, next) {
1148 			struct epoll_event ev;
1149 
1150 			if (src->callbacks.tqh_first == NULL)
1151 				continue; /* skip those with no callbacks */
1152 			memset(&ev, 0, sizeof(ev));
1153 			ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1154 			ev.data.fd = rte_intr_fd_get(src->intr_handle);
1155 
1156 			/**
1157 			 * add all the uio device file descriptor
1158 			 * into wait list.
1159 			 */
1160 			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1161 					rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1162 				rte_panic("Error adding fd %d epoll_ctl, %s\n",
1163 					rte_intr_fd_get(src->intr_handle),
1164 					strerror(errno));
1165 			}
1166 			else
1167 				numfds++;
1168 		}
1169 		rte_spinlock_unlock(&intr_lock);
1170 		/* serve the interrupt */
1171 		eal_intr_handle_interrupts(pfd, numfds);
1172 
1173 		/**
1174 		 * when we return, we need to rebuild the
1175 		 * list of fds to monitor.
1176 		 */
1177 		close(pfd);
1178 	}
1179 }
1180 
1181 int
1182 rte_eal_intr_init(void)
1183 {
1184 	int ret = 0;
1185 
1186 	/* init the global interrupt source head */
1187 	TAILQ_INIT(&intr_sources);
1188 
1189 	/**
1190 	 * create a pipe which will be waited by epoll and notified to
1191 	 * rebuild the wait list of epoll.
1192 	 */
1193 	if (pipe(intr_pipe.pipefd) < 0) {
1194 		rte_errno = errno;
1195 		return -1;
1196 	}
1197 
1198 	/* create the host thread to wait/handle the interrupt */
1199 	ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1200 			eal_intr_thread_main, NULL);
1201 	if (ret != 0) {
1202 		rte_errno = -ret;
1203 		RTE_LOG(ERR, EAL,
1204 			"Failed to create thread for interrupt handling\n");
1205 	}
1206 
1207 	return ret;
1208 }
1209 
1210 static void
1211 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1212 {
1213 	union rte_intr_read_buffer buf;
1214 	int bytes_read = 0;
1215 	int nbytes;
1216 
1217 	switch (rte_intr_type_get(intr_handle)) {
1218 	case RTE_INTR_HANDLE_UIO:
1219 	case RTE_INTR_HANDLE_UIO_INTX:
1220 		bytes_read = sizeof(buf.uio_intr_count);
1221 		break;
1222 #ifdef VFIO_PRESENT
1223 	case RTE_INTR_HANDLE_VFIO_MSIX:
1224 	case RTE_INTR_HANDLE_VFIO_MSI:
1225 	case RTE_INTR_HANDLE_VFIO_LEGACY:
1226 		bytes_read = sizeof(buf.vfio_intr_count);
1227 		break;
1228 #endif
1229 	case RTE_INTR_HANDLE_VDEV:
1230 		bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1231 		/* For vdev, number of bytes to read is set by driver */
1232 		break;
1233 	case RTE_INTR_HANDLE_EXT:
1234 		return;
1235 	default:
1236 		bytes_read = 1;
1237 		RTE_LOG(INFO, EAL, "unexpected intr type\n");
1238 		break;
1239 	}
1240 
1241 	/**
1242 	 * read out to clear the ready-to-be-read flag
1243 	 * for epoll_wait.
1244 	 */
1245 	if (bytes_read == 0)
1246 		return;
1247 	do {
1248 		nbytes = read(fd, &buf, bytes_read);
1249 		if (nbytes < 0) {
1250 			if (errno == EINTR || errno == EWOULDBLOCK ||
1251 			    errno == EAGAIN)
1252 				continue;
1253 			RTE_LOG(ERR, EAL,
1254 				"Error reading from fd %d: %s\n",
1255 				fd, strerror(errno));
1256 		} else if (nbytes == 0)
1257 			RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1258 		return;
1259 	} while (1);
1260 }
1261 
1262 static int
1263 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1264 			struct rte_epoll_event *events)
1265 {
1266 	unsigned int i, count = 0;
1267 	struct rte_epoll_event *rev;
1268 	uint32_t valid_status;
1269 
1270 	for (i = 0; i < n; i++) {
1271 		rev = evs[i].data.ptr;
1272 		valid_status =  RTE_EPOLL_VALID;
1273 		/* ACQUIRE memory ordering here pairs with RELEASE
1274 		 * ordering below acting as a lock to synchronize
1275 		 * the event data updating.
1276 		 */
1277 		if (!rev || !__atomic_compare_exchange_n(&rev->status,
1278 				    &valid_status, RTE_EPOLL_EXEC, 0,
1279 				    __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1280 			continue;
1281 
1282 		events[count].status        = RTE_EPOLL_VALID;
1283 		events[count].fd            = rev->fd;
1284 		events[count].epfd          = rev->epfd;
1285 		events[count].epdata.event  = evs[i].events;
1286 		events[count].epdata.data   = rev->epdata.data;
1287 		if (rev->epdata.cb_fun)
1288 			rev->epdata.cb_fun(rev->fd,
1289 					   rev->epdata.cb_arg);
1290 
1291 		/* the status update should be observed after
1292 		 * the other fields change.
1293 		 */
1294 		__atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1295 				__ATOMIC_RELEASE);
1296 		count++;
1297 	}
1298 	return count;
1299 }
1300 
1301 static inline int
1302 eal_init_tls_epfd(void)
1303 {
1304 	int pfd = epoll_create(255);
1305 
1306 	if (pfd < 0) {
1307 		RTE_LOG(ERR, EAL,
1308 			"Cannot create epoll instance\n");
1309 		return -1;
1310 	}
1311 	return pfd;
1312 }
1313 
1314 int
1315 rte_intr_tls_epfd(void)
1316 {
1317 	if (RTE_PER_LCORE(_epfd) == -1)
1318 		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1319 
1320 	return RTE_PER_LCORE(_epfd);
1321 }
1322 
1323 static int
1324 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1325 	       int maxevents, int timeout, bool interruptible)
1326 {
1327 	struct epoll_event evs[maxevents];
1328 	int rc;
1329 
1330 	if (!events) {
1331 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1332 		return -1;
1333 	}
1334 
1335 	/* using per thread epoll fd */
1336 	if (epfd == RTE_EPOLL_PER_THREAD)
1337 		epfd = rte_intr_tls_epfd();
1338 
1339 	while (1) {
1340 		rc = epoll_wait(epfd, evs, maxevents, timeout);
1341 		if (likely(rc > 0)) {
1342 			/* epoll_wait has at least one fd ready to read */
1343 			rc = eal_epoll_process_event(evs, rc, events);
1344 			break;
1345 		} else if (rc < 0) {
1346 			if (errno == EINTR) {
1347 				if (interruptible)
1348 					return -1;
1349 				else
1350 					continue;
1351 			}
1352 			/* epoll_wait fail */
1353 			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1354 				strerror(errno));
1355 			rc = -1;
1356 			break;
1357 		} else {
1358 			/* rc == 0, epoll_wait timed out */
1359 			break;
1360 		}
1361 	}
1362 
1363 	return rc;
1364 }
1365 
1366 int
1367 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1368 	       int maxevents, int timeout)
1369 {
1370 	return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1371 }
1372 
1373 int
1374 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1375 			     int maxevents, int timeout)
1376 {
1377 	return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1378 }
1379 
1380 static inline void
1381 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1382 {
1383 	uint32_t valid_status = RTE_EPOLL_VALID;
1384 
1385 	while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1386 		    RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1387 		while (__atomic_load_n(&ev->status,
1388 				__ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1389 			rte_pause();
1390 		valid_status = RTE_EPOLL_VALID;
1391 	}
1392 	memset(&ev->epdata, 0, sizeof(ev->epdata));
1393 	ev->fd = -1;
1394 	ev->epfd = -1;
1395 }
1396 
1397 int
1398 rte_epoll_ctl(int epfd, int op, int fd,
1399 	      struct rte_epoll_event *event)
1400 {
1401 	struct epoll_event ev;
1402 
1403 	if (!event) {
1404 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1405 		return -1;
1406 	}
1407 
1408 	/* using per thread epoll fd */
1409 	if (epfd == RTE_EPOLL_PER_THREAD)
1410 		epfd = rte_intr_tls_epfd();
1411 
1412 	if (op == EPOLL_CTL_ADD) {
1413 		__atomic_store_n(&event->status, RTE_EPOLL_VALID,
1414 				__ATOMIC_RELAXED);
1415 		event->fd = fd;  /* ignore fd in event */
1416 		event->epfd = epfd;
1417 		ev.data.ptr = (void *)event;
1418 	}
1419 
1420 	ev.events = event->epdata.event;
1421 	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1422 		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1423 			op, fd, strerror(errno));
1424 		if (op == EPOLL_CTL_ADD)
1425 			/* rollback status when CTL_ADD fail */
1426 			__atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1427 					__ATOMIC_RELAXED);
1428 		return -1;
1429 	}
1430 
1431 	if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1432 			__ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1433 		eal_epoll_data_safe_free(event);
1434 
1435 	return 0;
1436 }
1437 
1438 int
1439 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1440 		int op, unsigned int vec, void *data)
1441 {
1442 	struct rte_epoll_event *rev;
1443 	struct rte_epoll_data *epdata;
1444 	int epfd_op;
1445 	unsigned int efd_idx;
1446 	int rc = 0;
1447 
1448 	efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1449 		(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1450 
1451 	if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1452 			efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1453 		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1454 		return -EPERM;
1455 	}
1456 
1457 	switch (op) {
1458 	case RTE_INTR_EVENT_ADD:
1459 		epfd_op = EPOLL_CTL_ADD;
1460 		rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1461 		if (__atomic_load_n(&rev->status,
1462 				__ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1463 			RTE_LOG(INFO, EAL, "Event already been added.\n");
1464 			return -EEXIST;
1465 		}
1466 
1467 		/* attach to intr vector fd */
1468 		epdata = &rev->epdata;
1469 		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1470 		epdata->data   = data;
1471 		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1472 		epdata->cb_arg = (void *)intr_handle;
1473 		rc = rte_epoll_ctl(epfd, epfd_op,
1474 			rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1475 		if (!rc)
1476 			RTE_LOG(DEBUG, EAL,
1477 				"efd %d associated with vec %d added on epfd %d"
1478 				"\n", rev->fd, vec, epfd);
1479 		else
1480 			rc = -EPERM;
1481 		break;
1482 	case RTE_INTR_EVENT_DEL:
1483 		epfd_op = EPOLL_CTL_DEL;
1484 		rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1485 		if (__atomic_load_n(&rev->status,
1486 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1487 			RTE_LOG(INFO, EAL, "Event does not exist.\n");
1488 			return -EPERM;
1489 		}
1490 
1491 		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1492 		if (rc)
1493 			rc = -EPERM;
1494 		break;
1495 	default:
1496 		RTE_LOG(ERR, EAL, "event op type mismatch\n");
1497 		rc = -EPERM;
1498 	}
1499 
1500 	return rc;
1501 }
1502 
1503 void
1504 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1505 {
1506 	uint32_t i;
1507 	struct rte_epoll_event *rev;
1508 
1509 	for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1510 		rev = rte_intr_elist_index_get(intr_handle, i);
1511 		if (__atomic_load_n(&rev->status,
1512 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1513 			continue;
1514 		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1515 			/* force free if the entry valid */
1516 			eal_epoll_data_safe_free(rev);
1517 		}
1518 	}
1519 }
1520 
1521 int
1522 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1523 {
1524 	uint32_t i;
1525 	int fd;
1526 	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1527 
1528 	assert(nb_efd != 0);
1529 
1530 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1531 		for (i = 0; i < n; i++) {
1532 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1533 			if (fd < 0) {
1534 				RTE_LOG(ERR, EAL,
1535 					"can't setup eventfd, error %i (%s)\n",
1536 					errno, strerror(errno));
1537 				return -errno;
1538 			}
1539 
1540 			if (rte_intr_efds_index_set(intr_handle, i, fd))
1541 				return -rte_errno;
1542 		}
1543 
1544 		if (rte_intr_nb_efd_set(intr_handle, n))
1545 			return -rte_errno;
1546 
1547 		if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1548 			return -rte_errno;
1549 	} else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1550 		/* only check, initialization would be done in vdev driver.*/
1551 		if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1552 		    sizeof(union rte_intr_read_buffer)) {
1553 			RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1554 			return -EINVAL;
1555 		}
1556 	} else {
1557 		if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1558 			return -rte_errno;
1559 		if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1560 			return -rte_errno;
1561 		if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1562 			return -rte_errno;
1563 	}
1564 
1565 	return 0;
1566 }
1567 
1568 void
1569 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1570 {
1571 	uint32_t i;
1572 
1573 	rte_intr_free_epoll_fd(intr_handle);
1574 	if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1575 		for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1576 			close(rte_intr_efds_index_get(intr_handle, i));
1577 	}
1578 	rte_intr_nb_efd_set(intr_handle, 0);
1579 	rte_intr_max_intr_set(intr_handle, 0);
1580 }
1581 
1582 int
1583 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1584 {
1585 	return !(!rte_intr_nb_efd_get(intr_handle));
1586 }
1587 
1588 int
1589 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1590 {
1591 	if (!rte_intr_dp_is_en(intr_handle))
1592 		return 1;
1593 	else
1594 		return !!(rte_intr_max_intr_get(intr_handle) -
1595 				rte_intr_nb_efd_get(intr_handle));
1596 }
1597 
1598 int
1599 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1600 {
1601 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1602 		return 1;
1603 
1604 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1605 		return 1;
1606 
1607 	return 0;
1608 }
1609 
1610 int rte_thread_is_intr(void)
1611 {
1612 	return pthread_equal(intr_thread, pthread_self());
1613 }
1614