xref: /dpdk/lib/eal/linux/eal_interrupts.c (revision ae67895b507bb6af22263c79ba0d5c374b396485)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <sys/queue.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <errno.h>
12 #include <sys/epoll.h>
13 #include <sys/ioctl.h>
14 #include <sys/eventfd.h>
15 #include <assert.h>
16 #include <stdbool.h>
17 
18 #include <eal_trace_internal.h>
19 #include <rte_common.h>
20 #include <rte_interrupts.h>
21 #include <rte_thread.h>
22 #include <rte_per_lcore.h>
23 #include <rte_lcore.h>
24 #include <rte_branch_prediction.h>
25 #include <rte_debug.h>
26 #include <rte_log.h>
27 #include <rte_errno.h>
28 #include <rte_spinlock.h>
29 #include <rte_pause.h>
30 #include <rte_vfio.h>
31 
32 #include "eal_private.h"
33 
34 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
35 #define NB_OTHER_INTR               1
36 
37 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
38 
39 /**
40  * union for pipe fds.
41  */
42 union intr_pipefds{
43 	struct {
44 		int pipefd[2];
45 	};
46 	struct {
47 		int readfd;
48 		int writefd;
49 	};
50 };
51 
52 /**
53  * union buffer for reading on different devices
54  */
55 union rte_intr_read_buffer {
56 	int uio_intr_count;              /* for uio device */
57 #ifdef VFIO_PRESENT
58 	uint64_t vfio_intr_count;        /* for vfio device */
59 #endif
60 	uint64_t timerfd_num;            /* for timerfd */
61 	char charbuf[16];                /* for others */
62 };
63 
64 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
65 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
66 
67 struct rte_intr_callback {
68 	TAILQ_ENTRY(rte_intr_callback) next;
69 	rte_intr_callback_fn cb_fn;  /**< callback address */
70 	void *cb_arg;                /**< parameter for callback */
71 	uint8_t pending_delete;      /**< delete after callback is called */
72 	rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
73 };
74 
75 struct rte_intr_source {
76 	TAILQ_ENTRY(rte_intr_source) next;
77 	struct rte_intr_handle *intr_handle; /**< interrupt handle */
78 	struct rte_intr_cb_list callbacks;  /**< user callbacks */
79 	uint32_t active;
80 };
81 
82 /* global spinlock for interrupt data operation */
83 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
84 
85 /* union buffer for pipe read/write */
86 static union intr_pipefds intr_pipe;
87 
88 /* interrupt sources list */
89 static struct rte_intr_source_list intr_sources;
90 
91 /* interrupt handling thread */
92 static rte_thread_t intr_thread;
93 
94 /* VFIO interrupts */
95 #ifdef VFIO_PRESENT
96 
97 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
98 /* irq set buffer length for queue interrupts and LSC interrupt */
99 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
100 			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
101 
102 /* enable legacy (INTx) interrupts */
103 static int
vfio_enable_intx(const struct rte_intr_handle * intr_handle)104 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
105 	struct vfio_irq_set *irq_set;
106 	char irq_set_buf[IRQ_SET_BUF_LEN];
107 	int len, ret, vfio_dev_fd;
108 	int *fd_ptr;
109 
110 	len = sizeof(irq_set_buf);
111 
112 	/* enable INTx */
113 	irq_set = (struct vfio_irq_set *) irq_set_buf;
114 	irq_set->argsz = len;
115 	irq_set->count = 1;
116 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
117 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
118 	irq_set->start = 0;
119 	fd_ptr = (int *) &irq_set->data;
120 	*fd_ptr = rte_intr_fd_get(intr_handle);
121 
122 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
123 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
124 
125 	if (ret) {
126 		EAL_LOG(ERR, "Error enabling INTx interrupts for fd %d",
127 			rte_intr_fd_get(intr_handle));
128 		return -1;
129 	}
130 
131 	/* unmask INTx after enabling */
132 	memset(irq_set, 0, len);
133 	len = sizeof(struct vfio_irq_set);
134 	irq_set->argsz = len;
135 	irq_set->count = 1;
136 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
137 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
138 	irq_set->start = 0;
139 
140 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
141 
142 	if (ret) {
143 		EAL_LOG(ERR, "Error unmasking INTx interrupts for fd %d",
144 			rte_intr_fd_get(intr_handle));
145 		return -1;
146 	}
147 	return 0;
148 }
149 
150 /* disable legacy (INTx) interrupts */
151 static int
vfio_disable_intx(const struct rte_intr_handle * intr_handle)152 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
153 	struct vfio_irq_set *irq_set;
154 	char irq_set_buf[IRQ_SET_BUF_LEN];
155 	int len, ret, vfio_dev_fd;
156 
157 	len = sizeof(struct vfio_irq_set);
158 
159 	/* mask interrupts before disabling */
160 	irq_set = (struct vfio_irq_set *) irq_set_buf;
161 	irq_set->argsz = len;
162 	irq_set->count = 1;
163 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
164 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
165 	irq_set->start = 0;
166 
167 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
168 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
169 
170 	if (ret) {
171 		EAL_LOG(ERR, "Error masking INTx interrupts for fd %d",
172 			rte_intr_fd_get(intr_handle));
173 		return -1;
174 	}
175 
176 	/* disable INTx*/
177 	memset(irq_set, 0, len);
178 	irq_set->argsz = len;
179 	irq_set->count = 0;
180 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
181 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
182 	irq_set->start = 0;
183 
184 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
185 
186 	if (ret) {
187 		EAL_LOG(ERR, "Error disabling INTx interrupts for fd %d",
188 			rte_intr_fd_get(intr_handle));
189 		return -1;
190 	}
191 	return 0;
192 }
193 
194 /* unmask/ack legacy (INTx) interrupts */
195 static int
vfio_ack_intx(const struct rte_intr_handle * intr_handle)196 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
197 {
198 	struct vfio_irq_set irq_set;
199 	int vfio_dev_fd;
200 
201 	/* unmask INTx */
202 	memset(&irq_set, 0, sizeof(irq_set));
203 	irq_set.argsz = sizeof(irq_set);
204 	irq_set.count = 1;
205 	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
206 	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
207 	irq_set.start = 0;
208 
209 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
210 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
211 		EAL_LOG(ERR, "Error unmasking INTx interrupts for fd %d",
212 			rte_intr_fd_get(intr_handle));
213 		return -1;
214 	}
215 	return 0;
216 }
217 
218 /* enable MSI interrupts */
219 static int
vfio_enable_msi(const struct rte_intr_handle * intr_handle)220 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
221 	int len, ret;
222 	char irq_set_buf[IRQ_SET_BUF_LEN];
223 	struct vfio_irq_set *irq_set;
224 	int *fd_ptr, vfio_dev_fd;
225 
226 	len = sizeof(irq_set_buf);
227 
228 	irq_set = (struct vfio_irq_set *) irq_set_buf;
229 	irq_set->argsz = len;
230 	irq_set->count = 1;
231 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
232 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
233 	irq_set->start = 0;
234 	fd_ptr = (int *) &irq_set->data;
235 	*fd_ptr = rte_intr_fd_get(intr_handle);
236 
237 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
238 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
239 
240 	if (ret) {
241 		EAL_LOG(ERR, "Error enabling MSI interrupts for fd %d",
242 			rte_intr_fd_get(intr_handle));
243 		return -1;
244 	}
245 	return 0;
246 }
247 
248 /* disable MSI interrupts */
249 static int
vfio_disable_msi(const struct rte_intr_handle * intr_handle)250 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
251 	struct vfio_irq_set *irq_set;
252 	char irq_set_buf[IRQ_SET_BUF_LEN];
253 	int len, ret, vfio_dev_fd;
254 
255 	len = sizeof(struct vfio_irq_set);
256 
257 	irq_set = (struct vfio_irq_set *) irq_set_buf;
258 	irq_set->argsz = len;
259 	irq_set->count = 0;
260 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
261 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
262 	irq_set->start = 0;
263 
264 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
265 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
266 	if (ret)
267 		EAL_LOG(ERR, "Error disabling MSI interrupts for fd %d",
268 			rte_intr_fd_get(intr_handle));
269 
270 	return ret;
271 }
272 
273 /* enable MSI-X interrupts */
274 static int
vfio_enable_msix(const struct rte_intr_handle * intr_handle)275 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
276 	int len, ret;
277 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
278 	struct vfio_irq_set *irq_set;
279 	int *fd_ptr, vfio_dev_fd, i;
280 
281 	len = sizeof(irq_set_buf);
282 
283 	irq_set = (struct vfio_irq_set *) irq_set_buf;
284 	irq_set->argsz = len;
285 	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
286 	irq_set->count = rte_intr_max_intr_get(intr_handle) ?
287 		(rte_intr_max_intr_get(intr_handle) >
288 		 RTE_MAX_RXTX_INTR_VEC_ID + 1 ?	RTE_MAX_RXTX_INTR_VEC_ID + 1 :
289 		 rte_intr_max_intr_get(intr_handle)) : 1;
290 
291 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
292 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
293 	irq_set->start = 0;
294 	fd_ptr = (int *) &irq_set->data;
295 	/* INTR vector offset 0 reserve for non-efds mapping */
296 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
297 	for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
298 		fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
299 			rte_intr_efds_index_get(intr_handle, i);
300 	}
301 
302 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
303 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
304 
305 	if (ret) {
306 		EAL_LOG(ERR, "Error enabling MSI-X interrupts for fd %d",
307 			rte_intr_fd_get(intr_handle));
308 		return -1;
309 	}
310 
311 	return 0;
312 }
313 
314 /* disable MSI-X interrupts */
315 static int
vfio_disable_msix(const struct rte_intr_handle * intr_handle)316 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
317 	struct vfio_irq_set *irq_set;
318 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
319 	int len, ret, vfio_dev_fd;
320 
321 	len = sizeof(struct vfio_irq_set);
322 
323 	irq_set = (struct vfio_irq_set *) irq_set_buf;
324 	irq_set->argsz = len;
325 	irq_set->count = 0;
326 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
327 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
328 	irq_set->start = 0;
329 
330 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
331 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
332 
333 	if (ret)
334 		EAL_LOG(ERR, "Error disabling MSI-X interrupts for fd %d",
335 			rte_intr_fd_get(intr_handle));
336 
337 	return ret;
338 }
339 
340 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
341 /* enable req notifier */
342 static int
vfio_enable_req(const struct rte_intr_handle * intr_handle)343 vfio_enable_req(const struct rte_intr_handle *intr_handle)
344 {
345 	int len, ret;
346 	char irq_set_buf[IRQ_SET_BUF_LEN];
347 	struct vfio_irq_set *irq_set;
348 	int *fd_ptr, vfio_dev_fd;
349 
350 	len = sizeof(irq_set_buf);
351 
352 	irq_set = (struct vfio_irq_set *) irq_set_buf;
353 	irq_set->argsz = len;
354 	irq_set->count = 1;
355 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
356 			 VFIO_IRQ_SET_ACTION_TRIGGER;
357 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
358 	irq_set->start = 0;
359 	fd_ptr = (int *) &irq_set->data;
360 	*fd_ptr = rte_intr_fd_get(intr_handle);
361 
362 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
363 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
364 
365 	if (ret) {
366 		EAL_LOG(ERR, "Error enabling req interrupts for fd %d",
367 			rte_intr_fd_get(intr_handle));
368 		return -1;
369 	}
370 
371 	return 0;
372 }
373 
374 /* disable req notifier */
375 static int
vfio_disable_req(const struct rte_intr_handle * intr_handle)376 vfio_disable_req(const struct rte_intr_handle *intr_handle)
377 {
378 	struct vfio_irq_set *irq_set;
379 	char irq_set_buf[IRQ_SET_BUF_LEN];
380 	int len, ret, vfio_dev_fd;
381 
382 	len = sizeof(struct vfio_irq_set);
383 
384 	irq_set = (struct vfio_irq_set *) irq_set_buf;
385 	irq_set->argsz = len;
386 	irq_set->count = 0;
387 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
388 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
389 	irq_set->start = 0;
390 
391 	vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
392 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
393 
394 	if (ret)
395 		EAL_LOG(ERR, "Error disabling req interrupts for fd %d",
396 			rte_intr_fd_get(intr_handle));
397 
398 	return ret;
399 }
400 #endif
401 #endif
402 
403 static int
uio_intx_intr_disable(const struct rte_intr_handle * intr_handle)404 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
405 {
406 	unsigned char command_high;
407 	int uio_cfg_fd;
408 
409 	/* use UIO config file descriptor for uio_pci_generic */
410 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
411 	if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
412 		EAL_LOG(ERR,
413 			"Error reading interrupts status for fd %d",
414 			uio_cfg_fd);
415 		return -1;
416 	}
417 	/* disable interrupts */
418 	command_high |= 0x4;
419 	if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
420 		EAL_LOG(ERR,
421 			"Error disabling interrupts for fd %d",
422 			uio_cfg_fd);
423 		return -1;
424 	}
425 
426 	return 0;
427 }
428 
429 static int
uio_intx_intr_enable(const struct rte_intr_handle * intr_handle)430 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
431 {
432 	unsigned char command_high;
433 	int uio_cfg_fd;
434 
435 	/* use UIO config file descriptor for uio_pci_generic */
436 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
437 	if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
438 		EAL_LOG(ERR,
439 			"Error reading interrupts status for fd %d",
440 			uio_cfg_fd);
441 		return -1;
442 	}
443 	/* enable interrupts */
444 	command_high &= ~0x4;
445 	if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
446 		EAL_LOG(ERR,
447 			"Error enabling interrupts for fd %d",
448 			uio_cfg_fd);
449 		return -1;
450 	}
451 
452 	return 0;
453 }
454 
455 static int
uio_intr_disable(const struct rte_intr_handle * intr_handle)456 uio_intr_disable(const struct rte_intr_handle *intr_handle)
457 {
458 	const int value = 0;
459 
460 	if (rte_intr_fd_get(intr_handle) < 0 ||
461 	    write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
462 		EAL_LOG(ERR, "Error disabling interrupts for fd %d (%s)",
463 			rte_intr_fd_get(intr_handle), strerror(errno));
464 		return -1;
465 	}
466 	return 0;
467 }
468 
469 static int
uio_intr_enable(const struct rte_intr_handle * intr_handle)470 uio_intr_enable(const struct rte_intr_handle *intr_handle)
471 {
472 	const int value = 1;
473 
474 	if (rte_intr_fd_get(intr_handle) < 0 ||
475 	    write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
476 		EAL_LOG(ERR, "Error enabling interrupts for fd %d (%s)",
477 			rte_intr_fd_get(intr_handle), strerror(errno));
478 		return -1;
479 	}
480 	return 0;
481 }
482 
483 int
rte_intr_callback_register(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb,void * cb_arg)484 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
485 			rte_intr_callback_fn cb, void *cb_arg)
486 {
487 	int ret, wake_thread;
488 	struct rte_intr_source *src;
489 	struct rte_intr_callback *callback;
490 
491 	wake_thread = 0;
492 
493 	/* first do parameter checking */
494 	if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
495 		EAL_LOG(ERR, "Registering with invalid input parameter");
496 		return -EINVAL;
497 	}
498 
499 	/* allocate a new interrupt callback entity */
500 	callback = calloc(1, sizeof(*callback));
501 	if (callback == NULL) {
502 		EAL_LOG(ERR, "Can not allocate memory");
503 		return -ENOMEM;
504 	}
505 	callback->cb_fn = cb;
506 	callback->cb_arg = cb_arg;
507 	callback->pending_delete = 0;
508 	callback->ucb_fn = NULL;
509 
510 	rte_spinlock_lock(&intr_lock);
511 
512 	/* check if there is at least one callback registered for the fd */
513 	TAILQ_FOREACH(src, &intr_sources, next) {
514 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
515 			/* we had no interrupts for this */
516 			if (TAILQ_EMPTY(&src->callbacks))
517 				wake_thread = 1;
518 
519 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
520 			ret = 0;
521 			break;
522 		}
523 	}
524 
525 	/* no existing callbacks for this - add new source */
526 	if (src == NULL) {
527 		src = calloc(1, sizeof(*src));
528 		if (src == NULL) {
529 			EAL_LOG(ERR, "Can not allocate memory");
530 			ret = -ENOMEM;
531 			free(callback);
532 			callback = NULL;
533 		} else {
534 			src->intr_handle = rte_intr_instance_dup(intr_handle);
535 			if (src->intr_handle == NULL) {
536 				EAL_LOG(ERR, "Can not create intr instance");
537 				ret = -ENOMEM;
538 				free(callback);
539 				callback = NULL;
540 				free(src);
541 				src = NULL;
542 			} else {
543 				TAILQ_INIT(&src->callbacks);
544 				TAILQ_INSERT_TAIL(&(src->callbacks), callback,
545 						  next);
546 				TAILQ_INSERT_TAIL(&intr_sources, src, next);
547 				wake_thread = 1;
548 				ret = 0;
549 			}
550 		}
551 	}
552 
553 	rte_spinlock_unlock(&intr_lock);
554 
555 	/**
556 	 * check if need to notify the pipe fd waited by epoll_wait to
557 	 * rebuild the wait list.
558 	 */
559 	if (wake_thread)
560 		if (write(intr_pipe.writefd, "1", 1) < 0)
561 			ret = -EPIPE;
562 
563 	rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
564 	return ret;
565 }
566 
567 int
rte_intr_callback_unregister_pending(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb_fn,void * cb_arg,rte_intr_unregister_callback_fn ucb_fn)568 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
569 				rte_intr_callback_fn cb_fn, void *cb_arg,
570 				rte_intr_unregister_callback_fn ucb_fn)
571 {
572 	int ret;
573 	struct rte_intr_source *src;
574 	struct rte_intr_callback *cb, *next;
575 
576 	/* do parameter checking first */
577 	if (rte_intr_fd_get(intr_handle) < 0) {
578 		EAL_LOG(ERR, "Unregistering with invalid input parameter");
579 		return -EINVAL;
580 	}
581 
582 	rte_spinlock_lock(&intr_lock);
583 
584 	/* check if the interrupt source for the fd is existent */
585 	TAILQ_FOREACH(src, &intr_sources, next) {
586 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
587 			break;
588 	}
589 
590 	/* No interrupt source registered for the fd */
591 	if (src == NULL) {
592 		ret = -ENOENT;
593 
594 	/* only usable if the source is active */
595 	} else if (src->active == 0) {
596 		ret = -EAGAIN;
597 
598 	} else {
599 		ret = 0;
600 
601 		/* walk through the callbacks and mark all that match. */
602 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
603 			next = TAILQ_NEXT(cb, next);
604 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
605 					cb->cb_arg == cb_arg)) {
606 				cb->pending_delete = 1;
607 				cb->ucb_fn = ucb_fn;
608 				ret++;
609 			}
610 		}
611 	}
612 
613 	rte_spinlock_unlock(&intr_lock);
614 
615 	return ret;
616 }
617 
618 int
rte_intr_callback_unregister(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb_fn,void * cb_arg)619 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
620 			rte_intr_callback_fn cb_fn, void *cb_arg)
621 {
622 	int ret;
623 	struct rte_intr_source *src;
624 	struct rte_intr_callback *cb, *next;
625 
626 	/* do parameter checking first */
627 	if (rte_intr_fd_get(intr_handle) < 0) {
628 		EAL_LOG(ERR, "Unregistering with invalid input parameter");
629 		return -EINVAL;
630 	}
631 
632 	rte_spinlock_lock(&intr_lock);
633 
634 	/* check if the interrupt source for the fd is existent */
635 	TAILQ_FOREACH(src, &intr_sources, next)
636 		if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
637 			break;
638 
639 	/* No interrupt source registered for the fd */
640 	if (src == NULL) {
641 		ret = -ENOENT;
642 
643 	/* interrupt source has some active callbacks right now. */
644 	} else if (src->active != 0) {
645 		ret = -EAGAIN;
646 
647 	/* ok to remove. */
648 	} else {
649 		ret = 0;
650 
651 		/*walk through the callbacks and remove all that match. */
652 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
653 
654 			next = TAILQ_NEXT(cb, next);
655 
656 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
657 					cb->cb_arg == cb_arg)) {
658 				TAILQ_REMOVE(&src->callbacks, cb, next);
659 				free(cb);
660 				ret++;
661 			}
662 		}
663 
664 		/* all callbacks for that source are removed. */
665 		if (TAILQ_EMPTY(&src->callbacks)) {
666 			TAILQ_REMOVE(&intr_sources, src, next);
667 			rte_intr_instance_free(src->intr_handle);
668 			free(src);
669 		}
670 	}
671 
672 	rte_spinlock_unlock(&intr_lock);
673 
674 	/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
675 	if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
676 		ret = -EPIPE;
677 	}
678 
679 	rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
680 		ret);
681 	return ret;
682 }
683 
684 int
rte_intr_callback_unregister_sync(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb_fn,void * cb_arg)685 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
686 			rte_intr_callback_fn cb_fn, void *cb_arg)
687 {
688 	int ret = 0;
689 
690 	while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
691 		rte_pause();
692 
693 	return ret;
694 }
695 
696 int
rte_intr_enable(const struct rte_intr_handle * intr_handle)697 rte_intr_enable(const struct rte_intr_handle *intr_handle)
698 {
699 	int rc = 0, uio_cfg_fd;
700 
701 	if (intr_handle == NULL)
702 		return -1;
703 
704 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
705 		rc = 0;
706 		goto out;
707 	}
708 
709 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
710 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
711 		rc = -1;
712 		goto out;
713 	}
714 
715 	switch (rte_intr_type_get(intr_handle)) {
716 	/* write to the uio fd to enable the interrupt */
717 	case RTE_INTR_HANDLE_UIO:
718 		if (uio_intr_enable(intr_handle))
719 			rc = -1;
720 		break;
721 	case RTE_INTR_HANDLE_UIO_INTX:
722 		if (uio_intx_intr_enable(intr_handle))
723 			rc = -1;
724 		break;
725 	/* not used at this moment */
726 	case RTE_INTR_HANDLE_ALARM:
727 		rc = -1;
728 		break;
729 #ifdef VFIO_PRESENT
730 	case RTE_INTR_HANDLE_VFIO_MSIX:
731 		if (vfio_enable_msix(intr_handle))
732 			rc = -1;
733 		break;
734 	case RTE_INTR_HANDLE_VFIO_MSI:
735 		if (vfio_enable_msi(intr_handle))
736 			rc = -1;
737 		break;
738 	case RTE_INTR_HANDLE_VFIO_LEGACY:
739 		if (vfio_enable_intx(intr_handle))
740 			rc = -1;
741 		break;
742 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
743 	case RTE_INTR_HANDLE_VFIO_REQ:
744 		if (vfio_enable_req(intr_handle))
745 			rc = -1;
746 		break;
747 #endif
748 #endif
749 	/* not used at this moment */
750 	case RTE_INTR_HANDLE_DEV_EVENT:
751 		rc = -1;
752 		break;
753 	/* unknown handle type */
754 	default:
755 		EAL_LOG(ERR, "Unknown handle type of fd %d",
756 			rte_intr_fd_get(intr_handle));
757 		rc = -1;
758 		break;
759 	}
760 out:
761 	rte_eal_trace_intr_enable(intr_handle, rc);
762 	return rc;
763 }
764 
765 /**
766  * PMD generally calls this function at the end of its IRQ callback.
767  * Internally, it unmasks the interrupt if possible.
768  *
769  * For INTx, unmasking is required as the interrupt is auto-masked prior to
770  * invoking callback.
771  *
772  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
773  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
774  * this function is no-op.
775  */
776 int
rte_intr_ack(const struct rte_intr_handle * intr_handle)777 rte_intr_ack(const struct rte_intr_handle *intr_handle)
778 {
779 	int uio_cfg_fd;
780 
781 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
782 		return 0;
783 
784 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
785 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
786 		return -1;
787 
788 	switch (rte_intr_type_get(intr_handle)) {
789 	/* Both acking and enabling are same for UIO */
790 	case RTE_INTR_HANDLE_UIO:
791 		if (uio_intr_enable(intr_handle))
792 			return -1;
793 		break;
794 	case RTE_INTR_HANDLE_UIO_INTX:
795 		if (uio_intx_intr_enable(intr_handle))
796 			return -1;
797 		break;
798 	/* not used at this moment */
799 	case RTE_INTR_HANDLE_ALARM:
800 		return -1;
801 #ifdef VFIO_PRESENT
802 	/* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
803 	case RTE_INTR_HANDLE_VFIO_MSIX:
804 	case RTE_INTR_HANDLE_VFIO_MSI:
805 		return 0;
806 	case RTE_INTR_HANDLE_VFIO_LEGACY:
807 		if (vfio_ack_intx(intr_handle))
808 			return -1;
809 		break;
810 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
811 	case RTE_INTR_HANDLE_VFIO_REQ:
812 		return -1;
813 #endif
814 #endif
815 	/* not used at this moment */
816 	case RTE_INTR_HANDLE_DEV_EVENT:
817 		return -1;
818 	/* unknown handle type */
819 	default:
820 		EAL_LOG(ERR, "Unknown handle type of fd %d",
821 			rte_intr_fd_get(intr_handle));
822 		return -1;
823 	}
824 
825 	return 0;
826 }
827 
828 int
rte_intr_disable(const struct rte_intr_handle * intr_handle)829 rte_intr_disable(const struct rte_intr_handle *intr_handle)
830 {
831 	int rc = 0, uio_cfg_fd;
832 
833 	if (intr_handle == NULL)
834 		return -1;
835 
836 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
837 		rc = 0;
838 		goto out;
839 	}
840 
841 	uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
842 	if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
843 		rc = -1;
844 		goto out;
845 	}
846 
847 	switch (rte_intr_type_get(intr_handle)) {
848 	/* write to the uio fd to disable the interrupt */
849 	case RTE_INTR_HANDLE_UIO:
850 		if (uio_intr_disable(intr_handle))
851 			rc = -1;
852 		break;
853 	case RTE_INTR_HANDLE_UIO_INTX:
854 		if (uio_intx_intr_disable(intr_handle))
855 			rc = -1;
856 		break;
857 	/* not used at this moment */
858 	case RTE_INTR_HANDLE_ALARM:
859 		rc = -1;
860 		break;
861 #ifdef VFIO_PRESENT
862 	case RTE_INTR_HANDLE_VFIO_MSIX:
863 		if (vfio_disable_msix(intr_handle))
864 			rc = -1;
865 		break;
866 	case RTE_INTR_HANDLE_VFIO_MSI:
867 		if (vfio_disable_msi(intr_handle))
868 			rc = -1;
869 		break;
870 	case RTE_INTR_HANDLE_VFIO_LEGACY:
871 		if (vfio_disable_intx(intr_handle))
872 			rc = -1;
873 		break;
874 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
875 	case RTE_INTR_HANDLE_VFIO_REQ:
876 		if (vfio_disable_req(intr_handle))
877 			rc = -1;
878 		break;
879 #endif
880 #endif
881 	/* not used at this moment */
882 	case RTE_INTR_HANDLE_DEV_EVENT:
883 		rc = -1;
884 		break;
885 	/* unknown handle type */
886 	default:
887 		EAL_LOG(ERR, "Unknown handle type of fd %d",
888 			rte_intr_fd_get(intr_handle));
889 		rc = -1;
890 		break;
891 	}
892 out:
893 	rte_eal_trace_intr_disable(intr_handle, rc);
894 	return rc;
895 }
896 
897 static int
eal_intr_process_interrupts(struct epoll_event * events,int nfds)898 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
899 {
900 	bool call = false;
901 	int n, bytes_read, rv;
902 	struct rte_intr_source *src;
903 	struct rte_intr_callback *cb, *next;
904 	union rte_intr_read_buffer buf;
905 	struct rte_intr_callback active_cb;
906 
907 	for (n = 0; n < nfds; n++) {
908 
909 		/**
910 		 * if the pipe fd is ready to read, return out to
911 		 * rebuild the wait list.
912 		 */
913 		if (events[n].data.fd == intr_pipe.readfd){
914 			int r = read(intr_pipe.readfd, buf.charbuf,
915 					sizeof(buf.charbuf));
916 			RTE_SET_USED(r);
917 			return -1;
918 		}
919 		rte_spinlock_lock(&intr_lock);
920 		TAILQ_FOREACH(src, &intr_sources, next)
921 			if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
922 				break;
923 		if (src == NULL){
924 			rte_spinlock_unlock(&intr_lock);
925 			continue;
926 		}
927 
928 		/* mark this interrupt source as active and release the lock. */
929 		src->active = 1;
930 		rte_spinlock_unlock(&intr_lock);
931 
932 		/* set the length to be read dor different handle type */
933 		switch (rte_intr_type_get(src->intr_handle)) {
934 		case RTE_INTR_HANDLE_UIO:
935 		case RTE_INTR_HANDLE_UIO_INTX:
936 			bytes_read = sizeof(buf.uio_intr_count);
937 			break;
938 		case RTE_INTR_HANDLE_ALARM:
939 			bytes_read = sizeof(buf.timerfd_num);
940 			break;
941 #ifdef VFIO_PRESENT
942 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
943 		case RTE_INTR_HANDLE_VFIO_REQ:
944 #endif
945 		case RTE_INTR_HANDLE_VFIO_MSIX:
946 		case RTE_INTR_HANDLE_VFIO_MSI:
947 		case RTE_INTR_HANDLE_VFIO_LEGACY:
948 			bytes_read = sizeof(buf.vfio_intr_count);
949 			break;
950 #endif
951 		case RTE_INTR_HANDLE_VDEV:
952 		case RTE_INTR_HANDLE_EXT:
953 			bytes_read = 0;
954 			call = true;
955 			break;
956 		case RTE_INTR_HANDLE_DEV_EVENT:
957 			bytes_read = 0;
958 			call = true;
959 			break;
960 		default:
961 			bytes_read = 1;
962 			break;
963 		}
964 
965 		if (bytes_read > 0) {
966 			/**
967 			 * read out to clear the ready-to-be-read flag
968 			 * for epoll_wait.
969 			 */
970 			bytes_read = read(events[n].data.fd, &buf, bytes_read);
971 			if (bytes_read < 0) {
972 				if (errno == EINTR || errno == EWOULDBLOCK)
973 					continue;
974 
975 				EAL_LOG(ERR, "Error reading from file "
976 					"descriptor %d: %s",
977 					events[n].data.fd,
978 					strerror(errno));
979 				/*
980 				 * The device is unplugged or buggy, remove
981 				 * it as an interrupt source and return to
982 				 * force the wait list to be rebuilt.
983 				 */
984 				rte_spinlock_lock(&intr_lock);
985 				TAILQ_REMOVE(&intr_sources, src, next);
986 				rte_spinlock_unlock(&intr_lock);
987 
988 				for (cb = TAILQ_FIRST(&src->callbacks); cb;
989 							cb = next) {
990 					next = TAILQ_NEXT(cb, next);
991 					TAILQ_REMOVE(&src->callbacks, cb, next);
992 					free(cb);
993 				}
994 				rte_intr_instance_free(src->intr_handle);
995 				free(src);
996 				return -1;
997 			} else if (bytes_read == 0)
998 				EAL_LOG(ERR, "Read nothing from file "
999 					"descriptor %d", events[n].data.fd);
1000 			else
1001 				call = true;
1002 		}
1003 
1004 		/* grab a lock, again to call callbacks and update status. */
1005 		rte_spinlock_lock(&intr_lock);
1006 
1007 		if (call) {
1008 
1009 			/* Finally, call all callbacks. */
1010 			TAILQ_FOREACH(cb, &src->callbacks, next) {
1011 
1012 				/* make a copy and unlock. */
1013 				active_cb = *cb;
1014 				rte_spinlock_unlock(&intr_lock);
1015 
1016 				/* call the actual callback */
1017 				active_cb.cb_fn(active_cb.cb_arg);
1018 
1019 				/*get the lock back. */
1020 				rte_spinlock_lock(&intr_lock);
1021 			}
1022 		}
1023 		/* we done with that interrupt source, release it. */
1024 		src->active = 0;
1025 
1026 		rv = 0;
1027 
1028 		/* check if any callback are supposed to be removed */
1029 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1030 			next = TAILQ_NEXT(cb, next);
1031 			if (cb->pending_delete) {
1032 				TAILQ_REMOVE(&src->callbacks, cb, next);
1033 				if (cb->ucb_fn)
1034 					cb->ucb_fn(src->intr_handle, cb->cb_arg);
1035 				free(cb);
1036 				rv++;
1037 			}
1038 		}
1039 
1040 		/* all callbacks for that source are removed. */
1041 		if (TAILQ_EMPTY(&src->callbacks)) {
1042 			TAILQ_REMOVE(&intr_sources, src, next);
1043 			rte_intr_instance_free(src->intr_handle);
1044 			free(src);
1045 		}
1046 
1047 		/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1048 		if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1049 			rte_spinlock_unlock(&intr_lock);
1050 			return -EPIPE;
1051 		}
1052 
1053 		rte_spinlock_unlock(&intr_lock);
1054 	}
1055 
1056 	return 0;
1057 }
1058 
1059 /**
1060  * It handles all the interrupts.
1061  *
1062  * @param pfd
1063  *  epoll file descriptor.
1064  * @param totalfds
1065  *  The number of file descriptors added in epoll.
1066  *
1067  * @return
1068  *  void
1069  */
1070 static void
eal_intr_handle_interrupts(int pfd,unsigned totalfds)1071 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1072 {
1073 	struct epoll_event events[totalfds];
1074 	int nfds = 0;
1075 
1076 	for(;;) {
1077 		nfds = epoll_wait(pfd, events, totalfds,
1078 			EAL_INTR_EPOLL_WAIT_FOREVER);
1079 		/* epoll_wait fail */
1080 		if (nfds < 0) {
1081 			if (errno == EINTR)
1082 				continue;
1083 			EAL_LOG(ERR,
1084 				"epoll_wait returns with fail");
1085 			return;
1086 		}
1087 		/* epoll_wait timeout, will never happens here */
1088 		else if (nfds == 0)
1089 			continue;
1090 		/* epoll_wait has at least one fd ready to read */
1091 		if (eal_intr_process_interrupts(events, nfds) < 0)
1092 			return;
1093 	}
1094 }
1095 
1096 /**
1097  * It builds/rebuilds up the epoll file descriptor with all the
1098  * file descriptors being waited on. Then handles the interrupts.
1099  *
1100  * @param arg
1101  *  pointer. (unused)
1102  *
1103  * @return
1104  *  never return;
1105  */
1106 static __rte_noreturn uint32_t
eal_intr_thread_main(__rte_unused void * arg)1107 eal_intr_thread_main(__rte_unused void *arg)
1108 {
1109 	/* host thread, never break out */
1110 	for (;;) {
1111 		/* build up the epoll fd with all descriptors we are to
1112 		 * wait on then pass it to the handle_interrupts function
1113 		 */
1114 		static struct epoll_event pipe_event = {
1115 			.events = EPOLLIN | EPOLLPRI,
1116 		};
1117 		struct rte_intr_source *src;
1118 		unsigned numfds = 0;
1119 
1120 		/* create epoll fd */
1121 		int pfd = epoll_create(1);
1122 		if (pfd < 0)
1123 			rte_panic("Cannot create epoll instance\n");
1124 
1125 		pipe_event.data.fd = intr_pipe.readfd;
1126 		/**
1127 		 * add pipe fd into wait list, this pipe is used to
1128 		 * rebuild the wait list.
1129 		 */
1130 		if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1131 						&pipe_event) < 0) {
1132 			rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1133 					intr_pipe.readfd, strerror(errno));
1134 		}
1135 		numfds++;
1136 
1137 		rte_spinlock_lock(&intr_lock);
1138 
1139 		TAILQ_FOREACH(src, &intr_sources, next) {
1140 			struct epoll_event ev;
1141 
1142 			if (src->callbacks.tqh_first == NULL)
1143 				continue; /* skip those with no callbacks */
1144 			memset(&ev, 0, sizeof(ev));
1145 			ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1146 			ev.data.fd = rte_intr_fd_get(src->intr_handle);
1147 
1148 			/**
1149 			 * add all the uio device file descriptor
1150 			 * into wait list.
1151 			 */
1152 			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1153 					rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1154 				rte_panic("Error adding fd %d epoll_ctl, %s\n",
1155 					rte_intr_fd_get(src->intr_handle),
1156 					strerror(errno));
1157 			}
1158 			else
1159 				numfds++;
1160 		}
1161 		rte_spinlock_unlock(&intr_lock);
1162 		/* serve the interrupt */
1163 		eal_intr_handle_interrupts(pfd, numfds);
1164 
1165 		/**
1166 		 * when we return, we need to rebuild the
1167 		 * list of fds to monitor.
1168 		 */
1169 		close(pfd);
1170 	}
1171 }
1172 
1173 int
rte_eal_intr_init(void)1174 rte_eal_intr_init(void)
1175 {
1176 	int ret = 0;
1177 
1178 	/* init the global interrupt source head */
1179 	TAILQ_INIT(&intr_sources);
1180 
1181 	/**
1182 	 * create a pipe which will be waited by epoll and notified to
1183 	 * rebuild the wait list of epoll.
1184 	 */
1185 	if (pipe(intr_pipe.pipefd) < 0) {
1186 		rte_errno = errno;
1187 		return -1;
1188 	}
1189 
1190 	/* create the host thread to wait/handle the interrupt */
1191 	ret = rte_thread_create_internal_control(&intr_thread, "intr",
1192 			eal_intr_thread_main, NULL);
1193 	if (ret != 0) {
1194 		rte_errno = -ret;
1195 		EAL_LOG(ERR,
1196 			"Failed to create thread for interrupt handling");
1197 	}
1198 
1199 	return ret;
1200 }
1201 
1202 static void
eal_intr_proc_rxtx_intr(int fd,const struct rte_intr_handle * intr_handle)1203 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1204 {
1205 	union rte_intr_read_buffer buf;
1206 	int bytes_read = 0;
1207 	int nbytes;
1208 
1209 	switch (rte_intr_type_get(intr_handle)) {
1210 	case RTE_INTR_HANDLE_UIO:
1211 	case RTE_INTR_HANDLE_UIO_INTX:
1212 		bytes_read = sizeof(buf.uio_intr_count);
1213 		break;
1214 #ifdef VFIO_PRESENT
1215 	case RTE_INTR_HANDLE_VFIO_MSIX:
1216 	case RTE_INTR_HANDLE_VFIO_MSI:
1217 	case RTE_INTR_HANDLE_VFIO_LEGACY:
1218 		bytes_read = sizeof(buf.vfio_intr_count);
1219 		break;
1220 #endif
1221 	case RTE_INTR_HANDLE_VDEV:
1222 		bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1223 		/* For vdev, number of bytes to read is set by driver */
1224 		break;
1225 	case RTE_INTR_HANDLE_EXT:
1226 		return;
1227 	default:
1228 		bytes_read = 1;
1229 		EAL_LOG(INFO, "unexpected intr type");
1230 		break;
1231 	}
1232 
1233 	/**
1234 	 * read out to clear the ready-to-be-read flag
1235 	 * for epoll_wait.
1236 	 */
1237 	if (bytes_read == 0)
1238 		return;
1239 	do {
1240 		nbytes = read(fd, &buf, bytes_read);
1241 		if (nbytes < 0) {
1242 			if (errno == EINTR || errno == EWOULDBLOCK ||
1243 			    errno == EAGAIN)
1244 				continue;
1245 			EAL_LOG(ERR,
1246 				"Error reading from fd %d: %s",
1247 				fd, strerror(errno));
1248 		} else if (nbytes == 0)
1249 			EAL_LOG(ERR, "Read nothing from fd %d", fd);
1250 		return;
1251 	} while (1);
1252 }
1253 
1254 static int
eal_epoll_process_event(struct epoll_event * evs,unsigned int n,struct rte_epoll_event * events)1255 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1256 			struct rte_epoll_event *events)
1257 {
1258 	unsigned int i, count = 0;
1259 	struct rte_epoll_event *rev;
1260 	uint32_t valid_status;
1261 
1262 	for (i = 0; i < n; i++) {
1263 		rev = evs[i].data.ptr;
1264 		valid_status =  RTE_EPOLL_VALID;
1265 		/* ACQUIRE memory ordering here pairs with RELEASE
1266 		 * ordering below acting as a lock to synchronize
1267 		 * the event data updating.
1268 		 */
1269 		if (!rev || !rte_atomic_compare_exchange_strong_explicit(&rev->status,
1270 				    &valid_status, RTE_EPOLL_EXEC,
1271 				    rte_memory_order_acquire, rte_memory_order_relaxed))
1272 			continue;
1273 
1274 		events[count].status        = RTE_EPOLL_VALID;
1275 		events[count].fd            = rev->fd;
1276 		events[count].epfd          = rev->epfd;
1277 		events[count].epdata.event  = evs[i].events;
1278 		events[count].epdata.data   = rev->epdata.data;
1279 		if (rev->epdata.cb_fun)
1280 			rev->epdata.cb_fun(rev->fd,
1281 					   rev->epdata.cb_arg);
1282 
1283 		/* the status update should be observed after
1284 		 * the other fields change.
1285 		 */
1286 		rte_atomic_store_explicit(&rev->status, RTE_EPOLL_VALID,
1287 				rte_memory_order_release);
1288 		count++;
1289 	}
1290 	return count;
1291 }
1292 
1293 static inline int
eal_init_tls_epfd(void)1294 eal_init_tls_epfd(void)
1295 {
1296 	int pfd = epoll_create(255);
1297 
1298 	if (pfd < 0) {
1299 		EAL_LOG(ERR,
1300 			"Cannot create epoll instance");
1301 		return -1;
1302 	}
1303 	return pfd;
1304 }
1305 
1306 int
rte_intr_tls_epfd(void)1307 rte_intr_tls_epfd(void)
1308 {
1309 	if (RTE_PER_LCORE(_epfd) == -1)
1310 		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1311 
1312 	return RTE_PER_LCORE(_epfd);
1313 }
1314 
1315 static int
eal_epoll_wait(int epfd,struct rte_epoll_event * events,int maxevents,int timeout,bool interruptible)1316 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1317 	       int maxevents, int timeout, bool interruptible)
1318 {
1319 	struct epoll_event evs[maxevents];
1320 	int rc;
1321 
1322 	if (!events) {
1323 		EAL_LOG(ERR, "rte_epoll_event can't be NULL");
1324 		return -1;
1325 	}
1326 
1327 	/* using per thread epoll fd */
1328 	if (epfd == RTE_EPOLL_PER_THREAD)
1329 		epfd = rte_intr_tls_epfd();
1330 
1331 	while (1) {
1332 		rc = epoll_wait(epfd, evs, maxevents, timeout);
1333 		if (likely(rc > 0)) {
1334 			/* epoll_wait has at least one fd ready to read */
1335 			rc = eal_epoll_process_event(evs, rc, events);
1336 			break;
1337 		} else if (rc < 0) {
1338 			if (errno == EINTR) {
1339 				if (interruptible)
1340 					return -1;
1341 				else
1342 					continue;
1343 			}
1344 			/* epoll_wait fail */
1345 			EAL_LOG(ERR, "epoll_wait returns with fail %s",
1346 				strerror(errno));
1347 			rc = -1;
1348 			break;
1349 		} else {
1350 			/* rc == 0, epoll_wait timed out */
1351 			break;
1352 		}
1353 	}
1354 
1355 	return rc;
1356 }
1357 
1358 int
rte_epoll_wait(int epfd,struct rte_epoll_event * events,int maxevents,int timeout)1359 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1360 	       int maxevents, int timeout)
1361 {
1362 	return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1363 }
1364 
1365 int
rte_epoll_wait_interruptible(int epfd,struct rte_epoll_event * events,int maxevents,int timeout)1366 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1367 			     int maxevents, int timeout)
1368 {
1369 	return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1370 }
1371 
1372 static inline void
eal_epoll_data_safe_free(struct rte_epoll_event * ev)1373 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1374 {
1375 	uint32_t valid_status = RTE_EPOLL_VALID;
1376 
1377 	while (!rte_atomic_compare_exchange_strong_explicit(&ev->status, &valid_status,
1378 		    RTE_EPOLL_INVALID, rte_memory_order_acquire, rte_memory_order_relaxed)) {
1379 		while (rte_atomic_load_explicit(&ev->status,
1380 				rte_memory_order_relaxed) != RTE_EPOLL_VALID)
1381 			rte_pause();
1382 		valid_status = RTE_EPOLL_VALID;
1383 	}
1384 	memset(&ev->epdata, 0, sizeof(ev->epdata));
1385 	ev->fd = -1;
1386 	ev->epfd = -1;
1387 }
1388 
1389 int
rte_epoll_ctl(int epfd,int op,int fd,struct rte_epoll_event * event)1390 rte_epoll_ctl(int epfd, int op, int fd,
1391 	      struct rte_epoll_event *event)
1392 {
1393 	struct epoll_event ev;
1394 
1395 	if (!event) {
1396 		EAL_LOG(ERR, "rte_epoll_event can't be NULL");
1397 		return -1;
1398 	}
1399 
1400 	/* using per thread epoll fd */
1401 	if (epfd == RTE_EPOLL_PER_THREAD)
1402 		epfd = rte_intr_tls_epfd();
1403 
1404 	if (op == EPOLL_CTL_ADD) {
1405 		rte_atomic_store_explicit(&event->status, RTE_EPOLL_VALID,
1406 				rte_memory_order_relaxed);
1407 		event->fd = fd;  /* ignore fd in event */
1408 		event->epfd = epfd;
1409 		ev.data.ptr = (void *)event;
1410 	}
1411 
1412 	ev.events = event->epdata.event;
1413 	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1414 		EAL_LOG(ERR, "Error op %d fd %d epoll_ctl, %s",
1415 			op, fd, strerror(errno));
1416 		if (op == EPOLL_CTL_ADD)
1417 			/* rollback status when CTL_ADD fail */
1418 			rte_atomic_store_explicit(&event->status, RTE_EPOLL_INVALID,
1419 					rte_memory_order_relaxed);
1420 		return -1;
1421 	}
1422 
1423 	if (op == EPOLL_CTL_DEL && rte_atomic_load_explicit(&event->status,
1424 			rte_memory_order_relaxed) != RTE_EPOLL_INVALID)
1425 		eal_epoll_data_safe_free(event);
1426 
1427 	return 0;
1428 }
1429 
1430 int
rte_intr_rx_ctl(struct rte_intr_handle * intr_handle,int epfd,int op,unsigned int vec,void * data)1431 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1432 		int op, unsigned int vec, void *data)
1433 {
1434 	struct rte_epoll_event *rev;
1435 	struct rte_epoll_data *epdata;
1436 	int epfd_op;
1437 	unsigned int efd_idx;
1438 	int rc = 0;
1439 
1440 	efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1441 		(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1442 
1443 	if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1444 			efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1445 		EAL_LOG(ERR, "Wrong intr vector number.");
1446 		return -EPERM;
1447 	}
1448 
1449 	switch (op) {
1450 	case RTE_INTR_EVENT_ADD:
1451 		epfd_op = EPOLL_CTL_ADD;
1452 		rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1453 		if (rte_atomic_load_explicit(&rev->status,
1454 				rte_memory_order_relaxed) != RTE_EPOLL_INVALID) {
1455 			EAL_LOG(INFO, "Event already been added.");
1456 			return -EEXIST;
1457 		}
1458 
1459 		/* attach to intr vector fd */
1460 		epdata = &rev->epdata;
1461 		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1462 		epdata->data   = data;
1463 		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1464 		epdata->cb_arg = (void *)intr_handle;
1465 		rc = rte_epoll_ctl(epfd, epfd_op,
1466 			rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1467 		if (!rc)
1468 			EAL_LOG(DEBUG,
1469 				"efd %d associated with vec %d added on epfd %d",
1470 				rev->fd, vec, epfd);
1471 		else
1472 			rc = -EPERM;
1473 		break;
1474 	case RTE_INTR_EVENT_DEL:
1475 		epfd_op = EPOLL_CTL_DEL;
1476 		rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1477 		if (rte_atomic_load_explicit(&rev->status,
1478 				rte_memory_order_relaxed) == RTE_EPOLL_INVALID) {
1479 			EAL_LOG(INFO, "Event does not exist.");
1480 			return -EPERM;
1481 		}
1482 
1483 		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1484 		if (rc)
1485 			rc = -EPERM;
1486 		break;
1487 	default:
1488 		EAL_LOG(ERR, "event op type mismatch");
1489 		rc = -EPERM;
1490 	}
1491 
1492 	return rc;
1493 }
1494 
1495 void
rte_intr_free_epoll_fd(struct rte_intr_handle * intr_handle)1496 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1497 {
1498 	uint32_t i;
1499 	struct rte_epoll_event *rev;
1500 
1501 	for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1502 		rev = rte_intr_elist_index_get(intr_handle, i);
1503 		if (rte_atomic_load_explicit(&rev->status,
1504 				rte_memory_order_relaxed) == RTE_EPOLL_INVALID)
1505 			continue;
1506 		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1507 			/* force free if the entry valid */
1508 			eal_epoll_data_safe_free(rev);
1509 		}
1510 	}
1511 }
1512 
1513 int
rte_intr_efd_enable(struct rte_intr_handle * intr_handle,uint32_t nb_efd)1514 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1515 {
1516 	uint32_t i;
1517 	int fd;
1518 	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1519 
1520 	assert(nb_efd != 0);
1521 
1522 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1523 		for (i = 0; i < n; i++) {
1524 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1525 			if (fd < 0) {
1526 				EAL_LOG(ERR,
1527 					"can't setup eventfd, error %i (%s)",
1528 					errno, strerror(errno));
1529 				return -errno;
1530 			}
1531 
1532 			if (rte_intr_efds_index_set(intr_handle, i, fd))
1533 				return -rte_errno;
1534 		}
1535 
1536 		if (rte_intr_nb_efd_set(intr_handle, n))
1537 			return -rte_errno;
1538 
1539 		if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1540 			return -rte_errno;
1541 	} else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1542 		/* only check, initialization would be done in vdev driver.*/
1543 		if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1544 		    sizeof(union rte_intr_read_buffer)) {
1545 			EAL_LOG(ERR, "the efd_counter_size is oversized");
1546 			return -EINVAL;
1547 		}
1548 	} else {
1549 		if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1550 			return -rte_errno;
1551 		if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1552 			return -rte_errno;
1553 		if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1554 			return -rte_errno;
1555 	}
1556 
1557 	return 0;
1558 }
1559 
1560 void
rte_intr_efd_disable(struct rte_intr_handle * intr_handle)1561 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1562 {
1563 	uint32_t i;
1564 
1565 	rte_intr_free_epoll_fd(intr_handle);
1566 	if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1567 		for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1568 			close(rte_intr_efds_index_get(intr_handle, i));
1569 	}
1570 	rte_intr_nb_efd_set(intr_handle, 0);
1571 	rte_intr_max_intr_set(intr_handle, 0);
1572 }
1573 
1574 int
rte_intr_dp_is_en(struct rte_intr_handle * intr_handle)1575 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1576 {
1577 	return !(!rte_intr_nb_efd_get(intr_handle));
1578 }
1579 
1580 int
rte_intr_allow_others(struct rte_intr_handle * intr_handle)1581 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1582 {
1583 	if (!rte_intr_dp_is_en(intr_handle))
1584 		return 1;
1585 	else
1586 		return !!(rte_intr_max_intr_get(intr_handle) -
1587 				rte_intr_nb_efd_get(intr_handle));
1588 }
1589 
1590 int
rte_intr_cap_multiple(struct rte_intr_handle * intr_handle)1591 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1592 {
1593 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1594 		return 1;
1595 
1596 	if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1597 		return 1;
1598 
1599 	return 0;
1600 }
1601 
rte_thread_is_intr(void)1602 int rte_thread_is_intr(void)
1603 {
1604 	return rte_thread_equal(intr_thread, rte_thread_self());
1605 }
1606