xref: /netbsd-src/sys/kern/sys_epoll.c (revision 2c545067c78a4b84d16735051f9ff75bb33c88e8)
1 /*	$NetBSD: sys_epoll.c,v 1.4 2023/07/30 18:31:13 christos Exp $	*/
2 
3 /*-
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2007 Roman Divacky
7  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 #include <sys/cdefs.h>
31 __KERNEL_RCSID(0, "$NetBSD: sys_epoll.c,v 1.4 2023/07/30 18:31:13 christos Exp $");
32 
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/bitops.h>
37 #include <sys/epoll.h>
38 #include <sys/event.h>
39 #include <sys/eventvar.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/filedesc.h>
43 #include <sys/fcntl.h>
44 #include <sys/proc.h>
45 #include <sys/signal.h>
46 #include <sys/vnode.h>
47 
48 #include <sys/syscallargs.h>
49 
50 #define	EPOLL_MAX_DEPTH		5
51 
52 #define	EPOLL_EVRD	(EPOLLIN|EPOLLRDNORM)
53 #define	EPOLL_EVWR	(EPOLLOUT|EPOLLWRNORM)
54 #define	EPOLL_EVSUP	(EPOLLET|EPOLLONESHOT|EPOLLHUP|EPOLLERR|EPOLLPRI \
55 			|EPOLL_EVRD|EPOLL_EVWR|EPOLLRDHUP)
56 
57 #define	kext_data	ext[0]
58 #define	kext_epfd	ext[1]
59 #define	kext_fd		ext[2]
60 
61 #if DEBUG
62 #define	DPRINTF(x) uprintf x
63 #else
64 #define	DPRINTF(x) __nothing
65 #endif
66 
67 struct epoll_edge {
68 	int epfd;
69 	int fd;
70 };
71 
72 __BITMAP_TYPE(epoll_seen, char, 1);
73 
74 static int	epoll_to_kevent(int, int, struct epoll_event *, struct kevent *,
75     int *);
76 static void	kevent_to_epoll(struct kevent *, struct epoll_event *);
77 static int      epoll_kev_put_events(void *, struct kevent *, struct kevent *,
78     size_t, int);
79 static int	epoll_kev_fetch_changes(void *, const struct kevent *,
80     struct kevent *, size_t, int);
81 static int	epoll_kev_fetch_timeout(const void *, void *, size_t);
82 static int	epoll_register_kevent(register_t *, int, int, int,
83     unsigned int);
84 static int	epoll_fd_registered(register_t *, int, int);
85 static int	epoll_delete_all_events(register_t *, int, int);
86 static int	epoll_recover_watch_tree(struct epoll_edge *, size_t, size_t);
87 static int	epoll_dfs(struct epoll_edge *, size_t, struct epoll_seen *,
88     size_t, int, int);
89 static int	epoll_check_loop_and_depth(struct lwp *, int, int);
90 
91 /*
92  * epoll_create1(2).  Parse the flags and then create a kqueue instance.
93  */
94 int
sys_epoll_create1(struct lwp * l,const struct sys_epoll_create1_args * uap,register_t * retval)95 sys_epoll_create1(struct lwp *l, const struct sys_epoll_create1_args *uap,
96     register_t *retval)
97 {
98 	/* {
99 		syscallarg(int) flags;
100 	} */
101 	struct sys_kqueue1_args kqa;
102 
103 	if ((SCARG(uap, flags) & ~(EPOLL_CLOEXEC)) != 0)
104 		return EINVAL;
105 
106 	SCARG(&kqa, flags) = 0;
107 	if (SCARG(uap, flags) & EPOLL_CLOEXEC)
108 		SCARG(&kqa, flags) |= O_CLOEXEC;
109 
110 	return sys_kqueue1(l, &kqa, retval);
111 }
112 
113 /*
114  * Structure converting function from epoll to kevent.
115  */
116 static int
epoll_to_kevent(int epfd,int fd,struct epoll_event * l_event,struct kevent * kevent,int * nkevents)117 epoll_to_kevent(int epfd, int fd, struct epoll_event *l_event,
118     struct kevent *kevent, int *nkevents)
119 {
120 	uint32_t levents = l_event->events;
121 	uint32_t kev_flags = EV_ADD | EV_ENABLE;
122 
123 	/* flags related to how event is registered */
124 	if ((levents & EPOLLONESHOT) != 0)
125 		kev_flags |= EV_DISPATCH;
126 	if ((levents & EPOLLET) != 0)
127 		kev_flags |= EV_CLEAR;
128 	if ((levents & EPOLLERR) != 0)
129 		kev_flags |= EV_ERROR;
130 	if ((levents & EPOLLRDHUP) != 0)
131 		kev_flags |= EV_EOF;
132 
133 	/* flags related to what event is registered */
134 	if ((levents & EPOLL_EVRD) != 0) {
135 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
136 		kevent->kext_data = l_event->data;
137 		kevent->kext_epfd = epfd;
138 		kevent->kext_fd = fd;
139 		++kevent;
140 		++(*nkevents);
141 	}
142 	if ((levents & EPOLL_EVWR) != 0) {
143 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
144 		kevent->kext_data = l_event->data;
145 		kevent->kext_epfd = epfd;
146 		kevent->kext_fd = fd;
147 		++kevent;
148 		++(*nkevents);
149 	}
150 	/* zero event mask is legal */
151 	if ((levents & (EPOLL_EVRD | EPOLL_EVWR)) == 0) {
152 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
153 		++(*nkevents);
154 	}
155 
156 	if ((levents & ~(EPOLL_EVSUP)) != 0) {
157 		return EINVAL;
158 	}
159 
160 	return 0;
161 }
162 
163 /*
164  * Structure converting function from kevent to epoll. In a case
165  * this is called on error in registration we store the error in
166  * event->data and pick it up later in sys_epoll_ctl().
167  */
168 static void
kevent_to_epoll(struct kevent * kevent,struct epoll_event * l_event)169 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
170 {
171 
172 	l_event->data = kevent->kext_data;
173 
174 	if ((kevent->flags & EV_ERROR) != 0) {
175 		l_event->events = EPOLLERR;
176 		return;
177 	}
178 
179 	/* XXX EPOLLPRI, EPOLLHUP */
180 	switch (kevent->filter) {
181 	case EVFILT_READ:
182 		l_event->events = EPOLLIN;
183 		if ((kevent->flags & EV_EOF) != 0)
184 			l_event->events |= EPOLLRDHUP;
185 		break;
186 	case EVFILT_WRITE:
187 		l_event->events = EPOLLOUT;
188 		break;
189 	default:
190 		DPRINTF(("%s: unhandled kevent filter %d\n", __func__,
191 		    kevent->filter));
192 		break;
193 	}
194 }
195 
196 /*
197  * Copyout callback used by kevent.  This converts kevent events to
198  * epoll events that are located in args->eventlist.
199  */
200 static int
epoll_kev_put_events(void * ctx,struct kevent * events,struct kevent * eventlist,size_t index,int n)201 epoll_kev_put_events(void *ctx, struct kevent *events,
202     struct kevent *eventlist, size_t index, int n)
203 {
204 	int i;
205 	struct epoll_event *eep = (struct epoll_event *)eventlist;
206 
207 	KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS);
208 
209 	for (i = 0; i < n; i++)
210 		kevent_to_epoll(events + i, eep + index + i);
211 
212 	return 0;
213 }
214 
215 /*
216  * Copyin callback used by kevent. This copies already
217  * converted filters from kernel memory to the kevent
218  * internal kernel memory. Hence the memcpy instead of
219  * copyin.
220  */
221 static int
epoll_kev_fetch_changes(void * ctx,const struct kevent * changelist,struct kevent * changes,size_t index,int n)222 epoll_kev_fetch_changes(void *ctx, const struct kevent *changelist,
223     struct kevent *changes, size_t index, int n)
224 {
225 	KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS);
226 
227 	memcpy(changes, changelist + index, n * sizeof(*changes));
228 
229 	return 0;
230 }
231 
232 /*
233  * Timer copy callback used by kevent.  Copies a converted timeout
234  * from kernel memory to kevent memory.  Hence the memcpy instead of
235  * just using copyin.
236  */
237 static int
epoll_kev_fetch_timeout(const void * src,void * dest,size_t size)238 epoll_kev_fetch_timeout(const void *src, void *dest, size_t size)
239 {
240 	memcpy(dest, src, size);
241 
242 	return 0;
243 }
244 
245 /*
246  * Load epoll filter, convert it to kevent filter and load it into
247  * kevent subsystem.
248  *
249  * event must point to kernel memory or be NULL.
250  */
251 int
epoll_ctl_common(struct lwp * l,register_t * retval,int epfd,int op,int fd,struct epoll_event * event)252 epoll_ctl_common(struct lwp *l, register_t *retval, int epfd, int op, int fd,
253     struct epoll_event *event)
254 {
255 	struct kevent kev[2];
256 	struct kevent_ops k_ops = {
257 		.keo_private = NULL,
258 		.keo_fetch_timeout = NULL,
259 		.keo_fetch_changes = epoll_kev_fetch_changes,
260 		.keo_put_events = NULL,
261 	};
262 	file_t *epfp, *fp;
263 	int error = 0;
264 	int nchanges = 0;
265 
266 	/*
267 	 * Need to validate epfd and fd separately from kevent1 to match
268 	 * Linux's errno behaviour.
269 	 */
270 	epfp = fd_getfile(epfd);
271 	if (epfp == NULL)
272 		return EBADF;
273 	if (epfp->f_type != DTYPE_KQUEUE)
274 		error = EINVAL;
275 	fd_putfile(epfd);
276 	if (error != 0)
277 		return error;
278 
279 	fp = fd_getfile(fd);
280 	if (fp == NULL)
281 		return EBADF;
282 	if (fp->f_type == DTYPE_VNODE) {
283 		switch (fp->f_vnode->v_type) {
284 		case VREG:
285 		case VDIR:
286 		case VBLK:
287 		case VLNK:
288 			error = EPERM;
289 			break;
290 
291 		default:
292 			break;
293 		}
294 	}
295 	fd_putfile(fd);
296 	if (error != 0)
297 		return error;
298 
299 	/* Linux disallows spying on himself */
300 	if (epfd == fd) {
301 		return EINVAL;
302 	}
303 
304 	if (op != EPOLL_CTL_DEL) {
305 		error = epoll_to_kevent(epfd, fd, event, kev, &nchanges);
306 		if (error != 0)
307 			return error;
308 	}
309 
310 	switch (op) {
311 	case EPOLL_CTL_MOD:
312 		error = epoll_delete_all_events(retval, epfd, fd);
313 		if (error != 0)
314 			return error;
315 		break;
316 
317 	case EPOLL_CTL_ADD:
318 		if (epoll_fd_registered(retval, epfd, fd))
319 			return EEXIST;
320 		error = epoll_check_loop_and_depth(l, epfd, fd);
321 		if (error != 0)
322 			return error;
323 		break;
324 
325 	case EPOLL_CTL_DEL:
326 		/* CTL_DEL means unregister this fd with this epoll */
327 		return epoll_delete_all_events(retval, epfd, fd);
328 
329 	default:
330 		DPRINTF(("%s: invalid op %d\n", __func__, op));
331 		return EINVAL;
332 	}
333 
334 	error = kevent1(retval, epfd, kev, nchanges, NULL, 0, NULL, &k_ops);
335 
336 	if (error == EOPNOTSUPP) {
337 		error = EPERM;
338 	}
339 
340 	return error;
341 }
342 
343 /*
344  * epoll_ctl(2).  Copyin event if necessary and then call
345  * epoll_ctl_common().
346  */
347 int
sys_epoll_ctl(struct lwp * l,const struct sys_epoll_ctl_args * uap,register_t * retval)348 sys_epoll_ctl(struct lwp *l, const struct sys_epoll_ctl_args *uap,
349     register_t *retval)
350 {
351 	/* {
352 		syscallarg(int) epfd;
353 		syscallarg(int) op;
354 		syscallarg(int) fd;
355 		syscallarg(struct epoll_event *) event;
356 	} */
357 	struct epoll_event ee;
358 	struct epoll_event *eep;
359 	int error;
360 
361 	if (SCARG(uap, op) != EPOLL_CTL_DEL) {
362 		error = copyin(SCARG(uap, event), &ee, sizeof(ee));
363 		if (error != 0)
364 			return error;
365 
366 		eep = &ee;
367 	} else
368 		eep = NULL;
369 
370 	return epoll_ctl_common(l, retval, SCARG(uap, epfd), SCARG(uap, op),
371 	    SCARG(uap, fd), eep);
372 }
373 
374 /*
375  * Wait for a filter to be triggered on the epoll file descriptor.
376  * All of the epoll_*wait* syscalls eventually end up here.
377  *
378  * events, nss, and ssp must point to kernel memory (or be NULL).
379  */
380 int
epoll_wait_common(struct lwp * l,register_t * retval,int epfd,struct epoll_event * events,int maxevents,struct timespec * tsp,const sigset_t * nssp)381 epoll_wait_common(struct lwp *l, register_t *retval, int epfd,
382     struct epoll_event *events, int maxevents, struct timespec *tsp,
383     const sigset_t *nssp)
384 {
385 	struct kevent_ops k_ops = {
386 	        .keo_private = NULL,
387 		.keo_fetch_timeout = epoll_kev_fetch_timeout,
388 		.keo_fetch_changes = NULL,
389 		.keo_put_events = epoll_kev_put_events,
390 	};
391 	struct proc *p = l->l_proc;
392 	file_t *epfp;
393 	sigset_t oss;
394 	int error = 0;
395 
396 	if (maxevents <= 0 || maxevents > EPOLL_MAX_EVENTS)
397 		return EINVAL;
398 
399 	/*
400 	 * Need to validate epfd separately from kevent1 to match
401 	 * Linux's errno behaviour.
402 	 */
403 	epfp = fd_getfile(epfd);
404 	if (epfp == NULL)
405 		return EBADF;
406 	if (epfp->f_type != DTYPE_KQUEUE)
407 		error = EINVAL;
408 	fd_putfile(epfd);
409 	if (error != 0)
410 		return error;
411 
412 	if (nssp != NULL) {
413 		mutex_enter(p->p_lock);
414 		error = sigprocmask1(l, SIG_SETMASK, nssp, &oss);
415 		mutex_exit(p->p_lock);
416 		if (error != 0)
417 			return error;
418 	}
419 
420 	error = kevent1(retval, epfd, NULL, 0, (struct kevent *)events,
421 	    maxevents, tsp, &k_ops);
422 	/*
423 	 * Since we're not registering nay events, ENOMEM should not
424 	 * be possible for this specific kevent1 call.
425 	 */
426 	KASSERT(error != ENOMEM);
427 
428 	if (nssp != NULL) {
429 	        mutex_enter(p->p_lock);
430 		error = sigprocmask1(l, SIG_SETMASK, &oss, NULL);
431 		mutex_exit(p->p_lock);
432 	}
433 
434 	return error;
435 }
436 
437 /*
438  * epoll_pwait2(2).
439  */
440 int
sys_epoll_pwait2(struct lwp * l,const struct sys_epoll_pwait2_args * uap,register_t * retval)441 sys_epoll_pwait2(struct lwp *l, const struct sys_epoll_pwait2_args *uap,
442     register_t *retval)
443 {
444 	/* {
445 		syscallarg(int) epfd;
446 		syscallarg(struct epoll_event *) events;
447 		syscallarg(int) maxevents;
448 		syscallarg(struct timespec *) timeout;
449 		syscallarg(sigset_t *) sigmask;
450 	} */
451 	struct epoll_event *events;
452 	struct timespec ts, *tsp;
453 	sigset_t ss, *ssp;
454 	int error;
455 	const int maxevents = SCARG(uap, maxevents);
456 
457 	if (maxevents <= 0 || maxevents >= EPOLL_MAX_EVENTS)
458 		return EINVAL;
459 
460 	if (SCARG(uap, timeout) != NULL) {
461 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
462 		if (error != 0)
463 			return error;
464 
465 		tsp = &ts;
466 	} else
467 		tsp = NULL;
468 
469 	if (SCARG(uap, sigmask) != NULL) {
470 		error = copyin(SCARG(uap, sigmask), &ss, sizeof(ss));
471 		if (error != 0)
472 			return error;
473 
474 		ssp = &ss;
475 	} else
476 		ssp = NULL;
477 
478 	events = kmem_alloc(maxevents * sizeof(*events), KM_SLEEP);
479 
480 	error = epoll_wait_common(l, retval, SCARG(uap, epfd), events,
481 	    maxevents, tsp, ssp);
482 	if (error == 0)
483 		error = copyout(events, SCARG(uap, events),
484 		    *retval * sizeof(*events));
485 
486 	kmem_free(events, maxevents * sizeof(*events));
487 	return error;
488 }
489 
490 /*
491  * Helper that registers a single kevent.
492  */
493 static int
epoll_register_kevent(register_t * retval,int epfd,int fd,int filter,unsigned int flags)494 epoll_register_kevent(register_t *retval, int epfd, int fd, int filter,
495     unsigned int flags)
496 {
497 	struct kevent kev;
498 	struct kevent_ops k_ops = {
499 		.keo_private = NULL,
500 		.keo_fetch_timeout = NULL,
501 		.keo_fetch_changes = epoll_kev_fetch_changes,
502 		.keo_put_events = NULL,
503 	};
504 
505 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
506 
507 	return kevent1(retval, epfd, &kev, 1, NULL, 0, NULL, &k_ops);
508 }
509 
510 /*
511  * Check if an fd is already registered in the kqueue referenced by epfd.
512  */
513 static int
epoll_fd_registered(register_t * retval,int epfd,int fd)514 epoll_fd_registered(register_t *retval, int epfd, int fd)
515 {
516 	/*
517 	 * Set empty filter flags to avoid accidental modification of already
518 	 * registered events. In the case of event re-registration:
519 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
520 	 * 2. If event does exists, it's enabled/disabled state is preserved
521 	 *    but fflags, data and udata fields are overwritten. So we can not
522 	 *    set socket lowats and store user's context pointer in udata.
523 	 */
524 	if (epoll_register_kevent(retval, epfd, fd, EVFILT_READ, 0) != ENOENT ||
525 	    epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE, 0) != ENOENT)
526 		return 1;
527 
528 	return 0;
529 }
530 
531 /*
532  * Remove all events in the kqueue referenced by epfd that depend on
533  * fd.
534  */
535 static int
epoll_delete_all_events(register_t * retval,int epfd,int fd)536 epoll_delete_all_events(register_t *retval, int epfd, int fd)
537 {
538 	int error1, error2;
539 
540 	error1 = epoll_register_kevent(retval, epfd, fd, EVFILT_READ,
541 	    EV_DELETE);
542 	error2 = epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE,
543 	    EV_DELETE);
544 
545 	/* return 0 if at least one result positive */
546 	return error1 == 0 ? 0 : error2;
547 }
548 
549 /*
550  * Interate through all the knotes and recover a directed graph on
551  * which kqueues are watching each other.
552  *
553  * If edges is NULL, the number of edges is still counted but no graph
554  * is assembled.
555  */
556 static int
epoll_recover_watch_tree(struct epoll_edge * edges,size_t nedges,size_t nfds)557 epoll_recover_watch_tree(struct epoll_edge *edges, size_t nedges, size_t nfds) {
558 	file_t *currfp, *targetfp;
559 	struct knote *kn, *tmpkn;
560 	size_t i, nedges_so_far = 0;
561 
562 	for (i = 0; i < nfds && (edges == NULL || nedges_so_far < nedges); i++)
563 	{
564 		currfp = fd_getfile(i);
565 		if (currfp == NULL)
566 			continue;
567 		if (currfp->f_type != DTYPE_KQUEUE)
568 			goto continue_count_outer;
569 
570 		SLIST_FOREACH_SAFE(kn, &currfp->f_kqueue->kq_sel.sel_klist,
571 		    kn_selnext, tmpkn) {
572 			targetfp = fd_getfile(kn->kn_kevent.kext_epfd);
573 			if (targetfp == NULL)
574 				continue;
575 			if (targetfp->f_type == DTYPE_KQUEUE) {
576 				if (edges != NULL) {
577 					edges[nedges_so_far].epfd =
578 					    kn->kn_kevent.kext_epfd;
579 					edges[nedges_so_far].fd =
580 					    kn->kn_kevent.kext_fd;
581 				}
582 				nedges_so_far++;
583 			}
584 
585 			fd_putfile(kn->kn_kevent.kext_epfd);
586 		}
587 
588 continue_count_outer:
589 		fd_putfile(i);
590 	}
591 
592 	return nedges_so_far;
593 }
594 
595 /*
596  * Run dfs on the graph described by edges, checking for loops and a
597  * depth greater than EPOLL_MAX_DEPTH.
598  */
599 static int
epoll_dfs(struct epoll_edge * edges,size_t nedges,struct epoll_seen * seen,size_t nseen,int currfd,int depth)600 epoll_dfs(struct epoll_edge *edges, size_t nedges, struct epoll_seen *seen,
601     size_t nseen, int currfd, int depth)
602 {
603 	int error;
604 	size_t i;
605 
606 	KASSERT(edges != NULL);
607 	KASSERT(seen != NULL);
608 	KASSERT(nedges > 0);
609 	KASSERT(currfd < nseen);
610 	KASSERT(0 <= depth && depth <= EPOLL_MAX_DEPTH + 1);
611 
612 	if (__BITMAP_ISSET(currfd, seen))
613 		return ELOOP;
614 
615 	__BITMAP_SET(currfd, seen);
616 
617 	depth++;
618 	if (depth > EPOLL_MAX_DEPTH)
619 		return EINVAL;
620 
621 	for (i = 0; i < nedges; i++) {
622 		if (edges[i].epfd != currfd)
623 			continue;
624 
625 		error = epoll_dfs(edges, nedges, seen, nseen,
626 		    edges[i].fd, depth);
627 		if (error != 0)
628 			return error;
629 	}
630 
631 	return 0;
632 }
633 
634 /*
635  * Check if adding fd to epfd would violate the maximum depth or
636  * create a loop.
637  */
638 static int
epoll_check_loop_and_depth(struct lwp * l,int epfd,int fd)639 epoll_check_loop_and_depth(struct lwp *l, int epfd, int fd)
640 {
641 	int error;
642 	file_t *fp;
643 	struct epoll_edge *edges;
644 	struct epoll_seen *seen;
645 	size_t nedges, nfds, seen_size;
646 	bool fdirrelevant;
647 
648 	/* If the target isn't another kqueue, we can skip this check */
649 	fp = fd_getfile(fd);
650 	if (fp == NULL)
651 		return 0;
652 	fdirrelevant = fp->f_type != DTYPE_KQUEUE;
653 	fd_putfile(fd);
654 	if (fdirrelevant)
655 		return 0;
656 
657 	nfds = l->l_proc->p_fd->fd_lastfile + 1;
658 
659 	/*
660 	 * We call epoll_recover_watch_tree twice, once to find the
661 	 * number of edges, and once to actually fill them in.  We add one
662 	 * because we want to include the edge epfd->fd.
663 	 */
664 	nedges = 1 + epoll_recover_watch_tree(NULL, 0, nfds);
665 
666 	edges = kmem_zalloc(nedges * sizeof(*edges), KM_SLEEP);
667 
668 	epoll_recover_watch_tree(edges + 1, nedges - 1, nfds);
669 
670 	edges[0].epfd = epfd;
671 	edges[0].fd = fd;
672 
673 	seen_size = __BITMAP_SIZE(char, nfds);
674 	seen = kmem_zalloc(seen_size, KM_SLEEP);
675 
676 	error = epoll_dfs(edges, nedges, seen, nfds, epfd, 0);
677 
678 	kmem_free(seen, seen_size);
679 	kmem_free(edges, nedges * sizeof(*edges));
680 
681 	return error;
682 }
683