xref: /openbsd-src/sys/kern/kern_event.c (revision d3f5ce7613848b361ec06b8f2d44ad34c6022628)
1 /*	$OpenBSD: kern_event.c,v 1.190 2022/06/20 01:39:44 visa Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/proc.h>
35 #include <sys/pledge.h>
36 #include <sys/malloc.h>
37 #include <sys/unistd.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/fcntl.h>
41 #include <sys/selinfo.h>
42 #include <sys/queue.h>
43 #include <sys/event.h>
44 #include <sys/eventvar.h>
45 #include <sys/ktrace.h>
46 #include <sys/pool.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/stat.h>
50 #include <sys/uio.h>
51 #include <sys/mount.h>
52 #include <sys/syscallargs.h>
53 #include <sys/time.h>
54 #include <sys/timeout.h>
55 #include <sys/vnode.h>
56 #include <sys/wait.h>
57 
58 #ifdef DIAGNOSTIC
59 #define KLIST_ASSERT_LOCKED(kl) do {					\
60 	if ((kl)->kl_ops != NULL)					\
61 		(kl)->kl_ops->klo_assertlk((kl)->kl_arg);		\
62 	else								\
63 		KERNEL_ASSERT_LOCKED();					\
64 } while (0)
65 #else
66 #define KLIST_ASSERT_LOCKED(kl)	((void)(kl))
67 #endif
68 
69 struct	kqueue *kqueue_alloc(struct filedesc *);
70 void	kqueue_terminate(struct proc *p, struct kqueue *);
71 void	KQREF(struct kqueue *);
72 void	KQRELE(struct kqueue *);
73 
74 void	kqueue_purge(struct proc *, struct kqueue *);
75 int	kqueue_sleep(struct kqueue *, struct timespec *);
76 
77 int	kqueue_read(struct file *, struct uio *, int);
78 int	kqueue_write(struct file *, struct uio *, int);
79 int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
80 		    struct proc *p);
81 int	kqueue_kqfilter(struct file *fp, struct knote *kn);
82 int	kqueue_stat(struct file *fp, struct stat *st, struct proc *p);
83 int	kqueue_close(struct file *fp, struct proc *p);
84 void	kqueue_wakeup(struct kqueue *kq);
85 
86 #ifdef KQUEUE_DEBUG
87 void	kqueue_do_check(struct kqueue *kq, const char *func, int line);
88 #define kqueue_check(kq)	kqueue_do_check((kq), __func__, __LINE__)
89 #else
90 #define kqueue_check(kq)	do {} while (0)
91 #endif
92 
93 static int	filter_attach(struct knote *kn);
94 static void	filter_detach(struct knote *kn);
95 static int	filter_event(struct knote *kn, long hint);
96 static int	filter_modify(struct kevent *kev, struct knote *kn);
97 static int	filter_process(struct knote *kn, struct kevent *kev);
98 static void	kqueue_expand_hash(struct kqueue *kq);
99 static void	kqueue_expand_list(struct kqueue *kq, int fd);
100 static void	kqueue_task(void *);
101 static int	klist_lock(struct klist *);
102 static void	klist_unlock(struct klist *, int);
103 
104 const struct fileops kqueueops = {
105 	.fo_read	= kqueue_read,
106 	.fo_write	= kqueue_write,
107 	.fo_ioctl	= kqueue_ioctl,
108 	.fo_kqfilter	= kqueue_kqfilter,
109 	.fo_stat	= kqueue_stat,
110 	.fo_close	= kqueue_close
111 };
112 
113 void	knote_attach(struct knote *kn);
114 void	knote_detach(struct knote *kn);
115 void	knote_drop(struct knote *kn, struct proc *p);
116 void	knote_enqueue(struct knote *kn);
117 void	knote_dequeue(struct knote *kn);
118 int	knote_acquire(struct knote *kn, struct klist *, int);
119 void	knote_release(struct knote *kn);
120 void	knote_activate(struct knote *kn);
121 void	knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist,
122 	    int idx, int purge);
123 
124 void	filt_kqdetach(struct knote *kn);
125 int	filt_kqueue(struct knote *kn, long hint);
126 int	filt_kqueuemodify(struct kevent *kev, struct knote *kn);
127 int	filt_kqueueprocess(struct knote *kn, struct kevent *kev);
128 int	filt_kqueue_common(struct knote *kn, struct kqueue *kq);
129 int	filt_procattach(struct knote *kn);
130 void	filt_procdetach(struct knote *kn);
131 int	filt_proc(struct knote *kn, long hint);
132 int	filt_fileattach(struct knote *kn);
133 void	filt_timerexpire(void *knx);
134 int	filt_timerattach(struct knote *kn);
135 void	filt_timerdetach(struct knote *kn);
136 int	filt_timermodify(struct kevent *kev, struct knote *kn);
137 int	filt_timerprocess(struct knote *kn, struct kevent *kev);
138 void	filt_seltruedetach(struct knote *kn);
139 
140 const struct filterops kqread_filtops = {
141 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
142 	.f_attach	= NULL,
143 	.f_detach	= filt_kqdetach,
144 	.f_event	= filt_kqueue,
145 	.f_modify	= filt_kqueuemodify,
146 	.f_process	= filt_kqueueprocess,
147 };
148 
149 const struct filterops proc_filtops = {
150 	.f_flags	= 0,
151 	.f_attach	= filt_procattach,
152 	.f_detach	= filt_procdetach,
153 	.f_event	= filt_proc,
154 };
155 
156 const struct filterops file_filtops = {
157 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
158 	.f_attach	= filt_fileattach,
159 	.f_detach	= NULL,
160 	.f_event	= NULL,
161 };
162 
163 const struct filterops timer_filtops = {
164 	.f_flags	= 0,
165 	.f_attach	= filt_timerattach,
166 	.f_detach	= filt_timerdetach,
167 	.f_event	= NULL,
168 	.f_modify	= filt_timermodify,
169 	.f_process	= filt_timerprocess,
170 };
171 
172 struct	pool knote_pool;
173 struct	pool kqueue_pool;
174 struct	mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
175 int kq_ntimeouts = 0;
176 int kq_timeoutmax = (4 * 1024);
177 
178 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
179 
180 /*
181  * Table for for all system-defined filters.
182  */
183 const struct filterops *const sysfilt_ops[] = {
184 	&file_filtops,			/* EVFILT_READ */
185 	&file_filtops,			/* EVFILT_WRITE */
186 	NULL, /*&aio_filtops,*/		/* EVFILT_AIO */
187 	&file_filtops,			/* EVFILT_VNODE */
188 	&proc_filtops,			/* EVFILT_PROC */
189 	&sig_filtops,			/* EVFILT_SIGNAL */
190 	&timer_filtops,			/* EVFILT_TIMER */
191 	&file_filtops,			/* EVFILT_DEVICE */
192 	&file_filtops,			/* EVFILT_EXCEPT */
193 };
194 
195 void
196 KQREF(struct kqueue *kq)
197 {
198 	refcnt_take(&kq->kq_refcnt);
199 }
200 
201 void
202 KQRELE(struct kqueue *kq)
203 {
204 	struct filedesc *fdp;
205 
206 	if (refcnt_rele(&kq->kq_refcnt) == 0)
207 		return;
208 
209 	fdp = kq->kq_fdp;
210 	if (rw_status(&fdp->fd_lock) == RW_WRITE) {
211 		LIST_REMOVE(kq, kq_next);
212 	} else {
213 		fdplock(fdp);
214 		LIST_REMOVE(kq, kq_next);
215 		fdpunlock(fdp);
216 	}
217 
218 	KASSERT(TAILQ_EMPTY(&kq->kq_head));
219 	KASSERT(kq->kq_nknotes == 0);
220 
221 	free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
222 	    sizeof(struct knlist));
223 	hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
224 	klist_free(&kq->kq_sel.si_note);
225 	pool_put(&kqueue_pool, kq);
226 }
227 
228 void
229 kqueue_init(void)
230 {
231 	pool_init(&kqueue_pool, sizeof(struct kqueue), 0, IPL_MPFLOOR,
232 	    PR_WAITOK, "kqueuepl", NULL);
233 	pool_init(&knote_pool, sizeof(struct knote), 0, IPL_MPFLOOR,
234 	    PR_WAITOK, "knotepl", NULL);
235 }
236 
237 void
238 kqueue_init_percpu(void)
239 {
240 	pool_cache_init(&knote_pool);
241 }
242 
243 int
244 filt_fileattach(struct knote *kn)
245 {
246 	struct file *fp = kn->kn_fp;
247 
248 	return fp->f_ops->fo_kqfilter(fp, kn);
249 }
250 
251 int
252 kqueue_kqfilter(struct file *fp, struct knote *kn)
253 {
254 	struct kqueue *kq = kn->kn_fp->f_data;
255 
256 	if (kn->kn_filter != EVFILT_READ)
257 		return (EINVAL);
258 
259 	kn->kn_fop = &kqread_filtops;
260 	klist_insert(&kq->kq_sel.si_note, kn);
261 	return (0);
262 }
263 
264 void
265 filt_kqdetach(struct knote *kn)
266 {
267 	struct kqueue *kq = kn->kn_fp->f_data;
268 
269 	klist_remove(&kq->kq_sel.si_note, kn);
270 }
271 
272 int
273 filt_kqueue_common(struct knote *kn, struct kqueue *kq)
274 {
275 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
276 
277 	kn->kn_data = kq->kq_count;
278 
279 	return (kn->kn_data > 0);
280 }
281 
282 int
283 filt_kqueue(struct knote *kn, long hint)
284 {
285 	struct kqueue *kq = kn->kn_fp->f_data;
286 	int active;
287 
288 	mtx_enter(&kq->kq_lock);
289 	active = filt_kqueue_common(kn, kq);
290 	mtx_leave(&kq->kq_lock);
291 
292 	return (active);
293 }
294 
295 int
296 filt_kqueuemodify(struct kevent *kev, struct knote *kn)
297 {
298 	struct kqueue *kq = kn->kn_fp->f_data;
299 	int active;
300 
301 	mtx_enter(&kq->kq_lock);
302 	knote_assign(kev, kn);
303 	active = filt_kqueue_common(kn, kq);
304 	mtx_leave(&kq->kq_lock);
305 
306 	return (active);
307 }
308 
309 int
310 filt_kqueueprocess(struct knote *kn, struct kevent *kev)
311 {
312 	struct kqueue *kq = kn->kn_fp->f_data;
313 	int active;
314 
315 	mtx_enter(&kq->kq_lock);
316 	if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
317 		active = 1;
318 	else
319 		active = filt_kqueue_common(kn, kq);
320 	if (active)
321 		knote_submit(kn, kev);
322 	mtx_leave(&kq->kq_lock);
323 
324 	return (active);
325 }
326 
327 int
328 filt_procattach(struct knote *kn)
329 {
330 	struct process *pr;
331 	int s;
332 
333 	if ((curproc->p_p->ps_flags & PS_PLEDGE) &&
334 	    (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0)
335 		return pledge_fail(curproc, EPERM, PLEDGE_PROC);
336 
337 	if (kn->kn_id > PID_MAX)
338 		return ESRCH;
339 
340 	pr = prfind(kn->kn_id);
341 	if (pr == NULL)
342 		return (ESRCH);
343 
344 	/* exiting processes can't be specified */
345 	if (pr->ps_flags & PS_EXITING)
346 		return (ESRCH);
347 
348 	kn->kn_ptr.p_process = pr;
349 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
350 
351 	/*
352 	 * internal flag indicating registration done by kernel
353 	 */
354 	if (kn->kn_flags & EV_FLAG1) {
355 		kn->kn_data = kn->kn_sdata;		/* ppid */
356 		kn->kn_fflags = NOTE_CHILD;
357 		kn->kn_flags &= ~EV_FLAG1;
358 	}
359 
360 	s = splhigh();
361 	klist_insert_locked(&pr->ps_klist, kn);
362 	splx(s);
363 
364 	return (0);
365 }
366 
367 /*
368  * The knote may be attached to a different process, which may exit,
369  * leaving nothing for the knote to be attached to.  So when the process
370  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
371  * it will be deleted when read out.  However, as part of the knote deletion,
372  * this routine is called, so a check is needed to avoid actually performing
373  * a detach, because the original process does not exist any more.
374  */
375 void
376 filt_procdetach(struct knote *kn)
377 {
378 	struct kqueue *kq = kn->kn_kq;
379 	struct process *pr = kn->kn_ptr.p_process;
380 	int s, status;
381 
382 	mtx_enter(&kq->kq_lock);
383 	status = kn->kn_status;
384 	mtx_leave(&kq->kq_lock);
385 
386 	if (status & KN_DETACHED)
387 		return;
388 
389 	s = splhigh();
390 	klist_remove_locked(&pr->ps_klist, kn);
391 	splx(s);
392 }
393 
394 int
395 filt_proc(struct knote *kn, long hint)
396 {
397 	struct kqueue *kq = kn->kn_kq;
398 	u_int event;
399 
400 	/*
401 	 * mask off extra data
402 	 */
403 	event = (u_int)hint & NOTE_PCTRLMASK;
404 
405 	/*
406 	 * if the user is interested in this event, record it.
407 	 */
408 	if (kn->kn_sfflags & event)
409 		kn->kn_fflags |= event;
410 
411 	/*
412 	 * process is gone, so flag the event as finished and remove it
413 	 * from the process's klist
414 	 */
415 	if (event == NOTE_EXIT) {
416 		struct process *pr = kn->kn_ptr.p_process;
417 		int s;
418 
419 		mtx_enter(&kq->kq_lock);
420 		kn->kn_status |= KN_DETACHED;
421 		mtx_leave(&kq->kq_lock);
422 
423 		s = splhigh();
424 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
425 		kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig);
426 		klist_remove_locked(&pr->ps_klist, kn);
427 		splx(s);
428 		return (1);
429 	}
430 
431 	/*
432 	 * process forked, and user wants to track the new process,
433 	 * so attach a new knote to it, and immediately report an
434 	 * event with the parent's pid.
435 	 */
436 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
437 		struct kevent kev;
438 		int error;
439 
440 		/*
441 		 * register knote with new process.
442 		 */
443 		memset(&kev, 0, sizeof(kev));
444 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
445 		kev.filter = kn->kn_filter;
446 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
447 		kev.fflags = kn->kn_sfflags;
448 		kev.data = kn->kn_id;			/* parent */
449 		kev.udata = kn->kn_udata;		/* preserve udata */
450 		error = kqueue_register(kq, &kev, 0, NULL);
451 		if (error)
452 			kn->kn_fflags |= NOTE_TRACKERR;
453 	}
454 
455 	return (kn->kn_fflags != 0);
456 }
457 
458 static void
459 filt_timer_timeout_add(struct knote *kn)
460 {
461 	struct timeval tv;
462 	struct timeout *to = kn->kn_hook;
463 	int tticks;
464 
465 	tv.tv_sec = kn->kn_sdata / 1000;
466 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
467 	tticks = tvtohz(&tv);
468 	/* Remove extra tick from tvtohz() if timeout has fired before. */
469 	if (timeout_triggered(to))
470 		tticks--;
471 	timeout_add(to, (tticks > 0) ? tticks : 1);
472 }
473 
474 void
475 filt_timerexpire(void *knx)
476 {
477 	struct knote *kn = knx;
478 	struct kqueue *kq = kn->kn_kq;
479 
480 	kn->kn_data++;
481 	mtx_enter(&kq->kq_lock);
482 	knote_activate(kn);
483 	mtx_leave(&kq->kq_lock);
484 
485 	if ((kn->kn_flags & EV_ONESHOT) == 0)
486 		filt_timer_timeout_add(kn);
487 }
488 
489 
490 /*
491  * data contains amount of time to sleep, in milliseconds
492  */
493 int
494 filt_timerattach(struct knote *kn)
495 {
496 	struct timeout *to;
497 
498 	if (kq_ntimeouts > kq_timeoutmax)
499 		return (ENOMEM);
500 	kq_ntimeouts++;
501 
502 	kn->kn_flags |= EV_CLEAR;	/* automatically set */
503 	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
504 	timeout_set(to, filt_timerexpire, kn);
505 	kn->kn_hook = to;
506 	filt_timer_timeout_add(kn);
507 
508 	return (0);
509 }
510 
511 void
512 filt_timerdetach(struct knote *kn)
513 {
514 	struct timeout *to;
515 
516 	to = (struct timeout *)kn->kn_hook;
517 	timeout_del_barrier(to);
518 	free(to, M_KEVENT, sizeof(*to));
519 	kq_ntimeouts--;
520 }
521 
522 int
523 filt_timermodify(struct kevent *kev, struct knote *kn)
524 {
525 	struct kqueue *kq = kn->kn_kq;
526 	struct timeout *to = kn->kn_hook;
527 
528 	/* Reset the timer. Any pending events are discarded. */
529 
530 	timeout_del_barrier(to);
531 
532 	mtx_enter(&kq->kq_lock);
533 	if (kn->kn_status & KN_QUEUED)
534 		knote_dequeue(kn);
535 	kn->kn_status &= ~KN_ACTIVE;
536 	mtx_leave(&kq->kq_lock);
537 
538 	kn->kn_data = 0;
539 	knote_assign(kev, kn);
540 	/* Reinit timeout to invoke tick adjustment again. */
541 	timeout_set(to, filt_timerexpire, kn);
542 	filt_timer_timeout_add(kn);
543 
544 	return (0);
545 }
546 
547 int
548 filt_timerprocess(struct knote *kn, struct kevent *kev)
549 {
550 	int active, s;
551 
552 	s = splsoftclock();
553 	active = (kn->kn_data != 0);
554 	if (active)
555 		knote_submit(kn, kev);
556 	splx(s);
557 
558 	return (active);
559 }
560 
561 
562 /*
563  * filt_seltrue:
564  *
565  *	This filter "event" routine simulates seltrue().
566  */
567 int
568 filt_seltrue(struct knote *kn, long hint)
569 {
570 
571 	/*
572 	 * We don't know how much data can be read/written,
573 	 * but we know that it *can* be.  This is about as
574 	 * good as select/poll does as well.
575 	 */
576 	kn->kn_data = 0;
577 	return (1);
578 }
579 
580 int
581 filt_seltruemodify(struct kevent *kev, struct knote *kn)
582 {
583 	knote_assign(kev, kn);
584 	return (kn->kn_fop->f_event(kn, 0));
585 }
586 
587 int
588 filt_seltrueprocess(struct knote *kn, struct kevent *kev)
589 {
590 	int active;
591 
592 	active = kn->kn_fop->f_event(kn, 0);
593 	if (active)
594 		knote_submit(kn, kev);
595 	return (active);
596 }
597 
598 /*
599  * This provides full kqfilter entry for device switch tables, which
600  * has same effect as filter using filt_seltrue() as filter method.
601  */
602 void
603 filt_seltruedetach(struct knote *kn)
604 {
605 	/* Nothing to do */
606 }
607 
608 const struct filterops seltrue_filtops = {
609 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
610 	.f_attach	= NULL,
611 	.f_detach	= filt_seltruedetach,
612 	.f_event	= filt_seltrue,
613 	.f_modify	= filt_seltruemodify,
614 	.f_process	= filt_seltrueprocess,
615 };
616 
617 int
618 seltrue_kqfilter(dev_t dev, struct knote *kn)
619 {
620 	switch (kn->kn_filter) {
621 	case EVFILT_READ:
622 	case EVFILT_WRITE:
623 		kn->kn_fop = &seltrue_filtops;
624 		break;
625 	default:
626 		return (EINVAL);
627 	}
628 
629 	/* Nothing more to do */
630 	return (0);
631 }
632 
633 static int
634 filt_dead(struct knote *kn, long hint)
635 {
636 	if (kn->kn_filter == EVFILT_EXCEPT) {
637 		/*
638 		 * Do not deliver event because there is no out-of-band data.
639 		 * However, let HUP condition pass for poll(2).
640 		 */
641 		if ((kn->kn_flags & __EV_POLL) == 0) {
642 			kn->kn_flags |= EV_DISABLE;
643 			return (0);
644 		}
645 	}
646 
647 	kn->kn_flags |= (EV_EOF | EV_ONESHOT);
648 	if (kn->kn_flags & __EV_POLL)
649 		kn->kn_flags |= __EV_HUP;
650 	kn->kn_data = 0;
651 	return (1);
652 }
653 
654 static void
655 filt_deaddetach(struct knote *kn)
656 {
657 	/* Nothing to do */
658 }
659 
660 const struct filterops dead_filtops = {
661 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
662 	.f_attach	= NULL,
663 	.f_detach	= filt_deaddetach,
664 	.f_event	= filt_dead,
665 	.f_modify	= filt_seltruemodify,
666 	.f_process	= filt_seltrueprocess,
667 };
668 
669 static int
670 filt_badfd(struct knote *kn, long hint)
671 {
672 	kn->kn_flags |= (EV_ERROR | EV_ONESHOT);
673 	kn->kn_data = EBADF;
674 	return (1);
675 }
676 
677 /* For use with kqpoll. */
678 const struct filterops badfd_filtops = {
679 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
680 	.f_attach	= NULL,
681 	.f_detach	= filt_deaddetach,
682 	.f_event	= filt_badfd,
683 	.f_modify	= filt_seltruemodify,
684 	.f_process	= filt_seltrueprocess,
685 };
686 
687 static int
688 filter_attach(struct knote *kn)
689 {
690 	int error;
691 
692 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
693 		error = kn->kn_fop->f_attach(kn);
694 	} else {
695 		KERNEL_LOCK();
696 		error = kn->kn_fop->f_attach(kn);
697 		KERNEL_UNLOCK();
698 	}
699 	return (error);
700 }
701 
702 static void
703 filter_detach(struct knote *kn)
704 {
705 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
706 		kn->kn_fop->f_detach(kn);
707 	} else {
708 		KERNEL_LOCK();
709 		kn->kn_fop->f_detach(kn);
710 		KERNEL_UNLOCK();
711 	}
712 }
713 
714 static int
715 filter_event(struct knote *kn, long hint)
716 {
717 	if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0)
718 		KERNEL_ASSERT_LOCKED();
719 
720 	return (kn->kn_fop->f_event(kn, hint));
721 }
722 
723 static int
724 filter_modify(struct kevent *kev, struct knote *kn)
725 {
726 	int active, s;
727 
728 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
729 		active = kn->kn_fop->f_modify(kev, kn);
730 	} else {
731 		KERNEL_LOCK();
732 		if (kn->kn_fop->f_modify != NULL) {
733 			active = kn->kn_fop->f_modify(kev, kn);
734 		} else {
735 			s = splhigh();
736 			active = knote_modify(kev, kn);
737 			splx(s);
738 		}
739 		KERNEL_UNLOCK();
740 	}
741 	return (active);
742 }
743 
744 static int
745 filter_process(struct knote *kn, struct kevent *kev)
746 {
747 	int active, s;
748 
749 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
750 		active = kn->kn_fop->f_process(kn, kev);
751 	} else {
752 		KERNEL_LOCK();
753 		if (kn->kn_fop->f_process != NULL) {
754 			active = kn->kn_fop->f_process(kn, kev);
755 		} else {
756 			s = splhigh();
757 			active = knote_process(kn, kev);
758 			splx(s);
759 		}
760 		KERNEL_UNLOCK();
761 	}
762 	return (active);
763 }
764 
765 /*
766  * Initialize the current thread for poll/select system call.
767  * num indicates the number of serials that the system call may utilize.
768  * After this function, the valid range of serials is
769  * p_kq_serial <= x < p_kq_serial + num.
770  */
771 void
772 kqpoll_init(unsigned int num)
773 {
774 	struct proc *p = curproc;
775 	struct filedesc *fdp;
776 
777 	if (p->p_kq == NULL) {
778 		p->p_kq = kqueue_alloc(p->p_fd);
779 		p->p_kq_serial = arc4random();
780 		fdp = p->p_fd;
781 		fdplock(fdp);
782 		LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next);
783 		fdpunlock(fdp);
784 	}
785 
786 	if (p->p_kq_serial + num < p->p_kq_serial) {
787 		/* Serial is about to wrap. Clear all attached knotes. */
788 		kqueue_purge(p, p->p_kq);
789 		p->p_kq_serial = 0;
790 	}
791 }
792 
793 /*
794  * Finish poll/select system call.
795  * num must have the same value that was used with kqpoll_init().
796  */
797 void
798 kqpoll_done(unsigned int num)
799 {
800 	struct proc *p = curproc;
801 	struct kqueue *kq = p->p_kq;
802 
803 	KASSERT(p->p_kq != NULL);
804 	KASSERT(p->p_kq_serial + num >= p->p_kq_serial);
805 
806 	p->p_kq_serial += num;
807 
808 	/*
809 	 * Because of kn_pollid key, a thread can in principle allocate
810 	 * up to O(maxfiles^2) knotes by calling poll(2) repeatedly
811 	 * with suitably varying pollfd arrays.
812 	 * Prevent such a large allocation by clearing knotes eagerly
813 	 * if there are too many of them.
814 	 *
815 	 * A small multiple of kq_knlistsize should give enough margin
816 	 * that eager clearing is infrequent, or does not happen at all,
817 	 * with normal programs.
818 	 * A single pollfd entry can use up to three knotes.
819 	 * Typically there is no significant overlap of fd and events
820 	 * between different entries in the pollfd array.
821 	 */
822 	if (kq->kq_nknotes > 4 * kq->kq_knlistsize)
823 		kqueue_purge(p, kq);
824 }
825 
826 void
827 kqpoll_exit(void)
828 {
829 	struct proc *p = curproc;
830 
831 	if (p->p_kq == NULL)
832 		return;
833 
834 	kqueue_purge(p, p->p_kq);
835 	kqueue_terminate(p, p->p_kq);
836 	KASSERT(p->p_kq->kq_refcnt.r_refs == 1);
837 	KQRELE(p->p_kq);
838 	p->p_kq = NULL;
839 }
840 
841 struct kqueue *
842 kqueue_alloc(struct filedesc *fdp)
843 {
844 	struct kqueue *kq;
845 
846 	kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO);
847 	refcnt_init(&kq->kq_refcnt);
848 	kq->kq_fdp = fdp;
849 	TAILQ_INIT(&kq->kq_head);
850 	mtx_init(&kq->kq_lock, IPL_HIGH);
851 	task_set(&kq->kq_task, kqueue_task, kq);
852 	klist_init_mutex(&kq->kq_sel.si_note, &kqueue_klist_lock);
853 
854 	return (kq);
855 }
856 
857 int
858 sys_kqueue(struct proc *p, void *v, register_t *retval)
859 {
860 	struct filedesc *fdp = p->p_fd;
861 	struct kqueue *kq;
862 	struct file *fp;
863 	int fd, error;
864 
865 	kq = kqueue_alloc(fdp);
866 
867 	fdplock(fdp);
868 	error = falloc(p, &fp, &fd);
869 	if (error)
870 		goto out;
871 	fp->f_flag = FREAD | FWRITE;
872 	fp->f_type = DTYPE_KQUEUE;
873 	fp->f_ops = &kqueueops;
874 	fp->f_data = kq;
875 	*retval = fd;
876 	LIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_next);
877 	kq = NULL;
878 	fdinsert(fdp, fd, 0, fp);
879 	FRELE(fp, p);
880 out:
881 	fdpunlock(fdp);
882 	if (kq != NULL)
883 		pool_put(&kqueue_pool, kq);
884 	return (error);
885 }
886 
887 int
888 sys_kevent(struct proc *p, void *v, register_t *retval)
889 {
890 	struct kqueue_scan_state scan;
891 	struct filedesc* fdp = p->p_fd;
892 	struct sys_kevent_args /* {
893 		syscallarg(int)	fd;
894 		syscallarg(const struct kevent *) changelist;
895 		syscallarg(int)	nchanges;
896 		syscallarg(struct kevent *) eventlist;
897 		syscallarg(int)	nevents;
898 		syscallarg(const struct timespec *) timeout;
899 	} */ *uap = v;
900 	struct kevent *kevp;
901 	struct kqueue *kq;
902 	struct file *fp;
903 	struct timespec ts;
904 	struct timespec *tsp = NULL;
905 	int i, n, nerrors, error;
906 	int ready, total;
907 	struct kevent kev[KQ_NEVENTS];
908 
909 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
910 		return (EBADF);
911 
912 	if (fp->f_type != DTYPE_KQUEUE) {
913 		error = EBADF;
914 		goto done;
915 	}
916 
917 	if (SCARG(uap, timeout) != NULL) {
918 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
919 		if (error)
920 			goto done;
921 #ifdef KTRACE
922 		if (KTRPOINT(p, KTR_STRUCT))
923 			ktrreltimespec(p, &ts);
924 #endif
925 		if (ts.tv_sec < 0 || !timespecisvalid(&ts)) {
926 			error = EINVAL;
927 			goto done;
928 		}
929 		tsp = &ts;
930 	}
931 
932 	kq = fp->f_data;
933 	nerrors = 0;
934 
935 	while ((n = SCARG(uap, nchanges)) > 0) {
936 		if (n > nitems(kev))
937 			n = nitems(kev);
938 		error = copyin(SCARG(uap, changelist), kev,
939 		    n * sizeof(struct kevent));
940 		if (error)
941 			goto done;
942 #ifdef KTRACE
943 		if (KTRPOINT(p, KTR_STRUCT))
944 			ktrevent(p, kev, n);
945 #endif
946 		for (i = 0; i < n; i++) {
947 			kevp = &kev[i];
948 			kevp->flags &= ~EV_SYSFLAGS;
949 			error = kqueue_register(kq, kevp, 0, p);
950 			if (error || (kevp->flags & EV_RECEIPT)) {
951 				if (SCARG(uap, nevents) != 0) {
952 					kevp->flags = EV_ERROR;
953 					kevp->data = error;
954 					copyout(kevp, SCARG(uap, eventlist),
955 					    sizeof(*kevp));
956 					SCARG(uap, eventlist)++;
957 					SCARG(uap, nevents)--;
958 					nerrors++;
959 				} else {
960 					goto done;
961 				}
962 			}
963 		}
964 		SCARG(uap, nchanges) -= n;
965 		SCARG(uap, changelist) += n;
966 	}
967 	if (nerrors) {
968 		*retval = nerrors;
969 		error = 0;
970 		goto done;
971 	}
972 
973 	kqueue_scan_setup(&scan, kq);
974 	FRELE(fp, p);
975 	/*
976 	 * Collect as many events as we can.  The timeout on successive
977 	 * loops is disabled (kqueue_scan() becomes non-blocking).
978 	 */
979 	total = 0;
980 	error = 0;
981 	while ((n = SCARG(uap, nevents) - total) > 0) {
982 		if (n > nitems(kev))
983 			n = nitems(kev);
984 		ready = kqueue_scan(&scan, n, kev, tsp, p, &error);
985 		if (ready == 0)
986 			break;
987 		error = copyout(kev, SCARG(uap, eventlist) + total,
988 		    sizeof(struct kevent) * ready);
989 #ifdef KTRACE
990 		if (KTRPOINT(p, KTR_STRUCT))
991 			ktrevent(p, kev, ready);
992 #endif
993 		total += ready;
994 		if (error || ready < n)
995 			break;
996 	}
997 	kqueue_scan_finish(&scan);
998 	*retval = total;
999 	return (error);
1000 
1001  done:
1002 	FRELE(fp, p);
1003 	return (error);
1004 }
1005 
1006 #ifdef KQUEUE_DEBUG
1007 void
1008 kqueue_do_check(struct kqueue *kq, const char *func, int line)
1009 {
1010 	struct knote *kn;
1011 	int count = 0, nmarker = 0;
1012 
1013 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1014 
1015 	TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1016 		if (kn->kn_filter == EVFILT_MARKER) {
1017 			if ((kn->kn_status & KN_QUEUED) != 0)
1018 				panic("%s:%d: kq=%p kn=%p marker QUEUED",
1019 				    func, line, kq, kn);
1020 			nmarker++;
1021 		} else {
1022 			if ((kn->kn_status & KN_ACTIVE) == 0)
1023 				panic("%s:%d: kq=%p kn=%p knote !ACTIVE",
1024 				    func, line, kq, kn);
1025 			if ((kn->kn_status & KN_QUEUED) == 0)
1026 				panic("%s:%d: kq=%p kn=%p knote !QUEUED",
1027 				    func, line, kq, kn);
1028 			if (kn->kn_kq != kq)
1029 				panic("%s:%d: kq=%p kn=%p kn_kq=%p != kq",
1030 				    func, line, kq, kn, kn->kn_kq);
1031 			count++;
1032 			if (count > kq->kq_count)
1033 				goto bad;
1034 		}
1035 	}
1036 	if (count != kq->kq_count) {
1037 bad:
1038 		panic("%s:%d: kq=%p kq_count=%d count=%d nmarker=%d",
1039 		    func, line, kq, kq->kq_count, count, nmarker);
1040 	}
1041 }
1042 #endif
1043 
1044 int
1045 kqueue_register(struct kqueue *kq, struct kevent *kev, unsigned int pollid,
1046     struct proc *p)
1047 {
1048 	struct filedesc *fdp = kq->kq_fdp;
1049 	const struct filterops *fops = NULL;
1050 	struct file *fp = NULL;
1051 	struct knote *kn = NULL, *newkn = NULL;
1052 	struct knlist *list = NULL;
1053 	int active, error = 0;
1054 
1055 	KASSERT(pollid == 0 || (p != NULL && p->p_kq == kq));
1056 
1057 	if (kev->filter < 0) {
1058 		if (kev->filter + EVFILT_SYSCOUNT < 0)
1059 			return (EINVAL);
1060 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
1061 	}
1062 
1063 	if (fops == NULL) {
1064 		/*
1065 		 * XXX
1066 		 * filter attach routine is responsible for ensuring that
1067 		 * the identifier can be attached to it.
1068 		 */
1069 		return (EINVAL);
1070 	}
1071 
1072 	if (fops->f_flags & FILTEROP_ISFD) {
1073 		/* validate descriptor */
1074 		if (kev->ident > INT_MAX)
1075 			return (EBADF);
1076 	}
1077 
1078 	if (kev->flags & EV_ADD)
1079 		newkn = pool_get(&knote_pool, PR_WAITOK | PR_ZERO);
1080 
1081 again:
1082 	if (fops->f_flags & FILTEROP_ISFD) {
1083 		if ((fp = fd_getfile(fdp, kev->ident)) == NULL) {
1084 			error = EBADF;
1085 			goto done;
1086 		}
1087 		mtx_enter(&kq->kq_lock);
1088 		if (kev->flags & EV_ADD)
1089 			kqueue_expand_list(kq, kev->ident);
1090 		if (kev->ident < kq->kq_knlistsize)
1091 			list = &kq->kq_knlist[kev->ident];
1092 	} else {
1093 		mtx_enter(&kq->kq_lock);
1094 		if (kev->flags & EV_ADD)
1095 			kqueue_expand_hash(kq);
1096 		if (kq->kq_knhashmask != 0) {
1097 			list = &kq->kq_knhash[
1098 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1099 		}
1100 	}
1101 	if (list != NULL) {
1102 		SLIST_FOREACH(kn, list, kn_link) {
1103 			if (kev->filter == kn->kn_filter &&
1104 			    kev->ident == kn->kn_id &&
1105 			    pollid == kn->kn_pollid) {
1106 				if (!knote_acquire(kn, NULL, 0)) {
1107 					/* knote_acquire() has released
1108 					 * kq_lock. */
1109 					if (fp != NULL) {
1110 						FRELE(fp, p);
1111 						fp = NULL;
1112 					}
1113 					goto again;
1114 				}
1115 				break;
1116 			}
1117 		}
1118 	}
1119 	KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0);
1120 
1121 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
1122 		mtx_leave(&kq->kq_lock);
1123 		error = ENOENT;
1124 		goto done;
1125 	}
1126 
1127 	/*
1128 	 * kn now contains the matching knote, or NULL if no match.
1129 	 */
1130 	if (kev->flags & EV_ADD) {
1131 		if (kn == NULL) {
1132 			kn = newkn;
1133 			newkn = NULL;
1134 			kn->kn_status = KN_PROCESSING;
1135 			kn->kn_fp = fp;
1136 			kn->kn_kq = kq;
1137 			kn->kn_fop = fops;
1138 
1139 			/*
1140 			 * apply reference count to knote structure, and
1141 			 * do not release it at the end of this routine.
1142 			 */
1143 			fp = NULL;
1144 
1145 			kn->kn_sfflags = kev->fflags;
1146 			kn->kn_sdata = kev->data;
1147 			kev->fflags = 0;
1148 			kev->data = 0;
1149 			kn->kn_kevent = *kev;
1150 			kn->kn_pollid = pollid;
1151 
1152 			knote_attach(kn);
1153 			mtx_leave(&kq->kq_lock);
1154 
1155 			error = filter_attach(kn);
1156 			if (error != 0) {
1157 				knote_drop(kn, p);
1158 				goto done;
1159 			}
1160 
1161 			/*
1162 			 * If this is a file descriptor filter, check if
1163 			 * fd was closed while the knote was being added.
1164 			 * knote_fdclose() has missed kn if the function
1165 			 * ran before kn appeared in kq_knlist.
1166 			 */
1167 			if ((fops->f_flags & FILTEROP_ISFD) &&
1168 			    fd_checkclosed(fdp, kev->ident, kn->kn_fp)) {
1169 				/*
1170 				 * Drop the knote silently without error
1171 				 * because another thread might already have
1172 				 * seen it. This corresponds to the insert
1173 				 * happening in full before the close.
1174 				 */
1175 				filter_detach(kn);
1176 				knote_drop(kn, p);
1177 				goto done;
1178 			}
1179 
1180 			/* Check if there is a pending event. */
1181 			active = filter_process(kn, NULL);
1182 			mtx_enter(&kq->kq_lock);
1183 			if (active)
1184 				knote_activate(kn);
1185 		} else if (kn->kn_fop == &badfd_filtops) {
1186 			/*
1187 			 * Nothing expects this badfd knote any longer.
1188 			 * Drop it to make room for the new knote and retry.
1189 			 */
1190 			KASSERT(kq == p->p_kq);
1191 			mtx_leave(&kq->kq_lock);
1192 			filter_detach(kn);
1193 			knote_drop(kn, p);
1194 
1195 			KASSERT(fp != NULL);
1196 			FRELE(fp, p);
1197 			fp = NULL;
1198 
1199 			goto again;
1200 		} else {
1201 			/*
1202 			 * The user may change some filter values after the
1203 			 * initial EV_ADD, but doing so will not reset any
1204 			 * filters which have already been triggered.
1205 			 */
1206 			mtx_leave(&kq->kq_lock);
1207 			active = filter_modify(kev, kn);
1208 			mtx_enter(&kq->kq_lock);
1209 			if (active)
1210 				knote_activate(kn);
1211 			if (kev->flags & EV_ERROR) {
1212 				error = kev->data;
1213 				goto release;
1214 			}
1215 		}
1216 	} else if (kev->flags & EV_DELETE) {
1217 		mtx_leave(&kq->kq_lock);
1218 		filter_detach(kn);
1219 		knote_drop(kn, p);
1220 		goto done;
1221 	}
1222 
1223 	if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0))
1224 		kn->kn_status |= KN_DISABLED;
1225 
1226 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1227 		kn->kn_status &= ~KN_DISABLED;
1228 		mtx_leave(&kq->kq_lock);
1229 		/* Check if there is a pending event. */
1230 		active = filter_process(kn, NULL);
1231 		mtx_enter(&kq->kq_lock);
1232 		if (active)
1233 			knote_activate(kn);
1234 	}
1235 
1236 release:
1237 	knote_release(kn);
1238 	mtx_leave(&kq->kq_lock);
1239 done:
1240 	if (fp != NULL)
1241 		FRELE(fp, p);
1242 	if (newkn != NULL)
1243 		pool_put(&knote_pool, newkn);
1244 	return (error);
1245 }
1246 
1247 int
1248 kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
1249 {
1250 	struct timespec elapsed, start, stop;
1251 	uint64_t nsecs;
1252 	int error;
1253 
1254 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1255 
1256 	if (tsp != NULL) {
1257 		getnanouptime(&start);
1258 		nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
1259 	} else
1260 		nsecs = INFSLP;
1261 	error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK,
1262 	    "kqread", nsecs);
1263 	if (tsp != NULL) {
1264 		getnanouptime(&stop);
1265 		timespecsub(&stop, &start, &elapsed);
1266 		timespecsub(tsp, &elapsed, tsp);
1267 		if (tsp->tv_sec < 0)
1268 			timespecclear(tsp);
1269 	}
1270 
1271 	return (error);
1272 }
1273 
1274 /*
1275  * Scan the kqueue, blocking if necessary until the target time is reached.
1276  * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
1277  * 0 we do not block at all.
1278  */
1279 int
1280 kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
1281     struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp)
1282 {
1283 	struct kqueue *kq = scan->kqs_kq;
1284 	struct knote *kn;
1285 	int error = 0, nkev = 0;
1286 	int reinserted;
1287 
1288 	if (maxevents == 0)
1289 		goto done;
1290 retry:
1291 	KASSERT(nkev == 0);
1292 
1293 	error = 0;
1294 	reinserted = 0;
1295 
1296 	/* msleep() with PCATCH requires kernel lock. */
1297 	KERNEL_LOCK();
1298 
1299 	mtx_enter(&kq->kq_lock);
1300 
1301 	if (kq->kq_state & KQ_DYING) {
1302 		mtx_leave(&kq->kq_lock);
1303 		KERNEL_UNLOCK();
1304 		error = EBADF;
1305 		goto done;
1306 	}
1307 
1308 	if (kq->kq_count == 0) {
1309 		/*
1310 		 * Successive loops are only necessary if there are more
1311 		 * ready events to gather, so they don't need to block.
1312 		 */
1313 		if ((tsp != NULL && !timespecisset(tsp)) ||
1314 		    scan->kqs_nevent != 0) {
1315 			mtx_leave(&kq->kq_lock);
1316 			KERNEL_UNLOCK();
1317 			error = 0;
1318 			goto done;
1319 		}
1320 		kq->kq_state |= KQ_SLEEP;
1321 		error = kqueue_sleep(kq, tsp);
1322 		/* kqueue_sleep() has released kq_lock. */
1323 		KERNEL_UNLOCK();
1324 		if (error == 0 || error == EWOULDBLOCK)
1325 			goto retry;
1326 		/* don't restart after signals... */
1327 		if (error == ERESTART)
1328 			error = EINTR;
1329 		goto done;
1330 	}
1331 
1332 	/* The actual scan does not sleep on kq, so unlock the kernel. */
1333 	KERNEL_UNLOCK();
1334 
1335 	/*
1336 	 * Put the end marker in the queue to limit the scan to the events
1337 	 * that are currently active.  This prevents events from being
1338 	 * recollected if they reactivate during scan.
1339 	 *
1340 	 * If a partial scan has been performed already but no events have
1341 	 * been collected, reposition the end marker to make any new events
1342 	 * reachable.
1343 	 */
1344 	if (!scan->kqs_queued) {
1345 		TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
1346 		scan->kqs_queued = 1;
1347 	} else if (scan->kqs_nevent == 0) {
1348 		TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
1349 		TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
1350 	}
1351 
1352 	TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe);
1353 	while (nkev < maxevents) {
1354 		kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe);
1355 		if (kn->kn_filter == EVFILT_MARKER) {
1356 			if (kn == &scan->kqs_end)
1357 				break;
1358 
1359 			/* Move start marker past another thread's marker. */
1360 			TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
1361 			TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start,
1362 			    kn_tqe);
1363 			continue;
1364 		}
1365 
1366 		if (!knote_acquire(kn, NULL, 0)) {
1367 			/* knote_acquire() has released kq_lock. */
1368 			mtx_enter(&kq->kq_lock);
1369 			continue;
1370 		}
1371 
1372 		kqueue_check(kq);
1373 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1374 		kn->kn_status &= ~KN_QUEUED;
1375 		kq->kq_count--;
1376 		kqueue_check(kq);
1377 
1378 		if (kn->kn_status & KN_DISABLED) {
1379 			knote_release(kn);
1380 			continue;
1381 		}
1382 
1383 		mtx_leave(&kq->kq_lock);
1384 
1385 		/* Drop expired kqpoll knotes. */
1386 		if (p->p_kq == kq &&
1387 		    p->p_kq_serial > (unsigned long)kn->kn_udata) {
1388 			filter_detach(kn);
1389 			knote_drop(kn, p);
1390 			mtx_enter(&kq->kq_lock);
1391 			continue;
1392 		}
1393 
1394 		/*
1395 		 * Invalidate knotes whose vnodes have been revoked.
1396 		 * This is a workaround; it is tricky to clear existing
1397 		 * knotes and prevent new ones from being registered
1398 		 * with the current revocation mechanism.
1399 		 */
1400 		if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1401 		    kn->kn_fp != NULL &&
1402 		    kn->kn_fp->f_type == DTYPE_VNODE) {
1403 			struct vnode *vp = kn->kn_fp->f_data;
1404 
1405 			if (__predict_false(vp->v_op == &dead_vops &&
1406 			    kn->kn_fop != &dead_filtops)) {
1407 				filter_detach(kn);
1408 				kn->kn_fop = &dead_filtops;
1409 
1410 				/*
1411 				 * Check if the event should be delivered.
1412 				 * Use f_event directly because this is
1413 				 * a special situation.
1414 				 */
1415 				if (kn->kn_fop->f_event(kn, 0) == 0) {
1416 					filter_detach(kn);
1417 					knote_drop(kn, p);
1418 					mtx_enter(&kq->kq_lock);
1419 					continue;
1420 				}
1421 			}
1422 		}
1423 
1424 		memset(kevp, 0, sizeof(*kevp));
1425 		if (filter_process(kn, kevp) == 0) {
1426 			mtx_enter(&kq->kq_lock);
1427 			if ((kn->kn_status & KN_QUEUED) == 0)
1428 				kn->kn_status &= ~KN_ACTIVE;
1429 			knote_release(kn);
1430 			kqueue_check(kq);
1431 			continue;
1432 		}
1433 
1434 		/*
1435 		 * Post-event action on the note
1436 		 */
1437 		if (kevp->flags & EV_ONESHOT) {
1438 			filter_detach(kn);
1439 			knote_drop(kn, p);
1440 			mtx_enter(&kq->kq_lock);
1441 		} else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) {
1442 			mtx_enter(&kq->kq_lock);
1443 			if (kevp->flags & EV_DISPATCH)
1444 				kn->kn_status |= KN_DISABLED;
1445 			if ((kn->kn_status & KN_QUEUED) == 0)
1446 				kn->kn_status &= ~KN_ACTIVE;
1447 			knote_release(kn);
1448 		} else {
1449 			mtx_enter(&kq->kq_lock);
1450 			if ((kn->kn_status & KN_QUEUED) == 0) {
1451 				kqueue_check(kq);
1452 				kq->kq_count++;
1453 				kn->kn_status |= KN_QUEUED;
1454 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1455 				/* Wakeup is done after loop. */
1456 				reinserted = 1;
1457 			}
1458 			knote_release(kn);
1459 		}
1460 		kqueue_check(kq);
1461 
1462 		kevp++;
1463 		nkev++;
1464 		scan->kqs_nevent++;
1465 	}
1466 	TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
1467 	if (reinserted && kq->kq_count != 0)
1468 		kqueue_wakeup(kq);
1469 	mtx_leave(&kq->kq_lock);
1470 	if (scan->kqs_nevent == 0)
1471 		goto retry;
1472 done:
1473 	*errorp = error;
1474 	return (nkev);
1475 }
1476 
1477 void
1478 kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq)
1479 {
1480 	memset(scan, 0, sizeof(*scan));
1481 
1482 	KQREF(kq);
1483 	scan->kqs_kq = kq;
1484 	scan->kqs_start.kn_filter = EVFILT_MARKER;
1485 	scan->kqs_start.kn_status = KN_PROCESSING;
1486 	scan->kqs_end.kn_filter = EVFILT_MARKER;
1487 	scan->kqs_end.kn_status = KN_PROCESSING;
1488 }
1489 
1490 void
1491 kqueue_scan_finish(struct kqueue_scan_state *scan)
1492 {
1493 	struct kqueue *kq = scan->kqs_kq;
1494 
1495 	KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
1496 	KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
1497 	KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER);
1498 	KASSERT(scan->kqs_end.kn_status == KN_PROCESSING);
1499 
1500 	if (scan->kqs_queued) {
1501 		scan->kqs_queued = 0;
1502 		mtx_enter(&kq->kq_lock);
1503 		TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
1504 		mtx_leave(&kq->kq_lock);
1505 	}
1506 	KQRELE(kq);
1507 }
1508 
1509 /*
1510  * XXX
1511  * This could be expanded to call kqueue_scan, if desired.
1512  */
1513 int
1514 kqueue_read(struct file *fp, struct uio *uio, int fflags)
1515 {
1516 	return (ENXIO);
1517 }
1518 
1519 int
1520 kqueue_write(struct file *fp, struct uio *uio, int fflags)
1521 {
1522 	return (ENXIO);
1523 }
1524 
1525 int
1526 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
1527 {
1528 	return (ENOTTY);
1529 }
1530 
1531 int
1532 kqueue_stat(struct file *fp, struct stat *st, struct proc *p)
1533 {
1534 	struct kqueue *kq = fp->f_data;
1535 
1536 	memset(st, 0, sizeof(*st));
1537 	st->st_size = kq->kq_count;	/* unlocked read */
1538 	st->st_blksize = sizeof(struct kevent);
1539 	st->st_mode = S_IFIFO;
1540 	return (0);
1541 }
1542 
1543 void
1544 kqueue_purge(struct proc *p, struct kqueue *kq)
1545 {
1546 	int i;
1547 
1548 	mtx_enter(&kq->kq_lock);
1549 	for (i = 0; i < kq->kq_knlistsize; i++)
1550 		knote_remove(p, kq, &kq->kq_knlist, i, 1);
1551 	if (kq->kq_knhashmask != 0) {
1552 		for (i = 0; i < kq->kq_knhashmask + 1; i++)
1553 			knote_remove(p, kq, &kq->kq_knhash, i, 1);
1554 	}
1555 	mtx_leave(&kq->kq_lock);
1556 }
1557 
1558 void
1559 kqueue_terminate(struct proc *p, struct kqueue *kq)
1560 {
1561 	struct knote *kn;
1562 
1563 	mtx_enter(&kq->kq_lock);
1564 
1565 	/*
1566 	 * Any remaining entries should be scan markers.
1567 	 * They are removed when the ongoing scans finish.
1568 	 */
1569 	KASSERT(kq->kq_count == 0);
1570 	TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe)
1571 		KASSERT(kn->kn_filter == EVFILT_MARKER);
1572 
1573 	kq->kq_state |= KQ_DYING;
1574 	kqueue_wakeup(kq);
1575 	mtx_leave(&kq->kq_lock);
1576 
1577 	KASSERT(klist_empty(&kq->kq_sel.si_note));
1578 	task_del(systqmp, &kq->kq_task);
1579 }
1580 
1581 int
1582 kqueue_close(struct file *fp, struct proc *p)
1583 {
1584 	struct kqueue *kq = fp->f_data;
1585 
1586 	fp->f_data = NULL;
1587 
1588 	kqueue_purge(p, kq);
1589 	kqueue_terminate(p, kq);
1590 
1591 	KQRELE(kq);
1592 
1593 	return (0);
1594 }
1595 
1596 static void
1597 kqueue_task(void *arg)
1598 {
1599 	struct kqueue *kq = arg;
1600 
1601 	mtx_enter(&kqueue_klist_lock);
1602 	KNOTE(&kq->kq_sel.si_note, 0);
1603 	mtx_leave(&kqueue_klist_lock);
1604 	KQRELE(kq);
1605 }
1606 
1607 void
1608 kqueue_wakeup(struct kqueue *kq)
1609 {
1610 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1611 
1612 	if (kq->kq_state & KQ_SLEEP) {
1613 		kq->kq_state &= ~KQ_SLEEP;
1614 		wakeup(kq);
1615 	}
1616 	if (!klist_empty(&kq->kq_sel.si_note)) {
1617 		/* Defer activation to avoid recursion. */
1618 		KQREF(kq);
1619 		if (!task_add(systqmp, &kq->kq_task))
1620 			KQRELE(kq);
1621 	}
1622 }
1623 
1624 static void
1625 kqueue_expand_hash(struct kqueue *kq)
1626 {
1627 	struct knlist *hash;
1628 	u_long hashmask;
1629 
1630 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1631 
1632 	if (kq->kq_knhashmask == 0) {
1633 		mtx_leave(&kq->kq_lock);
1634 		hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask);
1635 		mtx_enter(&kq->kq_lock);
1636 		if (kq->kq_knhashmask == 0) {
1637 			kq->kq_knhash = hash;
1638 			kq->kq_knhashmask = hashmask;
1639 		} else {
1640 			/* Another thread has allocated the hash. */
1641 			mtx_leave(&kq->kq_lock);
1642 			hashfree(hash, KN_HASHSIZE, M_KEVENT);
1643 			mtx_enter(&kq->kq_lock);
1644 		}
1645 	}
1646 }
1647 
1648 static void
1649 kqueue_expand_list(struct kqueue *kq, int fd)
1650 {
1651 	struct knlist *list, *olist;
1652 	int size, osize;
1653 
1654 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1655 
1656 	if (kq->kq_knlistsize <= fd) {
1657 		size = kq->kq_knlistsize;
1658 		mtx_leave(&kq->kq_lock);
1659 		while (size <= fd)
1660 			size += KQEXTENT;
1661 		list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK);
1662 		mtx_enter(&kq->kq_lock);
1663 		if (kq->kq_knlistsize <= fd) {
1664 			memcpy(list, kq->kq_knlist,
1665 			    kq->kq_knlistsize * sizeof(*list));
1666 			memset(&list[kq->kq_knlistsize], 0,
1667 			    (size - kq->kq_knlistsize) * sizeof(*list));
1668 			olist = kq->kq_knlist;
1669 			osize = kq->kq_knlistsize;
1670 			kq->kq_knlist = list;
1671 			kq->kq_knlistsize = size;
1672 			mtx_leave(&kq->kq_lock);
1673 			free(olist, M_KEVENT, osize * sizeof(*list));
1674 			mtx_enter(&kq->kq_lock);
1675 		} else {
1676 			/* Another thread has expanded the list. */
1677 			mtx_leave(&kq->kq_lock);
1678 			free(list, M_KEVENT, size * sizeof(*list));
1679 			mtx_enter(&kq->kq_lock);
1680 		}
1681 	}
1682 }
1683 
1684 /*
1685  * Acquire a knote, return non-zero on success, 0 on failure.
1686  *
1687  * If we cannot acquire the knote we sleep and return 0.  The knote
1688  * may be stale on return in this case and the caller must restart
1689  * whatever loop they are in.
1690  *
1691  * If we are about to sleep and klist is non-NULL, the list is unlocked
1692  * before sleep and remains unlocked on return.
1693  */
1694 int
1695 knote_acquire(struct knote *kn, struct klist *klist, int ls)
1696 {
1697 	struct kqueue *kq = kn->kn_kq;
1698 
1699 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1700 	KASSERT(kn->kn_filter != EVFILT_MARKER);
1701 
1702 	if (kn->kn_status & KN_PROCESSING) {
1703 		kn->kn_status |= KN_WAITING;
1704 		if (klist != NULL) {
1705 			mtx_leave(&kq->kq_lock);
1706 			klist_unlock(klist, ls);
1707 			/* XXX Timeout resolves potential loss of wakeup. */
1708 			tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
1709 		} else {
1710 			msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts",
1711 			    SEC_TO_NSEC(1));
1712 		}
1713 		/* knote may be stale now */
1714 		return (0);
1715 	}
1716 	kn->kn_status |= KN_PROCESSING;
1717 	return (1);
1718 }
1719 
1720 /*
1721  * Release an acquired knote, clearing KN_PROCESSING.
1722  */
1723 void
1724 knote_release(struct knote *kn)
1725 {
1726 	MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock);
1727 	KASSERT(kn->kn_filter != EVFILT_MARKER);
1728 	KASSERT(kn->kn_status & KN_PROCESSING);
1729 
1730 	if (kn->kn_status & KN_WAITING) {
1731 		kn->kn_status &= ~KN_WAITING;
1732 		wakeup(kn);
1733 	}
1734 	kn->kn_status &= ~KN_PROCESSING;
1735 	/* kn should not be accessed anymore */
1736 }
1737 
1738 /*
1739  * activate one knote.
1740  */
1741 void
1742 knote_activate(struct knote *kn)
1743 {
1744 	MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock);
1745 
1746 	kn->kn_status |= KN_ACTIVE;
1747 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)
1748 		knote_enqueue(kn);
1749 }
1750 
1751 /*
1752  * walk down a list of knotes, activating them if their event has triggered.
1753  */
1754 void
1755 knote(struct klist *list, long hint)
1756 {
1757 	struct knote *kn, *kn0;
1758 	struct kqueue *kq;
1759 
1760 	KLIST_ASSERT_LOCKED(list);
1761 
1762 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) {
1763 		if (filter_event(kn, hint)) {
1764 			kq = kn->kn_kq;
1765 			mtx_enter(&kq->kq_lock);
1766 			knote_activate(kn);
1767 			mtx_leave(&kq->kq_lock);
1768 		}
1769 	}
1770 }
1771 
1772 /*
1773  * remove all knotes from a specified knlist
1774  */
1775 void
1776 knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, int idx,
1777     int purge)
1778 {
1779 	struct knote *kn;
1780 
1781 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1782 
1783 	/* Always fetch array pointer as another thread can resize kq_knlist. */
1784 	while ((kn = SLIST_FIRST(*plist + idx)) != NULL) {
1785 		KASSERT(kn->kn_kq == kq);
1786 
1787 		if (!purge) {
1788 			/* Skip pending badfd knotes. */
1789 			while (kn->kn_fop == &badfd_filtops) {
1790 				kn = SLIST_NEXT(kn, kn_link);
1791 				if (kn == NULL)
1792 					return;
1793 				KASSERT(kn->kn_kq == kq);
1794 			}
1795 		}
1796 
1797 		if (!knote_acquire(kn, NULL, 0)) {
1798 			/* knote_acquire() has released kq_lock. */
1799 			mtx_enter(&kq->kq_lock);
1800 			continue;
1801 		}
1802 		mtx_leave(&kq->kq_lock);
1803 		filter_detach(kn);
1804 
1805 		/*
1806 		 * Notify poll(2) and select(2) when a monitored
1807 		 * file descriptor is closed.
1808 		 *
1809 		 * This reuses the original knote for delivering the
1810 		 * notification so as to avoid allocating memory.
1811 		 */
1812 		if (!purge && (kn->kn_flags & (__EV_POLL | __EV_SELECT)) &&
1813 		    !(p->p_kq == kq &&
1814 		      p->p_kq_serial > (unsigned long)kn->kn_udata) &&
1815 		    kn->kn_fop != &badfd_filtops) {
1816 			KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD);
1817 			FRELE(kn->kn_fp, p);
1818 			kn->kn_fp = NULL;
1819 
1820 			kn->kn_fop = &badfd_filtops;
1821 			filter_event(kn, 0);
1822 			mtx_enter(&kq->kq_lock);
1823 			knote_activate(kn);
1824 			knote_release(kn);
1825 			continue;
1826 		}
1827 
1828 		knote_drop(kn, p);
1829 		mtx_enter(&kq->kq_lock);
1830 	}
1831 }
1832 
1833 /*
1834  * remove all knotes referencing a specified fd
1835  */
1836 void
1837 knote_fdclose(struct proc *p, int fd)
1838 {
1839 	struct filedesc *fdp = p->p_p->ps_fd;
1840 	struct kqueue *kq;
1841 
1842 	/*
1843 	 * fdplock can be ignored if the file descriptor table is being freed
1844 	 * because no other thread can access the fdp.
1845 	 */
1846 	if (fdp->fd_refcnt != 0)
1847 		fdpassertlocked(fdp);
1848 
1849 	LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) {
1850 		mtx_enter(&kq->kq_lock);
1851 		if (fd < kq->kq_knlistsize)
1852 			knote_remove(p, kq, &kq->kq_knlist, fd, 0);
1853 		mtx_leave(&kq->kq_lock);
1854 	}
1855 }
1856 
1857 /*
1858  * handle a process exiting, including the triggering of NOTE_EXIT notes
1859  * XXX this could be more efficient, doing a single pass down the klist
1860  */
1861 void
1862 knote_processexit(struct process *pr)
1863 {
1864 	KERNEL_ASSERT_LOCKED();
1865 
1866 	KNOTE(&pr->ps_klist, NOTE_EXIT);
1867 
1868 	/* remove other knotes hanging off the process */
1869 	klist_invalidate(&pr->ps_klist);
1870 }
1871 
1872 void
1873 knote_attach(struct knote *kn)
1874 {
1875 	struct kqueue *kq = kn->kn_kq;
1876 	struct knlist *list;
1877 
1878 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1879 	KASSERT(kn->kn_status & KN_PROCESSING);
1880 
1881 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1882 		KASSERT(kq->kq_knlistsize > kn->kn_id);
1883 		list = &kq->kq_knlist[kn->kn_id];
1884 	} else {
1885 		KASSERT(kq->kq_knhashmask != 0);
1886 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1887 	}
1888 	SLIST_INSERT_HEAD(list, kn, kn_link);
1889 	kq->kq_nknotes++;
1890 }
1891 
1892 void
1893 knote_detach(struct knote *kn)
1894 {
1895 	struct kqueue *kq = kn->kn_kq;
1896 	struct knlist *list;
1897 
1898 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1899 	KASSERT(kn->kn_status & KN_PROCESSING);
1900 
1901 	kq->kq_nknotes--;
1902 	if (kn->kn_fop->f_flags & FILTEROP_ISFD)
1903 		list = &kq->kq_knlist[kn->kn_id];
1904 	else
1905 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1906 	SLIST_REMOVE(list, kn, knote, kn_link);
1907 }
1908 
1909 /*
1910  * should be called at spl == 0, since we don't want to hold spl
1911  * while calling FRELE and pool_put.
1912  */
1913 void
1914 knote_drop(struct knote *kn, struct proc *p)
1915 {
1916 	struct kqueue *kq = kn->kn_kq;
1917 
1918 	KASSERT(kn->kn_filter != EVFILT_MARKER);
1919 
1920 	mtx_enter(&kq->kq_lock);
1921 	knote_detach(kn);
1922 	if (kn->kn_status & KN_QUEUED)
1923 		knote_dequeue(kn);
1924 	if (kn->kn_status & KN_WAITING) {
1925 		kn->kn_status &= ~KN_WAITING;
1926 		wakeup(kn);
1927 	}
1928 	mtx_leave(&kq->kq_lock);
1929 
1930 	if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL)
1931 		FRELE(kn->kn_fp, p);
1932 	pool_put(&knote_pool, kn);
1933 }
1934 
1935 
1936 void
1937 knote_enqueue(struct knote *kn)
1938 {
1939 	struct kqueue *kq = kn->kn_kq;
1940 
1941 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1942 	KASSERT(kn->kn_filter != EVFILT_MARKER);
1943 	KASSERT((kn->kn_status & KN_QUEUED) == 0);
1944 
1945 	kqueue_check(kq);
1946 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1947 	kn->kn_status |= KN_QUEUED;
1948 	kq->kq_count++;
1949 	kqueue_check(kq);
1950 	kqueue_wakeup(kq);
1951 }
1952 
1953 void
1954 knote_dequeue(struct knote *kn)
1955 {
1956 	struct kqueue *kq = kn->kn_kq;
1957 
1958 	MUTEX_ASSERT_LOCKED(&kq->kq_lock);
1959 	KASSERT(kn->kn_filter != EVFILT_MARKER);
1960 	KASSERT(kn->kn_status & KN_QUEUED);
1961 
1962 	kqueue_check(kq);
1963 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1964 	kn->kn_status &= ~KN_QUEUED;
1965 	kq->kq_count--;
1966 	kqueue_check(kq);
1967 }
1968 
1969 /*
1970  * Assign parameters to the knote.
1971  *
1972  * The knote's object lock must be held.
1973  */
1974 void
1975 knote_assign(const struct kevent *kev, struct knote *kn)
1976 {
1977 	if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0)
1978 		KERNEL_ASSERT_LOCKED();
1979 
1980 	kn->kn_sfflags = kev->fflags;
1981 	kn->kn_sdata = kev->data;
1982 	kn->kn_udata = kev->udata;
1983 }
1984 
1985 /*
1986  * Submit the knote's event for delivery.
1987  *
1988  * The knote's object lock must be held.
1989  */
1990 void
1991 knote_submit(struct knote *kn, struct kevent *kev)
1992 {
1993 	if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0)
1994 		KERNEL_ASSERT_LOCKED();
1995 
1996 	if (kev != NULL) {
1997 		*kev = kn->kn_kevent;
1998 		if (kn->kn_flags & EV_CLEAR) {
1999 			kn->kn_fflags = 0;
2000 			kn->kn_data = 0;
2001 		}
2002 	}
2003 }
2004 
2005 void
2006 klist_init(struct klist *klist, const struct klistops *ops, void *arg)
2007 {
2008 	SLIST_INIT(&klist->kl_list);
2009 	klist->kl_ops = ops;
2010 	klist->kl_arg = arg;
2011 }
2012 
2013 void
2014 klist_free(struct klist *klist)
2015 {
2016 	KASSERT(SLIST_EMPTY(&klist->kl_list));
2017 }
2018 
2019 void
2020 klist_insert(struct klist *klist, struct knote *kn)
2021 {
2022 	int ls;
2023 
2024 	ls = klist_lock(klist);
2025 	SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext);
2026 	klist_unlock(klist, ls);
2027 }
2028 
2029 void
2030 klist_insert_locked(struct klist *klist, struct knote *kn)
2031 {
2032 	KLIST_ASSERT_LOCKED(klist);
2033 
2034 	SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext);
2035 }
2036 
2037 void
2038 klist_remove(struct klist *klist, struct knote *kn)
2039 {
2040 	int ls;
2041 
2042 	ls = klist_lock(klist);
2043 	SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext);
2044 	klist_unlock(klist, ls);
2045 }
2046 
2047 void
2048 klist_remove_locked(struct klist *klist, struct knote *kn)
2049 {
2050 	KLIST_ASSERT_LOCKED(klist);
2051 
2052 	SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext);
2053 }
2054 
2055 /*
2056  * Detach all knotes from klist. The knotes are rewired to indicate EOF.
2057  *
2058  * The caller of this function must not hold any locks that can block
2059  * filterops callbacks that run with KN_PROCESSING.
2060  * Otherwise this function might deadlock.
2061  */
2062 void
2063 klist_invalidate(struct klist *list)
2064 {
2065 	struct knote *kn;
2066 	struct kqueue *kq;
2067 	struct proc *p = curproc;
2068 	int ls;
2069 
2070 	NET_ASSERT_UNLOCKED();
2071 
2072 	ls = klist_lock(list);
2073 	while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
2074 		kq = kn->kn_kq;
2075 		mtx_enter(&kq->kq_lock);
2076 		if (!knote_acquire(kn, list, ls)) {
2077 			/* knote_acquire() has released kq_lock
2078 			 * and klist lock. */
2079 			ls = klist_lock(list);
2080 			continue;
2081 		}
2082 		mtx_leave(&kq->kq_lock);
2083 		klist_unlock(list, ls);
2084 		filter_detach(kn);
2085 		if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
2086 			kn->kn_fop = &dead_filtops;
2087 			filter_event(kn, 0);
2088 			mtx_enter(&kq->kq_lock);
2089 			knote_activate(kn);
2090 			knote_release(kn);
2091 			mtx_leave(&kq->kq_lock);
2092 		} else {
2093 			knote_drop(kn, p);
2094 		}
2095 		ls = klist_lock(list);
2096 	}
2097 	klist_unlock(list, ls);
2098 }
2099 
2100 static int
2101 klist_lock(struct klist *list)
2102 {
2103 	int ls = 0;
2104 
2105 	if (list->kl_ops != NULL) {
2106 		ls = list->kl_ops->klo_lock(list->kl_arg);
2107 	} else {
2108 		KERNEL_LOCK();
2109 		ls = splhigh();
2110 	}
2111 	return ls;
2112 }
2113 
2114 static void
2115 klist_unlock(struct klist *list, int ls)
2116 {
2117 	if (list->kl_ops != NULL) {
2118 		list->kl_ops->klo_unlock(list->kl_arg, ls);
2119 	} else {
2120 		splx(ls);
2121 		KERNEL_UNLOCK();
2122 	}
2123 }
2124 
2125 static void
2126 klist_mutex_assertlk(void *arg)
2127 {
2128 	struct mutex *mtx = arg;
2129 
2130 	(void)mtx;
2131 
2132 	MUTEX_ASSERT_LOCKED(mtx);
2133 }
2134 
2135 static int
2136 klist_mutex_lock(void *arg)
2137 {
2138 	struct mutex *mtx = arg;
2139 
2140 	mtx_enter(mtx);
2141 	return 0;
2142 }
2143 
2144 static void
2145 klist_mutex_unlock(void *arg, int s)
2146 {
2147 	struct mutex *mtx = arg;
2148 
2149 	mtx_leave(mtx);
2150 }
2151 
2152 static const struct klistops mutex_klistops = {
2153 	.klo_assertlk	= klist_mutex_assertlk,
2154 	.klo_lock	= klist_mutex_lock,
2155 	.klo_unlock	= klist_mutex_unlock,
2156 };
2157 
2158 void
2159 klist_init_mutex(struct klist *klist, struct mutex *mtx)
2160 {
2161 	klist_init(klist, &mutex_klistops, mtx);
2162 }
2163 
2164 static void
2165 klist_rwlock_assertlk(void *arg)
2166 {
2167 	struct rwlock *rwl = arg;
2168 
2169 	(void)rwl;
2170 
2171 	rw_assert_wrlock(rwl);
2172 }
2173 
2174 static int
2175 klist_rwlock_lock(void *arg)
2176 {
2177 	struct rwlock *rwl = arg;
2178 
2179 	rw_enter_write(rwl);
2180 	return 0;
2181 }
2182 
2183 static void
2184 klist_rwlock_unlock(void *arg, int s)
2185 {
2186 	struct rwlock *rwl = arg;
2187 
2188 	rw_exit_write(rwl);
2189 }
2190 
2191 static const struct klistops rwlock_klistops = {
2192 	.klo_assertlk	= klist_rwlock_assertlk,
2193 	.klo_lock	= klist_rwlock_lock,
2194 	.klo_unlock	= klist_rwlock_unlock,
2195 };
2196 
2197 void
2198 klist_init_rwlock(struct klist *klist, struct rwlock *rwl)
2199 {
2200 	klist_init(klist, &rwlock_klistops, rwl);
2201 }
2202