xref: /dflybsd-src/sys/kern/kern_event.c (revision 5b991541a99aa38e5ca17ac8e6abee49bd57ac56)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $
27  * $DragonFly: src/sys/kern/kern_event.c,v 1.33 2007/02/03 17:05:57 corecode Exp $
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/proc.h>
34 #include <sys/malloc.h>
35 #include <sys/unistd.h>
36 #include <sys/file.h>
37 #include <sys/lock.h>
38 #include <sys/fcntl.h>
39 #include <sys/queue.h>
40 #include <sys/event.h>
41 #include <sys/eventvar.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/stat.h>
46 #include <sys/sysctl.h>
47 #include <sys/sysproto.h>
48 #include <sys/thread.h>
49 #include <sys/uio.h>
50 #include <sys/signalvar.h>
51 #include <sys/filio.h>
52 #include <sys/ktr.h>
53 
54 #include <sys/thread2.h>
55 #include <sys/file2.h>
56 #include <sys/mplock2.h>
57 
58 #include <vm/vm_zone.h>
59 
60 /*
61  * Global token for kqueue subsystem
62  */
63 struct lwkt_token kq_token = LWKT_TOKEN_UP_INITIALIZER(kq_token);
64 SYSCTL_INT(_lwkt, OID_AUTO, kq_mpsafe,
65 	   CTLFLAG_RW, &kq_token.t_flags, 0, "");
66 SYSCTL_LONG(_lwkt, OID_AUTO, kq_collisions,
67 	    CTLFLAG_RW, &kq_token.t_collisions, 0, "");
68 
69 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
70 
71 struct kevent_copyin_args {
72 	struct kevent_args	*ka;
73 	int			pchanges;
74 };
75 
76 static int	kqueue_sleep(struct kqueue *kq, struct timespec *tsp);
77 static int	kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
78 		    struct knote *marker);
79 static int 	kqueue_read(struct file *fp, struct uio *uio,
80 		    struct ucred *cred, int flags);
81 static int	kqueue_write(struct file *fp, struct uio *uio,
82 		    struct ucred *cred, int flags);
83 static int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
84 		    struct ucred *cred, struct sysmsg *msg);
85 static int 	kqueue_kqfilter(struct file *fp, struct knote *kn);
86 static int 	kqueue_stat(struct file *fp, struct stat *st,
87 		    struct ucred *cred);
88 static int 	kqueue_close(struct file *fp);
89 static void	kqueue_wakeup(struct kqueue *kq);
90 static int	filter_attach(struct knote *kn);
91 static int	filter_event(struct knote *kn, long hint);
92 
93 /*
94  * MPSAFE
95  */
96 static struct fileops kqueueops = {
97 	.fo_read = kqueue_read,
98 	.fo_write = kqueue_write,
99 	.fo_ioctl = kqueue_ioctl,
100 	.fo_kqfilter = kqueue_kqfilter,
101 	.fo_stat = kqueue_stat,
102 	.fo_close = kqueue_close,
103 	.fo_shutdown = nofo_shutdown
104 };
105 
106 static void 	knote_attach(struct knote *kn);
107 static void 	knote_drop(struct knote *kn);
108 static void	knote_detach_and_drop(struct knote *kn);
109 static void 	knote_enqueue(struct knote *kn);
110 static void 	knote_dequeue(struct knote *kn);
111 static void 	knote_init(void);
112 static struct 	knote *knote_alloc(void);
113 static void 	knote_free(struct knote *kn);
114 
115 static void	filt_kqdetach(struct knote *kn);
116 static int	filt_kqueue(struct knote *kn, long hint);
117 static int	filt_procattach(struct knote *kn);
118 static void	filt_procdetach(struct knote *kn);
119 static int	filt_proc(struct knote *kn, long hint);
120 static int	filt_fileattach(struct knote *kn);
121 static void	filt_timerexpire(void *knx);
122 static int	filt_timerattach(struct knote *kn);
123 static void	filt_timerdetach(struct knote *kn);
124 static int	filt_timer(struct knote *kn, long hint);
125 
126 static struct filterops file_filtops =
127 	{ FILTEROP_ISFD, filt_fileattach, NULL, NULL };
128 static struct filterops kqread_filtops =
129 	{ FILTEROP_ISFD, NULL, filt_kqdetach, filt_kqueue };
130 static struct filterops proc_filtops =
131 	{ 0, filt_procattach, filt_procdetach, filt_proc };
132 static struct filterops timer_filtops =
133 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
134 
135 static vm_zone_t	knote_zone;
136 static int 		kq_ncallouts = 0;
137 static int 		kq_calloutmax = (4 * 1024);
138 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
139     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
140 static int		kq_checkloop = 1000000;
141 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
142     &kq_checkloop, 0, "Maximum number of callouts allocated for kqueue");
143 
144 #define KNOTE_ACTIVATE(kn) do { 					\
145 	kn->kn_status |= KN_ACTIVE;					\
146 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
147 		knote_enqueue(kn);					\
148 } while(0)
149 
150 #define	KN_HASHSIZE		64		/* XXX should be tunable */
151 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
152 
153 extern struct filterops aio_filtops;
154 extern struct filterops sig_filtops;
155 
156 /*
157  * Table for for all system-defined filters.
158  */
159 static struct filterops *sysfilt_ops[] = {
160 	&file_filtops,			/* EVFILT_READ */
161 	&file_filtops,			/* EVFILT_WRITE */
162 	&aio_filtops,			/* EVFILT_AIO */
163 	&file_filtops,			/* EVFILT_VNODE */
164 	&proc_filtops,			/* EVFILT_PROC */
165 	&sig_filtops,			/* EVFILT_SIGNAL */
166 	&timer_filtops,			/* EVFILT_TIMER */
167 	&file_filtops,			/* EVFILT_EXCEPT */
168 };
169 
170 static int
171 filt_fileattach(struct knote *kn)
172 {
173 	return (fo_kqfilter(kn->kn_fp, kn));
174 }
175 
176 /*
177  * MPSAFE
178  */
179 static int
180 kqueue_kqfilter(struct file *fp, struct knote *kn)
181 {
182 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
183 
184 	if (kn->kn_filter != EVFILT_READ)
185 		return (EOPNOTSUPP);
186 
187 	kn->kn_fop = &kqread_filtops;
188 	knote_insert(&kq->kq_kqinfo.ki_note, kn);
189 	return (0);
190 }
191 
192 static void
193 filt_kqdetach(struct knote *kn)
194 {
195 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
196 
197 	knote_remove(&kq->kq_kqinfo.ki_note, kn);
198 }
199 
200 /*ARGSUSED*/
201 static int
202 filt_kqueue(struct knote *kn, long hint)
203 {
204 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
205 
206 	kn->kn_data = kq->kq_count;
207 	return (kn->kn_data > 0);
208 }
209 
210 static int
211 filt_procattach(struct knote *kn)
212 {
213 	struct proc *p;
214 	int immediate;
215 
216 	immediate = 0;
217 	lwkt_gettoken(&proc_token);
218 	p = pfind(kn->kn_id);
219 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
220 		p = zpfind(kn->kn_id);
221 		immediate = 1;
222 	}
223 	if (p == NULL) {
224 		lwkt_reltoken(&proc_token);
225 		return (ESRCH);
226 	}
227 	if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
228 		lwkt_reltoken(&proc_token);
229 		return (EACCES);
230 	}
231 
232 	kn->kn_ptr.p_proc = p;
233 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
234 
235 	/*
236 	 * internal flag indicating registration done by kernel
237 	 */
238 	if (kn->kn_flags & EV_FLAG1) {
239 		kn->kn_data = kn->kn_sdata;		/* ppid */
240 		kn->kn_fflags = NOTE_CHILD;
241 		kn->kn_flags &= ~EV_FLAG1;
242 	}
243 
244 	knote_insert(&p->p_klist, kn);
245 
246 	/*
247 	 * Immediately activate any exit notes if the target process is a
248 	 * zombie.  This is necessary to handle the case where the target
249 	 * process, e.g. a child, dies before the kevent is negistered.
250 	 */
251 	if (immediate && filt_proc(kn, NOTE_EXIT))
252 		KNOTE_ACTIVATE(kn);
253 	lwkt_reltoken(&proc_token);
254 
255 	return (0);
256 }
257 
258 /*
259  * The knote may be attached to a different process, which may exit,
260  * leaving nothing for the knote to be attached to.  So when the process
261  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
262  * it will be deleted when read out.  However, as part of the knote deletion,
263  * this routine is called, so a check is needed to avoid actually performing
264  * a detach, because the original process does not exist any more.
265  */
266 static void
267 filt_procdetach(struct knote *kn)
268 {
269 	struct proc *p;
270 
271 	if (kn->kn_status & KN_DETACHED)
272 		return;
273 	/* XXX locking? take proc_token here? */
274 	p = kn->kn_ptr.p_proc;
275 	knote_remove(&p->p_klist, kn);
276 }
277 
278 static int
279 filt_proc(struct knote *kn, long hint)
280 {
281 	u_int event;
282 
283 	/*
284 	 * mask off extra data
285 	 */
286 	event = (u_int)hint & NOTE_PCTRLMASK;
287 
288 	/*
289 	 * if the user is interested in this event, record it.
290 	 */
291 	if (kn->kn_sfflags & event)
292 		kn->kn_fflags |= event;
293 
294 	/*
295 	 * Process is gone, so flag the event as finished.  Detach the
296 	 * knote from the process now because the process will be poof,
297 	 * gone later on.
298 	 */
299 	if (event == NOTE_EXIT) {
300 		struct proc *p = kn->kn_ptr.p_proc;
301 		if ((kn->kn_status & KN_DETACHED) == 0) {
302 			knote_remove(&p->p_klist, kn);
303 			kn->kn_status |= KN_DETACHED;
304 			kn->kn_data = p->p_xstat;
305 			kn->kn_ptr.p_proc = NULL;
306 		}
307 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
308 		return (1);
309 	}
310 
311 	/*
312 	 * process forked, and user wants to track the new process,
313 	 * so attach a new knote to it, and immediately report an
314 	 * event with the parent's pid.
315 	 */
316 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
317 		struct kevent kev;
318 		int error;
319 
320 		/*
321 		 * register knote with new process.
322 		 */
323 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
324 		kev.filter = kn->kn_filter;
325 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
326 		kev.fflags = kn->kn_sfflags;
327 		kev.data = kn->kn_id;			/* parent */
328 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
329 		error = kqueue_register(kn->kn_kq, &kev);
330 		if (error)
331 			kn->kn_fflags |= NOTE_TRACKERR;
332 	}
333 
334 	return (kn->kn_fflags != 0);
335 }
336 
337 /*
338  * The callout interlocks with callout_stop() (or should), so the
339  * knote should still be a valid structure.  However the timeout
340  * can race a deletion so if KN_DELETING is set we just don't touch
341  * the knote.
342  */
343 static void
344 filt_timerexpire(void *knx)
345 {
346 	struct knote *kn = knx;
347 	struct callout *calloutp;
348 	struct timeval tv;
349 	int tticks;
350 
351 	lwkt_gettoken(&kq_token);
352 	if ((kn->kn_status & KN_DELETING) == 0) {
353 		kn->kn_data++;
354 		KNOTE_ACTIVATE(kn);
355 
356 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
357 			tv.tv_sec = kn->kn_sdata / 1000;
358 			tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
359 			tticks = tvtohz_high(&tv);
360 			calloutp = (struct callout *)kn->kn_hook;
361 			callout_reset(calloutp, tticks, filt_timerexpire, kn);
362 		}
363 	}
364 	lwkt_reltoken(&kq_token);
365 }
366 
367 /*
368  * data contains amount of time to sleep, in milliseconds
369  */
370 static int
371 filt_timerattach(struct knote *kn)
372 {
373 	struct callout *calloutp;
374 	struct timeval tv;
375 	int tticks;
376 
377 	if (kq_ncallouts >= kq_calloutmax)
378 		return (ENOMEM);
379 	kq_ncallouts++;
380 
381 	tv.tv_sec = kn->kn_sdata / 1000;
382 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
383 	tticks = tvtohz_high(&tv);
384 
385 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
386 	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
387 	    M_KQUEUE, M_WAITOK);
388 	callout_init(calloutp);
389 	kn->kn_hook = (caddr_t)calloutp;
390 	callout_reset(calloutp, tticks, filt_timerexpire, kn);
391 
392 	return (0);
393 }
394 
395 static void
396 filt_timerdetach(struct knote *kn)
397 {
398 	struct callout *calloutp;
399 
400 	calloutp = (struct callout *)kn->kn_hook;
401 	callout_stop(calloutp);
402 	FREE(calloutp, M_KQUEUE);
403 	kq_ncallouts--;
404 }
405 
406 static int
407 filt_timer(struct knote *kn, long hint)
408 {
409 
410 	return (kn->kn_data != 0);
411 }
412 
413 /*
414  * Acquire a knote, return non-zero on success, 0 on failure.
415  *
416  * If we cannot acquire the knote we sleep and return 0.  The knote
417  * may be stale on return in this case and the caller must restart
418  * whatever loop they are in.
419  */
420 static __inline
421 int
422 knote_acquire(struct knote *kn)
423 {
424 	if (kn->kn_status & KN_PROCESSING) {
425 		kn->kn_status |= KN_WAITING | KN_REPROCESS;
426 		tsleep(kn, 0, "kqepts", hz);
427 		/* knote may be stale now */
428 		return(0);
429 	}
430 	kn->kn_status |= KN_PROCESSING;
431 	return(1);
432 }
433 
434 /*
435  * Release an acquired knote, clearing KN_PROCESSING and handling any
436  * KN_REPROCESS events.
437  *
438  * Non-zero is returned if the knote is destroyed.
439  */
440 static __inline
441 int
442 knote_release(struct knote *kn)
443 {
444 	while (kn->kn_status & KN_REPROCESS) {
445 		kn->kn_status &= ~KN_REPROCESS;
446 		if (kn->kn_status & KN_WAITING) {
447 			kn->kn_status &= ~KN_WAITING;
448 			wakeup(kn);
449 		}
450 		if (kn->kn_status & KN_DELETING) {
451 			knote_detach_and_drop(kn);
452 			return(1);
453 			/* NOT REACHED */
454 		}
455 		if (filter_event(kn, 0))
456 			KNOTE_ACTIVATE(kn);
457 	}
458 	kn->kn_status &= ~KN_PROCESSING;
459 	return(0);
460 }
461 
462 /*
463  * Initialize a kqueue.
464  *
465  * NOTE: The lwp/proc code initializes a kqueue for select/poll ops.
466  *
467  * MPSAFE
468  */
469 void
470 kqueue_init(struct kqueue *kq, struct filedesc *fdp)
471 {
472 	TAILQ_INIT(&kq->kq_knpend);
473 	TAILQ_INIT(&kq->kq_knlist);
474 	kq->kq_count = 0;
475 	kq->kq_fdp = fdp;
476 	SLIST_INIT(&kq->kq_kqinfo.ki_note);
477 }
478 
479 /*
480  * Terminate a kqueue.  Freeing the actual kq itself is left up to the
481  * caller (it might be embedded in a lwp so we don't do it here).
482  *
483  * The kq's knlist must be completely eradicated so block on any
484  * processing races.
485  */
486 void
487 kqueue_terminate(struct kqueue *kq)
488 {
489 	struct knote *kn;
490 
491 	lwkt_gettoken(&kq_token);
492 	while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
493 		if (knote_acquire(kn))
494 			knote_detach_and_drop(kn);
495 	}
496 	if (kq->kq_knhash) {
497 		kfree(kq->kq_knhash, M_KQUEUE);
498 		kq->kq_knhash = NULL;
499 		kq->kq_knhashmask = 0;
500 	}
501 	lwkt_reltoken(&kq_token);
502 }
503 
504 /*
505  * MPSAFE
506  */
507 int
508 sys_kqueue(struct kqueue_args *uap)
509 {
510 	struct thread *td = curthread;
511 	struct kqueue *kq;
512 	struct file *fp;
513 	int fd, error;
514 
515 	error = falloc(td->td_lwp, &fp, &fd);
516 	if (error)
517 		return (error);
518 	fp->f_flag = FREAD | FWRITE;
519 	fp->f_type = DTYPE_KQUEUE;
520 	fp->f_ops = &kqueueops;
521 
522 	kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
523 	kqueue_init(kq, td->td_proc->p_fd);
524 	fp->f_data = kq;
525 
526 	fsetfd(kq->kq_fdp, fp, fd);
527 	uap->sysmsg_result = fd;
528 	fdrop(fp);
529 	return (error);
530 }
531 
532 /*
533  * Copy 'count' items into the destination list pointed to by uap->eventlist.
534  */
535 static int
536 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res)
537 {
538 	struct kevent_copyin_args *kap;
539 	int error;
540 
541 	kap = (struct kevent_copyin_args *)arg;
542 
543 	error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp));
544 	if (error == 0) {
545 		kap->ka->eventlist += count;
546 		*res += count;
547 	} else {
548 		*res = -1;
549 	}
550 
551 	return (error);
552 }
553 
554 /*
555  * Copy at most 'max' items from the list pointed to by kap->changelist,
556  * return number of items in 'events'.
557  */
558 static int
559 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
560 {
561 	struct kevent_copyin_args *kap;
562 	int error, count;
563 
564 	kap = (struct kevent_copyin_args *)arg;
565 
566 	count = min(kap->ka->nchanges - kap->pchanges, max);
567 	error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp);
568 	if (error == 0) {
569 		kap->ka->changelist += count;
570 		kap->pchanges += count;
571 		*events = count;
572 	}
573 
574 	return (error);
575 }
576 
577 /*
578  * MPSAFE
579  */
580 int
581 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
582 	    k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
583 	    struct timespec *tsp_in)
584 {
585 	struct kevent *kevp;
586 	struct timespec *tsp;
587 	int i, n, total, error, nerrors = 0;
588 	int lres;
589 	int limit = kq_checkloop;
590 	struct kevent kev[KQ_NEVENTS];
591 	struct knote marker;
592 
593 	tsp = tsp_in;
594 	*res = 0;
595 
596 	lwkt_gettoken(&kq_token);
597 	for ( ;; ) {
598 		n = 0;
599 		error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
600 		if (error)
601 			goto done;
602 		if (n == 0)
603 			break;
604 		for (i = 0; i < n; i++) {
605 			kevp = &kev[i];
606 			kevp->flags &= ~EV_SYSFLAGS;
607 			error = kqueue_register(kq, kevp);
608 
609 			/*
610 			 * If a registration returns an error we
611 			 * immediately post the error.  The kevent()
612 			 * call itself will fail with the error if
613 			 * no space is available for posting.
614 			 *
615 			 * Such errors normally bypass the timeout/blocking
616 			 * code.  However, if the copyoutfn function refuses
617 			 * to post the error (see sys_poll()), then we
618 			 * ignore it too.
619 			 */
620 			if (error) {
621 				kevp->flags = EV_ERROR;
622 				kevp->data = error;
623 				lres = *res;
624 				kevent_copyoutfn(uap, kevp, 1, res);
625 				if (lres != *res) {
626 					nevents--;
627 					nerrors++;
628 				}
629 			}
630 		}
631 	}
632 	if (nerrors) {
633 		error = 0;
634 		goto done;
635 	}
636 
637 	/*
638 	 * Acquire/wait for events - setup timeout
639 	 */
640 	if (tsp != NULL) {
641 		struct timespec ats;
642 
643 		if (tsp->tv_sec || tsp->tv_nsec) {
644 			nanouptime(&ats);
645 			timespecadd(tsp, &ats);		/* tsp = target time */
646 		}
647 	}
648 
649 	/*
650 	 * Loop as required.
651 	 *
652 	 * Collect as many events as we can. Sleeping on successive
653 	 * loops is disabled if copyoutfn has incremented (*res).
654 	 *
655 	 * The loop stops if an error occurs, all events have been
656 	 * scanned (the marker has been reached), or fewer than the
657 	 * maximum number of events is found.
658 	 *
659 	 * The copyoutfn function does not have to increment (*res) in
660 	 * order for the loop to continue.
661 	 *
662 	 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
663 	 */
664 	total = 0;
665 	error = 0;
666 	marker.kn_filter = EVFILT_MARKER;
667 	marker.kn_status = KN_PROCESSING;
668 	TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
669 	while ((n = nevents - total) > 0) {
670 		if (n > KQ_NEVENTS)
671 			n = KQ_NEVENTS;
672 
673 		/*
674 		 * If no events are pending sleep until timeout (if any)
675 		 * or an event occurs.
676 		 *
677 		 * After the sleep completes the marker is moved to the
678 		 * end of the list, making any received events available
679 		 * to our scan.
680 		 */
681 		if (kq->kq_count == 0 && *res == 0) {
682 			error = kqueue_sleep(kq, tsp);
683 			if (error)
684 				break;
685 
686 			TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
687 			TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
688 		}
689 
690 		/*
691 		 * Process all received events
692 		 * Account for all non-spurious events in our total
693 		 */
694 		i = kqueue_scan(kq, kev, n, &marker);
695 		if (i) {
696 			lres = *res;
697 			error = kevent_copyoutfn(uap, kev, i, res);
698 			total += *res - lres;
699 			if (error)
700 				break;
701 		}
702 		if (limit && --limit == 0)
703 			panic("kqueue: checkloop failed i=%d", i);
704 
705 		/*
706 		 * Normally when fewer events are returned than requested
707 		 * we can stop.  However, if only spurious events were
708 		 * collected the copyout will not bump (*res) and we have
709 		 * to continue.
710 		 */
711 		if (i < n && *res)
712 			break;
713 
714 		/*
715 		 * Deal with an edge case where spurious events can cause
716 		 * a loop to occur without moving the marker.  This can
717 		 * prevent kqueue_scan() from picking up new events which
718 		 * race us.  We must be sure to move the marker for this
719 		 * case.
720 		 *
721 		 * NOTE: We do not want to move the marker if events
722 		 *	 were scanned because normal kqueue operations
723 		 *	 may reactivate events.  Moving the marker in
724 		 *	 that case could result in duplicates for the
725 		 *	 same event.
726 		 */
727 		if (i == 0) {
728 			TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
729 			TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
730 		}
731 	}
732 	TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
733 
734 	/* Timeouts do not return EWOULDBLOCK. */
735 	if (error == EWOULDBLOCK)
736 		error = 0;
737 
738 done:
739 	lwkt_reltoken(&kq_token);
740 	return (error);
741 }
742 
743 /*
744  * MPALMOSTSAFE
745  */
746 int
747 sys_kevent(struct kevent_args *uap)
748 {
749 	struct thread *td = curthread;
750 	struct proc *p = td->td_proc;
751 	struct timespec ts, *tsp;
752 	struct kqueue *kq;
753 	struct file *fp = NULL;
754 	struct kevent_copyin_args *kap, ka;
755 	int error;
756 
757 	if (uap->timeout) {
758 		error = copyin(uap->timeout, &ts, sizeof(ts));
759 		if (error)
760 			return (error);
761 		tsp = &ts;
762 	} else {
763 		tsp = NULL;
764 	}
765 
766 	fp = holdfp(p->p_fd, uap->fd, -1);
767 	if (fp == NULL)
768 		return (EBADF);
769 	if (fp->f_type != DTYPE_KQUEUE) {
770 		fdrop(fp);
771 		return (EBADF);
772 	}
773 
774 	kq = (struct kqueue *)fp->f_data;
775 
776 	kap = &ka;
777 	kap->ka = uap;
778 	kap->pchanges = 0;
779 
780 	error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap,
781 			    kevent_copyin, kevent_copyout, tsp);
782 
783 	fdrop(fp);
784 
785 	return (error);
786 }
787 
788 int
789 kqueue_register(struct kqueue *kq, struct kevent *kev)
790 {
791 	struct filedesc *fdp = kq->kq_fdp;
792 	struct filterops *fops;
793 	struct file *fp = NULL;
794 	struct knote *kn = NULL;
795 	int error = 0;
796 
797 	if (kev->filter < 0) {
798 		if (kev->filter + EVFILT_SYSCOUNT < 0)
799 			return (EINVAL);
800 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
801 	} else {
802 		/*
803 		 * XXX
804 		 * filter attach routine is responsible for insuring that
805 		 * the identifier can be attached to it.
806 		 */
807 		kprintf("unknown filter: %d\n", kev->filter);
808 		return (EINVAL);
809 	}
810 
811 	lwkt_gettoken(&kq_token);
812 	if (fops->f_flags & FILTEROP_ISFD) {
813 		/* validate descriptor */
814 		fp = holdfp(fdp, kev->ident, -1);
815 		if (fp == NULL) {
816 			lwkt_reltoken(&kq_token);
817 			return (EBADF);
818 		}
819 
820 again1:
821 		SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
822 			if (kn->kn_kq == kq &&
823 			    kn->kn_filter == kev->filter &&
824 			    kn->kn_id == kev->ident) {
825 				if (knote_acquire(kn) == 0)
826 					goto again1;
827 				break;
828 			}
829 		}
830 	} else {
831 		if (kq->kq_knhashmask) {
832 			struct klist *list;
833 
834 			list = &kq->kq_knhash[
835 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
836 again2:
837 			SLIST_FOREACH(kn, list, kn_link) {
838 				if (kn->kn_id == kev->ident &&
839 				    kn->kn_filter == kev->filter) {
840 					if (knote_acquire(kn) == 0)
841 						goto again2;
842 					break;
843 				}
844 			}
845 		}
846 	}
847 
848 	/*
849 	 * NOTE: At this point if kn is non-NULL we will have acquired
850 	 *	 it and set KN_PROCESSING.
851 	 */
852 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
853 		error = ENOENT;
854 		goto done;
855 	}
856 
857 	/*
858 	 * kn now contains the matching knote, or NULL if no match
859 	 */
860 	if (kev->flags & EV_ADD) {
861 		if (kn == NULL) {
862 			kn = knote_alloc();
863 			if (kn == NULL) {
864 				error = ENOMEM;
865 				goto done;
866 			}
867 			kn->kn_fp = fp;
868 			kn->kn_kq = kq;
869 			kn->kn_fop = fops;
870 
871 			/*
872 			 * apply reference count to knote structure, and
873 			 * do not release it at the end of this routine.
874 			 */
875 			fp = NULL;
876 
877 			kn->kn_sfflags = kev->fflags;
878 			kn->kn_sdata = kev->data;
879 			kev->fflags = 0;
880 			kev->data = 0;
881 			kn->kn_kevent = *kev;
882 
883 			/*
884 			 * KN_PROCESSING prevents the knote from getting
885 			 * ripped out from under us while we are trying
886 			 * to attach it, in case the attach blocks.
887 			 */
888 			kn->kn_status = KN_PROCESSING;
889 			knote_attach(kn);
890 			if ((error = filter_attach(kn)) != 0) {
891 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
892 				knote_drop(kn);
893 				goto done;
894 			}
895 
896 			/*
897 			 * Interlock against close races which either tried
898 			 * to remove our knote while we were blocked or missed
899 			 * it entirely prior to our attachment.  We do not
900 			 * want to end up with a knote on a closed descriptor.
901 			 */
902 			if ((fops->f_flags & FILTEROP_ISFD) &&
903 			    checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
904 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
905 			}
906 		} else {
907 			/*
908 			 * The user may change some filter values after the
909 			 * initial EV_ADD, but doing so will not reset any
910 			 * filter which have already been triggered.
911 			 */
912 			KKASSERT(kn->kn_status & KN_PROCESSING);
913 			kn->kn_sfflags = kev->fflags;
914 			kn->kn_sdata = kev->data;
915 			kn->kn_kevent.udata = kev->udata;
916 		}
917 
918 		/*
919 		 * Execute the filter event to immediately activate the
920 		 * knote if necessary.  If reprocessing events are pending
921 		 * due to blocking above we do not run the filter here
922 		 * but instead let knote_release() do it.  Otherwise we
923 		 * might run the filter on a deleted event.
924 		 */
925 		if ((kn->kn_status & KN_REPROCESS) == 0) {
926 			if (filter_event(kn, 0))
927 				KNOTE_ACTIVATE(kn);
928 		}
929 	} else if (kev->flags & EV_DELETE) {
930 		/*
931 		 * Delete the existing knote
932 		 */
933 		knote_detach_and_drop(kn);
934 		goto done;
935 	}
936 
937 	/*
938 	 * Disablement does not deactivate a knote here.
939 	 */
940 	if ((kev->flags & EV_DISABLE) &&
941 	    ((kn->kn_status & KN_DISABLED) == 0)) {
942 		kn->kn_status |= KN_DISABLED;
943 	}
944 
945 	/*
946 	 * Re-enablement may have to immediately enqueue an active knote.
947 	 */
948 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
949 		kn->kn_status &= ~KN_DISABLED;
950 		if ((kn->kn_status & KN_ACTIVE) &&
951 		    ((kn->kn_status & KN_QUEUED) == 0)) {
952 			knote_enqueue(kn);
953 		}
954 	}
955 
956 	/*
957 	 * Handle any required reprocessing
958 	 */
959 	knote_release(kn);
960 	/* kn may be invalid now */
961 
962 done:
963 	lwkt_reltoken(&kq_token);
964 	if (fp != NULL)
965 		fdrop(fp);
966 	return (error);
967 }
968 
969 /*
970  * Block as necessary until the target time is reached.
971  * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
972  * 0 we do not block at all.
973  */
974 static int
975 kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
976 {
977 	int error = 0;
978 
979 	if (tsp == NULL) {
980 		kq->kq_state |= KQ_SLEEP;
981 		error = tsleep(kq, PCATCH, "kqread", 0);
982 	} else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
983 		error = EWOULDBLOCK;
984 	} else {
985 		struct timespec ats;
986 		struct timespec atx = *tsp;
987 		int timeout;
988 
989 		nanouptime(&ats);
990 		timespecsub(&atx, &ats);
991 		if (ats.tv_sec < 0) {
992 			error = EWOULDBLOCK;
993 		} else {
994 			timeout = atx.tv_sec > 24 * 60 * 60 ?
995 				24 * 60 * 60 * hz : tstohz_high(&atx);
996 			kq->kq_state |= KQ_SLEEP;
997 			error = tsleep(kq, PCATCH, "kqread", timeout);
998 		}
999 	}
1000 
1001 	/* don't restart after signals... */
1002 	if (error == ERESTART)
1003 		return (EINTR);
1004 
1005 	return (error);
1006 }
1007 
1008 /*
1009  * Scan the kqueue, return the number of active events placed in kevp up
1010  * to count.
1011  *
1012  * Continuous mode events may get recycled, do not continue scanning past
1013  * marker unless no events have been collected.
1014  */
1015 static int
1016 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
1017             struct knote *marker)
1018 {
1019         struct knote *kn, local_marker;
1020         int total;
1021 
1022         total = 0;
1023 	local_marker.kn_filter = EVFILT_MARKER;
1024 	local_marker.kn_status = KN_PROCESSING;
1025 
1026 	/*
1027 	 * Collect events.
1028 	 */
1029 	TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
1030 	while (count) {
1031 		kn = TAILQ_NEXT(&local_marker, kn_tqe);
1032 		if (kn->kn_filter == EVFILT_MARKER) {
1033 			/* Marker reached, we are done */
1034 			if (kn == marker)
1035 				break;
1036 
1037 			/* Move local marker past some other threads marker */
1038 			kn = TAILQ_NEXT(kn, kn_tqe);
1039 			TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1040 			TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
1041 			continue;
1042 		}
1043 
1044 		/*
1045 		 * We can't skip a knote undergoing processing, otherwise
1046 		 * we risk not returning it when the user process expects
1047 		 * it should be returned.  Sleep and retry.
1048 		 */
1049 		if (knote_acquire(kn) == 0)
1050 			continue;
1051 
1052 		/*
1053 		 * Remove the event for processing.
1054 		 *
1055 		 * WARNING!  We must leave KN_QUEUED set to prevent the
1056 		 *	     event from being KNOTE_ACTIVATE()d while
1057 		 *	     the queue state is in limbo, in case we
1058 		 *	     block.
1059 		 *
1060 		 * WARNING!  We must set KN_PROCESSING to avoid races
1061 		 *	     against deletion or another thread's
1062 		 *	     processing.
1063 		 */
1064 		TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1065 		kq->kq_count--;
1066 
1067 		/*
1068 		 * We have to deal with an extremely important race against
1069 		 * file descriptor close()s here.  The file descriptor can
1070 		 * disappear MPSAFE, and there is a small window of
1071 		 * opportunity between that and the call to knote_fdclose().
1072 		 *
1073 		 * If we hit that window here while doselect or dopoll is
1074 		 * trying to delete a spurious event they will not be able
1075 		 * to match up the event against a knote and will go haywire.
1076 		 */
1077 		if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1078 		    checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
1079 			kn->kn_status |= KN_DELETING | KN_REPROCESS;
1080 		}
1081 
1082 		if (kn->kn_status & KN_DISABLED) {
1083 			/*
1084 			 * If disabled we ensure the event is not queued
1085 			 * but leave its active bit set.  On re-enablement
1086 			 * the event may be immediately triggered.
1087 			 */
1088 			kn->kn_status &= ~KN_QUEUED;
1089 		} else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
1090 			   (kn->kn_status & KN_DELETING) == 0 &&
1091 			   filter_event(kn, 0) == 0) {
1092 			/*
1093 			 * If not running in one-shot mode and the event
1094 			 * is no longer present we ensure it is removed
1095 			 * from the queue and ignore it.
1096 			 */
1097 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1098 		} else {
1099 			/*
1100 			 * Post the event
1101 			 */
1102 			*kevp++ = kn->kn_kevent;
1103 			++total;
1104 			--count;
1105 
1106 			if (kn->kn_flags & EV_ONESHOT) {
1107 				kn->kn_status &= ~KN_QUEUED;
1108 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
1109 			} else if (kn->kn_flags & EV_CLEAR) {
1110 				kn->kn_data = 0;
1111 				kn->kn_fflags = 0;
1112 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1113 			} else {
1114 				TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
1115 				kq->kq_count++;
1116 			}
1117 		}
1118 
1119 		/*
1120 		 * Handle any post-processing states
1121 		 */
1122 		knote_release(kn);
1123 	}
1124 	TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1125 
1126 	return (total);
1127 }
1128 
1129 /*
1130  * XXX
1131  * This could be expanded to call kqueue_scan, if desired.
1132  *
1133  * MPSAFE
1134  */
1135 static int
1136 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1137 {
1138 	return (ENXIO);
1139 }
1140 
1141 /*
1142  * MPSAFE
1143  */
1144 static int
1145 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1146 {
1147 	return (ENXIO);
1148 }
1149 
1150 /*
1151  * MPALMOSTSAFE
1152  */
1153 static int
1154 kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
1155 	     struct ucred *cred, struct sysmsg *msg)
1156 {
1157 	struct kqueue *kq;
1158 	int error;
1159 
1160 	lwkt_gettoken(&kq_token);
1161 	kq = (struct kqueue *)fp->f_data;
1162 
1163 	switch(com) {
1164 	case FIOASYNC:
1165 		if (*(int *)data)
1166 			kq->kq_state |= KQ_ASYNC;
1167 		else
1168 			kq->kq_state &= ~KQ_ASYNC;
1169 		error = 0;
1170 		break;
1171 	case FIOSETOWN:
1172 		error = fsetown(*(int *)data, &kq->kq_sigio);
1173 		break;
1174 	default:
1175 		error = ENOTTY;
1176 		break;
1177 	}
1178 	lwkt_reltoken(&kq_token);
1179 	return (error);
1180 }
1181 
1182 /*
1183  * MPSAFE
1184  */
1185 static int
1186 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
1187 {
1188 	struct kqueue *kq = (struct kqueue *)fp->f_data;
1189 
1190 	bzero((void *)st, sizeof(*st));
1191 	st->st_size = kq->kq_count;
1192 	st->st_blksize = sizeof(struct kevent);
1193 	st->st_mode = S_IFIFO;
1194 	return (0);
1195 }
1196 
1197 /*
1198  * MPSAFE
1199  */
1200 static int
1201 kqueue_close(struct file *fp)
1202 {
1203 	struct kqueue *kq = (struct kqueue *)fp->f_data;
1204 
1205 	kqueue_terminate(kq);
1206 
1207 	fp->f_data = NULL;
1208 	funsetown(kq->kq_sigio);
1209 
1210 	kfree(kq, M_KQUEUE);
1211 	return (0);
1212 }
1213 
1214 static void
1215 kqueue_wakeup(struct kqueue *kq)
1216 {
1217 	if (kq->kq_state & KQ_SLEEP) {
1218 		kq->kq_state &= ~KQ_SLEEP;
1219 		wakeup(kq);
1220 	}
1221 	KNOTE(&kq->kq_kqinfo.ki_note, 0);
1222 }
1223 
1224 /*
1225  * Calls filterops f_attach function, acquiring mplock if filter is not
1226  * marked as FILTEROP_MPSAFE.
1227  */
1228 static int
1229 filter_attach(struct knote *kn)
1230 {
1231 	int ret;
1232 
1233 	if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) {
1234 		get_mplock();
1235 		ret = kn->kn_fop->f_attach(kn);
1236 		rel_mplock();
1237 	} else {
1238 		ret = kn->kn_fop->f_attach(kn);
1239 	}
1240 
1241 	return (ret);
1242 }
1243 
1244 /*
1245  * Detach the knote and drop it, destroying the knote.
1246  *
1247  * Calls filterops f_detach function, acquiring mplock if filter is not
1248  * marked as FILTEROP_MPSAFE.
1249  */
1250 static void
1251 knote_detach_and_drop(struct knote *kn)
1252 {
1253 	kn->kn_status |= KN_DELETING | KN_REPROCESS;
1254 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1255 		kn->kn_fop->f_detach(kn);
1256 	} else {
1257 		get_mplock();
1258 		kn->kn_fop->f_detach(kn);
1259 		rel_mplock();
1260 	}
1261 	knote_drop(kn);
1262 }
1263 
1264 /*
1265  * Calls filterops f_event function, acquiring mplock if filter is not
1266  * marked as FILTEROP_MPSAFE.
1267  *
1268  * If the knote is in the middle of being created or deleted we cannot
1269  * safely call the filter op.
1270  */
1271 static int
1272 filter_event(struct knote *kn, long hint)
1273 {
1274 	int ret;
1275 
1276 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1277 		ret = kn->kn_fop->f_event(kn, hint);
1278 	} else {
1279 		get_mplock();
1280 		ret = kn->kn_fop->f_event(kn, hint);
1281 		rel_mplock();
1282 	}
1283 	return (ret);
1284 }
1285 
1286 /*
1287  * Walk down a list of knotes, activating them if their event has triggered.
1288  *
1289  * If we encounter any knotes which are undergoing processing we just mark
1290  * them for reprocessing and do not try to [re]activate the knote.  However,
1291  * if a hint is being passed we have to wait and that makes things a bit
1292  * sticky.
1293  */
1294 void
1295 knote(struct klist *list, long hint)
1296 {
1297 	struct knote *kn;
1298 
1299 	lwkt_gettoken(&kq_token);
1300 restart:
1301 	SLIST_FOREACH(kn, list, kn_next) {
1302 		if (kn->kn_status & KN_PROCESSING) {
1303 			/*
1304 			 * Someone else is processing the knote, ask the
1305 			 * other thread to reprocess it and don't mess
1306 			 * with it otherwise.
1307 			 */
1308 			if (hint == 0) {
1309 				kn->kn_status |= KN_REPROCESS;
1310 				continue;
1311 			}
1312 
1313 			/*
1314 			 * If the hint is non-zero we have to wait or risk
1315 			 * losing the state the caller is trying to update.
1316 			 *
1317 			 * XXX This is a real problem, certain process
1318 			 *     and signal filters will bump kn_data for
1319 			 *     already-processed notes more than once if
1320 			 *     we restart the list scan.  FIXME.
1321 			 */
1322 			kn->kn_status |= KN_WAITING | KN_REPROCESS;
1323 			tsleep(kn, 0, "knotec", hz);
1324 			goto restart;
1325 		}
1326 
1327 		/*
1328 		 * Become the reprocessing master ourselves.
1329 		 *
1330 		 * If hint is non-zer running the event is mandatory
1331 		 * when not deleting so do it whether reprocessing is
1332 		 * set or not.
1333 		 */
1334 		kn->kn_status |= KN_PROCESSING;
1335 		if ((kn->kn_status & KN_DELETING) == 0) {
1336 			if (filter_event(kn, hint))
1337 				KNOTE_ACTIVATE(kn);
1338 		}
1339 		if (knote_release(kn))
1340 			goto restart;
1341 	}
1342 	lwkt_reltoken(&kq_token);
1343 }
1344 
1345 /*
1346  * Insert knote at head of klist.
1347  *
1348  * This function may only be called via a filter function and thus
1349  * kq_token should already be held and marked for processing.
1350  */
1351 void
1352 knote_insert(struct klist *klist, struct knote *kn)
1353 {
1354 	KKASSERT(kn->kn_status & KN_PROCESSING);
1355 	ASSERT_LWKT_TOKEN_HELD(&kq_token);
1356 	SLIST_INSERT_HEAD(klist, kn, kn_next);
1357 }
1358 
1359 /*
1360  * Remove knote from a klist
1361  *
1362  * This function may only be called via a filter function and thus
1363  * kq_token should already be held and marked for processing.
1364  */
1365 void
1366 knote_remove(struct klist *klist, struct knote *kn)
1367 {
1368 	KKASSERT(kn->kn_status & KN_PROCESSING);
1369 	ASSERT_LWKT_TOKEN_HELD(&kq_token);
1370 	SLIST_REMOVE(klist, kn, knote, kn_next);
1371 }
1372 
1373 /*
1374  * Remove all knotes from a specified klist
1375  *
1376  * Only called from aio.
1377  */
1378 void
1379 knote_empty(struct klist *list)
1380 {
1381 	struct knote *kn;
1382 
1383 	lwkt_gettoken(&kq_token);
1384 	while ((kn = SLIST_FIRST(list)) != NULL) {
1385 		if (knote_acquire(kn))
1386 			knote_detach_and_drop(kn);
1387 	}
1388 	lwkt_reltoken(&kq_token);
1389 }
1390 
1391 void
1392 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
1393 		    struct filterops *ops, void *hook)
1394 {
1395 	struct knote *kn;
1396 
1397 	lwkt_gettoken(&kq_token);
1398 	while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
1399 		if (knote_acquire(kn)) {
1400 			knote_remove(&src->ki_note, kn);
1401 			kn->kn_fop = ops;
1402 			kn->kn_hook = hook;
1403 			knote_insert(&dst->ki_note, kn);
1404 			knote_release(kn);
1405 			/* kn may be invalid now */
1406 		}
1407 	}
1408 	lwkt_reltoken(&kq_token);
1409 }
1410 
1411 /*
1412  * Remove all knotes referencing a specified fd
1413  */
1414 void
1415 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
1416 {
1417 	struct knote *kn;
1418 
1419 	lwkt_gettoken(&kq_token);
1420 restart:
1421 	SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
1422 		if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
1423 			if (knote_acquire(kn))
1424 				knote_detach_and_drop(kn);
1425 			goto restart;
1426 		}
1427 	}
1428 	lwkt_reltoken(&kq_token);
1429 }
1430 
1431 /*
1432  * Low level attach function.
1433  *
1434  * The knote should already be marked for processing.
1435  */
1436 static void
1437 knote_attach(struct knote *kn)
1438 {
1439 	struct klist *list;
1440 	struct kqueue *kq = kn->kn_kq;
1441 
1442 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1443 		KKASSERT(kn->kn_fp);
1444 		list = &kn->kn_fp->f_klist;
1445 	} else {
1446 		if (kq->kq_knhashmask == 0)
1447 			kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1448 						 &kq->kq_knhashmask);
1449 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1450 	}
1451 	SLIST_INSERT_HEAD(list, kn, kn_link);
1452 	TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
1453 }
1454 
1455 /*
1456  * Low level drop function.
1457  *
1458  * The knote should already be marked for processing.
1459  */
1460 static void
1461 knote_drop(struct knote *kn)
1462 {
1463 	struct kqueue *kq;
1464 	struct klist *list;
1465 
1466 	kq = kn->kn_kq;
1467 
1468 	if (kn->kn_fop->f_flags & FILTEROP_ISFD)
1469 		list = &kn->kn_fp->f_klist;
1470 	else
1471 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1472 
1473 	SLIST_REMOVE(list, kn, knote, kn_link);
1474 	TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
1475 	if (kn->kn_status & KN_QUEUED)
1476 		knote_dequeue(kn);
1477 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1478 		fdrop(kn->kn_fp);
1479 		kn->kn_fp = NULL;
1480 	}
1481 	knote_free(kn);
1482 }
1483 
1484 /*
1485  * Low level enqueue function.
1486  *
1487  * The knote should already be marked for processing.
1488  */
1489 static void
1490 knote_enqueue(struct knote *kn)
1491 {
1492 	struct kqueue *kq = kn->kn_kq;
1493 
1494 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
1495 	TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
1496 	kn->kn_status |= KN_QUEUED;
1497 	++kq->kq_count;
1498 
1499 	/*
1500 	 * Send SIGIO on request (typically set up as a mailbox signal)
1501 	 */
1502 	if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
1503 		pgsigio(kq->kq_sigio, SIGIO, 0);
1504 
1505 	kqueue_wakeup(kq);
1506 }
1507 
1508 /*
1509  * Low level dequeue function.
1510  *
1511  * The knote should already be marked for processing.
1512  */
1513 static void
1514 knote_dequeue(struct knote *kn)
1515 {
1516 	struct kqueue *kq = kn->kn_kq;
1517 
1518 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
1519 	TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1520 	kn->kn_status &= ~KN_QUEUED;
1521 	kq->kq_count--;
1522 }
1523 
1524 static void
1525 knote_init(void)
1526 {
1527 	knote_zone = zinit("KNOTE", sizeof(struct knote), 0, 0, 1);
1528 }
1529 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
1530 
1531 static struct knote *
1532 knote_alloc(void)
1533 {
1534 	return ((struct knote *)zalloc(knote_zone));
1535 }
1536 
1537 static void
1538 knote_free(struct knote *kn)
1539 {
1540 	zfree(knote_zone, kn);
1541 }
1542