xref: /dflybsd-src/sys/kern/kern_event.c (revision dae741e33c840b92a8a53bf9f01157ede145e256)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $
27  * $DragonFly: src/sys/kern/kern_event.c,v 1.33 2007/02/03 17:05:57 corecode Exp $
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/proc.h>
34 #include <sys/malloc.h>
35 #include <sys/unistd.h>
36 #include <sys/file.h>
37 #include <sys/lock.h>
38 #include <sys/fcntl.h>
39 #include <sys/queue.h>
40 #include <sys/event.h>
41 #include <sys/eventvar.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/stat.h>
46 #include <sys/sysctl.h>
47 #include <sys/sysproto.h>
48 #include <sys/thread.h>
49 #include <sys/uio.h>
50 #include <sys/signalvar.h>
51 #include <sys/filio.h>
52 #include <sys/ktr.h>
53 
54 #include <sys/thread2.h>
55 #include <sys/file2.h>
56 #include <sys/mplock2.h>
57 
58 /*
59  * Global token for kqueue subsystem
60  */
61 #if 0
62 struct lwkt_token kq_token = LWKT_TOKEN_INITIALIZER(kq_token);
63 SYSCTL_LONG(_lwkt, OID_AUTO, kq_collisions,
64     CTLFLAG_RW, &kq_token.t_collisions, 0,
65     "Collision counter of kq_token");
66 #endif
67 
68 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
69 
70 struct kevent_copyin_args {
71 	struct kevent_args	*ka;
72 	int			pchanges;
73 };
74 
75 static int	kqueue_sleep(struct kqueue *kq, struct timespec *tsp);
76 static int	kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
77 		    struct knote *marker);
78 static int 	kqueue_read(struct file *fp, struct uio *uio,
79 		    struct ucred *cred, int flags);
80 static int	kqueue_write(struct file *fp, struct uio *uio,
81 		    struct ucred *cred, int flags);
82 static int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
83 		    struct ucred *cred, struct sysmsg *msg);
84 static int 	kqueue_kqfilter(struct file *fp, struct knote *kn);
85 static int 	kqueue_stat(struct file *fp, struct stat *st,
86 		    struct ucred *cred);
87 static int 	kqueue_close(struct file *fp);
88 static void	kqueue_wakeup(struct kqueue *kq);
89 static int	filter_attach(struct knote *kn);
90 static int	filter_event(struct knote *kn, long hint);
91 
92 /*
93  * MPSAFE
94  */
95 static struct fileops kqueueops = {
96 	.fo_read = kqueue_read,
97 	.fo_write = kqueue_write,
98 	.fo_ioctl = kqueue_ioctl,
99 	.fo_kqfilter = kqueue_kqfilter,
100 	.fo_stat = kqueue_stat,
101 	.fo_close = kqueue_close,
102 	.fo_shutdown = nofo_shutdown
103 };
104 
105 static void 	knote_attach(struct knote *kn);
106 static void 	knote_drop(struct knote *kn);
107 static void	knote_detach_and_drop(struct knote *kn);
108 static void 	knote_enqueue(struct knote *kn);
109 static void 	knote_dequeue(struct knote *kn);
110 static struct 	knote *knote_alloc(void);
111 static void 	knote_free(struct knote *kn);
112 
113 static void	filt_kqdetach(struct knote *kn);
114 static int	filt_kqueue(struct knote *kn, long hint);
115 static int	filt_procattach(struct knote *kn);
116 static void	filt_procdetach(struct knote *kn);
117 static int	filt_proc(struct knote *kn, long hint);
118 static int	filt_fileattach(struct knote *kn);
119 static void	filt_timerexpire(void *knx);
120 static int	filt_timerattach(struct knote *kn);
121 static void	filt_timerdetach(struct knote *kn);
122 static int	filt_timer(struct knote *kn, long hint);
123 
124 static struct filterops file_filtops =
125 	{ FILTEROP_ISFD, filt_fileattach, NULL, NULL };
126 static struct filterops kqread_filtops =
127 	{ FILTEROP_ISFD, NULL, filt_kqdetach, filt_kqueue };
128 static struct filterops proc_filtops =
129 	{ 0, filt_procattach, filt_procdetach, filt_proc };
130 static struct filterops timer_filtops =
131 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
132 
133 static int 		kq_ncallouts = 0;
134 static int 		kq_calloutmax = (4 * 1024);
135 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
136     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
137 static int		kq_checkloop = 1000000;
138 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
139     &kq_checkloop, 0, "Maximum number of callouts allocated for kqueue");
140 
141 #define KNOTE_ACTIVATE(kn) do { 					\
142 	kn->kn_status |= KN_ACTIVE;					\
143 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
144 		knote_enqueue(kn);					\
145 } while(0)
146 
147 #define	KN_HASHSIZE		64		/* XXX should be tunable */
148 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
149 
150 extern struct filterops aio_filtops;
151 extern struct filterops sig_filtops;
152 
153 /*
154  * Table for for all system-defined filters.
155  */
156 static struct filterops *sysfilt_ops[] = {
157 	&file_filtops,			/* EVFILT_READ */
158 	&file_filtops,			/* EVFILT_WRITE */
159 	&aio_filtops,			/* EVFILT_AIO */
160 	&file_filtops,			/* EVFILT_VNODE */
161 	&proc_filtops,			/* EVFILT_PROC */
162 	&sig_filtops,			/* EVFILT_SIGNAL */
163 	&timer_filtops,			/* EVFILT_TIMER */
164 	&file_filtops,			/* EVFILT_EXCEPT */
165 };
166 
167 static int
168 filt_fileattach(struct knote *kn)
169 {
170 	return (fo_kqfilter(kn->kn_fp, kn));
171 }
172 
173 /*
174  * MPSAFE
175  */
176 static int
177 kqueue_kqfilter(struct file *fp, struct knote *kn)
178 {
179 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
180 
181 	if (kn->kn_filter != EVFILT_READ)
182 		return (EOPNOTSUPP);
183 
184 	kn->kn_fop = &kqread_filtops;
185 	knote_insert(&kq->kq_kqinfo.ki_note, kn);
186 	return (0);
187 }
188 
189 static void
190 filt_kqdetach(struct knote *kn)
191 {
192 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
193 
194 	knote_remove(&kq->kq_kqinfo.ki_note, kn);
195 }
196 
197 /*ARGSUSED*/
198 static int
199 filt_kqueue(struct knote *kn, long hint)
200 {
201 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
202 
203 	kn->kn_data = kq->kq_count;
204 	return (kn->kn_data > 0);
205 }
206 
207 static int
208 filt_procattach(struct knote *kn)
209 {
210 	struct proc *p;
211 	int immediate;
212 
213 	immediate = 0;
214 	p = pfind(kn->kn_id);
215 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
216 		p = zpfind(kn->kn_id);
217 		immediate = 1;
218 	}
219 	if (p == NULL) {
220 		return (ESRCH);
221 	}
222 	if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
223 		if (p)
224 			PRELE(p);
225 		return (EACCES);
226 	}
227 
228 	lwkt_gettoken(&p->p_token);
229 	kn->kn_ptr.p_proc = p;
230 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
231 
232 	/*
233 	 * internal flag indicating registration done by kernel
234 	 */
235 	if (kn->kn_flags & EV_FLAG1) {
236 		kn->kn_data = kn->kn_sdata;		/* ppid */
237 		kn->kn_fflags = NOTE_CHILD;
238 		kn->kn_flags &= ~EV_FLAG1;
239 	}
240 
241 	knote_insert(&p->p_klist, kn);
242 
243 	/*
244 	 * Immediately activate any exit notes if the target process is a
245 	 * zombie.  This is necessary to handle the case where the target
246 	 * process, e.g. a child, dies before the kevent is negistered.
247 	 */
248 	if (immediate && filt_proc(kn, NOTE_EXIT))
249 		KNOTE_ACTIVATE(kn);
250 	lwkt_reltoken(&p->p_token);
251 	PRELE(p);
252 
253 	return (0);
254 }
255 
256 /*
257  * The knote may be attached to a different process, which may exit,
258  * leaving nothing for the knote to be attached to.  So when the process
259  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
260  * it will be deleted when read out.  However, as part of the knote deletion,
261  * this routine is called, so a check is needed to avoid actually performing
262  * a detach, because the original process does not exist any more.
263  */
264 static void
265 filt_procdetach(struct knote *kn)
266 {
267 	struct proc *p;
268 
269 	if (kn->kn_status & KN_DETACHED)
270 		return;
271 	/* XXX locking? take proc_token here? */
272 	p = kn->kn_ptr.p_proc;
273 	knote_remove(&p->p_klist, kn);
274 }
275 
276 static int
277 filt_proc(struct knote *kn, long hint)
278 {
279 	u_int event;
280 
281 	/*
282 	 * mask off extra data
283 	 */
284 	event = (u_int)hint & NOTE_PCTRLMASK;
285 
286 	/*
287 	 * if the user is interested in this event, record it.
288 	 */
289 	if (kn->kn_sfflags & event)
290 		kn->kn_fflags |= event;
291 
292 	/*
293 	 * Process is gone, so flag the event as finished.  Detach the
294 	 * knote from the process now because the process will be poof,
295 	 * gone later on.
296 	 */
297 	if (event == NOTE_EXIT) {
298 		struct proc *p = kn->kn_ptr.p_proc;
299 		if ((kn->kn_status & KN_DETACHED) == 0) {
300 			knote_remove(&p->p_klist, kn);
301 			kn->kn_status |= KN_DETACHED;
302 			kn->kn_data = p->p_xstat;
303 			kn->kn_ptr.p_proc = NULL;
304 		}
305 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
306 		return (1);
307 	}
308 
309 	/*
310 	 * process forked, and user wants to track the new process,
311 	 * so attach a new knote to it, and immediately report an
312 	 * event with the parent's pid.
313 	 */
314 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
315 		struct kevent kev;
316 		int error;
317 
318 		/*
319 		 * register knote with new process.
320 		 */
321 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
322 		kev.filter = kn->kn_filter;
323 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
324 		kev.fflags = kn->kn_sfflags;
325 		kev.data = kn->kn_id;			/* parent */
326 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
327 		error = kqueue_register(kn->kn_kq, &kev);
328 		if (error)
329 			kn->kn_fflags |= NOTE_TRACKERR;
330 	}
331 
332 	return (kn->kn_fflags != 0);
333 }
334 
335 /*
336  * The callout interlocks with callout_terminate() but can still
337  * race a deletion so if KN_DELETING is set we just don't touch
338  * the knote.
339  */
340 static void
341 filt_timerexpire(void *knx)
342 {
343 	struct lwkt_token *tok;
344 	struct knote *kn = knx;
345 	struct callout *calloutp;
346 	struct timeval tv;
347 	int tticks;
348 
349 	tok = lwkt_token_pool_lookup(kn->kn_kq);
350 	lwkt_gettoken(tok);
351 	if ((kn->kn_status & KN_DELETING) == 0) {
352 		kn->kn_data++;
353 		KNOTE_ACTIVATE(kn);
354 
355 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
356 			tv.tv_sec = kn->kn_sdata / 1000;
357 			tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
358 			tticks = tvtohz_high(&tv);
359 			calloutp = (struct callout *)kn->kn_hook;
360 			callout_reset(calloutp, tticks, filt_timerexpire, kn);
361 		}
362 	}
363 	lwkt_reltoken(tok);
364 }
365 
366 /*
367  * data contains amount of time to sleep, in milliseconds
368  */
369 static int
370 filt_timerattach(struct knote *kn)
371 {
372 	struct callout *calloutp;
373 	struct timeval tv;
374 	int tticks;
375 
376 	if (kq_ncallouts >= kq_calloutmax) {
377 		kn->kn_hook = NULL;
378 		return (ENOMEM);
379 	}
380 	kq_ncallouts++;
381 
382 	tv.tv_sec = kn->kn_sdata / 1000;
383 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
384 	tticks = tvtohz_high(&tv);
385 
386 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
387 	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
388 	    M_KQUEUE, M_WAITOK);
389 	callout_init(calloutp);
390 	kn->kn_hook = (caddr_t)calloutp;
391 	callout_reset(calloutp, tticks, filt_timerexpire, kn);
392 
393 	return (0);
394 }
395 
396 /*
397  * This function is called with the knote flagged locked but it is
398  * still possible to race a callout event due to the callback blocking.
399  * We must call callout_terminate() instead of callout_stop() to deal
400  * with the race.
401  */
402 static void
403 filt_timerdetach(struct knote *kn)
404 {
405 	struct callout *calloutp;
406 
407 	calloutp = (struct callout *)kn->kn_hook;
408 	callout_terminate(calloutp);
409 	FREE(calloutp, M_KQUEUE);
410 	kq_ncallouts--;
411 }
412 
413 static int
414 filt_timer(struct knote *kn, long hint)
415 {
416 
417 	return (kn->kn_data != 0);
418 }
419 
420 /*
421  * Acquire a knote, return non-zero on success, 0 on failure.
422  *
423  * If we cannot acquire the knote we sleep and return 0.  The knote
424  * may be stale on return in this case and the caller must restart
425  * whatever loop they are in.
426  *
427  * Related kq token must be held.
428  */
429 static __inline
430 int
431 knote_acquire(struct knote *kn)
432 {
433 	if (kn->kn_status & KN_PROCESSING) {
434 		kn->kn_status |= KN_WAITING | KN_REPROCESS;
435 		tsleep(kn, 0, "kqepts", hz);
436 		/* knote may be stale now */
437 		return(0);
438 	}
439 	kn->kn_status |= KN_PROCESSING;
440 	return(1);
441 }
442 
443 /*
444  * Release an acquired knote, clearing KN_PROCESSING and handling any
445  * KN_REPROCESS events.
446  *
447  * Caller must be holding the related kq token
448  *
449  * Non-zero is returned if the knote is destroyed.
450  */
451 static __inline
452 int
453 knote_release(struct knote *kn)
454 {
455 	while (kn->kn_status & KN_REPROCESS) {
456 		kn->kn_status &= ~KN_REPROCESS;
457 		if (kn->kn_status & KN_WAITING) {
458 			kn->kn_status &= ~KN_WAITING;
459 			wakeup(kn);
460 		}
461 		if (kn->kn_status & KN_DELETING) {
462 			knote_detach_and_drop(kn);
463 			return(1);
464 			/* NOT REACHED */
465 		}
466 		if (filter_event(kn, 0))
467 			KNOTE_ACTIVATE(kn);
468 	}
469 	kn->kn_status &= ~KN_PROCESSING;
470 	return(0);
471 }
472 
473 /*
474  * Initialize a kqueue.
475  *
476  * NOTE: The lwp/proc code initializes a kqueue for select/poll ops.
477  *
478  * MPSAFE
479  */
480 void
481 kqueue_init(struct kqueue *kq, struct filedesc *fdp)
482 {
483 	TAILQ_INIT(&kq->kq_knpend);
484 	TAILQ_INIT(&kq->kq_knlist);
485 	kq->kq_count = 0;
486 	kq->kq_fdp = fdp;
487 	SLIST_INIT(&kq->kq_kqinfo.ki_note);
488 }
489 
490 /*
491  * Terminate a kqueue.  Freeing the actual kq itself is left up to the
492  * caller (it might be embedded in a lwp so we don't do it here).
493  *
494  * The kq's knlist must be completely eradicated so block on any
495  * processing races.
496  */
497 void
498 kqueue_terminate(struct kqueue *kq)
499 {
500 	struct lwkt_token *tok;
501 	struct knote *kn;
502 
503 	tok = lwkt_token_pool_lookup(kq);
504 	lwkt_gettoken(tok);
505 	while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
506 		if (knote_acquire(kn))
507 			knote_detach_and_drop(kn);
508 	}
509 	if (kq->kq_knhash) {
510 		kfree(kq->kq_knhash, M_KQUEUE);
511 		kq->kq_knhash = NULL;
512 		kq->kq_knhashmask = 0;
513 	}
514 	lwkt_reltoken(tok);
515 }
516 
517 /*
518  * MPSAFE
519  */
520 int
521 sys_kqueue(struct kqueue_args *uap)
522 {
523 	struct thread *td = curthread;
524 	struct kqueue *kq;
525 	struct file *fp;
526 	int fd, error;
527 
528 	error = falloc(td->td_lwp, &fp, &fd);
529 	if (error)
530 		return (error);
531 	fp->f_flag = FREAD | FWRITE;
532 	fp->f_type = DTYPE_KQUEUE;
533 	fp->f_ops = &kqueueops;
534 
535 	kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
536 	kqueue_init(kq, td->td_proc->p_fd);
537 	fp->f_data = kq;
538 
539 	fsetfd(kq->kq_fdp, fp, fd);
540 	uap->sysmsg_result = fd;
541 	fdrop(fp);
542 	return (error);
543 }
544 
545 /*
546  * Copy 'count' items into the destination list pointed to by uap->eventlist.
547  */
548 static int
549 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res)
550 {
551 	struct kevent_copyin_args *kap;
552 	int error;
553 
554 	kap = (struct kevent_copyin_args *)arg;
555 
556 	error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp));
557 	if (error == 0) {
558 		kap->ka->eventlist += count;
559 		*res += count;
560 	} else {
561 		*res = -1;
562 	}
563 
564 	return (error);
565 }
566 
567 /*
568  * Copy at most 'max' items from the list pointed to by kap->changelist,
569  * return number of items in 'events'.
570  */
571 static int
572 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
573 {
574 	struct kevent_copyin_args *kap;
575 	int error, count;
576 
577 	kap = (struct kevent_copyin_args *)arg;
578 
579 	count = min(kap->ka->nchanges - kap->pchanges, max);
580 	error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp);
581 	if (error == 0) {
582 		kap->ka->changelist += count;
583 		kap->pchanges += count;
584 		*events = count;
585 	}
586 
587 	return (error);
588 }
589 
590 /*
591  * MPSAFE
592  */
593 int
594 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
595 	    k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
596 	    struct timespec *tsp_in)
597 {
598 	struct kevent *kevp;
599 	struct timespec *tsp;
600 	int i, n, total, error, nerrors = 0;
601 	int lres;
602 	int limit = kq_checkloop;
603 	struct kevent kev[KQ_NEVENTS];
604 	struct knote marker;
605 	struct lwkt_token *tok;
606 
607 	tsp = tsp_in;
608 	*res = 0;
609 
610 	tok = lwkt_token_pool_lookup(kq);
611 	lwkt_gettoken(tok);
612 	for ( ;; ) {
613 		n = 0;
614 		error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
615 		if (error)
616 			goto done;
617 		if (n == 0)
618 			break;
619 		for (i = 0; i < n; i++) {
620 			kevp = &kev[i];
621 			kevp->flags &= ~EV_SYSFLAGS;
622 			error = kqueue_register(kq, kevp);
623 
624 			/*
625 			 * If a registration returns an error we
626 			 * immediately post the error.  The kevent()
627 			 * call itself will fail with the error if
628 			 * no space is available for posting.
629 			 *
630 			 * Such errors normally bypass the timeout/blocking
631 			 * code.  However, if the copyoutfn function refuses
632 			 * to post the error (see sys_poll()), then we
633 			 * ignore it too.
634 			 */
635 			if (error) {
636 				kevp->flags = EV_ERROR;
637 				kevp->data = error;
638 				lres = *res;
639 				kevent_copyoutfn(uap, kevp, 1, res);
640 				if (*res < 0) {
641 					goto done;
642 				} else if (lres != *res) {
643 					nevents--;
644 					nerrors++;
645 				}
646 			}
647 		}
648 	}
649 	if (nerrors) {
650 		error = 0;
651 		goto done;
652 	}
653 
654 	/*
655 	 * Acquire/wait for events - setup timeout
656 	 */
657 	if (tsp != NULL) {
658 		struct timespec ats;
659 
660 		if (tsp->tv_sec || tsp->tv_nsec) {
661 			nanouptime(&ats);
662 			timespecadd(tsp, &ats);		/* tsp = target time */
663 		}
664 	}
665 
666 	/*
667 	 * Loop as required.
668 	 *
669 	 * Collect as many events as we can. Sleeping on successive
670 	 * loops is disabled if copyoutfn has incremented (*res).
671 	 *
672 	 * The loop stops if an error occurs, all events have been
673 	 * scanned (the marker has been reached), or fewer than the
674 	 * maximum number of events is found.
675 	 *
676 	 * The copyoutfn function does not have to increment (*res) in
677 	 * order for the loop to continue.
678 	 *
679 	 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
680 	 */
681 	total = 0;
682 	error = 0;
683 	marker.kn_filter = EVFILT_MARKER;
684 	marker.kn_status = KN_PROCESSING;
685 	TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
686 	while ((n = nevents - total) > 0) {
687 		if (n > KQ_NEVENTS)
688 			n = KQ_NEVENTS;
689 
690 		/*
691 		 * If no events are pending sleep until timeout (if any)
692 		 * or an event occurs.
693 		 *
694 		 * After the sleep completes the marker is moved to the
695 		 * end of the list, making any received events available
696 		 * to our scan.
697 		 */
698 		if (kq->kq_count == 0 && *res == 0) {
699 			error = kqueue_sleep(kq, tsp);
700 			if (error)
701 				break;
702 
703 			TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
704 			TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
705 		}
706 
707 		/*
708 		 * Process all received events
709 		 * Account for all non-spurious events in our total
710 		 */
711 		i = kqueue_scan(kq, kev, n, &marker);
712 		if (i) {
713 			lres = *res;
714 			error = kevent_copyoutfn(uap, kev, i, res);
715 			total += *res - lres;
716 			if (error)
717 				break;
718 		}
719 		if (limit && --limit == 0)
720 			panic("kqueue: checkloop failed i=%d", i);
721 
722 		/*
723 		 * Normally when fewer events are returned than requested
724 		 * we can stop.  However, if only spurious events were
725 		 * collected the copyout will not bump (*res) and we have
726 		 * to continue.
727 		 */
728 		if (i < n && *res)
729 			break;
730 
731 		/*
732 		 * Deal with an edge case where spurious events can cause
733 		 * a loop to occur without moving the marker.  This can
734 		 * prevent kqueue_scan() from picking up new events which
735 		 * race us.  We must be sure to move the marker for this
736 		 * case.
737 		 *
738 		 * NOTE: We do not want to move the marker if events
739 		 *	 were scanned because normal kqueue operations
740 		 *	 may reactivate events.  Moving the marker in
741 		 *	 that case could result in duplicates for the
742 		 *	 same event.
743 		 */
744 		if (i == 0) {
745 			TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
746 			TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
747 		}
748 	}
749 	TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
750 
751 	/* Timeouts do not return EWOULDBLOCK. */
752 	if (error == EWOULDBLOCK)
753 		error = 0;
754 
755 done:
756 	lwkt_reltoken(tok);
757 	return (error);
758 }
759 
760 /*
761  * MPALMOSTSAFE
762  */
763 int
764 sys_kevent(struct kevent_args *uap)
765 {
766 	struct thread *td = curthread;
767 	struct proc *p = td->td_proc;
768 	struct timespec ts, *tsp;
769 	struct kqueue *kq;
770 	struct file *fp = NULL;
771 	struct kevent_copyin_args *kap, ka;
772 	int error;
773 
774 	if (uap->timeout) {
775 		error = copyin(uap->timeout, &ts, sizeof(ts));
776 		if (error)
777 			return (error);
778 		tsp = &ts;
779 	} else {
780 		tsp = NULL;
781 	}
782 
783 	fp = holdfp(p->p_fd, uap->fd, -1);
784 	if (fp == NULL)
785 		return (EBADF);
786 	if (fp->f_type != DTYPE_KQUEUE) {
787 		fdrop(fp);
788 		return (EBADF);
789 	}
790 
791 	kq = (struct kqueue *)fp->f_data;
792 
793 	kap = &ka;
794 	kap->ka = uap;
795 	kap->pchanges = 0;
796 
797 	error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap,
798 			    kevent_copyin, kevent_copyout, tsp);
799 
800 	fdrop(fp);
801 
802 	return (error);
803 }
804 
805 /*
806  * Caller must be holding the kq token
807  */
808 int
809 kqueue_register(struct kqueue *kq, struct kevent *kev)
810 {
811 	struct lwkt_token *tok;
812 	struct filedesc *fdp = kq->kq_fdp;
813 	struct filterops *fops;
814 	struct file *fp = NULL;
815 	struct knote *kn = NULL;
816 	int error = 0;
817 
818 	if (kev->filter < 0) {
819 		if (kev->filter + EVFILT_SYSCOUNT < 0)
820 			return (EINVAL);
821 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
822 	} else {
823 		/*
824 		 * XXX
825 		 * filter attach routine is responsible for insuring that
826 		 * the identifier can be attached to it.
827 		 */
828 		kprintf("unknown filter: %d\n", kev->filter);
829 		return (EINVAL);
830 	}
831 
832 	tok = lwkt_token_pool_lookup(kq);
833 	lwkt_gettoken(tok);
834 	if (fops->f_flags & FILTEROP_ISFD) {
835 		/* validate descriptor */
836 		fp = holdfp(fdp, kev->ident, -1);
837 		if (fp == NULL) {
838 			lwkt_reltoken(tok);
839 			return (EBADF);
840 		}
841 		lwkt_getpooltoken(&fp->f_klist);
842 again1:
843 		SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
844 			if (kn->kn_kq == kq &&
845 			    kn->kn_filter == kev->filter &&
846 			    kn->kn_id == kev->ident) {
847 				if (knote_acquire(kn) == 0)
848 					goto again1;
849 				break;
850 			}
851 		}
852 		lwkt_relpooltoken(&fp->f_klist);
853 	} else {
854 		if (kq->kq_knhashmask) {
855 			struct klist *list;
856 
857 			list = &kq->kq_knhash[
858 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
859 			lwkt_getpooltoken(list);
860 again2:
861 			SLIST_FOREACH(kn, list, kn_link) {
862 				if (kn->kn_id == kev->ident &&
863 				    kn->kn_filter == kev->filter) {
864 					if (knote_acquire(kn) == 0)
865 						goto again2;
866 					break;
867 				}
868 			}
869 			lwkt_relpooltoken(list);
870 		}
871 	}
872 
873 	/*
874 	 * NOTE: At this point if kn is non-NULL we will have acquired
875 	 *	 it and set KN_PROCESSING.
876 	 */
877 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
878 		error = ENOENT;
879 		goto done;
880 	}
881 
882 	/*
883 	 * kn now contains the matching knote, or NULL if no match
884 	 */
885 	if (kev->flags & EV_ADD) {
886 		if (kn == NULL) {
887 			kn = knote_alloc();
888 			if (kn == NULL) {
889 				error = ENOMEM;
890 				goto done;
891 			}
892 			kn->kn_fp = fp;
893 			kn->kn_kq = kq;
894 			kn->kn_fop = fops;
895 
896 			/*
897 			 * apply reference count to knote structure, and
898 			 * do not release it at the end of this routine.
899 			 */
900 			fp = NULL;
901 
902 			kn->kn_sfflags = kev->fflags;
903 			kn->kn_sdata = kev->data;
904 			kev->fflags = 0;
905 			kev->data = 0;
906 			kn->kn_kevent = *kev;
907 
908 			/*
909 			 * KN_PROCESSING prevents the knote from getting
910 			 * ripped out from under us while we are trying
911 			 * to attach it, in case the attach blocks.
912 			 */
913 			kn->kn_status = KN_PROCESSING;
914 			knote_attach(kn);
915 			if ((error = filter_attach(kn)) != 0) {
916 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
917 				knote_drop(kn);
918 				goto done;
919 			}
920 
921 			/*
922 			 * Interlock against close races which either tried
923 			 * to remove our knote while we were blocked or missed
924 			 * it entirely prior to our attachment.  We do not
925 			 * want to end up with a knote on a closed descriptor.
926 			 */
927 			if ((fops->f_flags & FILTEROP_ISFD) &&
928 			    checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
929 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
930 			}
931 		} else {
932 			/*
933 			 * The user may change some filter values after the
934 			 * initial EV_ADD, but doing so will not reset any
935 			 * filter which have already been triggered.
936 			 */
937 			KKASSERT(kn->kn_status & KN_PROCESSING);
938 			kn->kn_sfflags = kev->fflags;
939 			kn->kn_sdata = kev->data;
940 			kn->kn_kevent.udata = kev->udata;
941 		}
942 
943 		/*
944 		 * Execute the filter event to immediately activate the
945 		 * knote if necessary.  If reprocessing events are pending
946 		 * due to blocking above we do not run the filter here
947 		 * but instead let knote_release() do it.  Otherwise we
948 		 * might run the filter on a deleted event.
949 		 */
950 		if ((kn->kn_status & KN_REPROCESS) == 0) {
951 			if (filter_event(kn, 0))
952 				KNOTE_ACTIVATE(kn);
953 		}
954 	} else if (kev->flags & EV_DELETE) {
955 		/*
956 		 * Delete the existing knote
957 		 */
958 		knote_detach_and_drop(kn);
959 		goto done;
960 	}
961 
962 	/*
963 	 * Disablement does not deactivate a knote here.
964 	 */
965 	if ((kev->flags & EV_DISABLE) &&
966 	    ((kn->kn_status & KN_DISABLED) == 0)) {
967 		kn->kn_status |= KN_DISABLED;
968 	}
969 
970 	/*
971 	 * Re-enablement may have to immediately enqueue an active knote.
972 	 */
973 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
974 		kn->kn_status &= ~KN_DISABLED;
975 		if ((kn->kn_status & KN_ACTIVE) &&
976 		    ((kn->kn_status & KN_QUEUED) == 0)) {
977 			knote_enqueue(kn);
978 		}
979 	}
980 
981 	/*
982 	 * Handle any required reprocessing
983 	 */
984 	knote_release(kn);
985 	/* kn may be invalid now */
986 
987 done:
988 	lwkt_reltoken(tok);
989 	if (fp != NULL)
990 		fdrop(fp);
991 	return (error);
992 }
993 
994 /*
995  * Block as necessary until the target time is reached.
996  * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
997  * 0 we do not block at all.
998  *
999  * Caller must be holding the kq token.
1000  */
1001 static int
1002 kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
1003 {
1004 	int error = 0;
1005 
1006 	if (tsp == NULL) {
1007 		kq->kq_state |= KQ_SLEEP;
1008 		error = tsleep(kq, PCATCH, "kqread", 0);
1009 	} else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
1010 		error = EWOULDBLOCK;
1011 	} else {
1012 		struct timespec ats;
1013 		struct timespec atx = *tsp;
1014 		int timeout;
1015 
1016 		nanouptime(&ats);
1017 		timespecsub(&atx, &ats);
1018 		if (ats.tv_sec < 0) {
1019 			error = EWOULDBLOCK;
1020 		} else {
1021 			timeout = atx.tv_sec > 24 * 60 * 60 ?
1022 				24 * 60 * 60 * hz : tstohz_high(&atx);
1023 			kq->kq_state |= KQ_SLEEP;
1024 			error = tsleep(kq, PCATCH, "kqread", timeout);
1025 		}
1026 	}
1027 
1028 	/* don't restart after signals... */
1029 	if (error == ERESTART)
1030 		return (EINTR);
1031 
1032 	return (error);
1033 }
1034 
1035 /*
1036  * Scan the kqueue, return the number of active events placed in kevp up
1037  * to count.
1038  *
1039  * Continuous mode events may get recycled, do not continue scanning past
1040  * marker unless no events have been collected.
1041  *
1042  * Caller must be holding the kq token
1043  */
1044 static int
1045 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
1046             struct knote *marker)
1047 {
1048         struct knote *kn, local_marker;
1049         int total;
1050 
1051         total = 0;
1052 	local_marker.kn_filter = EVFILT_MARKER;
1053 	local_marker.kn_status = KN_PROCESSING;
1054 
1055 	/*
1056 	 * Collect events.
1057 	 */
1058 	TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
1059 	while (count) {
1060 		kn = TAILQ_NEXT(&local_marker, kn_tqe);
1061 		if (kn->kn_filter == EVFILT_MARKER) {
1062 			/* Marker reached, we are done */
1063 			if (kn == marker)
1064 				break;
1065 
1066 			/* Move local marker past some other threads marker */
1067 			kn = TAILQ_NEXT(kn, kn_tqe);
1068 			TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1069 			TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
1070 			continue;
1071 		}
1072 
1073 		/*
1074 		 * We can't skip a knote undergoing processing, otherwise
1075 		 * we risk not returning it when the user process expects
1076 		 * it should be returned.  Sleep and retry.
1077 		 */
1078 		if (knote_acquire(kn) == 0)
1079 			continue;
1080 
1081 		/*
1082 		 * Remove the event for processing.
1083 		 *
1084 		 * WARNING!  We must leave KN_QUEUED set to prevent the
1085 		 *	     event from being KNOTE_ACTIVATE()d while
1086 		 *	     the queue state is in limbo, in case we
1087 		 *	     block.
1088 		 *
1089 		 * WARNING!  We must set KN_PROCESSING to avoid races
1090 		 *	     against deletion or another thread's
1091 		 *	     processing.
1092 		 */
1093 		TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1094 		kq->kq_count--;
1095 
1096 		/*
1097 		 * We have to deal with an extremely important race against
1098 		 * file descriptor close()s here.  The file descriptor can
1099 		 * disappear MPSAFE, and there is a small window of
1100 		 * opportunity between that and the call to knote_fdclose().
1101 		 *
1102 		 * If we hit that window here while doselect or dopoll is
1103 		 * trying to delete a spurious event they will not be able
1104 		 * to match up the event against a knote and will go haywire.
1105 		 */
1106 		if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1107 		    checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
1108 			kn->kn_status |= KN_DELETING | KN_REPROCESS;
1109 		}
1110 
1111 		if (kn->kn_status & KN_DISABLED) {
1112 			/*
1113 			 * If disabled we ensure the event is not queued
1114 			 * but leave its active bit set.  On re-enablement
1115 			 * the event may be immediately triggered.
1116 			 */
1117 			kn->kn_status &= ~KN_QUEUED;
1118 		} else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
1119 			   (kn->kn_status & KN_DELETING) == 0 &&
1120 			   filter_event(kn, 0) == 0) {
1121 			/*
1122 			 * If not running in one-shot mode and the event
1123 			 * is no longer present we ensure it is removed
1124 			 * from the queue and ignore it.
1125 			 */
1126 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1127 		} else {
1128 			/*
1129 			 * Post the event
1130 			 */
1131 			*kevp++ = kn->kn_kevent;
1132 			++total;
1133 			--count;
1134 
1135 			if (kn->kn_flags & EV_ONESHOT) {
1136 				kn->kn_status &= ~KN_QUEUED;
1137 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
1138 			} else if (kn->kn_flags & EV_CLEAR) {
1139 				kn->kn_data = 0;
1140 				kn->kn_fflags = 0;
1141 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1142 			} else {
1143 				TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
1144 				kq->kq_count++;
1145 			}
1146 		}
1147 
1148 		/*
1149 		 * Handle any post-processing states
1150 		 */
1151 		knote_release(kn);
1152 	}
1153 	TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1154 
1155 	return (total);
1156 }
1157 
1158 /*
1159  * XXX
1160  * This could be expanded to call kqueue_scan, if desired.
1161  *
1162  * MPSAFE
1163  */
1164 static int
1165 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1166 {
1167 	return (ENXIO);
1168 }
1169 
1170 /*
1171  * MPSAFE
1172  */
1173 static int
1174 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1175 {
1176 	return (ENXIO);
1177 }
1178 
1179 /*
1180  * MPALMOSTSAFE
1181  */
1182 static int
1183 kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
1184 	     struct ucred *cred, struct sysmsg *msg)
1185 {
1186 	struct lwkt_token *tok;
1187 	struct kqueue *kq;
1188 	int error;
1189 
1190 	kq = (struct kqueue *)fp->f_data;
1191 	tok = lwkt_token_pool_lookup(kq);
1192 	lwkt_gettoken(tok);
1193 
1194 	switch(com) {
1195 	case FIOASYNC:
1196 		if (*(int *)data)
1197 			kq->kq_state |= KQ_ASYNC;
1198 		else
1199 			kq->kq_state &= ~KQ_ASYNC;
1200 		error = 0;
1201 		break;
1202 	case FIOSETOWN:
1203 		error = fsetown(*(int *)data, &kq->kq_sigio);
1204 		break;
1205 	default:
1206 		error = ENOTTY;
1207 		break;
1208 	}
1209 	lwkt_reltoken(tok);
1210 	return (error);
1211 }
1212 
1213 /*
1214  * MPSAFE
1215  */
1216 static int
1217 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
1218 {
1219 	struct kqueue *kq = (struct kqueue *)fp->f_data;
1220 
1221 	bzero((void *)st, sizeof(*st));
1222 	st->st_size = kq->kq_count;
1223 	st->st_blksize = sizeof(struct kevent);
1224 	st->st_mode = S_IFIFO;
1225 	return (0);
1226 }
1227 
1228 /*
1229  * MPSAFE
1230  */
1231 static int
1232 kqueue_close(struct file *fp)
1233 {
1234 	struct kqueue *kq = (struct kqueue *)fp->f_data;
1235 
1236 	kqueue_terminate(kq);
1237 
1238 	fp->f_data = NULL;
1239 	funsetown(&kq->kq_sigio);
1240 
1241 	kfree(kq, M_KQUEUE);
1242 	return (0);
1243 }
1244 
1245 static void
1246 kqueue_wakeup(struct kqueue *kq)
1247 {
1248 	if (kq->kq_state & KQ_SLEEP) {
1249 		kq->kq_state &= ~KQ_SLEEP;
1250 		wakeup(kq);
1251 	}
1252 	KNOTE(&kq->kq_kqinfo.ki_note, 0);
1253 }
1254 
1255 /*
1256  * Calls filterops f_attach function, acquiring mplock if filter is not
1257  * marked as FILTEROP_MPSAFE.
1258  *
1259  * Caller must be holding the related kq token
1260  */
1261 static int
1262 filter_attach(struct knote *kn)
1263 {
1264 	int ret;
1265 
1266 	if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) {
1267 		get_mplock();
1268 		ret = kn->kn_fop->f_attach(kn);
1269 		rel_mplock();
1270 	} else {
1271 		ret = kn->kn_fop->f_attach(kn);
1272 	}
1273 
1274 	return (ret);
1275 }
1276 
1277 /*
1278  * Detach the knote and drop it, destroying the knote.
1279  *
1280  * Calls filterops f_detach function, acquiring mplock if filter is not
1281  * marked as FILTEROP_MPSAFE.
1282  *
1283  * Caller must be holding the related kq token
1284  */
1285 static void
1286 knote_detach_and_drop(struct knote *kn)
1287 {
1288 	kn->kn_status |= KN_DELETING | KN_REPROCESS;
1289 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1290 		kn->kn_fop->f_detach(kn);
1291 	} else {
1292 		get_mplock();
1293 		kn->kn_fop->f_detach(kn);
1294 		rel_mplock();
1295 	}
1296 	knote_drop(kn);
1297 }
1298 
1299 /*
1300  * Calls filterops f_event function, acquiring mplock if filter is not
1301  * marked as FILTEROP_MPSAFE.
1302  *
1303  * If the knote is in the middle of being created or deleted we cannot
1304  * safely call the filter op.
1305  *
1306  * Caller must be holding the related kq token
1307  */
1308 static int
1309 filter_event(struct knote *kn, long hint)
1310 {
1311 	int ret;
1312 
1313 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1314 		ret = kn->kn_fop->f_event(kn, hint);
1315 	} else {
1316 		get_mplock();
1317 		ret = kn->kn_fop->f_event(kn, hint);
1318 		rel_mplock();
1319 	}
1320 	return (ret);
1321 }
1322 
1323 /*
1324  * Walk down a list of knotes, activating them if their event has triggered.
1325  *
1326  * If we encounter any knotes which are undergoing processing we just mark
1327  * them for reprocessing and do not try to [re]activate the knote.  However,
1328  * if a hint is being passed we have to wait and that makes things a bit
1329  * sticky.
1330  */
1331 void
1332 knote(struct klist *list, long hint)
1333 {
1334 	struct kqueue *kq;
1335 	struct knote *kn;
1336 	struct knote *kntmp;
1337 
1338 	lwkt_getpooltoken(list);
1339 restart:
1340 	SLIST_FOREACH(kn, list, kn_next) {
1341 		kq = kn->kn_kq;
1342 		lwkt_getpooltoken(kq);
1343 
1344 		/* temporary verification hack */
1345 		SLIST_FOREACH(kntmp, list, kn_next) {
1346 			if (kn == kntmp)
1347 				break;
1348 		}
1349 		if (kn != kntmp || kn->kn_kq != kq) {
1350 			lwkt_relpooltoken(kq);
1351 			goto restart;
1352 		}
1353 
1354 		if (kn->kn_status & KN_PROCESSING) {
1355 			/*
1356 			 * Someone else is processing the knote, ask the
1357 			 * other thread to reprocess it and don't mess
1358 			 * with it otherwise.
1359 			 */
1360 			if (hint == 0) {
1361 				kn->kn_status |= KN_REPROCESS;
1362 				lwkt_relpooltoken(kq);
1363 				continue;
1364 			}
1365 
1366 			/*
1367 			 * If the hint is non-zero we have to wait or risk
1368 			 * losing the state the caller is trying to update.
1369 			 *
1370 			 * XXX This is a real problem, certain process
1371 			 *     and signal filters will bump kn_data for
1372 			 *     already-processed notes more than once if
1373 			 *     we restart the list scan.  FIXME.
1374 			 */
1375 			kn->kn_status |= KN_WAITING | KN_REPROCESS;
1376 			tsleep(kn, 0, "knotec", hz);
1377 			lwkt_relpooltoken(kq);
1378 			goto restart;
1379 		}
1380 
1381 		/*
1382 		 * Become the reprocessing master ourselves.
1383 		 *
1384 		 * If hint is non-zer running the event is mandatory
1385 		 * when not deleting so do it whether reprocessing is
1386 		 * set or not.
1387 		 */
1388 		kn->kn_status |= KN_PROCESSING;
1389 		if ((kn->kn_status & KN_DELETING) == 0) {
1390 			if (filter_event(kn, hint))
1391 				KNOTE_ACTIVATE(kn);
1392 		}
1393 		if (knote_release(kn)) {
1394 			lwkt_relpooltoken(kq);
1395 			goto restart;
1396 		}
1397 		lwkt_relpooltoken(kq);
1398 	}
1399 	lwkt_relpooltoken(list);
1400 }
1401 
1402 /*
1403  * Insert knote at head of klist.
1404  *
1405  * This function may only be called via a filter function and thus
1406  * kq_token should already be held and marked for processing.
1407  */
1408 void
1409 knote_insert(struct klist *klist, struct knote *kn)
1410 {
1411 	lwkt_getpooltoken(klist);
1412 	KKASSERT(kn->kn_status & KN_PROCESSING);
1413 	SLIST_INSERT_HEAD(klist, kn, kn_next);
1414 	lwkt_relpooltoken(klist);
1415 }
1416 
1417 /*
1418  * Remove knote from a klist
1419  *
1420  * This function may only be called via a filter function and thus
1421  * kq_token should already be held and marked for processing.
1422  */
1423 void
1424 knote_remove(struct klist *klist, struct knote *kn)
1425 {
1426 	lwkt_getpooltoken(klist);
1427 	KKASSERT(kn->kn_status & KN_PROCESSING);
1428 	SLIST_REMOVE(klist, kn, knote, kn_next);
1429 	lwkt_relpooltoken(klist);
1430 }
1431 
1432 #if 0
1433 /*
1434  * Remove all knotes from a specified klist
1435  *
1436  * Only called from aio.
1437  */
1438 void
1439 knote_empty(struct klist *list)
1440 {
1441 	struct knote *kn;
1442 
1443 	lwkt_gettoken(&kq_token);
1444 	while ((kn = SLIST_FIRST(list)) != NULL) {
1445 		if (knote_acquire(kn))
1446 			knote_detach_and_drop(kn);
1447 	}
1448 	lwkt_reltoken(&kq_token);
1449 }
1450 #endif
1451 
1452 void
1453 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
1454 		    struct filterops *ops, void *hook)
1455 {
1456 	struct kqueue *kq;
1457 	struct knote *kn;
1458 
1459 	lwkt_getpooltoken(&src->ki_note);
1460 	lwkt_getpooltoken(&dst->ki_note);
1461 	while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
1462 		kq = kn->kn_kq;
1463 		lwkt_getpooltoken(kq);
1464 		if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) {
1465 			lwkt_relpooltoken(kq);
1466 			continue;
1467 		}
1468 		if (knote_acquire(kn)) {
1469 			knote_remove(&src->ki_note, kn);
1470 			kn->kn_fop = ops;
1471 			kn->kn_hook = hook;
1472 			knote_insert(&dst->ki_note, kn);
1473 			knote_release(kn);
1474 			/* kn may be invalid now */
1475 		}
1476 		lwkt_relpooltoken(kq);
1477 	}
1478 	lwkt_relpooltoken(&dst->ki_note);
1479 	lwkt_relpooltoken(&src->ki_note);
1480 }
1481 
1482 /*
1483  * Remove all knotes referencing a specified fd
1484  */
1485 void
1486 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
1487 {
1488 	struct kqueue *kq;
1489 	struct knote *kn;
1490 	struct knote *kntmp;
1491 
1492 	lwkt_getpooltoken(&fp->f_klist);
1493 restart:
1494 	SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
1495 		if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
1496 			kq = kn->kn_kq;
1497 			lwkt_getpooltoken(kq);
1498 
1499 			/* temporary verification hack */
1500 			SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) {
1501 				if (kn == kntmp)
1502 					break;
1503 			}
1504 			if (kn != kntmp || kn->kn_kq->kq_fdp != fdp ||
1505 			    kn->kn_id != fd || kn->kn_kq != kq) {
1506 				lwkt_relpooltoken(kq);
1507 				goto restart;
1508 			}
1509 			if (knote_acquire(kn))
1510 				knote_detach_and_drop(kn);
1511 			lwkt_relpooltoken(kq);
1512 			goto restart;
1513 		}
1514 	}
1515 	lwkt_relpooltoken(&fp->f_klist);
1516 }
1517 
1518 /*
1519  * Low level attach function.
1520  *
1521  * The knote should already be marked for processing.
1522  * Caller must hold the related kq token.
1523  */
1524 static void
1525 knote_attach(struct knote *kn)
1526 {
1527 	struct klist *list;
1528 	struct kqueue *kq = kn->kn_kq;
1529 
1530 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1531 		KKASSERT(kn->kn_fp);
1532 		list = &kn->kn_fp->f_klist;
1533 	} else {
1534 		if (kq->kq_knhashmask == 0)
1535 			kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1536 						 &kq->kq_knhashmask);
1537 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1538 	}
1539 	lwkt_getpooltoken(list);
1540 	SLIST_INSERT_HEAD(list, kn, kn_link);
1541 	TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
1542 	lwkt_relpooltoken(list);
1543 }
1544 
1545 /*
1546  * Low level drop function.
1547  *
1548  * The knote should already be marked for processing.
1549  * Caller must hold the related kq token.
1550  */
1551 static void
1552 knote_drop(struct knote *kn)
1553 {
1554 	struct kqueue *kq;
1555 	struct klist *list;
1556 
1557 	kq = kn->kn_kq;
1558 
1559 	if (kn->kn_fop->f_flags & FILTEROP_ISFD)
1560 		list = &kn->kn_fp->f_klist;
1561 	else
1562 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1563 
1564 	lwkt_getpooltoken(list);
1565 	SLIST_REMOVE(list, kn, knote, kn_link);
1566 	TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
1567 	if (kn->kn_status & KN_QUEUED)
1568 		knote_dequeue(kn);
1569 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1570 		fdrop(kn->kn_fp);
1571 		kn->kn_fp = NULL;
1572 	}
1573 	knote_free(kn);
1574 	lwkt_relpooltoken(list);
1575 }
1576 
1577 /*
1578  * Low level enqueue function.
1579  *
1580  * The knote should already be marked for processing.
1581  * Caller must be holding the kq token
1582  */
1583 static void
1584 knote_enqueue(struct knote *kn)
1585 {
1586 	struct kqueue *kq = kn->kn_kq;
1587 
1588 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
1589 	TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
1590 	kn->kn_status |= KN_QUEUED;
1591 	++kq->kq_count;
1592 
1593 	/*
1594 	 * Send SIGIO on request (typically set up as a mailbox signal)
1595 	 */
1596 	if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
1597 		pgsigio(kq->kq_sigio, SIGIO, 0);
1598 
1599 	kqueue_wakeup(kq);
1600 }
1601 
1602 /*
1603  * Low level dequeue function.
1604  *
1605  * The knote should already be marked for processing.
1606  * Caller must be holding the kq token
1607  */
1608 static void
1609 knote_dequeue(struct knote *kn)
1610 {
1611 	struct kqueue *kq = kn->kn_kq;
1612 
1613 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
1614 	TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1615 	kn->kn_status &= ~KN_QUEUED;
1616 	kq->kq_count--;
1617 }
1618 
1619 static struct knote *
1620 knote_alloc(void)
1621 {
1622 	return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK);
1623 }
1624 
1625 static void
1626 knote_free(struct knote *kn)
1627 {
1628 	kfree(kn, M_KQUEUE);
1629 }
1630