1 /* $NetBSD: kern_event.c,v 1.118 2021/05/02 19:13:43 jdolecek Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 34 * Copyright (c) 2009 Apple, Inc 35 * All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 46 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 49 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 56 * SUCH DAMAGE. 57 * 58 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp 59 */ 60 61 #include <sys/cdefs.h> 62 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.118 2021/05/02 19:13:43 jdolecek Exp $"); 63 64 #include <sys/param.h> 65 #include <sys/systm.h> 66 #include <sys/kernel.h> 67 #include <sys/wait.h> 68 #include <sys/proc.h> 69 #include <sys/file.h> 70 #include <sys/select.h> 71 #include <sys/queue.h> 72 #include <sys/event.h> 73 #include <sys/eventvar.h> 74 #include <sys/poll.h> 75 #include <sys/kmem.h> 76 #include <sys/stat.h> 77 #include <sys/filedesc.h> 78 #include <sys/syscallargs.h> 79 #include <sys/kauth.h> 80 #include <sys/conf.h> 81 #include <sys/atomic.h> 82 83 static int kqueue_scan(file_t *, size_t, struct kevent *, 84 const struct timespec *, register_t *, 85 const struct kevent_ops *, struct kevent *, 86 size_t); 87 static int kqueue_ioctl(file_t *, u_long, void *); 88 static int kqueue_fcntl(file_t *, u_int, void *); 89 static int kqueue_poll(file_t *, int); 90 static int kqueue_kqfilter(file_t *, struct knote *); 91 static int kqueue_stat(file_t *, struct stat *); 92 static int kqueue_close(file_t *); 93 static void kqueue_restart(file_t *); 94 static int kqueue_register(struct kqueue *, struct kevent *); 95 static void kqueue_doclose(struct kqueue *, struct klist *, int); 96 97 static void knote_detach(struct knote *, filedesc_t *fdp, bool); 98 static void knote_enqueue(struct knote *); 99 static void knote_activate(struct knote *); 100 101 static void filt_kqdetach(struct knote *); 102 static int filt_kqueue(struct knote *, long hint); 103 static int filt_procattach(struct knote *); 104 static void filt_procdetach(struct knote *); 105 static int filt_proc(struct knote *, long hint); 106 static int filt_fileattach(struct knote *); 107 static void filt_timerexpire(void *x); 108 static int filt_timerattach(struct knote *); 109 static void filt_timerdetach(struct knote *); 110 static int filt_timer(struct knote *, long hint); 111 static int filt_fsattach(struct knote *kn); 112 static void filt_fsdetach(struct knote *kn); 113 static int filt_fs(struct knote *kn, long hint); 114 static int filt_userattach(struct knote *); 115 static void filt_userdetach(struct knote *); 116 static int filt_user(struct knote *, long hint); 117 static void filt_usertouch(struct knote *, struct kevent *, long type); 118 119 static const struct fileops kqueueops = { 120 .fo_name = "kqueue", 121 .fo_read = (void *)enxio, 122 .fo_write = (void *)enxio, 123 .fo_ioctl = kqueue_ioctl, 124 .fo_fcntl = kqueue_fcntl, 125 .fo_poll = kqueue_poll, 126 .fo_stat = kqueue_stat, 127 .fo_close = kqueue_close, 128 .fo_kqfilter = kqueue_kqfilter, 129 .fo_restart = kqueue_restart, 130 }; 131 132 static const struct filterops kqread_filtops = { 133 .f_isfd = 1, 134 .f_attach = NULL, 135 .f_detach = filt_kqdetach, 136 .f_event = filt_kqueue, 137 }; 138 139 static const struct filterops proc_filtops = { 140 .f_isfd = 0, 141 .f_attach = filt_procattach, 142 .f_detach = filt_procdetach, 143 .f_event = filt_proc, 144 }; 145 146 static const struct filterops file_filtops = { 147 .f_isfd = 1, 148 .f_attach = filt_fileattach, 149 .f_detach = NULL, 150 .f_event = NULL, 151 }; 152 153 static const struct filterops timer_filtops = { 154 .f_isfd = 0, 155 .f_attach = filt_timerattach, 156 .f_detach = filt_timerdetach, 157 .f_event = filt_timer, 158 }; 159 160 static const struct filterops fs_filtops = { 161 .f_isfd = 0, 162 .f_attach = filt_fsattach, 163 .f_detach = filt_fsdetach, 164 .f_event = filt_fs, 165 }; 166 167 static const struct filterops user_filtops = { 168 .f_isfd = 0, 169 .f_attach = filt_userattach, 170 .f_detach = filt_userdetach, 171 .f_event = filt_user, 172 .f_touch = filt_usertouch, 173 }; 174 175 static u_int kq_ncallouts = 0; 176 static int kq_calloutmax = (4 * 1024); 177 178 #define KN_HASHSIZE 64 /* XXX should be tunable */ 179 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 180 181 extern const struct filterops sig_filtops; 182 183 #define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv) 184 185 /* 186 * Table for for all system-defined filters. 187 * These should be listed in the numeric order of the EVFILT_* defines. 188 * If filtops is NULL, the filter isn't implemented in NetBSD. 189 * End of list is when name is NULL. 190 * 191 * Note that 'refcnt' is meaningless for built-in filters. 192 */ 193 struct kfilter { 194 const char *name; /* name of filter */ 195 uint32_t filter; /* id of filter */ 196 unsigned refcnt; /* reference count */ 197 const struct filterops *filtops;/* operations for filter */ 198 size_t namelen; /* length of name string */ 199 }; 200 201 /* System defined filters */ 202 static struct kfilter sys_kfilters[] = { 203 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 }, 204 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, }, 205 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 }, 206 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 }, 207 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 }, 208 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 }, 209 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 }, 210 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 }, 211 { "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 }, 212 { NULL, 0, 0, NULL, 0 }, 213 }; 214 215 /* User defined kfilters */ 216 static struct kfilter *user_kfilters; /* array */ 217 static int user_kfilterc; /* current offset */ 218 static int user_kfiltermaxc; /* max size so far */ 219 static size_t user_kfiltersz; /* size of allocated memory */ 220 221 /* 222 * Global Locks. 223 * 224 * Lock order: 225 * 226 * kqueue_filter_lock 227 * -> kn_kq->kq_fdp->fd_lock 228 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.) 229 * -> kn_kq->kq_lock 230 * 231 * Locking rules: 232 * 233 * f_attach: fdp->fd_lock, KERNEL_LOCK 234 * f_detach: fdp->fd_lock, KERNEL_LOCK 235 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock 236 * f_event via knote: whatever caller guarantees 237 * Typically, f_event(NOTE_SUBMIT) via knote: object lock 238 * f_event(!NOTE_SUBMIT) via knote: nothing, 239 * acquires/releases object lock inside. 240 */ 241 static krwlock_t kqueue_filter_lock; /* lock on filter lists */ 242 static kmutex_t kqueue_misc_lock; /* miscellaneous */ 243 244 static kauth_listener_t kqueue_listener; 245 246 static int 247 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 248 void *arg0, void *arg1, void *arg2, void *arg3) 249 { 250 struct proc *p; 251 int result; 252 253 result = KAUTH_RESULT_DEFER; 254 p = arg0; 255 256 if (action != KAUTH_PROCESS_KEVENT_FILTER) 257 return result; 258 259 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) || 260 ISSET(p->p_flag, PK_SUGID))) 261 return result; 262 263 result = KAUTH_RESULT_ALLOW; 264 265 return result; 266 } 267 268 /* 269 * Initialize the kqueue subsystem. 270 */ 271 void 272 kqueue_init(void) 273 { 274 275 rw_init(&kqueue_filter_lock); 276 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE); 277 278 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, 279 kqueue_listener_cb, NULL); 280 } 281 282 /* 283 * Find kfilter entry by name, or NULL if not found. 284 */ 285 static struct kfilter * 286 kfilter_byname_sys(const char *name) 287 { 288 int i; 289 290 KASSERT(rw_lock_held(&kqueue_filter_lock)); 291 292 for (i = 0; sys_kfilters[i].name != NULL; i++) { 293 if (strcmp(name, sys_kfilters[i].name) == 0) 294 return &sys_kfilters[i]; 295 } 296 return NULL; 297 } 298 299 static struct kfilter * 300 kfilter_byname_user(const char *name) 301 { 302 int i; 303 304 KASSERT(rw_lock_held(&kqueue_filter_lock)); 305 306 /* user filter slots have a NULL name if previously deregistered */ 307 for (i = 0; i < user_kfilterc ; i++) { 308 if (user_kfilters[i].name != NULL && 309 strcmp(name, user_kfilters[i].name) == 0) 310 return &user_kfilters[i]; 311 } 312 return NULL; 313 } 314 315 static struct kfilter * 316 kfilter_byname(const char *name) 317 { 318 struct kfilter *kfilter; 319 320 KASSERT(rw_lock_held(&kqueue_filter_lock)); 321 322 if ((kfilter = kfilter_byname_sys(name)) != NULL) 323 return kfilter; 324 325 return kfilter_byname_user(name); 326 } 327 328 /* 329 * Find kfilter entry by filter id, or NULL if not found. 330 * Assumes entries are indexed in filter id order, for speed. 331 */ 332 static struct kfilter * 333 kfilter_byfilter(uint32_t filter) 334 { 335 struct kfilter *kfilter; 336 337 KASSERT(rw_lock_held(&kqueue_filter_lock)); 338 339 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */ 340 kfilter = &sys_kfilters[filter]; 341 else if (user_kfilters != NULL && 342 filter < EVFILT_SYSCOUNT + user_kfilterc) 343 /* it's a user filter */ 344 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT]; 345 else 346 return (NULL); /* out of range */ 347 KASSERT(kfilter->filter == filter); /* sanity check! */ 348 return (kfilter); 349 } 350 351 /* 352 * Register a new kfilter. Stores the entry in user_kfilters. 353 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise. 354 * If retfilter != NULL, the new filterid is returned in it. 355 */ 356 int 357 kfilter_register(const char *name, const struct filterops *filtops, 358 int *retfilter) 359 { 360 struct kfilter *kfilter; 361 size_t len; 362 int i; 363 364 if (name == NULL || name[0] == '\0' || filtops == NULL) 365 return (EINVAL); /* invalid args */ 366 367 rw_enter(&kqueue_filter_lock, RW_WRITER); 368 if (kfilter_byname(name) != NULL) { 369 rw_exit(&kqueue_filter_lock); 370 return (EEXIST); /* already exists */ 371 } 372 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) { 373 rw_exit(&kqueue_filter_lock); 374 return (EINVAL); /* too many */ 375 } 376 377 for (i = 0; i < user_kfilterc; i++) { 378 kfilter = &user_kfilters[i]; 379 if (kfilter->name == NULL) { 380 /* Previously deregistered slot. Reuse. */ 381 goto reuse; 382 } 383 } 384 385 /* check if need to grow user_kfilters */ 386 if (user_kfilterc + 1 > user_kfiltermaxc) { 387 /* Grow in KFILTER_EXTENT chunks. */ 388 user_kfiltermaxc += KFILTER_EXTENT; 389 len = user_kfiltermaxc * sizeof(*kfilter); 390 kfilter = kmem_alloc(len, KM_SLEEP); 391 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz); 392 if (user_kfilters != NULL) { 393 memcpy(kfilter, user_kfilters, user_kfiltersz); 394 kmem_free(user_kfilters, user_kfiltersz); 395 } 396 user_kfiltersz = len; 397 user_kfilters = kfilter; 398 } 399 /* Adding new slot */ 400 kfilter = &user_kfilters[user_kfilterc++]; 401 reuse: 402 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP); 403 404 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT; 405 406 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP); 407 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops)); 408 409 if (retfilter != NULL) 410 *retfilter = kfilter->filter; 411 rw_exit(&kqueue_filter_lock); 412 413 return (0); 414 } 415 416 /* 417 * Unregister a kfilter previously registered with kfilter_register. 418 * This retains the filter id, but clears the name and frees filtops (filter 419 * operations), so that the number isn't reused during a boot. 420 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise. 421 */ 422 int 423 kfilter_unregister(const char *name) 424 { 425 struct kfilter *kfilter; 426 427 if (name == NULL || name[0] == '\0') 428 return (EINVAL); /* invalid name */ 429 430 rw_enter(&kqueue_filter_lock, RW_WRITER); 431 if (kfilter_byname_sys(name) != NULL) { 432 rw_exit(&kqueue_filter_lock); 433 return (EINVAL); /* can't detach system filters */ 434 } 435 436 kfilter = kfilter_byname_user(name); 437 if (kfilter == NULL) { 438 rw_exit(&kqueue_filter_lock); 439 return (ENOENT); 440 } 441 if (kfilter->refcnt != 0) { 442 rw_exit(&kqueue_filter_lock); 443 return (EBUSY); 444 } 445 446 /* Cast away const (but we know it's safe. */ 447 kmem_free(__UNCONST(kfilter->name), kfilter->namelen); 448 kfilter->name = NULL; /* mark as `not implemented' */ 449 450 if (kfilter->filtops != NULL) { 451 /* Cast away const (but we know it's safe. */ 452 kmem_free(__UNCONST(kfilter->filtops), 453 sizeof(*kfilter->filtops)); 454 kfilter->filtops = NULL; /* mark as `not implemented' */ 455 } 456 rw_exit(&kqueue_filter_lock); 457 458 return (0); 459 } 460 461 462 /* 463 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file 464 * descriptors. Calls fileops kqfilter method for given file descriptor. 465 */ 466 static int 467 filt_fileattach(struct knote *kn) 468 { 469 file_t *fp; 470 471 fp = kn->kn_obj; 472 473 return (*fp->f_ops->fo_kqfilter)(fp, kn); 474 } 475 476 /* 477 * Filter detach method for EVFILT_READ on kqueue descriptor. 478 */ 479 static void 480 filt_kqdetach(struct knote *kn) 481 { 482 struct kqueue *kq; 483 484 kq = ((file_t *)kn->kn_obj)->f_kqueue; 485 486 mutex_spin_enter(&kq->kq_lock); 487 selremove_knote(&kq->kq_sel, kn); 488 mutex_spin_exit(&kq->kq_lock); 489 } 490 491 /* 492 * Filter event method for EVFILT_READ on kqueue descriptor. 493 */ 494 /*ARGSUSED*/ 495 static int 496 filt_kqueue(struct knote *kn, long hint) 497 { 498 struct kqueue *kq; 499 int rv; 500 501 kq = ((file_t *)kn->kn_obj)->f_kqueue; 502 503 if (hint != NOTE_SUBMIT) 504 mutex_spin_enter(&kq->kq_lock); 505 kn->kn_data = KQ_COUNT(kq); 506 rv = (kn->kn_data > 0); 507 if (hint != NOTE_SUBMIT) 508 mutex_spin_exit(&kq->kq_lock); 509 510 return rv; 511 } 512 513 /* 514 * Filter attach method for EVFILT_PROC. 515 */ 516 static int 517 filt_procattach(struct knote *kn) 518 { 519 struct proc *p; 520 struct lwp *curl; 521 522 curl = curlwp; 523 524 mutex_enter(&proc_lock); 525 if (kn->kn_flags & EV_FLAG1) { 526 /* 527 * NOTE_TRACK attaches to the child process too early 528 * for proc_find, so do a raw look up and check the state 529 * explicitly. 530 */ 531 p = proc_find_raw(kn->kn_id); 532 if (p != NULL && p->p_stat != SIDL) 533 p = NULL; 534 } else { 535 p = proc_find(kn->kn_id); 536 } 537 538 if (p == NULL) { 539 mutex_exit(&proc_lock); 540 return ESRCH; 541 } 542 543 /* 544 * Fail if it's not owned by you, or the last exec gave us 545 * setuid/setgid privs (unless you're root). 546 */ 547 mutex_enter(p->p_lock); 548 mutex_exit(&proc_lock); 549 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER, 550 p, NULL, NULL, NULL) != 0) { 551 mutex_exit(p->p_lock); 552 return EACCES; 553 } 554 555 kn->kn_obj = p; 556 kn->kn_flags |= EV_CLEAR; /* automatically set */ 557 558 /* 559 * internal flag indicating registration done by kernel 560 */ 561 if (kn->kn_flags & EV_FLAG1) { 562 kn->kn_data = kn->kn_sdata; /* ppid */ 563 kn->kn_fflags = NOTE_CHILD; 564 kn->kn_flags &= ~EV_FLAG1; 565 } 566 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); 567 mutex_exit(p->p_lock); 568 569 return 0; 570 } 571 572 /* 573 * Filter detach method for EVFILT_PROC. 574 * 575 * The knote may be attached to a different process, which may exit, 576 * leaving nothing for the knote to be attached to. So when the process 577 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 578 * it will be deleted when read out. However, as part of the knote deletion, 579 * this routine is called, so a check is needed to avoid actually performing 580 * a detach, because the original process might not exist any more. 581 */ 582 static void 583 filt_procdetach(struct knote *kn) 584 { 585 struct proc *p; 586 587 if (kn->kn_status & KN_DETACHED) 588 return; 589 590 p = kn->kn_obj; 591 592 mutex_enter(p->p_lock); 593 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); 594 mutex_exit(p->p_lock); 595 } 596 597 /* 598 * Filter event method for EVFILT_PROC. 599 */ 600 static int 601 filt_proc(struct knote *kn, long hint) 602 { 603 u_int event, fflag; 604 struct kevent kev; 605 struct kqueue *kq; 606 int error; 607 608 event = (u_int)hint & NOTE_PCTRLMASK; 609 kq = kn->kn_kq; 610 fflag = 0; 611 612 /* If the user is interested in this event, record it. */ 613 if (kn->kn_sfflags & event) 614 fflag |= event; 615 616 if (event == NOTE_EXIT) { 617 struct proc *p = kn->kn_obj; 618 619 if (p != NULL) 620 kn->kn_data = P_WAITSTATUS(p); 621 /* 622 * Process is gone, so flag the event as finished. 623 * 624 * Detach the knote from watched process and mark 625 * it as such. We can't leave this to kqueue_scan(), 626 * since the process might not exist by then. And we 627 * have to do this now, since psignal KNOTE() is called 628 * also for zombies and we might end up reading freed 629 * memory if the kevent would already be picked up 630 * and knote g/c'ed. 631 */ 632 filt_procdetach(kn); 633 634 mutex_spin_enter(&kq->kq_lock); 635 kn->kn_status |= KN_DETACHED; 636 /* Mark as ONESHOT, so that the knote it g/c'ed when read */ 637 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 638 kn->kn_fflags |= fflag; 639 mutex_spin_exit(&kq->kq_lock); 640 641 return 1; 642 } 643 644 mutex_spin_enter(&kq->kq_lock); 645 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 646 /* 647 * Process forked, and user wants to track the new process, 648 * so attach a new knote to it, and immediately report an 649 * event with the parent's pid. Register knote with new 650 * process. 651 */ 652 memset(&kev, 0, sizeof(kev)); 653 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 654 kev.filter = kn->kn_filter; 655 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 656 kev.fflags = kn->kn_sfflags; 657 kev.data = kn->kn_id; /* parent */ 658 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 659 mutex_spin_exit(&kq->kq_lock); 660 error = kqueue_register(kq, &kev); 661 mutex_spin_enter(&kq->kq_lock); 662 if (error != 0) 663 kn->kn_fflags |= NOTE_TRACKERR; 664 } 665 kn->kn_fflags |= fflag; 666 fflag = kn->kn_fflags; 667 mutex_spin_exit(&kq->kq_lock); 668 669 return fflag != 0; 670 } 671 672 static void 673 filt_timerexpire(void *knx) 674 { 675 struct knote *kn = knx; 676 int tticks; 677 678 mutex_enter(&kqueue_misc_lock); 679 kn->kn_data++; 680 knote_activate(kn); 681 if ((kn->kn_flags & EV_ONESHOT) == 0) { 682 tticks = mstohz(kn->kn_sdata); 683 if (tticks <= 0) 684 tticks = 1; 685 callout_schedule((callout_t *)kn->kn_hook, tticks); 686 } 687 mutex_exit(&kqueue_misc_lock); 688 } 689 690 /* 691 * data contains amount of time to sleep, in milliseconds 692 */ 693 static int 694 filt_timerattach(struct knote *kn) 695 { 696 callout_t *calloutp; 697 struct kqueue *kq; 698 int tticks; 699 700 tticks = mstohz(kn->kn_sdata); 701 702 /* if the supplied value is under our resolution, use 1 tick */ 703 if (tticks == 0) { 704 if (kn->kn_sdata == 0) 705 return EINVAL; 706 tticks = 1; 707 } 708 709 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax || 710 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) { 711 atomic_dec_uint(&kq_ncallouts); 712 return ENOMEM; 713 } 714 callout_init(calloutp, CALLOUT_MPSAFE); 715 716 kq = kn->kn_kq; 717 mutex_spin_enter(&kq->kq_lock); 718 kn->kn_flags |= EV_CLEAR; /* automatically set */ 719 kn->kn_hook = calloutp; 720 mutex_spin_exit(&kq->kq_lock); 721 722 callout_reset(calloutp, tticks, filt_timerexpire, kn); 723 724 return (0); 725 } 726 727 static void 728 filt_timerdetach(struct knote *kn) 729 { 730 callout_t *calloutp; 731 struct kqueue *kq = kn->kn_kq; 732 733 mutex_spin_enter(&kq->kq_lock); 734 /* prevent rescheduling when we expire */ 735 kn->kn_flags |= EV_ONESHOT; 736 mutex_spin_exit(&kq->kq_lock); 737 738 calloutp = (callout_t *)kn->kn_hook; 739 callout_halt(calloutp, NULL); 740 callout_destroy(calloutp); 741 kmem_free(calloutp, sizeof(*calloutp)); 742 atomic_dec_uint(&kq_ncallouts); 743 } 744 745 static int 746 filt_timer(struct knote *kn, long hint) 747 { 748 int rv; 749 750 mutex_enter(&kqueue_misc_lock); 751 rv = (kn->kn_data != 0); 752 mutex_exit(&kqueue_misc_lock); 753 754 return rv; 755 } 756 757 /* 758 * Filter event method for EVFILT_FS. 759 */ 760 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist); 761 762 static int 763 filt_fsattach(struct knote *kn) 764 { 765 766 mutex_enter(&kqueue_misc_lock); 767 kn->kn_flags |= EV_CLEAR; 768 SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext); 769 mutex_exit(&kqueue_misc_lock); 770 771 return 0; 772 } 773 774 static void 775 filt_fsdetach(struct knote *kn) 776 { 777 778 mutex_enter(&kqueue_misc_lock); 779 SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext); 780 mutex_exit(&kqueue_misc_lock); 781 } 782 783 static int 784 filt_fs(struct knote *kn, long hint) 785 { 786 int rv; 787 788 mutex_enter(&kqueue_misc_lock); 789 kn->kn_fflags |= hint; 790 rv = (kn->kn_fflags != 0); 791 mutex_exit(&kqueue_misc_lock); 792 793 return rv; 794 } 795 796 static int 797 filt_userattach(struct knote *kn) 798 { 799 struct kqueue *kq = kn->kn_kq; 800 801 /* 802 * EVFILT_USER knotes are not attached to anything in the kernel. 803 */ 804 mutex_spin_enter(&kq->kq_lock); 805 kn->kn_hook = NULL; 806 if (kn->kn_fflags & NOTE_TRIGGER) 807 kn->kn_hookid = 1; 808 else 809 kn->kn_hookid = 0; 810 mutex_spin_exit(&kq->kq_lock); 811 return (0); 812 } 813 814 static void 815 filt_userdetach(struct knote *kn) 816 { 817 818 /* 819 * EVFILT_USER knotes are not attached to anything in the kernel. 820 */ 821 } 822 823 static int 824 filt_user(struct knote *kn, long hint) 825 { 826 struct kqueue *kq = kn->kn_kq; 827 int hookid; 828 829 mutex_spin_enter(&kq->kq_lock); 830 hookid = kn->kn_hookid; 831 mutex_spin_exit(&kq->kq_lock); 832 833 return hookid; 834 } 835 836 static void 837 filt_usertouch(struct knote *kn, struct kevent *kev, long type) 838 { 839 int ffctrl; 840 841 KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); 842 843 switch (type) { 844 case EVENT_REGISTER: 845 if (kev->fflags & NOTE_TRIGGER) 846 kn->kn_hookid = 1; 847 848 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 849 kev->fflags &= NOTE_FFLAGSMASK; 850 switch (ffctrl) { 851 case NOTE_FFNOP: 852 break; 853 854 case NOTE_FFAND: 855 kn->kn_sfflags &= kev->fflags; 856 break; 857 858 case NOTE_FFOR: 859 kn->kn_sfflags |= kev->fflags; 860 break; 861 862 case NOTE_FFCOPY: 863 kn->kn_sfflags = kev->fflags; 864 break; 865 866 default: 867 /* XXX Return error? */ 868 break; 869 } 870 kn->kn_sdata = kev->data; 871 if (kev->flags & EV_CLEAR) { 872 kn->kn_hookid = 0; 873 kn->kn_data = 0; 874 kn->kn_fflags = 0; 875 } 876 break; 877 878 case EVENT_PROCESS: 879 *kev = kn->kn_kevent; 880 kev->fflags = kn->kn_sfflags; 881 kev->data = kn->kn_sdata; 882 if (kn->kn_flags & EV_CLEAR) { 883 kn->kn_hookid = 0; 884 kn->kn_data = 0; 885 kn->kn_fflags = 0; 886 } 887 break; 888 889 default: 890 panic("filt_usertouch() - invalid type (%ld)", type); 891 break; 892 } 893 } 894 895 /* 896 * filt_seltrue: 897 * 898 * This filter "event" routine simulates seltrue(). 899 */ 900 int 901 filt_seltrue(struct knote *kn, long hint) 902 { 903 904 /* 905 * We don't know how much data can be read/written, 906 * but we know that it *can* be. This is about as 907 * good as select/poll does as well. 908 */ 909 kn->kn_data = 0; 910 return (1); 911 } 912 913 /* 914 * This provides full kqfilter entry for device switch tables, which 915 * has same effect as filter using filt_seltrue() as filter method. 916 */ 917 static void 918 filt_seltruedetach(struct knote *kn) 919 { 920 /* Nothing to do */ 921 } 922 923 const struct filterops seltrue_filtops = { 924 .f_isfd = 1, 925 .f_attach = NULL, 926 .f_detach = filt_seltruedetach, 927 .f_event = filt_seltrue, 928 }; 929 930 int 931 seltrue_kqfilter(dev_t dev, struct knote *kn) 932 { 933 switch (kn->kn_filter) { 934 case EVFILT_READ: 935 case EVFILT_WRITE: 936 kn->kn_fop = &seltrue_filtops; 937 break; 938 default: 939 return (EINVAL); 940 } 941 942 /* Nothing more to do */ 943 return (0); 944 } 945 946 /* 947 * kqueue(2) system call. 948 */ 949 static int 950 kqueue1(struct lwp *l, int flags, register_t *retval) 951 { 952 struct kqueue *kq; 953 file_t *fp; 954 int fd, error; 955 956 if ((error = fd_allocfile(&fp, &fd)) != 0) 957 return error; 958 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE)); 959 fp->f_type = DTYPE_KQUEUE; 960 fp->f_ops = &kqueueops; 961 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP); 962 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED); 963 cv_init(&kq->kq_cv, "kqueue"); 964 selinit(&kq->kq_sel); 965 TAILQ_INIT(&kq->kq_head); 966 fp->f_kqueue = kq; 967 *retval = fd; 968 kq->kq_fdp = curlwp->l_fd; 969 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0); 970 fd_affix(curproc, fp, fd); 971 return error; 972 } 973 974 /* 975 * kqueue(2) system call. 976 */ 977 int 978 sys_kqueue(struct lwp *l, const void *v, register_t *retval) 979 { 980 return kqueue1(l, 0, retval); 981 } 982 983 int 984 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap, 985 register_t *retval) 986 { 987 /* { 988 syscallarg(int) flags; 989 } */ 990 return kqueue1(l, SCARG(uap, flags), retval); 991 } 992 993 /* 994 * kevent(2) system call. 995 */ 996 int 997 kevent_fetch_changes(void *ctx, const struct kevent *changelist, 998 struct kevent *changes, size_t index, int n) 999 { 1000 1001 return copyin(changelist + index, changes, n * sizeof(*changes)); 1002 } 1003 1004 int 1005 kevent_put_events(void *ctx, struct kevent *events, 1006 struct kevent *eventlist, size_t index, int n) 1007 { 1008 1009 return copyout(events, eventlist + index, n * sizeof(*events)); 1010 } 1011 1012 static const struct kevent_ops kevent_native_ops = { 1013 .keo_private = NULL, 1014 .keo_fetch_timeout = copyin, 1015 .keo_fetch_changes = kevent_fetch_changes, 1016 .keo_put_events = kevent_put_events, 1017 }; 1018 1019 int 1020 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap, 1021 register_t *retval) 1022 { 1023 /* { 1024 syscallarg(int) fd; 1025 syscallarg(const struct kevent *) changelist; 1026 syscallarg(size_t) nchanges; 1027 syscallarg(struct kevent *) eventlist; 1028 syscallarg(size_t) nevents; 1029 syscallarg(const struct timespec *) timeout; 1030 } */ 1031 1032 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist), 1033 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents), 1034 SCARG(uap, timeout), &kevent_native_ops); 1035 } 1036 1037 int 1038 kevent1(register_t *retval, int fd, 1039 const struct kevent *changelist, size_t nchanges, 1040 struct kevent *eventlist, size_t nevents, 1041 const struct timespec *timeout, 1042 const struct kevent_ops *keops) 1043 { 1044 struct kevent *kevp; 1045 struct kqueue *kq; 1046 struct timespec ts; 1047 size_t i, n, ichange; 1048 int nerrors, error; 1049 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */ 1050 file_t *fp; 1051 1052 /* check that we're dealing with a kq */ 1053 fp = fd_getfile(fd); 1054 if (fp == NULL) 1055 return (EBADF); 1056 1057 if (fp->f_type != DTYPE_KQUEUE) { 1058 fd_putfile(fd); 1059 return (EBADF); 1060 } 1061 1062 if (timeout != NULL) { 1063 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts)); 1064 if (error) 1065 goto done; 1066 timeout = &ts; 1067 } 1068 1069 kq = fp->f_kqueue; 1070 nerrors = 0; 1071 ichange = 0; 1072 1073 /* traverse list of events to register */ 1074 while (nchanges > 0) { 1075 n = MIN(nchanges, __arraycount(kevbuf)); 1076 error = (*keops->keo_fetch_changes)(keops->keo_private, 1077 changelist, kevbuf, ichange, n); 1078 if (error) 1079 goto done; 1080 for (i = 0; i < n; i++) { 1081 kevp = &kevbuf[i]; 1082 kevp->flags &= ~EV_SYSFLAGS; 1083 /* register each knote */ 1084 error = kqueue_register(kq, kevp); 1085 if (!error && !(kevp->flags & EV_RECEIPT)) 1086 continue; 1087 if (nevents == 0) 1088 goto done; 1089 kevp->flags = EV_ERROR; 1090 kevp->data = error; 1091 error = (*keops->keo_put_events) 1092 (keops->keo_private, kevp, 1093 eventlist, nerrors, 1); 1094 if (error) 1095 goto done; 1096 nevents--; 1097 nerrors++; 1098 } 1099 nchanges -= n; /* update the results */ 1100 ichange += n; 1101 } 1102 if (nerrors) { 1103 *retval = nerrors; 1104 error = 0; 1105 goto done; 1106 } 1107 1108 /* actually scan through the events */ 1109 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops, 1110 kevbuf, __arraycount(kevbuf)); 1111 done: 1112 fd_putfile(fd); 1113 return (error); 1114 } 1115 1116 /* 1117 * Register a given kevent kev onto the kqueue 1118 */ 1119 static int 1120 kqueue_register(struct kqueue *kq, struct kevent *kev) 1121 { 1122 struct kfilter *kfilter; 1123 filedesc_t *fdp; 1124 file_t *fp; 1125 fdfile_t *ff; 1126 struct knote *kn, *newkn; 1127 struct klist *list; 1128 int error, fd, rv; 1129 1130 fdp = kq->kq_fdp; 1131 fp = NULL; 1132 kn = NULL; 1133 error = 0; 1134 fd = 0; 1135 1136 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP); 1137 1138 rw_enter(&kqueue_filter_lock, RW_READER); 1139 kfilter = kfilter_byfilter(kev->filter); 1140 if (kfilter == NULL || kfilter->filtops == NULL) { 1141 /* filter not found nor implemented */ 1142 rw_exit(&kqueue_filter_lock); 1143 kmem_free(newkn, sizeof(*newkn)); 1144 return (EINVAL); 1145 } 1146 1147 /* search if knote already exists */ 1148 if (kfilter->filtops->f_isfd) { 1149 /* monitoring a file descriptor */ 1150 /* validate descriptor */ 1151 if (kev->ident > INT_MAX 1152 || (fp = fd_getfile(fd = kev->ident)) == NULL) { 1153 rw_exit(&kqueue_filter_lock); 1154 kmem_free(newkn, sizeof(*newkn)); 1155 return EBADF; 1156 } 1157 mutex_enter(&fdp->fd_lock); 1158 ff = fdp->fd_dt->dt_ff[fd]; 1159 if (ff->ff_refcnt & FR_CLOSING) { 1160 error = EBADF; 1161 goto doneunlock; 1162 } 1163 if (fd <= fdp->fd_lastkqfile) { 1164 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) { 1165 if (kq == kn->kn_kq && 1166 kev->filter == kn->kn_filter) 1167 break; 1168 } 1169 } 1170 } else { 1171 /* 1172 * not monitoring a file descriptor, so 1173 * lookup knotes in internal hash table 1174 */ 1175 mutex_enter(&fdp->fd_lock); 1176 if (fdp->fd_knhashmask != 0) { 1177 list = &fdp->fd_knhash[ 1178 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; 1179 SLIST_FOREACH(kn, list, kn_link) { 1180 if (kev->ident == kn->kn_id && 1181 kq == kn->kn_kq && 1182 kev->filter == kn->kn_filter) 1183 break; 1184 } 1185 } 1186 } 1187 1188 /* 1189 * kn now contains the matching knote, or NULL if no match 1190 */ 1191 if (kn == NULL) { 1192 if (kev->flags & EV_ADD) { 1193 /* create new knote */ 1194 kn = newkn; 1195 newkn = NULL; 1196 kn->kn_obj = fp; 1197 kn->kn_id = kev->ident; 1198 kn->kn_kq = kq; 1199 kn->kn_fop = kfilter->filtops; 1200 kn->kn_kfilter = kfilter; 1201 kn->kn_sfflags = kev->fflags; 1202 kn->kn_sdata = kev->data; 1203 kev->fflags = 0; 1204 kev->data = 0; 1205 kn->kn_kevent = *kev; 1206 1207 KASSERT(kn->kn_fop != NULL); 1208 /* 1209 * apply reference count to knote structure, and 1210 * do not release it at the end of this routine. 1211 */ 1212 fp = NULL; 1213 1214 if (!kn->kn_fop->f_isfd) { 1215 /* 1216 * If knote is not on an fd, store on 1217 * internal hash table. 1218 */ 1219 if (fdp->fd_knhashmask == 0) { 1220 /* XXXAD can block with fd_lock held */ 1221 fdp->fd_knhash = hashinit(KN_HASHSIZE, 1222 HASH_LIST, true, 1223 &fdp->fd_knhashmask); 1224 } 1225 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, 1226 fdp->fd_knhashmask)]; 1227 } else { 1228 /* Otherwise, knote is on an fd. */ 1229 list = (struct klist *) 1230 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist; 1231 if ((int)kn->kn_id > fdp->fd_lastkqfile) 1232 fdp->fd_lastkqfile = kn->kn_id; 1233 } 1234 SLIST_INSERT_HEAD(list, kn, kn_link); 1235 1236 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1237 error = (*kfilter->filtops->f_attach)(kn); 1238 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1239 if (error != 0) { 1240 #ifdef DEBUG 1241 struct proc *p = curlwp->l_proc; 1242 const file_t *ft = kn->kn_obj; 1243 printf("%s: %s[%d]: event type %d not " 1244 "supported for file type %d/%s " 1245 "(error %d)\n", __func__, 1246 p->p_comm, p->p_pid, 1247 kn->kn_filter, ft ? ft->f_type : -1, 1248 ft ? ft->f_ops->fo_name : "?", error); 1249 #endif 1250 1251 /* knote_detach() drops fdp->fd_lock */ 1252 knote_detach(kn, fdp, false); 1253 goto done; 1254 } 1255 atomic_inc_uint(&kfilter->refcnt); 1256 goto done_ev_add; 1257 } else { 1258 /* No matching knote and the EV_ADD flag is not set. */ 1259 error = ENOENT; 1260 goto doneunlock; 1261 } 1262 } 1263 1264 if (kev->flags & EV_DELETE) { 1265 /* knote_detach() drops fdp->fd_lock */ 1266 knote_detach(kn, fdp, true); 1267 goto done; 1268 } 1269 1270 /* 1271 * The user may change some filter values after the 1272 * initial EV_ADD, but doing so will not reset any 1273 * filter which have already been triggered. 1274 */ 1275 kn->kn_kevent.udata = kev->udata; 1276 KASSERT(kn->kn_fop != NULL); 1277 if (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL) { 1278 mutex_spin_enter(&kq->kq_lock); 1279 (*kn->kn_fop->f_touch)(kn, kev, EVENT_REGISTER); 1280 mutex_spin_exit(&kq->kq_lock); 1281 } else { 1282 kn->kn_sfflags = kev->fflags; 1283 kn->kn_sdata = kev->data; 1284 } 1285 1286 /* 1287 * We can get here if we are trying to attach 1288 * an event to a file descriptor that does not 1289 * support events, and the attach routine is 1290 * broken and does not return an error. 1291 */ 1292 done_ev_add: 1293 KASSERT(kn->kn_fop != NULL); 1294 KASSERT(kn->kn_fop->f_event != NULL); 1295 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1296 rv = (*kn->kn_fop->f_event)(kn, 0); 1297 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1298 if (rv) 1299 knote_activate(kn); 1300 1301 /* disable knote */ 1302 if ((kev->flags & EV_DISABLE)) { 1303 mutex_spin_enter(&kq->kq_lock); 1304 if ((kn->kn_status & KN_DISABLED) == 0) 1305 kn->kn_status |= KN_DISABLED; 1306 mutex_spin_exit(&kq->kq_lock); 1307 } 1308 1309 /* enable knote */ 1310 if ((kev->flags & EV_ENABLE)) { 1311 knote_enqueue(kn); 1312 } 1313 doneunlock: 1314 mutex_exit(&fdp->fd_lock); 1315 done: 1316 rw_exit(&kqueue_filter_lock); 1317 if (newkn != NULL) 1318 kmem_free(newkn, sizeof(*newkn)); 1319 if (fp != NULL) 1320 fd_putfile(fd); 1321 return (error); 1322 } 1323 1324 #if defined(DEBUG) 1325 #define KN_FMT(buf, kn) \ 1326 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf) 1327 1328 static void 1329 kqueue_check(const char *func, size_t line, const struct kqueue *kq) 1330 { 1331 const struct knote *kn; 1332 u_int count; 1333 int nmarker; 1334 char buf[128]; 1335 1336 KASSERT(mutex_owned(&kq->kq_lock)); 1337 KASSERT(KQ_COUNT(kq) < UINT_MAX / 2); 1338 1339 count = 0; 1340 nmarker = 0; 1341 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { 1342 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) { 1343 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s", 1344 func, line, kq, kn, KN_FMT(buf, kn)); 1345 } 1346 if ((kn->kn_status & KN_MARKER) == 0) { 1347 if (kn->kn_kq != kq) { 1348 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s", 1349 func, line, kq, kn, kn->kn_kq, 1350 KN_FMT(buf, kn)); 1351 } 1352 if ((kn->kn_status & KN_ACTIVE) == 0) { 1353 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s", 1354 func, line, kq, kn, KN_FMT(buf, kn)); 1355 } 1356 count++; 1357 if (count > KQ_COUNT(kq)) { 1358 panic("%s,%zu: kq=%p kq->kq_count(%d) != " 1359 "count(%d), nmarker=%d", 1360 func, line, kq, KQ_COUNT(kq), count, 1361 nmarker); 1362 } 1363 } else { 1364 nmarker++; 1365 } 1366 } 1367 } 1368 #define kq_check(a) kqueue_check(__func__, __LINE__, (a)) 1369 #else /* defined(DEBUG) */ 1370 #define kq_check(a) /* nothing */ 1371 #endif /* defined(DEBUG) */ 1372 1373 static void 1374 kqueue_restart(file_t *fp) 1375 { 1376 struct kqueue *kq = fp->f_kqueue; 1377 KASSERT(kq != NULL); 1378 1379 mutex_spin_enter(&kq->kq_lock); 1380 kq->kq_count |= KQ_RESTART; 1381 cv_broadcast(&kq->kq_cv); 1382 mutex_spin_exit(&kq->kq_lock); 1383 } 1384 1385 /* 1386 * Scan through the list of events on fp (for a maximum of maxevents), 1387 * returning the results in to ulistp. Timeout is determined by tsp; if 1388 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait 1389 * as appropriate. 1390 */ 1391 static int 1392 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp, 1393 const struct timespec *tsp, register_t *retval, 1394 const struct kevent_ops *keops, struct kevent *kevbuf, 1395 size_t kevcnt) 1396 { 1397 struct kqueue *kq; 1398 struct kevent *kevp; 1399 struct timespec ats, sleepts; 1400 struct knote *kn, *marker, morker; 1401 size_t count, nkev, nevents; 1402 int timeout, error, touch, rv, influx; 1403 filedesc_t *fdp; 1404 1405 fdp = curlwp->l_fd; 1406 kq = fp->f_kqueue; 1407 count = maxevents; 1408 nkev = nevents = error = 0; 1409 if (count == 0) { 1410 *retval = 0; 1411 return 0; 1412 } 1413 1414 if (tsp) { /* timeout supplied */ 1415 ats = *tsp; 1416 if (inittimeleft(&ats, &sleepts) == -1) { 1417 *retval = maxevents; 1418 return EINVAL; 1419 } 1420 timeout = tstohz(&ats); 1421 if (timeout <= 0) 1422 timeout = -1; /* do poll */ 1423 } else { 1424 /* no timeout, wait forever */ 1425 timeout = 0; 1426 } 1427 1428 memset(&morker, 0, sizeof(morker)); 1429 marker = &morker; 1430 marker->kn_status = KN_MARKER; 1431 mutex_spin_enter(&kq->kq_lock); 1432 retry: 1433 kevp = kevbuf; 1434 if (KQ_COUNT(kq) == 0) { 1435 if (timeout >= 0) { 1436 error = cv_timedwait_sig(&kq->kq_cv, 1437 &kq->kq_lock, timeout); 1438 if (error == 0) { 1439 if (KQ_COUNT(kq) == 0 && 1440 (kq->kq_count & KQ_RESTART)) { 1441 /* return to clear file reference */ 1442 error = ERESTART; 1443 } else if (tsp == NULL || (timeout = 1444 gettimeleft(&ats, &sleepts)) > 0) { 1445 goto retry; 1446 } 1447 } else { 1448 /* don't restart after signals... */ 1449 if (error == ERESTART) 1450 error = EINTR; 1451 if (error == EWOULDBLOCK) 1452 error = 0; 1453 } 1454 } 1455 mutex_spin_exit(&kq->kq_lock); 1456 goto done; 1457 } 1458 1459 /* mark end of knote list */ 1460 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1461 influx = 0; 1462 1463 /* 1464 * Acquire the fdp->fd_lock interlock to avoid races with 1465 * file creation/destruction from other threads. 1466 */ 1467 relock: 1468 mutex_spin_exit(&kq->kq_lock); 1469 mutex_enter(&fdp->fd_lock); 1470 mutex_spin_enter(&kq->kq_lock); 1471 1472 while (count != 0) { 1473 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */ 1474 1475 if ((kn->kn_status & KN_MARKER) != 0 && kn != marker) { 1476 if (influx) { 1477 influx = 0; 1478 KQ_FLUX_WAKEUP(kq); 1479 } 1480 mutex_exit(&fdp->fd_lock); 1481 (void)cv_wait(&kq->kq_cv, &kq->kq_lock); 1482 goto relock; 1483 } 1484 1485 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1486 if (kn == marker) { 1487 /* it's our marker, stop */ 1488 KQ_FLUX_WAKEUP(kq); 1489 if (count == maxevents) { 1490 mutex_exit(&fdp->fd_lock); 1491 goto retry; 1492 } 1493 break; 1494 } 1495 KASSERT((kn->kn_status & KN_BUSY) == 0); 1496 1497 kq_check(kq); 1498 kn->kn_status &= ~KN_QUEUED; 1499 kn->kn_status |= KN_BUSY; 1500 kq_check(kq); 1501 if (kn->kn_status & KN_DISABLED) { 1502 kn->kn_status &= ~KN_BUSY; 1503 kq->kq_count--; 1504 /* don't want disabled events */ 1505 continue; 1506 } 1507 if ((kn->kn_flags & EV_ONESHOT) == 0) { 1508 mutex_spin_exit(&kq->kq_lock); 1509 KASSERT(kn->kn_fop != NULL); 1510 KASSERT(kn->kn_fop->f_event != NULL); 1511 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1512 KASSERT(mutex_owned(&fdp->fd_lock)); 1513 rv = (*kn->kn_fop->f_event)(kn, 0); 1514 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1515 mutex_spin_enter(&kq->kq_lock); 1516 /* Re-poll if note was re-enqueued. */ 1517 if ((kn->kn_status & KN_QUEUED) != 0) { 1518 kn->kn_status &= ~KN_BUSY; 1519 /* Re-enqueue raised kq_count, lower it again */ 1520 kq->kq_count--; 1521 influx = 1; 1522 continue; 1523 } 1524 if (rv == 0) { 1525 /* 1526 * non-ONESHOT event that hasn't 1527 * triggered again, so de-queue. 1528 */ 1529 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); 1530 kq->kq_count--; 1531 influx = 1; 1532 continue; 1533 } 1534 } 1535 KASSERT(kn->kn_fop != NULL); 1536 touch = (!kn->kn_fop->f_isfd && 1537 kn->kn_fop->f_touch != NULL); 1538 /* XXXAD should be got from f_event if !oneshot. */ 1539 if (touch) { 1540 (*kn->kn_fop->f_touch)(kn, kevp, EVENT_PROCESS); 1541 } else { 1542 *kevp = kn->kn_kevent; 1543 } 1544 kevp++; 1545 nkev++; 1546 influx = 1; 1547 if (kn->kn_flags & EV_ONESHOT) { 1548 /* delete ONESHOT events after retrieval */ 1549 kn->kn_status &= ~KN_BUSY; 1550 kq->kq_count--; 1551 mutex_spin_exit(&kq->kq_lock); 1552 knote_detach(kn, fdp, true); 1553 mutex_enter(&fdp->fd_lock); 1554 mutex_spin_enter(&kq->kq_lock); 1555 } else if (kn->kn_flags & EV_CLEAR) { 1556 /* clear state after retrieval */ 1557 kn->kn_data = 0; 1558 kn->kn_fflags = 0; 1559 /* 1560 * Manually clear knotes who weren't 1561 * 'touch'ed. 1562 */ 1563 if (touch == 0) { 1564 kn->kn_data = 0; 1565 kn->kn_fflags = 0; 1566 } 1567 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); 1568 kq->kq_count--; 1569 } else if (kn->kn_flags & EV_DISPATCH) { 1570 kn->kn_status |= KN_DISABLED; 1571 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); 1572 kq->kq_count--; 1573 } else { 1574 /* add event back on list */ 1575 kq_check(kq); 1576 kn->kn_status |= KN_QUEUED; 1577 kn->kn_status &= ~KN_BUSY; 1578 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1579 kq_check(kq); 1580 } 1581 1582 if (nkev == kevcnt) { 1583 /* do copyouts in kevcnt chunks */ 1584 influx = 0; 1585 KQ_FLUX_WAKEUP(kq); 1586 mutex_spin_exit(&kq->kq_lock); 1587 mutex_exit(&fdp->fd_lock); 1588 error = (*keops->keo_put_events) 1589 (keops->keo_private, 1590 kevbuf, ulistp, nevents, nkev); 1591 mutex_enter(&fdp->fd_lock); 1592 mutex_spin_enter(&kq->kq_lock); 1593 nevents += nkev; 1594 nkev = 0; 1595 kevp = kevbuf; 1596 } 1597 count--; 1598 if (error != 0 || count == 0) { 1599 /* remove marker */ 1600 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1601 break; 1602 } 1603 } 1604 KQ_FLUX_WAKEUP(kq); 1605 mutex_spin_exit(&kq->kq_lock); 1606 mutex_exit(&fdp->fd_lock); 1607 1608 done: 1609 if (nkev != 0) { 1610 /* copyout remaining events */ 1611 error = (*keops->keo_put_events)(keops->keo_private, 1612 kevbuf, ulistp, nevents, nkev); 1613 } 1614 *retval = maxevents - count; 1615 1616 return error; 1617 } 1618 1619 /* 1620 * fileops ioctl method for a kqueue descriptor. 1621 * 1622 * Two ioctls are currently supported. They both use struct kfilter_mapping: 1623 * KFILTER_BYNAME find name for filter, and return result in 1624 * name, which is of size len. 1625 * KFILTER_BYFILTER find filter for name. len is ignored. 1626 */ 1627 /*ARGSUSED*/ 1628 static int 1629 kqueue_ioctl(file_t *fp, u_long com, void *data) 1630 { 1631 struct kfilter_mapping *km; 1632 const struct kfilter *kfilter; 1633 char *name; 1634 int error; 1635 1636 km = data; 1637 error = 0; 1638 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP); 1639 1640 switch (com) { 1641 case KFILTER_BYFILTER: /* convert filter -> name */ 1642 rw_enter(&kqueue_filter_lock, RW_READER); 1643 kfilter = kfilter_byfilter(km->filter); 1644 if (kfilter != NULL) { 1645 strlcpy(name, kfilter->name, KFILTER_MAXNAME); 1646 rw_exit(&kqueue_filter_lock); 1647 error = copyoutstr(name, km->name, km->len, NULL); 1648 } else { 1649 rw_exit(&kqueue_filter_lock); 1650 error = ENOENT; 1651 } 1652 break; 1653 1654 case KFILTER_BYNAME: /* convert name -> filter */ 1655 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL); 1656 if (error) { 1657 break; 1658 } 1659 rw_enter(&kqueue_filter_lock, RW_READER); 1660 kfilter = kfilter_byname(name); 1661 if (kfilter != NULL) 1662 km->filter = kfilter->filter; 1663 else 1664 error = ENOENT; 1665 rw_exit(&kqueue_filter_lock); 1666 break; 1667 1668 default: 1669 error = ENOTTY; 1670 break; 1671 1672 } 1673 kmem_free(name, KFILTER_MAXNAME); 1674 return (error); 1675 } 1676 1677 /* 1678 * fileops fcntl method for a kqueue descriptor. 1679 */ 1680 static int 1681 kqueue_fcntl(file_t *fp, u_int com, void *data) 1682 { 1683 1684 return (ENOTTY); 1685 } 1686 1687 /* 1688 * fileops poll method for a kqueue descriptor. 1689 * Determine if kqueue has events pending. 1690 */ 1691 static int 1692 kqueue_poll(file_t *fp, int events) 1693 { 1694 struct kqueue *kq; 1695 int revents; 1696 1697 kq = fp->f_kqueue; 1698 1699 revents = 0; 1700 if (events & (POLLIN | POLLRDNORM)) { 1701 mutex_spin_enter(&kq->kq_lock); 1702 if (KQ_COUNT(kq) != 0) { 1703 revents |= events & (POLLIN | POLLRDNORM); 1704 } else { 1705 selrecord(curlwp, &kq->kq_sel); 1706 } 1707 kq_check(kq); 1708 mutex_spin_exit(&kq->kq_lock); 1709 } 1710 1711 return revents; 1712 } 1713 1714 /* 1715 * fileops stat method for a kqueue descriptor. 1716 * Returns dummy info, with st_size being number of events pending. 1717 */ 1718 static int 1719 kqueue_stat(file_t *fp, struct stat *st) 1720 { 1721 struct kqueue *kq; 1722 1723 kq = fp->f_kqueue; 1724 1725 memset(st, 0, sizeof(*st)); 1726 st->st_size = KQ_COUNT(kq); 1727 st->st_blksize = sizeof(struct kevent); 1728 st->st_mode = S_IFIFO; 1729 1730 return 0; 1731 } 1732 1733 static void 1734 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd) 1735 { 1736 struct knote *kn; 1737 filedesc_t *fdp; 1738 1739 fdp = kq->kq_fdp; 1740 1741 KASSERT(mutex_owned(&fdp->fd_lock)); 1742 1743 for (kn = SLIST_FIRST(list); kn != NULL;) { 1744 if (kq != kn->kn_kq) { 1745 kn = SLIST_NEXT(kn, kn_link); 1746 continue; 1747 } 1748 knote_detach(kn, fdp, true); 1749 mutex_enter(&fdp->fd_lock); 1750 kn = SLIST_FIRST(list); 1751 } 1752 } 1753 1754 1755 /* 1756 * fileops close method for a kqueue descriptor. 1757 */ 1758 static int 1759 kqueue_close(file_t *fp) 1760 { 1761 struct kqueue *kq; 1762 filedesc_t *fdp; 1763 fdfile_t *ff; 1764 int i; 1765 1766 kq = fp->f_kqueue; 1767 fp->f_kqueue = NULL; 1768 fp->f_type = 0; 1769 fdp = curlwp->l_fd; 1770 1771 mutex_enter(&fdp->fd_lock); 1772 for (i = 0; i <= fdp->fd_lastkqfile; i++) { 1773 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL) 1774 continue; 1775 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i); 1776 } 1777 if (fdp->fd_knhashmask != 0) { 1778 for (i = 0; i < fdp->fd_knhashmask + 1; i++) { 1779 kqueue_doclose(kq, &fdp->fd_knhash[i], -1); 1780 } 1781 } 1782 mutex_exit(&fdp->fd_lock); 1783 1784 KASSERT(KQ_COUNT(kq) == 0); 1785 mutex_destroy(&kq->kq_lock); 1786 cv_destroy(&kq->kq_cv); 1787 seldestroy(&kq->kq_sel); 1788 kmem_free(kq, sizeof(*kq)); 1789 1790 return (0); 1791 } 1792 1793 /* 1794 * struct fileops kqfilter method for a kqueue descriptor. 1795 * Event triggered when monitored kqueue changes. 1796 */ 1797 static int 1798 kqueue_kqfilter(file_t *fp, struct knote *kn) 1799 { 1800 struct kqueue *kq; 1801 1802 kq = ((file_t *)kn->kn_obj)->f_kqueue; 1803 1804 KASSERT(fp == kn->kn_obj); 1805 1806 if (kn->kn_filter != EVFILT_READ) 1807 return 1; 1808 1809 kn->kn_fop = &kqread_filtops; 1810 mutex_enter(&kq->kq_lock); 1811 selrecord_knote(&kq->kq_sel, kn); 1812 mutex_exit(&kq->kq_lock); 1813 1814 return 0; 1815 } 1816 1817 1818 /* 1819 * Walk down a list of knotes, activating them if their event has 1820 * triggered. The caller's object lock (e.g. device driver lock) 1821 * must be held. 1822 */ 1823 void 1824 knote(struct klist *list, long hint) 1825 { 1826 struct knote *kn, *tmpkn; 1827 1828 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) { 1829 KASSERT(kn->kn_fop != NULL); 1830 KASSERT(kn->kn_fop->f_event != NULL); 1831 if ((*kn->kn_fop->f_event)(kn, hint)) 1832 knote_activate(kn); 1833 } 1834 } 1835 1836 /* 1837 * Remove all knotes referencing a specified fd 1838 */ 1839 void 1840 knote_fdclose(int fd) 1841 { 1842 struct klist *list; 1843 struct knote *kn; 1844 filedesc_t *fdp; 1845 1846 fdp = curlwp->l_fd; 1847 mutex_enter(&fdp->fd_lock); 1848 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist; 1849 while ((kn = SLIST_FIRST(list)) != NULL) { 1850 knote_detach(kn, fdp, true); 1851 mutex_enter(&fdp->fd_lock); 1852 } 1853 mutex_exit(&fdp->fd_lock); 1854 } 1855 1856 /* 1857 * Drop knote. Called with fdp->fd_lock held, and will drop before 1858 * returning. 1859 */ 1860 static void 1861 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop) 1862 { 1863 struct klist *list; 1864 struct kqueue *kq; 1865 1866 kq = kn->kn_kq; 1867 1868 KASSERT((kn->kn_status & KN_MARKER) == 0); 1869 KASSERT(mutex_owned(&fdp->fd_lock)); 1870 1871 KASSERT(kn->kn_fop != NULL); 1872 /* Remove from monitored object. */ 1873 if (dofop) { 1874 KASSERT(kn->kn_fop->f_detach != NULL); 1875 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1876 (*kn->kn_fop->f_detach)(kn); 1877 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1878 } 1879 1880 /* Remove from descriptor table. */ 1881 if (kn->kn_fop->f_isfd) 1882 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist; 1883 else 1884 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; 1885 1886 SLIST_REMOVE(list, kn, knote, kn_link); 1887 1888 /* Remove from kqueue. */ 1889 again: 1890 mutex_spin_enter(&kq->kq_lock); 1891 if ((kn->kn_status & KN_QUEUED) != 0) { 1892 kq_check(kq); 1893 kq->kq_count--; 1894 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1895 kn->kn_status &= ~KN_QUEUED; 1896 kq_check(kq); 1897 } else if (kn->kn_status & KN_BUSY) { 1898 mutex_spin_exit(&kq->kq_lock); 1899 goto again; 1900 } 1901 mutex_spin_exit(&kq->kq_lock); 1902 1903 mutex_exit(&fdp->fd_lock); 1904 if (kn->kn_fop->f_isfd) 1905 fd_putfile(kn->kn_id); 1906 atomic_dec_uint(&kn->kn_kfilter->refcnt); 1907 kmem_free(kn, sizeof(*kn)); 1908 } 1909 1910 /* 1911 * Queue new event for knote. 1912 */ 1913 static void 1914 knote_enqueue(struct knote *kn) 1915 { 1916 struct kqueue *kq; 1917 1918 KASSERT((kn->kn_status & KN_MARKER) == 0); 1919 1920 kq = kn->kn_kq; 1921 1922 mutex_spin_enter(&kq->kq_lock); 1923 if ((kn->kn_status & KN_DISABLED) != 0) { 1924 kn->kn_status &= ~KN_DISABLED; 1925 } 1926 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) { 1927 kq_check(kq); 1928 kn->kn_status |= KN_QUEUED; 1929 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1930 kq->kq_count++; 1931 kq_check(kq); 1932 cv_broadcast(&kq->kq_cv); 1933 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT); 1934 } 1935 mutex_spin_exit(&kq->kq_lock); 1936 } 1937 /* 1938 * Queue new event for knote. 1939 */ 1940 static void 1941 knote_activate(struct knote *kn) 1942 { 1943 struct kqueue *kq; 1944 1945 KASSERT((kn->kn_status & KN_MARKER) == 0); 1946 1947 kq = kn->kn_kq; 1948 1949 mutex_spin_enter(&kq->kq_lock); 1950 kn->kn_status |= KN_ACTIVE; 1951 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) { 1952 kq_check(kq); 1953 kn->kn_status |= KN_QUEUED; 1954 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1955 kq->kq_count++; 1956 kq_check(kq); 1957 cv_broadcast(&kq->kq_cv); 1958 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT); 1959 } 1960 mutex_spin_exit(&kq->kq_lock); 1961 } 1962