1 /* $NetBSD: kern_event.c,v 1.103 2018/01/12 17:58:51 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.103 2018/01/12 17:58:51 christos Exp $"); 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/kernel.h> 66 #include <sys/wait.h> 67 #include <sys/proc.h> 68 #include <sys/file.h> 69 #include <sys/select.h> 70 #include <sys/queue.h> 71 #include <sys/event.h> 72 #include <sys/eventvar.h> 73 #include <sys/poll.h> 74 #include <sys/kmem.h> 75 #include <sys/stat.h> 76 #include <sys/filedesc.h> 77 #include <sys/syscallargs.h> 78 #include <sys/kauth.h> 79 #include <sys/conf.h> 80 #include <sys/atomic.h> 81 82 static int kqueue_scan(file_t *, size_t, struct kevent *, 83 const struct timespec *, register_t *, 84 const struct kevent_ops *, struct kevent *, 85 size_t); 86 static int kqueue_ioctl(file_t *, u_long, void *); 87 static int kqueue_fcntl(file_t *, u_int, void *); 88 static int kqueue_poll(file_t *, int); 89 static int kqueue_kqfilter(file_t *, struct knote *); 90 static int kqueue_stat(file_t *, struct stat *); 91 static int kqueue_close(file_t *); 92 static int kqueue_register(struct kqueue *, struct kevent *); 93 static void kqueue_doclose(struct kqueue *, struct klist *, int); 94 95 static void knote_detach(struct knote *, filedesc_t *fdp, bool); 96 static void knote_enqueue(struct knote *); 97 static void knote_activate(struct knote *); 98 99 static void filt_kqdetach(struct knote *); 100 static int filt_kqueue(struct knote *, long hint); 101 static int filt_procattach(struct knote *); 102 static void filt_procdetach(struct knote *); 103 static int filt_proc(struct knote *, long hint); 104 static int filt_fileattach(struct knote *); 105 static void filt_timerexpire(void *x); 106 static int filt_timerattach(struct knote *); 107 static void filt_timerdetach(struct knote *); 108 static int filt_timer(struct knote *, long hint); 109 static int filt_fsattach(struct knote *kn); 110 static void filt_fsdetach(struct knote *kn); 111 static int filt_fs(struct knote *kn, long hint); 112 113 static const struct fileops kqueueops = { 114 .fo_name = "kqueue", 115 .fo_read = (void *)enxio, 116 .fo_write = (void *)enxio, 117 .fo_ioctl = kqueue_ioctl, 118 .fo_fcntl = kqueue_fcntl, 119 .fo_poll = kqueue_poll, 120 .fo_stat = kqueue_stat, 121 .fo_close = kqueue_close, 122 .fo_kqfilter = kqueue_kqfilter, 123 .fo_restart = fnullop_restart, 124 }; 125 126 static const struct filterops kqread_filtops = { 127 .f_isfd = 1, 128 .f_attach = NULL, 129 .f_detach = filt_kqdetach, 130 .f_event = filt_kqueue, 131 }; 132 133 static const struct filterops proc_filtops = { 134 .f_isfd = 0, 135 .f_attach = filt_procattach, 136 .f_detach = filt_procdetach, 137 .f_event = filt_proc, 138 }; 139 140 static const struct filterops file_filtops = { 141 .f_isfd = 1, 142 .f_attach = filt_fileattach, 143 .f_detach = NULL, 144 .f_event = NULL, 145 }; 146 147 static const struct filterops timer_filtops = { 148 .f_isfd = 0, 149 .f_attach = filt_timerattach, 150 .f_detach = filt_timerdetach, 151 .f_event = filt_timer, 152 }; 153 154 static const struct filterops fs_filtops = { 155 .f_isfd = 0, 156 .f_attach = filt_fsattach, 157 .f_detach = filt_fsdetach, 158 .f_event = filt_fs, 159 }; 160 161 static u_int kq_ncallouts = 0; 162 static int kq_calloutmax = (4 * 1024); 163 164 #define KN_HASHSIZE 64 /* XXX should be tunable */ 165 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 166 167 extern const struct filterops sig_filtops; 168 169 /* 170 * Table for for all system-defined filters. 171 * These should be listed in the numeric order of the EVFILT_* defines. 172 * If filtops is NULL, the filter isn't implemented in NetBSD. 173 * End of list is when name is NULL. 174 * 175 * Note that 'refcnt' is meaningless for built-in filters. 176 */ 177 struct kfilter { 178 const char *name; /* name of filter */ 179 uint32_t filter; /* id of filter */ 180 unsigned refcnt; /* reference count */ 181 const struct filterops *filtops;/* operations for filter */ 182 size_t namelen; /* length of name string */ 183 }; 184 185 /* System defined filters */ 186 static struct kfilter sys_kfilters[] = { 187 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 }, 188 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, }, 189 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 }, 190 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 }, 191 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 }, 192 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 }, 193 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 }, 194 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 }, 195 { NULL, 0, 0, NULL, 0 }, 196 }; 197 198 /* User defined kfilters */ 199 static struct kfilter *user_kfilters; /* array */ 200 static int user_kfilterc; /* current offset */ 201 static int user_kfiltermaxc; /* max size so far */ 202 static size_t user_kfiltersz; /* size of allocated memory */ 203 204 /* 205 * Global Locks. 206 * 207 * Lock order: 208 * 209 * kqueue_filter_lock 210 * -> kn_kq->kq_fdp->fd_lock 211 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.) 212 * -> kn_kq->kq_lock 213 * 214 * Locking rules: 215 * 216 * f_attach: fdp->fd_lock, KERNEL_LOCK 217 * f_detach: fdp->fd_lock, KERNEL_LOCK 218 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock 219 * f_event via knote: whatever caller guarantees 220 * Typically, f_event(NOTE_SUBMIT) via knote: object lock 221 * f_event(!NOTE_SUBMIT) via knote: nothing, 222 * acquires/releases object lock inside. 223 */ 224 static krwlock_t kqueue_filter_lock; /* lock on filter lists */ 225 static kmutex_t kqueue_misc_lock; /* miscellaneous */ 226 227 static kauth_listener_t kqueue_listener; 228 229 static int 230 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 231 void *arg0, void *arg1, void *arg2, void *arg3) 232 { 233 struct proc *p; 234 int result; 235 236 result = KAUTH_RESULT_DEFER; 237 p = arg0; 238 239 if (action != KAUTH_PROCESS_KEVENT_FILTER) 240 return result; 241 242 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) || 243 ISSET(p->p_flag, PK_SUGID))) 244 return result; 245 246 result = KAUTH_RESULT_ALLOW; 247 248 return result; 249 } 250 251 /* 252 * Initialize the kqueue subsystem. 253 */ 254 void 255 kqueue_init(void) 256 { 257 258 rw_init(&kqueue_filter_lock); 259 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE); 260 261 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, 262 kqueue_listener_cb, NULL); 263 } 264 265 /* 266 * Find kfilter entry by name, or NULL if not found. 267 */ 268 static struct kfilter * 269 kfilter_byname_sys(const char *name) 270 { 271 int i; 272 273 KASSERT(rw_lock_held(&kqueue_filter_lock)); 274 275 for (i = 0; sys_kfilters[i].name != NULL; i++) { 276 if (strcmp(name, sys_kfilters[i].name) == 0) 277 return &sys_kfilters[i]; 278 } 279 return NULL; 280 } 281 282 static struct kfilter * 283 kfilter_byname_user(const char *name) 284 { 285 int i; 286 287 KASSERT(rw_lock_held(&kqueue_filter_lock)); 288 289 /* user filter slots have a NULL name if previously deregistered */ 290 for (i = 0; i < user_kfilterc ; i++) { 291 if (user_kfilters[i].name != NULL && 292 strcmp(name, user_kfilters[i].name) == 0) 293 return &user_kfilters[i]; 294 } 295 return NULL; 296 } 297 298 static struct kfilter * 299 kfilter_byname(const char *name) 300 { 301 struct kfilter *kfilter; 302 303 KASSERT(rw_lock_held(&kqueue_filter_lock)); 304 305 if ((kfilter = kfilter_byname_sys(name)) != NULL) 306 return kfilter; 307 308 return kfilter_byname_user(name); 309 } 310 311 /* 312 * Find kfilter entry by filter id, or NULL if not found. 313 * Assumes entries are indexed in filter id order, for speed. 314 */ 315 static struct kfilter * 316 kfilter_byfilter(uint32_t filter) 317 { 318 struct kfilter *kfilter; 319 320 KASSERT(rw_lock_held(&kqueue_filter_lock)); 321 322 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */ 323 kfilter = &sys_kfilters[filter]; 324 else if (user_kfilters != NULL && 325 filter < EVFILT_SYSCOUNT + user_kfilterc) 326 /* it's a user filter */ 327 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT]; 328 else 329 return (NULL); /* out of range */ 330 KASSERT(kfilter->filter == filter); /* sanity check! */ 331 return (kfilter); 332 } 333 334 /* 335 * Register a new kfilter. Stores the entry in user_kfilters. 336 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise. 337 * If retfilter != NULL, the new filterid is returned in it. 338 */ 339 int 340 kfilter_register(const char *name, const struct filterops *filtops, 341 int *retfilter) 342 { 343 struct kfilter *kfilter; 344 size_t len; 345 int i; 346 347 if (name == NULL || name[0] == '\0' || filtops == NULL) 348 return (EINVAL); /* invalid args */ 349 350 rw_enter(&kqueue_filter_lock, RW_WRITER); 351 if (kfilter_byname(name) != NULL) { 352 rw_exit(&kqueue_filter_lock); 353 return (EEXIST); /* already exists */ 354 } 355 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) { 356 rw_exit(&kqueue_filter_lock); 357 return (EINVAL); /* too many */ 358 } 359 360 for (i = 0; i < user_kfilterc; i++) { 361 kfilter = &user_kfilters[i]; 362 if (kfilter->name == NULL) { 363 /* Previously deregistered slot. Reuse. */ 364 goto reuse; 365 } 366 } 367 368 /* check if need to grow user_kfilters */ 369 if (user_kfilterc + 1 > user_kfiltermaxc) { 370 /* Grow in KFILTER_EXTENT chunks. */ 371 user_kfiltermaxc += KFILTER_EXTENT; 372 len = user_kfiltermaxc * sizeof(*kfilter); 373 kfilter = kmem_alloc(len, KM_SLEEP); 374 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz); 375 if (user_kfilters != NULL) { 376 memcpy(kfilter, user_kfilters, user_kfiltersz); 377 kmem_free(user_kfilters, user_kfiltersz); 378 } 379 user_kfiltersz = len; 380 user_kfilters = kfilter; 381 } 382 /* Adding new slot */ 383 kfilter = &user_kfilters[user_kfilterc++]; 384 reuse: 385 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP); 386 387 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT; 388 389 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP); 390 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops)); 391 392 if (retfilter != NULL) 393 *retfilter = kfilter->filter; 394 rw_exit(&kqueue_filter_lock); 395 396 return (0); 397 } 398 399 /* 400 * Unregister a kfilter previously registered with kfilter_register. 401 * This retains the filter id, but clears the name and frees filtops (filter 402 * operations), so that the number isn't reused during a boot. 403 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise. 404 */ 405 int 406 kfilter_unregister(const char *name) 407 { 408 struct kfilter *kfilter; 409 410 if (name == NULL || name[0] == '\0') 411 return (EINVAL); /* invalid name */ 412 413 rw_enter(&kqueue_filter_lock, RW_WRITER); 414 if (kfilter_byname_sys(name) != NULL) { 415 rw_exit(&kqueue_filter_lock); 416 return (EINVAL); /* can't detach system filters */ 417 } 418 419 kfilter = kfilter_byname_user(name); 420 if (kfilter == NULL) { 421 rw_exit(&kqueue_filter_lock); 422 return (ENOENT); 423 } 424 if (kfilter->refcnt != 0) { 425 rw_exit(&kqueue_filter_lock); 426 return (EBUSY); 427 } 428 429 /* Cast away const (but we know it's safe. */ 430 kmem_free(__UNCONST(kfilter->name), kfilter->namelen); 431 kfilter->name = NULL; /* mark as `not implemented' */ 432 433 if (kfilter->filtops != NULL) { 434 /* Cast away const (but we know it's safe. */ 435 kmem_free(__UNCONST(kfilter->filtops), 436 sizeof(*kfilter->filtops)); 437 kfilter->filtops = NULL; /* mark as `not implemented' */ 438 } 439 rw_exit(&kqueue_filter_lock); 440 441 return (0); 442 } 443 444 445 /* 446 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file 447 * descriptors. Calls fileops kqfilter method for given file descriptor. 448 */ 449 static int 450 filt_fileattach(struct knote *kn) 451 { 452 file_t *fp; 453 454 fp = kn->kn_obj; 455 456 return (*fp->f_ops->fo_kqfilter)(fp, kn); 457 } 458 459 /* 460 * Filter detach method for EVFILT_READ on kqueue descriptor. 461 */ 462 static void 463 filt_kqdetach(struct knote *kn) 464 { 465 struct kqueue *kq; 466 467 kq = ((file_t *)kn->kn_obj)->f_kqueue; 468 469 mutex_spin_enter(&kq->kq_lock); 470 SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext); 471 mutex_spin_exit(&kq->kq_lock); 472 } 473 474 /* 475 * Filter event method for EVFILT_READ on kqueue descriptor. 476 */ 477 /*ARGSUSED*/ 478 static int 479 filt_kqueue(struct knote *kn, long hint) 480 { 481 struct kqueue *kq; 482 int rv; 483 484 kq = ((file_t *)kn->kn_obj)->f_kqueue; 485 486 if (hint != NOTE_SUBMIT) 487 mutex_spin_enter(&kq->kq_lock); 488 kn->kn_data = kq->kq_count; 489 rv = (kn->kn_data > 0); 490 if (hint != NOTE_SUBMIT) 491 mutex_spin_exit(&kq->kq_lock); 492 493 return rv; 494 } 495 496 /* 497 * Filter attach method for EVFILT_PROC. 498 */ 499 static int 500 filt_procattach(struct knote *kn) 501 { 502 struct proc *p; 503 struct lwp *curl; 504 505 curl = curlwp; 506 507 mutex_enter(proc_lock); 508 if (kn->kn_flags & EV_FLAG1) { 509 /* 510 * NOTE_TRACK attaches to the child process too early 511 * for proc_find, so do a raw look up and check the state 512 * explicitly. 513 */ 514 p = proc_find_raw(kn->kn_id); 515 if (p != NULL && p->p_stat != SIDL) 516 p = NULL; 517 } else { 518 p = proc_find(kn->kn_id); 519 } 520 521 if (p == NULL) { 522 mutex_exit(proc_lock); 523 return ESRCH; 524 } 525 526 /* 527 * Fail if it's not owned by you, or the last exec gave us 528 * setuid/setgid privs (unless you're root). 529 */ 530 mutex_enter(p->p_lock); 531 mutex_exit(proc_lock); 532 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER, 533 p, NULL, NULL, NULL) != 0) { 534 mutex_exit(p->p_lock); 535 return EACCES; 536 } 537 538 kn->kn_obj = p; 539 kn->kn_flags |= EV_CLEAR; /* automatically set */ 540 541 /* 542 * internal flag indicating registration done by kernel 543 */ 544 if (kn->kn_flags & EV_FLAG1) { 545 kn->kn_data = kn->kn_sdata; /* ppid */ 546 kn->kn_fflags = NOTE_CHILD; 547 kn->kn_flags &= ~EV_FLAG1; 548 } 549 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); 550 mutex_exit(p->p_lock); 551 552 return 0; 553 } 554 555 /* 556 * Filter detach method for EVFILT_PROC. 557 * 558 * The knote may be attached to a different process, which may exit, 559 * leaving nothing for the knote to be attached to. So when the process 560 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 561 * it will be deleted when read out. However, as part of the knote deletion, 562 * this routine is called, so a check is needed to avoid actually performing 563 * a detach, because the original process might not exist any more. 564 */ 565 static void 566 filt_procdetach(struct knote *kn) 567 { 568 struct proc *p; 569 570 if (kn->kn_status & KN_DETACHED) 571 return; 572 573 p = kn->kn_obj; 574 575 mutex_enter(p->p_lock); 576 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); 577 mutex_exit(p->p_lock); 578 } 579 580 /* 581 * Filter event method for EVFILT_PROC. 582 */ 583 static int 584 filt_proc(struct knote *kn, long hint) 585 { 586 u_int event, fflag; 587 struct kevent kev; 588 struct kqueue *kq; 589 int error; 590 591 event = (u_int)hint & NOTE_PCTRLMASK; 592 kq = kn->kn_kq; 593 fflag = 0; 594 595 /* If the user is interested in this event, record it. */ 596 if (kn->kn_sfflags & event) 597 fflag |= event; 598 599 if (event == NOTE_EXIT) { 600 struct proc *p = kn->kn_obj; 601 602 if (p != NULL) 603 kn->kn_data = P_WAITSTATUS(p); 604 /* 605 * Process is gone, so flag the event as finished. 606 * 607 * Detach the knote from watched process and mark 608 * it as such. We can't leave this to kqueue_scan(), 609 * since the process might not exist by then. And we 610 * have to do this now, since psignal KNOTE() is called 611 * also for zombies and we might end up reading freed 612 * memory if the kevent would already be picked up 613 * and knote g/c'ed. 614 */ 615 filt_procdetach(kn); 616 617 mutex_spin_enter(&kq->kq_lock); 618 kn->kn_status |= KN_DETACHED; 619 /* Mark as ONESHOT, so that the knote it g/c'ed when read */ 620 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 621 kn->kn_fflags |= fflag; 622 mutex_spin_exit(&kq->kq_lock); 623 624 return 1; 625 } 626 627 mutex_spin_enter(&kq->kq_lock); 628 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 629 /* 630 * Process forked, and user wants to track the new process, 631 * so attach a new knote to it, and immediately report an 632 * event with the parent's pid. Register knote with new 633 * process. 634 */ 635 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 636 kev.filter = kn->kn_filter; 637 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 638 kev.fflags = kn->kn_sfflags; 639 kev.data = kn->kn_id; /* parent */ 640 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 641 mutex_spin_exit(&kq->kq_lock); 642 error = kqueue_register(kq, &kev); 643 mutex_spin_enter(&kq->kq_lock); 644 if (error != 0) 645 kn->kn_fflags |= NOTE_TRACKERR; 646 } 647 kn->kn_fflags |= fflag; 648 fflag = kn->kn_fflags; 649 mutex_spin_exit(&kq->kq_lock); 650 651 return fflag != 0; 652 } 653 654 static void 655 filt_timerexpire(void *knx) 656 { 657 struct knote *kn = knx; 658 int tticks; 659 660 mutex_enter(&kqueue_misc_lock); 661 kn->kn_data++; 662 knote_activate(kn); 663 if ((kn->kn_flags & EV_ONESHOT) == 0) { 664 tticks = mstohz(kn->kn_sdata); 665 if (tticks <= 0) 666 tticks = 1; 667 callout_schedule((callout_t *)kn->kn_hook, tticks); 668 } 669 mutex_exit(&kqueue_misc_lock); 670 } 671 672 /* 673 * data contains amount of time to sleep, in milliseconds 674 */ 675 static int 676 filt_timerattach(struct knote *kn) 677 { 678 callout_t *calloutp; 679 struct kqueue *kq; 680 int tticks; 681 682 tticks = mstohz(kn->kn_sdata); 683 684 /* if the supplied value is under our resolution, use 1 tick */ 685 if (tticks == 0) { 686 if (kn->kn_sdata == 0) 687 return EINVAL; 688 tticks = 1; 689 } 690 691 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax || 692 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) { 693 atomic_dec_uint(&kq_ncallouts); 694 return ENOMEM; 695 } 696 callout_init(calloutp, CALLOUT_MPSAFE); 697 698 kq = kn->kn_kq; 699 mutex_spin_enter(&kq->kq_lock); 700 kn->kn_flags |= EV_CLEAR; /* automatically set */ 701 kn->kn_hook = calloutp; 702 mutex_spin_exit(&kq->kq_lock); 703 704 callout_reset(calloutp, tticks, filt_timerexpire, kn); 705 706 return (0); 707 } 708 709 static void 710 filt_timerdetach(struct knote *kn) 711 { 712 callout_t *calloutp; 713 struct kqueue *kq = kn->kn_kq; 714 715 mutex_spin_enter(&kq->kq_lock); 716 /* prevent rescheduling when we expire */ 717 kn->kn_flags |= EV_ONESHOT; 718 mutex_spin_exit(&kq->kq_lock); 719 720 calloutp = (callout_t *)kn->kn_hook; 721 callout_halt(calloutp, NULL); 722 callout_destroy(calloutp); 723 kmem_free(calloutp, sizeof(*calloutp)); 724 atomic_dec_uint(&kq_ncallouts); 725 } 726 727 static int 728 filt_timer(struct knote *kn, long hint) 729 { 730 int rv; 731 732 mutex_enter(&kqueue_misc_lock); 733 rv = (kn->kn_data != 0); 734 mutex_exit(&kqueue_misc_lock); 735 736 return rv; 737 } 738 739 /* 740 * Filter event method for EVFILT_FS. 741 */ 742 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist); 743 744 static int 745 filt_fsattach(struct knote *kn) 746 { 747 748 mutex_enter(&kqueue_misc_lock); 749 kn->kn_flags |= EV_CLEAR; 750 SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext); 751 mutex_exit(&kqueue_misc_lock); 752 753 return 0; 754 } 755 756 static void 757 filt_fsdetach(struct knote *kn) 758 { 759 760 mutex_enter(&kqueue_misc_lock); 761 SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext); 762 mutex_exit(&kqueue_misc_lock); 763 } 764 765 static int 766 filt_fs(struct knote *kn, long hint) 767 { 768 int rv; 769 770 mutex_enter(&kqueue_misc_lock); 771 kn->kn_fflags |= hint; 772 rv = (kn->kn_fflags != 0); 773 mutex_exit(&kqueue_misc_lock); 774 775 return rv; 776 } 777 778 /* 779 * filt_seltrue: 780 * 781 * This filter "event" routine simulates seltrue(). 782 */ 783 int 784 filt_seltrue(struct knote *kn, long hint) 785 { 786 787 /* 788 * We don't know how much data can be read/written, 789 * but we know that it *can* be. This is about as 790 * good as select/poll does as well. 791 */ 792 kn->kn_data = 0; 793 return (1); 794 } 795 796 /* 797 * This provides full kqfilter entry for device switch tables, which 798 * has same effect as filter using filt_seltrue() as filter method. 799 */ 800 static void 801 filt_seltruedetach(struct knote *kn) 802 { 803 /* Nothing to do */ 804 } 805 806 const struct filterops seltrue_filtops = { 807 .f_isfd = 1, 808 .f_attach = NULL, 809 .f_detach = filt_seltruedetach, 810 .f_event = filt_seltrue, 811 }; 812 813 int 814 seltrue_kqfilter(dev_t dev, struct knote *kn) 815 { 816 switch (kn->kn_filter) { 817 case EVFILT_READ: 818 case EVFILT_WRITE: 819 kn->kn_fop = &seltrue_filtops; 820 break; 821 default: 822 return (EINVAL); 823 } 824 825 /* Nothing more to do */ 826 return (0); 827 } 828 829 /* 830 * kqueue(2) system call. 831 */ 832 static int 833 kqueue1(struct lwp *l, int flags, register_t *retval) 834 { 835 struct kqueue *kq; 836 file_t *fp; 837 int fd, error; 838 839 if ((error = fd_allocfile(&fp, &fd)) != 0) 840 return error; 841 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE)); 842 fp->f_type = DTYPE_KQUEUE; 843 fp->f_ops = &kqueueops; 844 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP); 845 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED); 846 cv_init(&kq->kq_cv, "kqueue"); 847 selinit(&kq->kq_sel); 848 TAILQ_INIT(&kq->kq_head); 849 fp->f_kqueue = kq; 850 *retval = fd; 851 kq->kq_fdp = curlwp->l_fd; 852 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0); 853 fd_affix(curproc, fp, fd); 854 return error; 855 } 856 857 /* 858 * kqueue(2) system call. 859 */ 860 int 861 sys_kqueue(struct lwp *l, const void *v, register_t *retval) 862 { 863 return kqueue1(l, 0, retval); 864 } 865 866 int 867 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap, 868 register_t *retval) 869 { 870 /* { 871 syscallarg(int) flags; 872 } */ 873 return kqueue1(l, SCARG(uap, flags), retval); 874 } 875 876 /* 877 * kevent(2) system call. 878 */ 879 int 880 kevent_fetch_changes(void *ctx, const struct kevent *changelist, 881 struct kevent *changes, size_t index, int n) 882 { 883 884 return copyin(changelist + index, changes, n * sizeof(*changes)); 885 } 886 887 int 888 kevent_put_events(void *ctx, struct kevent *events, 889 struct kevent *eventlist, size_t index, int n) 890 { 891 892 return copyout(events, eventlist + index, n * sizeof(*events)); 893 } 894 895 static const struct kevent_ops kevent_native_ops = { 896 .keo_private = NULL, 897 .keo_fetch_timeout = copyin, 898 .keo_fetch_changes = kevent_fetch_changes, 899 .keo_put_events = kevent_put_events, 900 }; 901 902 int 903 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap, 904 register_t *retval) 905 { 906 /* { 907 syscallarg(int) fd; 908 syscallarg(const struct kevent *) changelist; 909 syscallarg(size_t) nchanges; 910 syscallarg(struct kevent *) eventlist; 911 syscallarg(size_t) nevents; 912 syscallarg(const struct timespec *) timeout; 913 } */ 914 915 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist), 916 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents), 917 SCARG(uap, timeout), &kevent_native_ops); 918 } 919 920 int 921 kevent1(register_t *retval, int fd, 922 const struct kevent *changelist, size_t nchanges, 923 struct kevent *eventlist, size_t nevents, 924 const struct timespec *timeout, 925 const struct kevent_ops *keops) 926 { 927 struct kevent *kevp; 928 struct kqueue *kq; 929 struct timespec ts; 930 size_t i, n, ichange; 931 int nerrors, error; 932 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */ 933 file_t *fp; 934 935 /* check that we're dealing with a kq */ 936 fp = fd_getfile(fd); 937 if (fp == NULL) 938 return (EBADF); 939 940 if (fp->f_type != DTYPE_KQUEUE) { 941 fd_putfile(fd); 942 return (EBADF); 943 } 944 945 if (timeout != NULL) { 946 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts)); 947 if (error) 948 goto done; 949 timeout = &ts; 950 } 951 952 kq = fp->f_kqueue; 953 nerrors = 0; 954 ichange = 0; 955 956 /* traverse list of events to register */ 957 while (nchanges > 0) { 958 n = MIN(nchanges, __arraycount(kevbuf)); 959 error = (*keops->keo_fetch_changes)(keops->keo_private, 960 changelist, kevbuf, ichange, n); 961 if (error) 962 goto done; 963 for (i = 0; i < n; i++) { 964 kevp = &kevbuf[i]; 965 kevp->flags &= ~EV_SYSFLAGS; 966 /* register each knote */ 967 error = kqueue_register(kq, kevp); 968 if (!error && !(kevp->flags & EV_RECEIPT)) 969 continue; 970 if (nevents == 0) 971 goto done; 972 kevp->flags = EV_ERROR; 973 kevp->data = error; 974 error = (*keops->keo_put_events) 975 (keops->keo_private, kevp, 976 eventlist, nerrors, 1); 977 if (error) 978 goto done; 979 nevents--; 980 nerrors++; 981 } 982 nchanges -= n; /* update the results */ 983 ichange += n; 984 } 985 if (nerrors) { 986 *retval = nerrors; 987 error = 0; 988 goto done; 989 } 990 991 /* actually scan through the events */ 992 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops, 993 kevbuf, __arraycount(kevbuf)); 994 done: 995 fd_putfile(fd); 996 return (error); 997 } 998 999 /* 1000 * Register a given kevent kev onto the kqueue 1001 */ 1002 static int 1003 kqueue_register(struct kqueue *kq, struct kevent *kev) 1004 { 1005 struct kfilter *kfilter; 1006 filedesc_t *fdp; 1007 file_t *fp; 1008 fdfile_t *ff; 1009 struct knote *kn, *newkn; 1010 struct klist *list; 1011 int error, fd, rv; 1012 1013 fdp = kq->kq_fdp; 1014 fp = NULL; 1015 kn = NULL; 1016 error = 0; 1017 fd = 0; 1018 1019 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP); 1020 1021 rw_enter(&kqueue_filter_lock, RW_READER); 1022 kfilter = kfilter_byfilter(kev->filter); 1023 if (kfilter == NULL || kfilter->filtops == NULL) { 1024 /* filter not found nor implemented */ 1025 rw_exit(&kqueue_filter_lock); 1026 kmem_free(newkn, sizeof(*newkn)); 1027 return (EINVAL); 1028 } 1029 1030 /* search if knote already exists */ 1031 if (kfilter->filtops->f_isfd) { 1032 /* monitoring a file descriptor */ 1033 /* validate descriptor */ 1034 if (kev->ident > INT_MAX 1035 || (fp = fd_getfile(fd = kev->ident)) == NULL) { 1036 rw_exit(&kqueue_filter_lock); 1037 kmem_free(newkn, sizeof(*newkn)); 1038 return EBADF; 1039 } 1040 mutex_enter(&fdp->fd_lock); 1041 ff = fdp->fd_dt->dt_ff[fd]; 1042 if (ff->ff_refcnt & FR_CLOSING) { 1043 error = EBADF; 1044 goto doneunlock; 1045 } 1046 if (fd <= fdp->fd_lastkqfile) { 1047 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) { 1048 if (kq == kn->kn_kq && 1049 kev->filter == kn->kn_filter) 1050 break; 1051 } 1052 } 1053 } else { 1054 /* 1055 * not monitoring a file descriptor, so 1056 * lookup knotes in internal hash table 1057 */ 1058 mutex_enter(&fdp->fd_lock); 1059 if (fdp->fd_knhashmask != 0) { 1060 list = &fdp->fd_knhash[ 1061 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; 1062 SLIST_FOREACH(kn, list, kn_link) { 1063 if (kev->ident == kn->kn_id && 1064 kq == kn->kn_kq && 1065 kev->filter == kn->kn_filter) 1066 break; 1067 } 1068 } 1069 } 1070 1071 /* 1072 * kn now contains the matching knote, or NULL if no match 1073 */ 1074 if (kev->flags & EV_ADD) { 1075 if (kn == NULL) { 1076 /* create new knote */ 1077 kn = newkn; 1078 newkn = NULL; 1079 kn->kn_obj = fp; 1080 kn->kn_id = kev->ident; 1081 kn->kn_kq = kq; 1082 kn->kn_fop = kfilter->filtops; 1083 kn->kn_kfilter = kfilter; 1084 kn->kn_sfflags = kev->fflags; 1085 kn->kn_sdata = kev->data; 1086 kev->fflags = 0; 1087 kev->data = 0; 1088 kn->kn_kevent = *kev; 1089 1090 KASSERT(kn->kn_fop != NULL); 1091 /* 1092 * apply reference count to knote structure, and 1093 * do not release it at the end of this routine. 1094 */ 1095 fp = NULL; 1096 1097 if (!kn->kn_fop->f_isfd) { 1098 /* 1099 * If knote is not on an fd, store on 1100 * internal hash table. 1101 */ 1102 if (fdp->fd_knhashmask == 0) { 1103 /* XXXAD can block with fd_lock held */ 1104 fdp->fd_knhash = hashinit(KN_HASHSIZE, 1105 HASH_LIST, true, 1106 &fdp->fd_knhashmask); 1107 } 1108 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, 1109 fdp->fd_knhashmask)]; 1110 } else { 1111 /* Otherwise, knote is on an fd. */ 1112 list = (struct klist *) 1113 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist; 1114 if ((int)kn->kn_id > fdp->fd_lastkqfile) 1115 fdp->fd_lastkqfile = kn->kn_id; 1116 } 1117 SLIST_INSERT_HEAD(list, kn, kn_link); 1118 1119 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1120 error = (*kfilter->filtops->f_attach)(kn); 1121 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1122 if (error != 0) { 1123 #ifdef DEBUG 1124 const file_t *ft = kn->kn_obj; 1125 uprintf("%s: event type %d not supported for " 1126 "file type %d/%s (error %d)\n", __func__, 1127 kn->kn_filter, ft ? ft->f_type : -1, 1128 ft ? ft->f_ops->fo_name : "?", error); 1129 #endif 1130 1131 /* knote_detach() drops fdp->fd_lock */ 1132 knote_detach(kn, fdp, false); 1133 goto done; 1134 } 1135 atomic_inc_uint(&kfilter->refcnt); 1136 } else { 1137 /* 1138 * The user may change some filter values after the 1139 * initial EV_ADD, but doing so will not reset any 1140 * filter which have already been triggered. 1141 */ 1142 kn->kn_sfflags = kev->fflags; 1143 kn->kn_sdata = kev->data; 1144 kn->kn_kevent.udata = kev->udata; 1145 } 1146 /* 1147 * We can get here if we are trying to attach 1148 * an event to a file descriptor that does not 1149 * support events, and the attach routine is 1150 * broken and does not return an error. 1151 */ 1152 KASSERT(kn->kn_fop != NULL); 1153 KASSERT(kn->kn_fop->f_event != NULL); 1154 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1155 rv = (*kn->kn_fop->f_event)(kn, 0); 1156 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1157 if (rv) 1158 knote_activate(kn); 1159 } else { 1160 if (kn == NULL) { 1161 error = ENOENT; 1162 goto doneunlock; 1163 } 1164 if (kev->flags & EV_DELETE) { 1165 /* knote_detach() drops fdp->fd_lock */ 1166 knote_detach(kn, fdp, true); 1167 goto done; 1168 } 1169 } 1170 1171 /* disable knote */ 1172 if ((kev->flags & EV_DISABLE)) { 1173 mutex_spin_enter(&kq->kq_lock); 1174 if ((kn->kn_status & KN_DISABLED) == 0) 1175 kn->kn_status |= KN_DISABLED; 1176 mutex_spin_exit(&kq->kq_lock); 1177 } 1178 1179 /* enable knote */ 1180 if ((kev->flags & EV_ENABLE)) { 1181 knote_enqueue(kn); 1182 } 1183 doneunlock: 1184 mutex_exit(&fdp->fd_lock); 1185 done: 1186 rw_exit(&kqueue_filter_lock); 1187 if (newkn != NULL) 1188 kmem_free(newkn, sizeof(*newkn)); 1189 if (fp != NULL) 1190 fd_putfile(fd); 1191 return (error); 1192 } 1193 1194 #if defined(DEBUG) 1195 #define KN_FMT(buf, kn) \ 1196 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf) 1197 1198 static void 1199 kqueue_check(const char *func, size_t line, const struct kqueue *kq) 1200 { 1201 const struct knote *kn; 1202 int count; 1203 int nmarker; 1204 char buf[128]; 1205 1206 KASSERT(mutex_owned(&kq->kq_lock)); 1207 KASSERT(kq->kq_count >= 0); 1208 1209 count = 0; 1210 nmarker = 0; 1211 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { 1212 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) { 1213 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s", 1214 func, line, kq, kn, KN_FMT(buf, kn)); 1215 } 1216 if ((kn->kn_status & KN_MARKER) == 0) { 1217 if (kn->kn_kq != kq) { 1218 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s", 1219 func, line, kq, kn, kn->kn_kq, 1220 KN_FMT(buf, kn)); 1221 } 1222 if ((kn->kn_status & KN_ACTIVE) == 0) { 1223 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s", 1224 func, line, kq, kn, KN_FMT(buf, kn)); 1225 } 1226 count++; 1227 if (count > kq->kq_count) { 1228 goto bad; 1229 } 1230 } else { 1231 nmarker++; 1232 #if 0 1233 if (nmarker > 10000) { 1234 panic("%s,%zu: kq=%p too many markers: " 1235 "%d != %d, nmarker=%d", 1236 func, line, kq, kq->kq_count, count, 1237 nmarker); 1238 } 1239 #endif 1240 } 1241 } 1242 if (kq->kq_count != count) { 1243 bad: 1244 panic("%s,%zu: kq=%p kq->kq_count(%d) != count(%d), nmarker=%d", 1245 func, line, kq, kq->kq_count, count, nmarker); 1246 } 1247 } 1248 #define kq_check(a) kqueue_check(__func__, __LINE__, (a)) 1249 #else /* defined(DEBUG) */ 1250 #define kq_check(a) /* nothing */ 1251 #endif /* defined(DEBUG) */ 1252 1253 /* 1254 * Scan through the list of events on fp (for a maximum of maxevents), 1255 * returning the results in to ulistp. Timeout is determined by tsp; if 1256 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait 1257 * as appropriate. 1258 */ 1259 static int 1260 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp, 1261 const struct timespec *tsp, register_t *retval, 1262 const struct kevent_ops *keops, struct kevent *kevbuf, 1263 size_t kevcnt) 1264 { 1265 struct kqueue *kq; 1266 struct kevent *kevp; 1267 struct timespec ats, sleepts; 1268 struct knote *kn, *marker, morker; 1269 size_t count, nkev, nevents; 1270 int timeout, error, rv; 1271 filedesc_t *fdp; 1272 1273 fdp = curlwp->l_fd; 1274 kq = fp->f_kqueue; 1275 count = maxevents; 1276 nkev = nevents = error = 0; 1277 if (count == 0) { 1278 *retval = 0; 1279 return 0; 1280 } 1281 1282 if (tsp) { /* timeout supplied */ 1283 ats = *tsp; 1284 if (inittimeleft(&ats, &sleepts) == -1) { 1285 *retval = maxevents; 1286 return EINVAL; 1287 } 1288 timeout = tstohz(&ats); 1289 if (timeout <= 0) 1290 timeout = -1; /* do poll */ 1291 } else { 1292 /* no timeout, wait forever */ 1293 timeout = 0; 1294 } 1295 1296 memset(&morker, 0, sizeof(morker)); 1297 marker = &morker; 1298 marker->kn_status = KN_MARKER; 1299 mutex_spin_enter(&kq->kq_lock); 1300 retry: 1301 kevp = kevbuf; 1302 if (kq->kq_count == 0) { 1303 if (timeout >= 0) { 1304 error = cv_timedwait_sig(&kq->kq_cv, 1305 &kq->kq_lock, timeout); 1306 if (error == 0) { 1307 if (tsp == NULL || (timeout = 1308 gettimeleft(&ats, &sleepts)) > 0) 1309 goto retry; 1310 } else { 1311 /* don't restart after signals... */ 1312 if (error == ERESTART) 1313 error = EINTR; 1314 if (error == EWOULDBLOCK) 1315 error = 0; 1316 } 1317 } 1318 mutex_spin_exit(&kq->kq_lock); 1319 } else { 1320 /* mark end of knote list */ 1321 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1322 1323 /* 1324 * Acquire the fdp->fd_lock interlock to avoid races with 1325 * file creation/destruction from other threads. 1326 */ 1327 mutex_spin_exit(&kq->kq_lock); 1328 mutex_enter(&fdp->fd_lock); 1329 mutex_spin_enter(&kq->kq_lock); 1330 1331 while (count != 0) { 1332 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */ 1333 while ((kn->kn_status & KN_MARKER) != 0) { 1334 if (kn == marker) { 1335 /* it's our marker, stop */ 1336 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1337 if (count < maxevents || (tsp != NULL && 1338 (timeout = gettimeleft(&ats, 1339 &sleepts)) <= 0)) 1340 goto done; 1341 mutex_exit(&fdp->fd_lock); 1342 goto retry; 1343 } 1344 /* someone else's marker. */ 1345 kn = TAILQ_NEXT(kn, kn_tqe); 1346 } 1347 kq_check(kq); 1348 kq->kq_count--; 1349 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1350 kn->kn_status &= ~KN_QUEUED; 1351 kn->kn_status |= KN_BUSY; 1352 kq_check(kq); 1353 if (kn->kn_status & KN_DISABLED) { 1354 kn->kn_status &= ~KN_BUSY; 1355 /* don't want disabled events */ 1356 continue; 1357 } 1358 if ((kn->kn_flags & EV_ONESHOT) == 0) { 1359 mutex_spin_exit(&kq->kq_lock); 1360 KASSERT(kn->kn_fop != NULL); 1361 KASSERT(kn->kn_fop->f_event != NULL); 1362 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1363 KASSERT(mutex_owned(&fdp->fd_lock)); 1364 rv = (*kn->kn_fop->f_event)(kn, 0); 1365 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1366 mutex_spin_enter(&kq->kq_lock); 1367 /* Re-poll if note was re-enqueued. */ 1368 if ((kn->kn_status & KN_QUEUED) != 0) { 1369 kn->kn_status &= ~KN_BUSY; 1370 continue; 1371 } 1372 if (rv == 0) { 1373 /* 1374 * non-ONESHOT event that hasn't 1375 * triggered again, so de-queue. 1376 */ 1377 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); 1378 continue; 1379 } 1380 } 1381 /* XXXAD should be got from f_event if !oneshot. */ 1382 *kevp++ = kn->kn_kevent; 1383 nkev++; 1384 if (kn->kn_flags & EV_ONESHOT) { 1385 /* delete ONESHOT events after retrieval */ 1386 kn->kn_status &= ~KN_BUSY; 1387 mutex_spin_exit(&kq->kq_lock); 1388 knote_detach(kn, fdp, true); 1389 mutex_enter(&fdp->fd_lock); 1390 mutex_spin_enter(&kq->kq_lock); 1391 } else if (kn->kn_flags & EV_CLEAR) { 1392 /* clear state after retrieval */ 1393 kn->kn_data = 0; 1394 kn->kn_fflags = 0; 1395 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY); 1396 } else if (kn->kn_flags & EV_DISPATCH) { 1397 kn->kn_status |= KN_DISABLED; 1398 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY); 1399 } else { 1400 /* add event back on list */ 1401 kq_check(kq); 1402 kn->kn_status |= KN_QUEUED; 1403 kn->kn_status &= ~KN_BUSY; 1404 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1405 kq->kq_count++; 1406 kq_check(kq); 1407 } 1408 if (nkev == kevcnt) { 1409 /* do copyouts in kevcnt chunks */ 1410 mutex_spin_exit(&kq->kq_lock); 1411 mutex_exit(&fdp->fd_lock); 1412 error = (*keops->keo_put_events) 1413 (keops->keo_private, 1414 kevbuf, ulistp, nevents, nkev); 1415 mutex_enter(&fdp->fd_lock); 1416 mutex_spin_enter(&kq->kq_lock); 1417 nevents += nkev; 1418 nkev = 0; 1419 kevp = kevbuf; 1420 } 1421 count--; 1422 if (error != 0 || count == 0) { 1423 /* remove marker */ 1424 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1425 break; 1426 } 1427 } 1428 done: 1429 mutex_spin_exit(&kq->kq_lock); 1430 mutex_exit(&fdp->fd_lock); 1431 } 1432 if (nkev != 0) { 1433 /* copyout remaining events */ 1434 error = (*keops->keo_put_events)(keops->keo_private, 1435 kevbuf, ulistp, nevents, nkev); 1436 } 1437 *retval = maxevents - count; 1438 1439 return error; 1440 } 1441 1442 /* 1443 * fileops ioctl method for a kqueue descriptor. 1444 * 1445 * Two ioctls are currently supported. They both use struct kfilter_mapping: 1446 * KFILTER_BYNAME find name for filter, and return result in 1447 * name, which is of size len. 1448 * KFILTER_BYFILTER find filter for name. len is ignored. 1449 */ 1450 /*ARGSUSED*/ 1451 static int 1452 kqueue_ioctl(file_t *fp, u_long com, void *data) 1453 { 1454 struct kfilter_mapping *km; 1455 const struct kfilter *kfilter; 1456 char *name; 1457 int error; 1458 1459 km = data; 1460 error = 0; 1461 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP); 1462 1463 switch (com) { 1464 case KFILTER_BYFILTER: /* convert filter -> name */ 1465 rw_enter(&kqueue_filter_lock, RW_READER); 1466 kfilter = kfilter_byfilter(km->filter); 1467 if (kfilter != NULL) { 1468 strlcpy(name, kfilter->name, KFILTER_MAXNAME); 1469 rw_exit(&kqueue_filter_lock); 1470 error = copyoutstr(name, km->name, km->len, NULL); 1471 } else { 1472 rw_exit(&kqueue_filter_lock); 1473 error = ENOENT; 1474 } 1475 break; 1476 1477 case KFILTER_BYNAME: /* convert name -> filter */ 1478 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL); 1479 if (error) { 1480 break; 1481 } 1482 rw_enter(&kqueue_filter_lock, RW_READER); 1483 kfilter = kfilter_byname(name); 1484 if (kfilter != NULL) 1485 km->filter = kfilter->filter; 1486 else 1487 error = ENOENT; 1488 rw_exit(&kqueue_filter_lock); 1489 break; 1490 1491 default: 1492 error = ENOTTY; 1493 break; 1494 1495 } 1496 kmem_free(name, KFILTER_MAXNAME); 1497 return (error); 1498 } 1499 1500 /* 1501 * fileops fcntl method for a kqueue descriptor. 1502 */ 1503 static int 1504 kqueue_fcntl(file_t *fp, u_int com, void *data) 1505 { 1506 1507 return (ENOTTY); 1508 } 1509 1510 /* 1511 * fileops poll method for a kqueue descriptor. 1512 * Determine if kqueue has events pending. 1513 */ 1514 static int 1515 kqueue_poll(file_t *fp, int events) 1516 { 1517 struct kqueue *kq; 1518 int revents; 1519 1520 kq = fp->f_kqueue; 1521 1522 revents = 0; 1523 if (events & (POLLIN | POLLRDNORM)) { 1524 mutex_spin_enter(&kq->kq_lock); 1525 if (kq->kq_count != 0) { 1526 revents |= events & (POLLIN | POLLRDNORM); 1527 } else { 1528 selrecord(curlwp, &kq->kq_sel); 1529 } 1530 kq_check(kq); 1531 mutex_spin_exit(&kq->kq_lock); 1532 } 1533 1534 return revents; 1535 } 1536 1537 /* 1538 * fileops stat method for a kqueue descriptor. 1539 * Returns dummy info, with st_size being number of events pending. 1540 */ 1541 static int 1542 kqueue_stat(file_t *fp, struct stat *st) 1543 { 1544 struct kqueue *kq; 1545 1546 kq = fp->f_kqueue; 1547 1548 memset(st, 0, sizeof(*st)); 1549 st->st_size = kq->kq_count; 1550 st->st_blksize = sizeof(struct kevent); 1551 st->st_mode = S_IFIFO; 1552 1553 return 0; 1554 } 1555 1556 static void 1557 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd) 1558 { 1559 struct knote *kn; 1560 filedesc_t *fdp; 1561 1562 fdp = kq->kq_fdp; 1563 1564 KASSERT(mutex_owned(&fdp->fd_lock)); 1565 1566 for (kn = SLIST_FIRST(list); kn != NULL;) { 1567 if (kq != kn->kn_kq) { 1568 kn = SLIST_NEXT(kn, kn_link); 1569 continue; 1570 } 1571 knote_detach(kn, fdp, true); 1572 mutex_enter(&fdp->fd_lock); 1573 kn = SLIST_FIRST(list); 1574 } 1575 } 1576 1577 1578 /* 1579 * fileops close method for a kqueue descriptor. 1580 */ 1581 static int 1582 kqueue_close(file_t *fp) 1583 { 1584 struct kqueue *kq; 1585 filedesc_t *fdp; 1586 fdfile_t *ff; 1587 int i; 1588 1589 kq = fp->f_kqueue; 1590 fp->f_kqueue = NULL; 1591 fp->f_type = 0; 1592 fdp = curlwp->l_fd; 1593 1594 mutex_enter(&fdp->fd_lock); 1595 for (i = 0; i <= fdp->fd_lastkqfile; i++) { 1596 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL) 1597 continue; 1598 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i); 1599 } 1600 if (fdp->fd_knhashmask != 0) { 1601 for (i = 0; i < fdp->fd_knhashmask + 1; i++) { 1602 kqueue_doclose(kq, &fdp->fd_knhash[i], -1); 1603 } 1604 } 1605 mutex_exit(&fdp->fd_lock); 1606 1607 KASSERT(kq->kq_count == 0); 1608 mutex_destroy(&kq->kq_lock); 1609 cv_destroy(&kq->kq_cv); 1610 seldestroy(&kq->kq_sel); 1611 kmem_free(kq, sizeof(*kq)); 1612 1613 return (0); 1614 } 1615 1616 /* 1617 * struct fileops kqfilter method for a kqueue descriptor. 1618 * Event triggered when monitored kqueue changes. 1619 */ 1620 static int 1621 kqueue_kqfilter(file_t *fp, struct knote *kn) 1622 { 1623 struct kqueue *kq; 1624 1625 kq = ((file_t *)kn->kn_obj)->f_kqueue; 1626 1627 KASSERT(fp == kn->kn_obj); 1628 1629 if (kn->kn_filter != EVFILT_READ) 1630 return 1; 1631 1632 kn->kn_fop = &kqread_filtops; 1633 mutex_enter(&kq->kq_lock); 1634 SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext); 1635 mutex_exit(&kq->kq_lock); 1636 1637 return 0; 1638 } 1639 1640 1641 /* 1642 * Walk down a list of knotes, activating them if their event has 1643 * triggered. The caller's object lock (e.g. device driver lock) 1644 * must be held. 1645 */ 1646 void 1647 knote(struct klist *list, long hint) 1648 { 1649 struct knote *kn, *tmpkn; 1650 1651 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) { 1652 KASSERT(kn->kn_fop != NULL); 1653 KASSERT(kn->kn_fop->f_event != NULL); 1654 if ((*kn->kn_fop->f_event)(kn, hint)) 1655 knote_activate(kn); 1656 } 1657 } 1658 1659 /* 1660 * Remove all knotes referencing a specified fd 1661 */ 1662 void 1663 knote_fdclose(int fd) 1664 { 1665 struct klist *list; 1666 struct knote *kn; 1667 filedesc_t *fdp; 1668 1669 fdp = curlwp->l_fd; 1670 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist; 1671 mutex_enter(&fdp->fd_lock); 1672 while ((kn = SLIST_FIRST(list)) != NULL) { 1673 knote_detach(kn, fdp, true); 1674 mutex_enter(&fdp->fd_lock); 1675 } 1676 mutex_exit(&fdp->fd_lock); 1677 } 1678 1679 /* 1680 * Drop knote. Called with fdp->fd_lock held, and will drop before 1681 * returning. 1682 */ 1683 static void 1684 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop) 1685 { 1686 struct klist *list; 1687 struct kqueue *kq; 1688 1689 kq = kn->kn_kq; 1690 1691 KASSERT((kn->kn_status & KN_MARKER) == 0); 1692 KASSERT(mutex_owned(&fdp->fd_lock)); 1693 1694 KASSERT(kn->kn_fop != NULL); 1695 /* Remove from monitored object. */ 1696 if (dofop) { 1697 KASSERT(kn->kn_fop->f_detach != NULL); 1698 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1699 (*kn->kn_fop->f_detach)(kn); 1700 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1701 } 1702 1703 /* Remove from descriptor table. */ 1704 if (kn->kn_fop->f_isfd) 1705 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist; 1706 else 1707 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; 1708 1709 SLIST_REMOVE(list, kn, knote, kn_link); 1710 1711 /* Remove from kqueue. */ 1712 again: 1713 mutex_spin_enter(&kq->kq_lock); 1714 if ((kn->kn_status & KN_QUEUED) != 0) { 1715 kq_check(kq); 1716 kq->kq_count--; 1717 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1718 kn->kn_status &= ~KN_QUEUED; 1719 kq_check(kq); 1720 } else if (kn->kn_status & KN_BUSY) { 1721 mutex_spin_exit(&kq->kq_lock); 1722 goto again; 1723 } 1724 mutex_spin_exit(&kq->kq_lock); 1725 1726 mutex_exit(&fdp->fd_lock); 1727 if (kn->kn_fop->f_isfd) 1728 fd_putfile(kn->kn_id); 1729 atomic_dec_uint(&kn->kn_kfilter->refcnt); 1730 kmem_free(kn, sizeof(*kn)); 1731 } 1732 1733 /* 1734 * Queue new event for knote. 1735 */ 1736 static void 1737 knote_enqueue(struct knote *kn) 1738 { 1739 struct kqueue *kq; 1740 1741 KASSERT((kn->kn_status & KN_MARKER) == 0); 1742 1743 kq = kn->kn_kq; 1744 1745 mutex_spin_enter(&kq->kq_lock); 1746 if ((kn->kn_status & KN_DISABLED) != 0) { 1747 kn->kn_status &= ~KN_DISABLED; 1748 } 1749 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) { 1750 kq_check(kq); 1751 kn->kn_status |= KN_QUEUED; 1752 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1753 kq->kq_count++; 1754 kq_check(kq); 1755 cv_broadcast(&kq->kq_cv); 1756 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT); 1757 } 1758 mutex_spin_exit(&kq->kq_lock); 1759 } 1760 /* 1761 * Queue new event for knote. 1762 */ 1763 static void 1764 knote_activate(struct knote *kn) 1765 { 1766 struct kqueue *kq; 1767 1768 KASSERT((kn->kn_status & KN_MARKER) == 0); 1769 1770 kq = kn->kn_kq; 1771 1772 mutex_spin_enter(&kq->kq_lock); 1773 kn->kn_status |= KN_ACTIVE; 1774 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) { 1775 kq_check(kq); 1776 kn->kn_status |= KN_QUEUED; 1777 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1778 kq->kq_count++; 1779 kq_check(kq); 1780 cv_broadcast(&kq->kq_cv); 1781 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT); 1782 } 1783 mutex_spin_exit(&kq->kq_lock); 1784 } 1785