1 /* $OpenBSD: kern_event.c,v 1.200 2024/08/06 08:44:54 claudio Exp $ */ 2 3 /*- 4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $ 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/proc.h> 34 #include <sys/pledge.h> 35 #include <sys/malloc.h> 36 #include <sys/file.h> 37 #include <sys/filedesc.h> 38 #include <sys/fcntl.h> 39 #include <sys/queue.h> 40 #include <sys/event.h> 41 #include <sys/eventvar.h> 42 #include <sys/ktrace.h> 43 #include <sys/pool.h> 44 #include <sys/stat.h> 45 #include <sys/mount.h> 46 #include <sys/syscallargs.h> 47 #include <sys/time.h> 48 #include <sys/timeout.h> 49 #include <sys/vnode.h> 50 #include <sys/wait.h> 51 52 #ifdef DIAGNOSTIC 53 #define KLIST_ASSERT_LOCKED(kl) do { \ 54 if ((kl)->kl_ops != NULL) \ 55 (kl)->kl_ops->klo_assertlk((kl)->kl_arg); \ 56 else \ 57 KERNEL_ASSERT_LOCKED(); \ 58 } while (0) 59 #else 60 #define KLIST_ASSERT_LOCKED(kl) ((void)(kl)) 61 #endif 62 63 int dokqueue(struct proc *, int, register_t *); 64 struct kqueue *kqueue_alloc(struct filedesc *); 65 void kqueue_terminate(struct proc *p, struct kqueue *); 66 void KQREF(struct kqueue *); 67 void KQRELE(struct kqueue *); 68 69 void kqueue_purge(struct proc *, struct kqueue *); 70 int kqueue_sleep(struct kqueue *, struct timespec *); 71 72 int kqueue_read(struct file *, struct uio *, int); 73 int kqueue_write(struct file *, struct uio *, int); 74 int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 75 struct proc *p); 76 int kqueue_kqfilter(struct file *fp, struct knote *kn); 77 int kqueue_stat(struct file *fp, struct stat *st, struct proc *p); 78 int kqueue_close(struct file *fp, struct proc *p); 79 void kqueue_wakeup(struct kqueue *kq); 80 81 #ifdef KQUEUE_DEBUG 82 void kqueue_do_check(struct kqueue *kq, const char *func, int line); 83 #define kqueue_check(kq) kqueue_do_check((kq), __func__, __LINE__) 84 #else 85 #define kqueue_check(kq) do {} while (0) 86 #endif 87 88 static int filter_attach(struct knote *kn); 89 static void filter_detach(struct knote *kn); 90 static int filter_event(struct knote *kn, long hint); 91 static int filter_modify(struct kevent *kev, struct knote *kn); 92 static int filter_process(struct knote *kn, struct kevent *kev); 93 static void kqueue_expand_hash(struct kqueue *kq); 94 static void kqueue_expand_list(struct kqueue *kq, int fd); 95 static void kqueue_task(void *); 96 static int klist_lock(struct klist *); 97 static void klist_unlock(struct klist *, int); 98 99 const struct fileops kqueueops = { 100 .fo_read = kqueue_read, 101 .fo_write = kqueue_write, 102 .fo_ioctl = kqueue_ioctl, 103 .fo_kqfilter = kqueue_kqfilter, 104 .fo_stat = kqueue_stat, 105 .fo_close = kqueue_close 106 }; 107 108 void knote_attach(struct knote *kn); 109 void knote_detach(struct knote *kn); 110 void knote_drop(struct knote *kn, struct proc *p); 111 void knote_enqueue(struct knote *kn); 112 void knote_dequeue(struct knote *kn); 113 int knote_acquire(struct knote *kn, struct klist *, int); 114 void knote_release(struct knote *kn); 115 void knote_activate(struct knote *kn); 116 void knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, 117 int idx, int purge); 118 119 void filt_kqdetach(struct knote *kn); 120 int filt_kqueue(struct knote *kn, long hint); 121 int filt_kqueuemodify(struct kevent *kev, struct knote *kn); 122 int filt_kqueueprocess(struct knote *kn, struct kevent *kev); 123 int filt_kqueue_common(struct knote *kn, struct kqueue *kq); 124 int filt_procattach(struct knote *kn); 125 void filt_procdetach(struct knote *kn); 126 int filt_proc(struct knote *kn, long hint); 127 int filt_procmodify(struct kevent *kev, struct knote *kn); 128 int filt_procprocess(struct knote *kn, struct kevent *kev); 129 int filt_sigattach(struct knote *kn); 130 void filt_sigdetach(struct knote *kn); 131 int filt_signal(struct knote *kn, long hint); 132 int filt_fileattach(struct knote *kn); 133 void filt_timerexpire(void *knx); 134 int filt_timerattach(struct knote *kn); 135 void filt_timerdetach(struct knote *kn); 136 int filt_timermodify(struct kevent *kev, struct knote *kn); 137 int filt_timerprocess(struct knote *kn, struct kevent *kev); 138 void filt_seltruedetach(struct knote *kn); 139 140 const struct filterops kqread_filtops = { 141 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 142 .f_attach = NULL, 143 .f_detach = filt_kqdetach, 144 .f_event = filt_kqueue, 145 .f_modify = filt_kqueuemodify, 146 .f_process = filt_kqueueprocess, 147 }; 148 149 const struct filterops proc_filtops = { 150 .f_flags = FILTEROP_MPSAFE, 151 .f_attach = filt_procattach, 152 .f_detach = filt_procdetach, 153 .f_event = filt_proc, 154 .f_modify = filt_procmodify, 155 .f_process = filt_procprocess, 156 }; 157 158 const struct filterops sig_filtops = { 159 .f_flags = FILTEROP_MPSAFE, 160 .f_attach = filt_sigattach, 161 .f_detach = filt_sigdetach, 162 .f_event = filt_signal, 163 .f_modify = filt_procmodify, 164 .f_process = filt_procprocess, 165 }; 166 167 const struct filterops file_filtops = { 168 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 169 .f_attach = filt_fileattach, 170 .f_detach = NULL, 171 .f_event = NULL, 172 }; 173 174 const struct filterops timer_filtops = { 175 .f_flags = 0, 176 .f_attach = filt_timerattach, 177 .f_detach = filt_timerdetach, 178 .f_event = NULL, 179 .f_modify = filt_timermodify, 180 .f_process = filt_timerprocess, 181 }; 182 183 struct pool knote_pool; 184 struct pool kqueue_pool; 185 struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); 186 struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl"); 187 int kq_ntimeouts = 0; 188 int kq_timeoutmax = (4 * 1024); 189 190 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 191 192 /* 193 * Table for all system-defined filters. 194 */ 195 const struct filterops *const sysfilt_ops[] = { 196 &file_filtops, /* EVFILT_READ */ 197 &file_filtops, /* EVFILT_WRITE */ 198 NULL, /*&aio_filtops,*/ /* EVFILT_AIO */ 199 &file_filtops, /* EVFILT_VNODE */ 200 &proc_filtops, /* EVFILT_PROC */ 201 &sig_filtops, /* EVFILT_SIGNAL */ 202 &timer_filtops, /* EVFILT_TIMER */ 203 &file_filtops, /* EVFILT_DEVICE */ 204 &file_filtops, /* EVFILT_EXCEPT */ 205 }; 206 207 void 208 KQREF(struct kqueue *kq) 209 { 210 refcnt_take(&kq->kq_refcnt); 211 } 212 213 void 214 KQRELE(struct kqueue *kq) 215 { 216 struct filedesc *fdp; 217 218 if (refcnt_rele(&kq->kq_refcnt) == 0) 219 return; 220 221 fdp = kq->kq_fdp; 222 if (rw_status(&fdp->fd_lock) == RW_WRITE) { 223 LIST_REMOVE(kq, kq_next); 224 } else { 225 fdplock(fdp); 226 LIST_REMOVE(kq, kq_next); 227 fdpunlock(fdp); 228 } 229 230 KASSERT(TAILQ_EMPTY(&kq->kq_head)); 231 KASSERT(kq->kq_nknotes == 0); 232 233 free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * 234 sizeof(struct knlist)); 235 hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT); 236 klist_free(&kq->kq_klist); 237 pool_put(&kqueue_pool, kq); 238 } 239 240 void 241 kqueue_init(void) 242 { 243 pool_init(&kqueue_pool, sizeof(struct kqueue), 0, IPL_MPFLOOR, 244 PR_WAITOK, "kqueuepl", NULL); 245 pool_init(&knote_pool, sizeof(struct knote), 0, IPL_MPFLOOR, 246 PR_WAITOK, "knotepl", NULL); 247 } 248 249 void 250 kqueue_init_percpu(void) 251 { 252 pool_cache_init(&knote_pool); 253 } 254 255 int 256 filt_fileattach(struct knote *kn) 257 { 258 struct file *fp = kn->kn_fp; 259 260 return fp->f_ops->fo_kqfilter(fp, kn); 261 } 262 263 int 264 kqueue_kqfilter(struct file *fp, struct knote *kn) 265 { 266 struct kqueue *kq = kn->kn_fp->f_data; 267 268 if (kn->kn_filter != EVFILT_READ) 269 return (EINVAL); 270 271 kn->kn_fop = &kqread_filtops; 272 klist_insert(&kq->kq_klist, kn); 273 return (0); 274 } 275 276 void 277 filt_kqdetach(struct knote *kn) 278 { 279 struct kqueue *kq = kn->kn_fp->f_data; 280 281 klist_remove(&kq->kq_klist, kn); 282 } 283 284 int 285 filt_kqueue_common(struct knote *kn, struct kqueue *kq) 286 { 287 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 288 289 kn->kn_data = kq->kq_count; 290 291 return (kn->kn_data > 0); 292 } 293 294 int 295 filt_kqueue(struct knote *kn, long hint) 296 { 297 struct kqueue *kq = kn->kn_fp->f_data; 298 int active; 299 300 mtx_enter(&kq->kq_lock); 301 active = filt_kqueue_common(kn, kq); 302 mtx_leave(&kq->kq_lock); 303 304 return (active); 305 } 306 307 int 308 filt_kqueuemodify(struct kevent *kev, struct knote *kn) 309 { 310 struct kqueue *kq = kn->kn_fp->f_data; 311 int active; 312 313 mtx_enter(&kq->kq_lock); 314 knote_assign(kev, kn); 315 active = filt_kqueue_common(kn, kq); 316 mtx_leave(&kq->kq_lock); 317 318 return (active); 319 } 320 321 int 322 filt_kqueueprocess(struct knote *kn, struct kevent *kev) 323 { 324 struct kqueue *kq = kn->kn_fp->f_data; 325 int active; 326 327 mtx_enter(&kq->kq_lock); 328 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 329 active = 1; 330 else 331 active = filt_kqueue_common(kn, kq); 332 if (active) 333 knote_submit(kn, kev); 334 mtx_leave(&kq->kq_lock); 335 336 return (active); 337 } 338 339 int 340 filt_procattach(struct knote *kn) 341 { 342 struct process *pr; 343 int nolock; 344 345 if ((curproc->p_p->ps_flags & PS_PLEDGE) && 346 (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0) 347 return pledge_fail(curproc, EPERM, PLEDGE_PROC); 348 349 if (kn->kn_id > PID_MAX) 350 return ESRCH; 351 352 KERNEL_LOCK(); 353 pr = prfind(kn->kn_id); 354 if (pr == NULL) 355 goto fail; 356 357 /* exiting processes can't be specified */ 358 if (pr->ps_flags & PS_EXITING) 359 goto fail; 360 361 kn->kn_ptr.p_process = pr; 362 kn->kn_flags |= EV_CLEAR; /* automatically set */ 363 364 /* 365 * internal flag indicating registration done by kernel 366 */ 367 if (kn->kn_flags & EV_FLAG1) { 368 kn->kn_data = kn->kn_sdata; /* ppid */ 369 kn->kn_fflags = NOTE_CHILD; 370 kn->kn_flags &= ~EV_FLAG1; 371 rw_assert_wrlock(&kqueue_ps_list_lock); 372 } 373 374 /* this needs both the ps_mtx and exclusive kqueue_ps_list_lock. */ 375 nolock = (rw_status(&kqueue_ps_list_lock) == RW_WRITE); 376 if (!nolock) 377 rw_enter_write(&kqueue_ps_list_lock); 378 mtx_enter(&pr->ps_mtx); 379 klist_insert_locked(&pr->ps_klist, kn); 380 mtx_leave(&pr->ps_mtx); 381 if (!nolock) 382 rw_exit_write(&kqueue_ps_list_lock); 383 384 KERNEL_UNLOCK(); 385 386 return (0); 387 388 fail: 389 KERNEL_UNLOCK(); 390 return (ESRCH); 391 } 392 393 /* 394 * The knote may be attached to a different process, which may exit, 395 * leaving nothing for the knote to be attached to. So when the process 396 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 397 * it will be deleted when read out. However, as part of the knote deletion, 398 * this routine is called, so a check is needed to avoid actually performing 399 * a detach, because the original process does not exist any more. 400 */ 401 void 402 filt_procdetach(struct knote *kn) 403 { 404 struct process *pr = kn->kn_ptr.p_process; 405 int status; 406 407 /* this needs both the ps_mtx and exclusive kqueue_ps_list_lock. */ 408 rw_enter_write(&kqueue_ps_list_lock); 409 mtx_enter(&pr->ps_mtx); 410 status = kn->kn_status; 411 412 if ((status & KN_DETACHED) == 0) 413 klist_remove_locked(&pr->ps_klist, kn); 414 415 mtx_leave(&pr->ps_mtx); 416 rw_exit_write(&kqueue_ps_list_lock); 417 } 418 419 int 420 filt_proc(struct knote *kn, long hint) 421 { 422 struct process *pr = kn->kn_ptr.p_process; 423 struct kqueue *kq = kn->kn_kq; 424 u_int event; 425 426 /* 427 * mask off extra data 428 */ 429 event = (u_int)hint & NOTE_PCTRLMASK; 430 431 /* 432 * if the user is interested in this event, record it. 433 */ 434 if (kn->kn_sfflags & event) 435 kn->kn_fflags |= event; 436 437 /* 438 * process is gone, so flag the event as finished and remove it 439 * from the process's klist 440 */ 441 if (event == NOTE_EXIT) { 442 struct process *pr = kn->kn_ptr.p_process; 443 444 mtx_enter(&kq->kq_lock); 445 kn->kn_status |= KN_DETACHED; 446 mtx_leave(&kq->kq_lock); 447 448 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 449 kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig); 450 klist_remove_locked(&pr->ps_klist, kn); 451 return (1); 452 } 453 454 /* 455 * process forked, and user wants to track the new process, 456 * so attach a new knote to it, and immediately report an 457 * event with the parent's pid. 458 */ 459 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 460 struct kevent kev; 461 int error; 462 463 /* 464 * register knote with new process. 465 */ 466 memset(&kev, 0, sizeof(kev)); 467 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 468 kev.filter = kn->kn_filter; 469 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 470 kev.fflags = kn->kn_sfflags; 471 kev.data = kn->kn_id; /* parent */ 472 kev.udata = kn->kn_udata; /* preserve udata */ 473 474 rw_assert_wrlock(&kqueue_ps_list_lock); 475 mtx_leave(&pr->ps_mtx); 476 error = kqueue_register(kq, &kev, 0, NULL); 477 mtx_enter(&pr->ps_mtx); 478 479 if (error) 480 kn->kn_fflags |= NOTE_TRACKERR; 481 } 482 483 return (kn->kn_fflags != 0); 484 } 485 486 int 487 filt_procmodify(struct kevent *kev, struct knote *kn) 488 { 489 struct process *pr = kn->kn_ptr.p_process; 490 int active; 491 492 mtx_enter(&pr->ps_mtx); 493 active = knote_modify(kev, kn); 494 mtx_leave(&pr->ps_mtx); 495 496 return (active); 497 } 498 499 /* 500 * By default only grab the mutex here. If the event requires extra protection 501 * because it alters the klist (NOTE_EXIT, NOTE_FORK the caller of the knote 502 * needs to grab the rwlock first. 503 */ 504 int 505 filt_procprocess(struct knote *kn, struct kevent *kev) 506 { 507 struct process *pr = kn->kn_ptr.p_process; 508 int active; 509 510 mtx_enter(&pr->ps_mtx); 511 active = knote_process(kn, kev); 512 mtx_leave(&pr->ps_mtx); 513 514 return (active); 515 } 516 517 /* 518 * signal knotes are shared with proc knotes, so we apply a mask to 519 * the hint in order to differentiate them from process hints. This 520 * could be avoided by using a signal-specific knote list, but probably 521 * isn't worth the trouble. 522 */ 523 int 524 filt_sigattach(struct knote *kn) 525 { 526 struct process *pr = curproc->p_p; 527 528 if (kn->kn_id >= NSIG) 529 return EINVAL; 530 531 kn->kn_ptr.p_process = pr; 532 kn->kn_flags |= EV_CLEAR; /* automatically set */ 533 534 /* this needs both the ps_mtx and exclusive kqueue_ps_list_lock. */ 535 rw_enter_write(&kqueue_ps_list_lock); 536 mtx_enter(&pr->ps_mtx); 537 klist_insert_locked(&pr->ps_klist, kn); 538 mtx_leave(&pr->ps_mtx); 539 rw_exit_write(&kqueue_ps_list_lock); 540 541 return (0); 542 } 543 544 void 545 filt_sigdetach(struct knote *kn) 546 { 547 struct process *pr = kn->kn_ptr.p_process; 548 549 rw_enter_write(&kqueue_ps_list_lock); 550 mtx_enter(&pr->ps_mtx); 551 klist_remove_locked(&pr->ps_klist, kn); 552 mtx_leave(&pr->ps_mtx); 553 rw_exit_write(&kqueue_ps_list_lock); 554 } 555 556 int 557 filt_signal(struct knote *kn, long hint) 558 { 559 if (hint & NOTE_SIGNAL) { 560 hint &= ~NOTE_SIGNAL; 561 562 if (kn->kn_id == hint) 563 kn->kn_data++; 564 } 565 return (kn->kn_data != 0); 566 } 567 568 #define NOTE_TIMER_UNITMASK \ 569 (NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS|NOTE_NSECONDS) 570 571 static int 572 filt_timervalidate(int sfflags, int64_t sdata, struct timespec *ts) 573 { 574 if (sfflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) 575 return (EINVAL); 576 577 switch (sfflags & NOTE_TIMER_UNITMASK) { 578 case NOTE_SECONDS: 579 ts->tv_sec = sdata; 580 ts->tv_nsec = 0; 581 break; 582 case NOTE_MSECONDS: 583 ts->tv_sec = sdata / 1000; 584 ts->tv_nsec = (sdata % 1000) * 1000000; 585 break; 586 case NOTE_USECONDS: 587 ts->tv_sec = sdata / 1000000; 588 ts->tv_nsec = (sdata % 1000000) * 1000; 589 break; 590 case NOTE_NSECONDS: 591 ts->tv_sec = sdata / 1000000000; 592 ts->tv_nsec = sdata % 1000000000; 593 break; 594 default: 595 return (EINVAL); 596 } 597 598 return (0); 599 } 600 601 static void 602 filt_timeradd(struct knote *kn, struct timespec *ts) 603 { 604 struct timespec expiry, now; 605 struct timeout *to = kn->kn_hook; 606 int tticks; 607 608 if (kn->kn_sfflags & NOTE_ABSTIME) { 609 nanotime(&now); 610 if (timespeccmp(ts, &now, >)) { 611 timespecsub(ts, &now, &expiry); 612 /* XXX timeout_abs_ts with CLOCK_REALTIME */ 613 timeout_add(to, tstohz(&expiry)); 614 } else { 615 /* Expire immediately. */ 616 filt_timerexpire(kn); 617 } 618 return; 619 } 620 621 tticks = tstohz(ts); 622 /* Remove extra tick from tstohz() if timeout has fired before. */ 623 if (timeout_triggered(to)) 624 tticks--; 625 timeout_add(to, (tticks > 0) ? tticks : 1); 626 } 627 628 void 629 filt_timerexpire(void *knx) 630 { 631 struct timespec ts; 632 struct knote *kn = knx; 633 struct kqueue *kq = kn->kn_kq; 634 635 kn->kn_data++; 636 mtx_enter(&kq->kq_lock); 637 knote_activate(kn); 638 mtx_leave(&kq->kq_lock); 639 640 if ((kn->kn_flags & EV_ONESHOT) == 0 && 641 (kn->kn_sfflags & NOTE_ABSTIME) == 0) { 642 (void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); 643 filt_timeradd(kn, &ts); 644 } 645 } 646 647 /* 648 * data contains amount of time to sleep 649 */ 650 int 651 filt_timerattach(struct knote *kn) 652 { 653 struct timespec ts; 654 struct timeout *to; 655 int error; 656 657 error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); 658 if (error != 0) 659 return (error); 660 661 if (kq_ntimeouts > kq_timeoutmax) 662 return (ENOMEM); 663 kq_ntimeouts++; 664 665 if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) 666 kn->kn_flags |= EV_CLEAR; /* automatically set */ 667 to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); 668 timeout_set(to, filt_timerexpire, kn); 669 kn->kn_hook = to; 670 filt_timeradd(kn, &ts); 671 672 return (0); 673 } 674 675 void 676 filt_timerdetach(struct knote *kn) 677 { 678 struct timeout *to; 679 680 to = (struct timeout *)kn->kn_hook; 681 timeout_del_barrier(to); 682 free(to, M_KEVENT, sizeof(*to)); 683 kq_ntimeouts--; 684 } 685 686 int 687 filt_timermodify(struct kevent *kev, struct knote *kn) 688 { 689 struct timespec ts; 690 struct kqueue *kq = kn->kn_kq; 691 struct timeout *to = kn->kn_hook; 692 int error; 693 694 error = filt_timervalidate(kev->fflags, kev->data, &ts); 695 if (error != 0) { 696 kev->flags |= EV_ERROR; 697 kev->data = error; 698 return (0); 699 } 700 701 /* Reset the timer. Any pending events are discarded. */ 702 703 timeout_del_barrier(to); 704 705 mtx_enter(&kq->kq_lock); 706 if (kn->kn_status & KN_QUEUED) 707 knote_dequeue(kn); 708 kn->kn_status &= ~KN_ACTIVE; 709 mtx_leave(&kq->kq_lock); 710 711 kn->kn_data = 0; 712 knote_assign(kev, kn); 713 /* Reinit timeout to invoke tick adjustment again. */ 714 timeout_set(to, filt_timerexpire, kn); 715 filt_timeradd(kn, &ts); 716 717 return (0); 718 } 719 720 int 721 filt_timerprocess(struct knote *kn, struct kevent *kev) 722 { 723 int active, s; 724 725 s = splsoftclock(); 726 active = (kn->kn_data != 0); 727 if (active) 728 knote_submit(kn, kev); 729 splx(s); 730 731 return (active); 732 } 733 734 735 /* 736 * filt_seltrue: 737 * 738 * This filter "event" routine simulates seltrue(). 739 */ 740 int 741 filt_seltrue(struct knote *kn, long hint) 742 { 743 744 /* 745 * We don't know how much data can be read/written, 746 * but we know that it *can* be. This is about as 747 * good as select/poll does as well. 748 */ 749 kn->kn_data = 0; 750 return (1); 751 } 752 753 int 754 filt_seltruemodify(struct kevent *kev, struct knote *kn) 755 { 756 knote_assign(kev, kn); 757 return (kn->kn_fop->f_event(kn, 0)); 758 } 759 760 int 761 filt_seltrueprocess(struct knote *kn, struct kevent *kev) 762 { 763 int active; 764 765 active = kn->kn_fop->f_event(kn, 0); 766 if (active) 767 knote_submit(kn, kev); 768 return (active); 769 } 770 771 /* 772 * This provides full kqfilter entry for device switch tables, which 773 * has same effect as filter using filt_seltrue() as filter method. 774 */ 775 void 776 filt_seltruedetach(struct knote *kn) 777 { 778 /* Nothing to do */ 779 } 780 781 const struct filterops seltrue_filtops = { 782 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 783 .f_attach = NULL, 784 .f_detach = filt_seltruedetach, 785 .f_event = filt_seltrue, 786 .f_modify = filt_seltruemodify, 787 .f_process = filt_seltrueprocess, 788 }; 789 790 int 791 seltrue_kqfilter(dev_t dev, struct knote *kn) 792 { 793 switch (kn->kn_filter) { 794 case EVFILT_READ: 795 case EVFILT_WRITE: 796 kn->kn_fop = &seltrue_filtops; 797 break; 798 default: 799 return (EINVAL); 800 } 801 802 /* Nothing more to do */ 803 return (0); 804 } 805 806 static int 807 filt_dead(struct knote *kn, long hint) 808 { 809 if (kn->kn_filter == EVFILT_EXCEPT) { 810 /* 811 * Do not deliver event because there is no out-of-band data. 812 * However, let HUP condition pass for poll(2). 813 */ 814 if ((kn->kn_flags & __EV_POLL) == 0) { 815 kn->kn_flags |= EV_DISABLE; 816 return (0); 817 } 818 } 819 820 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 821 if (kn->kn_flags & __EV_POLL) 822 kn->kn_flags |= __EV_HUP; 823 kn->kn_data = 0; 824 return (1); 825 } 826 827 static void 828 filt_deaddetach(struct knote *kn) 829 { 830 /* Nothing to do */ 831 } 832 833 const struct filterops dead_filtops = { 834 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 835 .f_attach = NULL, 836 .f_detach = filt_deaddetach, 837 .f_event = filt_dead, 838 .f_modify = filt_seltruemodify, 839 .f_process = filt_seltrueprocess, 840 }; 841 842 static int 843 filt_badfd(struct knote *kn, long hint) 844 { 845 kn->kn_flags |= (EV_ERROR | EV_ONESHOT); 846 kn->kn_data = EBADF; 847 return (1); 848 } 849 850 /* For use with kqpoll. */ 851 const struct filterops badfd_filtops = { 852 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 853 .f_attach = NULL, 854 .f_detach = filt_deaddetach, 855 .f_event = filt_badfd, 856 .f_modify = filt_seltruemodify, 857 .f_process = filt_seltrueprocess, 858 }; 859 860 static int 861 filter_attach(struct knote *kn) 862 { 863 int error; 864 865 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 866 error = kn->kn_fop->f_attach(kn); 867 } else { 868 KERNEL_LOCK(); 869 error = kn->kn_fop->f_attach(kn); 870 KERNEL_UNLOCK(); 871 } 872 return (error); 873 } 874 875 static void 876 filter_detach(struct knote *kn) 877 { 878 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 879 kn->kn_fop->f_detach(kn); 880 } else { 881 KERNEL_LOCK(); 882 kn->kn_fop->f_detach(kn); 883 KERNEL_UNLOCK(); 884 } 885 } 886 887 static int 888 filter_event(struct knote *kn, long hint) 889 { 890 if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) 891 KERNEL_ASSERT_LOCKED(); 892 893 return (kn->kn_fop->f_event(kn, hint)); 894 } 895 896 static int 897 filter_modify(struct kevent *kev, struct knote *kn) 898 { 899 int active, s; 900 901 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 902 active = kn->kn_fop->f_modify(kev, kn); 903 } else { 904 KERNEL_LOCK(); 905 if (kn->kn_fop->f_modify != NULL) { 906 active = kn->kn_fop->f_modify(kev, kn); 907 } else { 908 s = splhigh(); 909 active = knote_modify(kev, kn); 910 splx(s); 911 } 912 KERNEL_UNLOCK(); 913 } 914 return (active); 915 } 916 917 static int 918 filter_process(struct knote *kn, struct kevent *kev) 919 { 920 int active, s; 921 922 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 923 active = kn->kn_fop->f_process(kn, kev); 924 } else { 925 KERNEL_LOCK(); 926 if (kn->kn_fop->f_process != NULL) { 927 active = kn->kn_fop->f_process(kn, kev); 928 } else { 929 s = splhigh(); 930 active = knote_process(kn, kev); 931 splx(s); 932 } 933 KERNEL_UNLOCK(); 934 } 935 return (active); 936 } 937 938 /* 939 * Initialize the current thread for poll/select system call. 940 * num indicates the number of serials that the system call may utilize. 941 * After this function, the valid range of serials is 942 * p_kq_serial <= x < p_kq_serial + num. 943 */ 944 void 945 kqpoll_init(unsigned int num) 946 { 947 struct proc *p = curproc; 948 struct filedesc *fdp; 949 950 if (p->p_kq == NULL) { 951 p->p_kq = kqueue_alloc(p->p_fd); 952 p->p_kq_serial = arc4random(); 953 fdp = p->p_fd; 954 fdplock(fdp); 955 LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next); 956 fdpunlock(fdp); 957 } 958 959 if (p->p_kq_serial + num < p->p_kq_serial) { 960 /* Serial is about to wrap. Clear all attached knotes. */ 961 kqueue_purge(p, p->p_kq); 962 p->p_kq_serial = 0; 963 } 964 } 965 966 /* 967 * Finish poll/select system call. 968 * num must have the same value that was used with kqpoll_init(). 969 */ 970 void 971 kqpoll_done(unsigned int num) 972 { 973 struct proc *p = curproc; 974 struct kqueue *kq = p->p_kq; 975 976 KASSERT(p->p_kq != NULL); 977 KASSERT(p->p_kq_serial + num >= p->p_kq_serial); 978 979 p->p_kq_serial += num; 980 981 /* 982 * Because of kn_pollid key, a thread can in principle allocate 983 * up to O(maxfiles^2) knotes by calling poll(2) repeatedly 984 * with suitably varying pollfd arrays. 985 * Prevent such a large allocation by clearing knotes eagerly 986 * if there are too many of them. 987 * 988 * A small multiple of kq_knlistsize should give enough margin 989 * that eager clearing is infrequent, or does not happen at all, 990 * with normal programs. 991 * A single pollfd entry can use up to three knotes. 992 * Typically there is no significant overlap of fd and events 993 * between different entries in the pollfd array. 994 */ 995 if (kq->kq_nknotes > 4 * kq->kq_knlistsize) 996 kqueue_purge(p, kq); 997 } 998 999 void 1000 kqpoll_exit(void) 1001 { 1002 struct proc *p = curproc; 1003 1004 if (p->p_kq == NULL) 1005 return; 1006 1007 kqueue_purge(p, p->p_kq); 1008 kqueue_terminate(p, p->p_kq); 1009 KASSERT(p->p_kq->kq_refcnt.r_refs == 1); 1010 KQRELE(p->p_kq); 1011 p->p_kq = NULL; 1012 } 1013 1014 struct kqueue * 1015 kqueue_alloc(struct filedesc *fdp) 1016 { 1017 struct kqueue *kq; 1018 1019 kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO); 1020 refcnt_init(&kq->kq_refcnt); 1021 kq->kq_fdp = fdp; 1022 TAILQ_INIT(&kq->kq_head); 1023 mtx_init(&kq->kq_lock, IPL_HIGH); 1024 task_set(&kq->kq_task, kqueue_task, kq); 1025 klist_init_mutex(&kq->kq_klist, &kqueue_klist_lock); 1026 1027 return (kq); 1028 } 1029 1030 int 1031 dokqueue(struct proc *p, int flags, register_t *retval) 1032 { 1033 struct filedesc *fdp = p->p_fd; 1034 struct kqueue *kq; 1035 struct file *fp; 1036 int cloexec, error, fd; 1037 1038 cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; 1039 1040 kq = kqueue_alloc(fdp); 1041 1042 fdplock(fdp); 1043 error = falloc(p, &fp, &fd); 1044 if (error) 1045 goto out; 1046 fp->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); 1047 fp->f_type = DTYPE_KQUEUE; 1048 fp->f_ops = &kqueueops; 1049 fp->f_data = kq; 1050 *retval = fd; 1051 LIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_next); 1052 kq = NULL; 1053 fdinsert(fdp, fd, cloexec, fp); 1054 FRELE(fp, p); 1055 out: 1056 fdpunlock(fdp); 1057 if (kq != NULL) 1058 pool_put(&kqueue_pool, kq); 1059 return (error); 1060 } 1061 1062 int 1063 sys_kqueue(struct proc *p, void *v, register_t *retval) 1064 { 1065 return (dokqueue(p, 0, retval)); 1066 } 1067 1068 int 1069 sys_kqueue1(struct proc *p, void *v, register_t *retval) 1070 { 1071 struct sys_kqueue1_args /* { 1072 syscallarg(int) flags; 1073 } */ *uap = v; 1074 1075 if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK)) 1076 return (EINVAL); 1077 return (dokqueue(p, SCARG(uap, flags), retval)); 1078 } 1079 1080 int 1081 sys_kevent(struct proc *p, void *v, register_t *retval) 1082 { 1083 struct kqueue_scan_state scan; 1084 struct filedesc* fdp = p->p_fd; 1085 struct sys_kevent_args /* { 1086 syscallarg(int) fd; 1087 syscallarg(const struct kevent *) changelist; 1088 syscallarg(int) nchanges; 1089 syscallarg(struct kevent *) eventlist; 1090 syscallarg(int) nevents; 1091 syscallarg(const struct timespec *) timeout; 1092 } */ *uap = v; 1093 struct kevent *kevp; 1094 struct kqueue *kq; 1095 struct file *fp; 1096 struct timespec ts; 1097 struct timespec *tsp = NULL; 1098 int i, n, nerrors, error; 1099 int ready, total; 1100 struct kevent kev[KQ_NEVENTS]; 1101 1102 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) 1103 return (EBADF); 1104 1105 if (fp->f_type != DTYPE_KQUEUE) { 1106 error = EBADF; 1107 goto done; 1108 } 1109 1110 if (SCARG(uap, timeout) != NULL) { 1111 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 1112 if (error) 1113 goto done; 1114 #ifdef KTRACE 1115 if (KTRPOINT(p, KTR_STRUCT)) 1116 ktrreltimespec(p, &ts); 1117 #endif 1118 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) { 1119 error = EINVAL; 1120 goto done; 1121 } 1122 tsp = &ts; 1123 } 1124 1125 kq = fp->f_data; 1126 nerrors = 0; 1127 1128 while ((n = SCARG(uap, nchanges)) > 0) { 1129 if (n > nitems(kev)) 1130 n = nitems(kev); 1131 error = copyin(SCARG(uap, changelist), kev, 1132 n * sizeof(struct kevent)); 1133 if (error) 1134 goto done; 1135 #ifdef KTRACE 1136 if (KTRPOINT(p, KTR_STRUCT)) 1137 ktrevent(p, kev, n); 1138 #endif 1139 for (i = 0; i < n; i++) { 1140 kevp = &kev[i]; 1141 kevp->flags &= ~EV_SYSFLAGS; 1142 error = kqueue_register(kq, kevp, 0, p); 1143 if (error || (kevp->flags & EV_RECEIPT)) { 1144 if (SCARG(uap, nevents) != 0) { 1145 kevp->flags = EV_ERROR; 1146 kevp->data = error; 1147 copyout(kevp, SCARG(uap, eventlist), 1148 sizeof(*kevp)); 1149 SCARG(uap, eventlist)++; 1150 SCARG(uap, nevents)--; 1151 nerrors++; 1152 } else { 1153 goto done; 1154 } 1155 } 1156 } 1157 SCARG(uap, nchanges) -= n; 1158 SCARG(uap, changelist) += n; 1159 } 1160 if (nerrors) { 1161 *retval = nerrors; 1162 error = 0; 1163 goto done; 1164 } 1165 1166 kqueue_scan_setup(&scan, kq); 1167 FRELE(fp, p); 1168 /* 1169 * Collect as many events as we can. The timeout on successive 1170 * loops is disabled (kqueue_scan() becomes non-blocking). 1171 */ 1172 total = 0; 1173 error = 0; 1174 while ((n = SCARG(uap, nevents) - total) > 0) { 1175 if (n > nitems(kev)) 1176 n = nitems(kev); 1177 ready = kqueue_scan(&scan, n, kev, tsp, p, &error); 1178 if (ready == 0) 1179 break; 1180 error = copyout(kev, SCARG(uap, eventlist) + total, 1181 sizeof(struct kevent) * ready); 1182 #ifdef KTRACE 1183 if (KTRPOINT(p, KTR_STRUCT)) 1184 ktrevent(p, kev, ready); 1185 #endif 1186 total += ready; 1187 if (error || ready < n) 1188 break; 1189 } 1190 kqueue_scan_finish(&scan); 1191 *retval = total; 1192 return (error); 1193 1194 done: 1195 FRELE(fp, p); 1196 return (error); 1197 } 1198 1199 #ifdef KQUEUE_DEBUG 1200 void 1201 kqueue_do_check(struct kqueue *kq, const char *func, int line) 1202 { 1203 struct knote *kn; 1204 int count = 0, nmarker = 0; 1205 1206 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1207 1208 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { 1209 if (kn->kn_filter == EVFILT_MARKER) { 1210 if ((kn->kn_status & KN_QUEUED) != 0) 1211 panic("%s:%d: kq=%p kn=%p marker QUEUED", 1212 func, line, kq, kn); 1213 nmarker++; 1214 } else { 1215 if ((kn->kn_status & KN_ACTIVE) == 0) 1216 panic("%s:%d: kq=%p kn=%p knote !ACTIVE", 1217 func, line, kq, kn); 1218 if ((kn->kn_status & KN_QUEUED) == 0) 1219 panic("%s:%d: kq=%p kn=%p knote !QUEUED", 1220 func, line, kq, kn); 1221 if (kn->kn_kq != kq) 1222 panic("%s:%d: kq=%p kn=%p kn_kq=%p != kq", 1223 func, line, kq, kn, kn->kn_kq); 1224 count++; 1225 if (count > kq->kq_count) 1226 goto bad; 1227 } 1228 } 1229 if (count != kq->kq_count) { 1230 bad: 1231 panic("%s:%d: kq=%p kq_count=%d count=%d nmarker=%d", 1232 func, line, kq, kq->kq_count, count, nmarker); 1233 } 1234 } 1235 #endif 1236 1237 int 1238 kqueue_register(struct kqueue *kq, struct kevent *kev, unsigned int pollid, 1239 struct proc *p) 1240 { 1241 struct filedesc *fdp = kq->kq_fdp; 1242 const struct filterops *fops = NULL; 1243 struct file *fp = NULL; 1244 struct knote *kn = NULL, *newkn = NULL; 1245 struct knlist *list = NULL; 1246 int active, error = 0; 1247 1248 KASSERT(pollid == 0 || (p != NULL && p->p_kq == kq)); 1249 1250 if (kev->filter < 0) { 1251 if (kev->filter + EVFILT_SYSCOUNT < 0) 1252 return (EINVAL); 1253 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 1254 } 1255 1256 if (fops == NULL) { 1257 /* 1258 * XXX 1259 * filter attach routine is responsible for ensuring that 1260 * the identifier can be attached to it. 1261 */ 1262 return (EINVAL); 1263 } 1264 1265 if (fops->f_flags & FILTEROP_ISFD) { 1266 /* validate descriptor */ 1267 if (kev->ident > INT_MAX) 1268 return (EBADF); 1269 } 1270 1271 if (kev->flags & EV_ADD) 1272 newkn = pool_get(&knote_pool, PR_WAITOK | PR_ZERO); 1273 1274 again: 1275 if (fops->f_flags & FILTEROP_ISFD) { 1276 if ((fp = fd_getfile(fdp, kev->ident)) == NULL) { 1277 error = EBADF; 1278 goto done; 1279 } 1280 mtx_enter(&kq->kq_lock); 1281 if (kev->flags & EV_ADD) 1282 kqueue_expand_list(kq, kev->ident); 1283 if (kev->ident < kq->kq_knlistsize) 1284 list = &kq->kq_knlist[kev->ident]; 1285 } else { 1286 mtx_enter(&kq->kq_lock); 1287 if (kev->flags & EV_ADD) 1288 kqueue_expand_hash(kq); 1289 if (kq->kq_knhashmask != 0) { 1290 list = &kq->kq_knhash[ 1291 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1292 } 1293 } 1294 if (list != NULL) { 1295 SLIST_FOREACH(kn, list, kn_link) { 1296 if (kev->filter == kn->kn_filter && 1297 kev->ident == kn->kn_id && 1298 pollid == kn->kn_pollid) { 1299 if (!knote_acquire(kn, NULL, 0)) { 1300 /* knote_acquire() has released 1301 * kq_lock. */ 1302 if (fp != NULL) { 1303 FRELE(fp, p); 1304 fp = NULL; 1305 } 1306 goto again; 1307 } 1308 break; 1309 } 1310 } 1311 } 1312 KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0); 1313 1314 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 1315 mtx_leave(&kq->kq_lock); 1316 error = ENOENT; 1317 goto done; 1318 } 1319 1320 /* 1321 * kn now contains the matching knote, or NULL if no match. 1322 */ 1323 if (kev->flags & EV_ADD) { 1324 if (kn == NULL) { 1325 kn = newkn; 1326 newkn = NULL; 1327 kn->kn_status = KN_PROCESSING; 1328 kn->kn_fp = fp; 1329 kn->kn_kq = kq; 1330 kn->kn_fop = fops; 1331 1332 /* 1333 * apply reference count to knote structure, and 1334 * do not release it at the end of this routine. 1335 */ 1336 fp = NULL; 1337 1338 kn->kn_sfflags = kev->fflags; 1339 kn->kn_sdata = kev->data; 1340 kev->fflags = 0; 1341 kev->data = 0; 1342 kn->kn_kevent = *kev; 1343 kn->kn_pollid = pollid; 1344 1345 knote_attach(kn); 1346 mtx_leave(&kq->kq_lock); 1347 1348 error = filter_attach(kn); 1349 if (error != 0) { 1350 knote_drop(kn, p); 1351 goto done; 1352 } 1353 1354 /* 1355 * If this is a file descriptor filter, check if 1356 * fd was closed while the knote was being added. 1357 * knote_fdclose() has missed kn if the function 1358 * ran before kn appeared in kq_knlist. 1359 */ 1360 if ((fops->f_flags & FILTEROP_ISFD) && 1361 fd_checkclosed(fdp, kev->ident, kn->kn_fp)) { 1362 /* 1363 * Drop the knote silently without error 1364 * because another thread might already have 1365 * seen it. This corresponds to the insert 1366 * happening in full before the close. 1367 */ 1368 filter_detach(kn); 1369 knote_drop(kn, p); 1370 goto done; 1371 } 1372 1373 /* Check if there is a pending event. */ 1374 active = filter_process(kn, NULL); 1375 mtx_enter(&kq->kq_lock); 1376 if (active) 1377 knote_activate(kn); 1378 } else if (kn->kn_fop == &badfd_filtops) { 1379 /* 1380 * Nothing expects this badfd knote any longer. 1381 * Drop it to make room for the new knote and retry. 1382 */ 1383 KASSERT(kq == p->p_kq); 1384 mtx_leave(&kq->kq_lock); 1385 filter_detach(kn); 1386 knote_drop(kn, p); 1387 1388 KASSERT(fp != NULL); 1389 FRELE(fp, p); 1390 fp = NULL; 1391 1392 goto again; 1393 } else { 1394 /* 1395 * The user may change some filter values after the 1396 * initial EV_ADD, but doing so will not reset any 1397 * filters which have already been triggered. 1398 */ 1399 mtx_leave(&kq->kq_lock); 1400 active = filter_modify(kev, kn); 1401 mtx_enter(&kq->kq_lock); 1402 if (active) 1403 knote_activate(kn); 1404 if (kev->flags & EV_ERROR) { 1405 error = kev->data; 1406 goto release; 1407 } 1408 } 1409 } else if (kev->flags & EV_DELETE) { 1410 mtx_leave(&kq->kq_lock); 1411 filter_detach(kn); 1412 knote_drop(kn, p); 1413 goto done; 1414 } 1415 1416 if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) 1417 kn->kn_status |= KN_DISABLED; 1418 1419 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 1420 kn->kn_status &= ~KN_DISABLED; 1421 mtx_leave(&kq->kq_lock); 1422 /* Check if there is a pending event. */ 1423 active = filter_process(kn, NULL); 1424 mtx_enter(&kq->kq_lock); 1425 if (active) 1426 knote_activate(kn); 1427 } 1428 1429 release: 1430 knote_release(kn); 1431 mtx_leave(&kq->kq_lock); 1432 done: 1433 if (fp != NULL) 1434 FRELE(fp, p); 1435 if (newkn != NULL) 1436 pool_put(&knote_pool, newkn); 1437 return (error); 1438 } 1439 1440 int 1441 kqueue_sleep(struct kqueue *kq, struct timespec *tsp) 1442 { 1443 struct timespec elapsed, start, stop; 1444 uint64_t nsecs; 1445 int error; 1446 1447 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1448 1449 if (tsp != NULL) { 1450 getnanouptime(&start); 1451 nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); 1452 } else 1453 nsecs = INFSLP; 1454 error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK, 1455 "kqread", nsecs); 1456 if (tsp != NULL) { 1457 getnanouptime(&stop); 1458 timespecsub(&stop, &start, &elapsed); 1459 timespecsub(tsp, &elapsed, tsp); 1460 if (tsp->tv_sec < 0) 1461 timespecclear(tsp); 1462 } 1463 1464 return (error); 1465 } 1466 1467 /* 1468 * Scan the kqueue, blocking if necessary until the target time is reached. 1469 * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both 1470 * 0 we do not block at all. 1471 */ 1472 int 1473 kqueue_scan(struct kqueue_scan_state *scan, int maxevents, 1474 struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp) 1475 { 1476 struct kqueue *kq = scan->kqs_kq; 1477 struct knote *kn; 1478 int error = 0, nkev = 0; 1479 int reinserted; 1480 1481 if (maxevents == 0) 1482 goto done; 1483 retry: 1484 KASSERT(nkev == 0); 1485 1486 error = 0; 1487 reinserted = 0; 1488 1489 mtx_enter(&kq->kq_lock); 1490 1491 if (kq->kq_state & KQ_DYING) { 1492 mtx_leave(&kq->kq_lock); 1493 error = EBADF; 1494 goto done; 1495 } 1496 1497 if (kq->kq_count == 0) { 1498 /* 1499 * Successive loops are only necessary if there are more 1500 * ready events to gather, so they don't need to block. 1501 */ 1502 if ((tsp != NULL && !timespecisset(tsp)) || 1503 scan->kqs_nevent != 0) { 1504 mtx_leave(&kq->kq_lock); 1505 error = 0; 1506 goto done; 1507 } 1508 kq->kq_state |= KQ_SLEEP; 1509 error = kqueue_sleep(kq, tsp); 1510 /* kqueue_sleep() has released kq_lock. */ 1511 if (error == 0 || error == EWOULDBLOCK) 1512 goto retry; 1513 /* don't restart after signals... */ 1514 if (error == ERESTART) 1515 error = EINTR; 1516 goto done; 1517 } 1518 1519 /* 1520 * Put the end marker in the queue to limit the scan to the events 1521 * that are currently active. This prevents events from being 1522 * recollected if they reactivate during scan. 1523 * 1524 * If a partial scan has been performed already but no events have 1525 * been collected, reposition the end marker to make any new events 1526 * reachable. 1527 */ 1528 if (!scan->kqs_queued) { 1529 TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); 1530 scan->kqs_queued = 1; 1531 } else if (scan->kqs_nevent == 0) { 1532 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); 1533 TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); 1534 } 1535 1536 TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe); 1537 while (nkev < maxevents) { 1538 kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe); 1539 if (kn->kn_filter == EVFILT_MARKER) { 1540 if (kn == &scan->kqs_end) 1541 break; 1542 1543 /* Move start marker past another thread's marker. */ 1544 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); 1545 TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start, 1546 kn_tqe); 1547 continue; 1548 } 1549 1550 if (!knote_acquire(kn, NULL, 0)) { 1551 /* knote_acquire() has released kq_lock. */ 1552 mtx_enter(&kq->kq_lock); 1553 continue; 1554 } 1555 1556 kqueue_check(kq); 1557 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1558 kn->kn_status &= ~KN_QUEUED; 1559 kq->kq_count--; 1560 kqueue_check(kq); 1561 1562 if (kn->kn_status & KN_DISABLED) { 1563 knote_release(kn); 1564 continue; 1565 } 1566 1567 mtx_leave(&kq->kq_lock); 1568 1569 /* Drop expired kqpoll knotes. */ 1570 if (p->p_kq == kq && 1571 p->p_kq_serial > (unsigned long)kn->kn_udata) { 1572 filter_detach(kn); 1573 knote_drop(kn, p); 1574 mtx_enter(&kq->kq_lock); 1575 continue; 1576 } 1577 1578 /* 1579 * Invalidate knotes whose vnodes have been revoked. 1580 * This is a workaround; it is tricky to clear existing 1581 * knotes and prevent new ones from being registered 1582 * with the current revocation mechanism. 1583 */ 1584 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && 1585 kn->kn_fp != NULL && 1586 kn->kn_fp->f_type == DTYPE_VNODE) { 1587 struct vnode *vp = kn->kn_fp->f_data; 1588 1589 if (__predict_false(vp->v_op == &dead_vops && 1590 kn->kn_fop != &dead_filtops)) { 1591 filter_detach(kn); 1592 kn->kn_fop = &dead_filtops; 1593 1594 /* 1595 * Check if the event should be delivered. 1596 * Use f_event directly because this is 1597 * a special situation. 1598 */ 1599 if (kn->kn_fop->f_event(kn, 0) == 0) { 1600 filter_detach(kn); 1601 knote_drop(kn, p); 1602 mtx_enter(&kq->kq_lock); 1603 continue; 1604 } 1605 } 1606 } 1607 1608 memset(kevp, 0, sizeof(*kevp)); 1609 if (filter_process(kn, kevp) == 0) { 1610 mtx_enter(&kq->kq_lock); 1611 if ((kn->kn_status & KN_QUEUED) == 0) 1612 kn->kn_status &= ~KN_ACTIVE; 1613 knote_release(kn); 1614 kqueue_check(kq); 1615 continue; 1616 } 1617 1618 /* 1619 * Post-event action on the note 1620 */ 1621 if (kevp->flags & EV_ONESHOT) { 1622 filter_detach(kn); 1623 knote_drop(kn, p); 1624 mtx_enter(&kq->kq_lock); 1625 } else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) { 1626 mtx_enter(&kq->kq_lock); 1627 if (kevp->flags & EV_DISPATCH) 1628 kn->kn_status |= KN_DISABLED; 1629 if ((kn->kn_status & KN_QUEUED) == 0) 1630 kn->kn_status &= ~KN_ACTIVE; 1631 knote_release(kn); 1632 } else { 1633 mtx_enter(&kq->kq_lock); 1634 if ((kn->kn_status & KN_QUEUED) == 0) { 1635 kqueue_check(kq); 1636 kq->kq_count++; 1637 kn->kn_status |= KN_QUEUED; 1638 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1639 /* Wakeup is done after loop. */ 1640 reinserted = 1; 1641 } 1642 knote_release(kn); 1643 } 1644 kqueue_check(kq); 1645 1646 kevp++; 1647 nkev++; 1648 scan->kqs_nevent++; 1649 } 1650 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); 1651 if (reinserted && kq->kq_count != 0) 1652 kqueue_wakeup(kq); 1653 mtx_leave(&kq->kq_lock); 1654 if (scan->kqs_nevent == 0) 1655 goto retry; 1656 done: 1657 *errorp = error; 1658 return (nkev); 1659 } 1660 1661 void 1662 kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq) 1663 { 1664 memset(scan, 0, sizeof(*scan)); 1665 1666 KQREF(kq); 1667 scan->kqs_kq = kq; 1668 scan->kqs_start.kn_filter = EVFILT_MARKER; 1669 scan->kqs_start.kn_status = KN_PROCESSING; 1670 scan->kqs_end.kn_filter = EVFILT_MARKER; 1671 scan->kqs_end.kn_status = KN_PROCESSING; 1672 } 1673 1674 void 1675 kqueue_scan_finish(struct kqueue_scan_state *scan) 1676 { 1677 struct kqueue *kq = scan->kqs_kq; 1678 1679 KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER); 1680 KASSERT(scan->kqs_start.kn_status == KN_PROCESSING); 1681 KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER); 1682 KASSERT(scan->kqs_end.kn_status == KN_PROCESSING); 1683 1684 if (scan->kqs_queued) { 1685 scan->kqs_queued = 0; 1686 mtx_enter(&kq->kq_lock); 1687 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); 1688 mtx_leave(&kq->kq_lock); 1689 } 1690 KQRELE(kq); 1691 } 1692 1693 /* 1694 * XXX 1695 * This could be expanded to call kqueue_scan, if desired. 1696 */ 1697 int 1698 kqueue_read(struct file *fp, struct uio *uio, int fflags) 1699 { 1700 return (ENXIO); 1701 } 1702 1703 int 1704 kqueue_write(struct file *fp, struct uio *uio, int fflags) 1705 { 1706 return (ENXIO); 1707 } 1708 1709 int 1710 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) 1711 { 1712 return (ENOTTY); 1713 } 1714 1715 int 1716 kqueue_stat(struct file *fp, struct stat *st, struct proc *p) 1717 { 1718 struct kqueue *kq = fp->f_data; 1719 1720 memset(st, 0, sizeof(*st)); 1721 st->st_size = kq->kq_count; /* unlocked read */ 1722 st->st_blksize = sizeof(struct kevent); 1723 st->st_mode = S_IFIFO; 1724 return (0); 1725 } 1726 1727 void 1728 kqueue_purge(struct proc *p, struct kqueue *kq) 1729 { 1730 int i; 1731 1732 mtx_enter(&kq->kq_lock); 1733 for (i = 0; i < kq->kq_knlistsize; i++) 1734 knote_remove(p, kq, &kq->kq_knlist, i, 1); 1735 if (kq->kq_knhashmask != 0) { 1736 for (i = 0; i < kq->kq_knhashmask + 1; i++) 1737 knote_remove(p, kq, &kq->kq_knhash, i, 1); 1738 } 1739 mtx_leave(&kq->kq_lock); 1740 } 1741 1742 void 1743 kqueue_terminate(struct proc *p, struct kqueue *kq) 1744 { 1745 struct knote *kn; 1746 int state; 1747 1748 mtx_enter(&kq->kq_lock); 1749 1750 /* 1751 * Any remaining entries should be scan markers. 1752 * They are removed when the ongoing scans finish. 1753 */ 1754 KASSERT(kq->kq_count == 0); 1755 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) 1756 KASSERT(kn->kn_filter == EVFILT_MARKER); 1757 1758 kq->kq_state |= KQ_DYING; 1759 state = kq->kq_state; 1760 kqueue_wakeup(kq); 1761 mtx_leave(&kq->kq_lock); 1762 1763 /* 1764 * Any knotes that were attached to this kqueue were deleted 1765 * by knote_fdclose() when this kqueue's file descriptor was closed. 1766 */ 1767 KASSERT(klist_empty(&kq->kq_klist)); 1768 if (state & KQ_TASK) 1769 taskq_del_barrier(systqmp, &kq->kq_task); 1770 } 1771 1772 int 1773 kqueue_close(struct file *fp, struct proc *p) 1774 { 1775 struct kqueue *kq = fp->f_data; 1776 1777 fp->f_data = NULL; 1778 1779 kqueue_purge(p, kq); 1780 kqueue_terminate(p, kq); 1781 1782 KQRELE(kq); 1783 1784 return (0); 1785 } 1786 1787 static void 1788 kqueue_task(void *arg) 1789 { 1790 struct kqueue *kq = arg; 1791 1792 knote(&kq->kq_klist, 0); 1793 } 1794 1795 void 1796 kqueue_wakeup(struct kqueue *kq) 1797 { 1798 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1799 1800 if (kq->kq_state & KQ_SLEEP) { 1801 kq->kq_state &= ~KQ_SLEEP; 1802 wakeup(kq); 1803 } 1804 if (!klist_empty(&kq->kq_klist)) { 1805 /* Defer activation to avoid recursion. */ 1806 kq->kq_state |= KQ_TASK; 1807 task_add(systqmp, &kq->kq_task); 1808 } 1809 } 1810 1811 static void 1812 kqueue_expand_hash(struct kqueue *kq) 1813 { 1814 struct knlist *hash; 1815 u_long hashmask; 1816 1817 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1818 1819 if (kq->kq_knhashmask == 0) { 1820 mtx_leave(&kq->kq_lock); 1821 hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask); 1822 mtx_enter(&kq->kq_lock); 1823 if (kq->kq_knhashmask == 0) { 1824 kq->kq_knhash = hash; 1825 kq->kq_knhashmask = hashmask; 1826 } else { 1827 /* Another thread has allocated the hash. */ 1828 mtx_leave(&kq->kq_lock); 1829 hashfree(hash, KN_HASHSIZE, M_KEVENT); 1830 mtx_enter(&kq->kq_lock); 1831 } 1832 } 1833 } 1834 1835 static void 1836 kqueue_expand_list(struct kqueue *kq, int fd) 1837 { 1838 struct knlist *list, *olist; 1839 int size, osize; 1840 1841 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1842 1843 if (kq->kq_knlistsize <= fd) { 1844 size = kq->kq_knlistsize; 1845 mtx_leave(&kq->kq_lock); 1846 while (size <= fd) 1847 size += KQEXTENT; 1848 list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK); 1849 mtx_enter(&kq->kq_lock); 1850 if (kq->kq_knlistsize <= fd) { 1851 memcpy(list, kq->kq_knlist, 1852 kq->kq_knlistsize * sizeof(*list)); 1853 memset(&list[kq->kq_knlistsize], 0, 1854 (size - kq->kq_knlistsize) * sizeof(*list)); 1855 olist = kq->kq_knlist; 1856 osize = kq->kq_knlistsize; 1857 kq->kq_knlist = list; 1858 kq->kq_knlistsize = size; 1859 mtx_leave(&kq->kq_lock); 1860 free(olist, M_KEVENT, osize * sizeof(*list)); 1861 mtx_enter(&kq->kq_lock); 1862 } else { 1863 /* Another thread has expanded the list. */ 1864 mtx_leave(&kq->kq_lock); 1865 free(list, M_KEVENT, size * sizeof(*list)); 1866 mtx_enter(&kq->kq_lock); 1867 } 1868 } 1869 } 1870 1871 /* 1872 * Acquire a knote, return non-zero on success, 0 on failure. 1873 * 1874 * If we cannot acquire the knote we sleep and return 0. The knote 1875 * may be stale on return in this case and the caller must restart 1876 * whatever loop they are in. 1877 * 1878 * If we are about to sleep and klist is non-NULL, the list is unlocked 1879 * before sleep and remains unlocked on return. 1880 */ 1881 int 1882 knote_acquire(struct knote *kn, struct klist *klist, int ls) 1883 { 1884 struct kqueue *kq = kn->kn_kq; 1885 1886 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1887 KASSERT(kn->kn_filter != EVFILT_MARKER); 1888 1889 if (kn->kn_status & KN_PROCESSING) { 1890 kn->kn_status |= KN_WAITING; 1891 if (klist != NULL) { 1892 mtx_leave(&kq->kq_lock); 1893 klist_unlock(klist, ls); 1894 /* XXX Timeout resolves potential loss of wakeup. */ 1895 tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); 1896 } else { 1897 msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts", 1898 SEC_TO_NSEC(1)); 1899 } 1900 /* knote may be stale now */ 1901 return (0); 1902 } 1903 kn->kn_status |= KN_PROCESSING; 1904 return (1); 1905 } 1906 1907 /* 1908 * Release an acquired knote, clearing KN_PROCESSING. 1909 */ 1910 void 1911 knote_release(struct knote *kn) 1912 { 1913 MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); 1914 KASSERT(kn->kn_filter != EVFILT_MARKER); 1915 KASSERT(kn->kn_status & KN_PROCESSING); 1916 1917 if (kn->kn_status & KN_WAITING) { 1918 kn->kn_status &= ~KN_WAITING; 1919 wakeup(kn); 1920 } 1921 kn->kn_status &= ~KN_PROCESSING; 1922 /* kn should not be accessed anymore */ 1923 } 1924 1925 /* 1926 * activate one knote. 1927 */ 1928 void 1929 knote_activate(struct knote *kn) 1930 { 1931 MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); 1932 1933 kn->kn_status |= KN_ACTIVE; 1934 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) 1935 knote_enqueue(kn); 1936 } 1937 1938 /* 1939 * walk down a list of knotes, activating them if their event has triggered. 1940 */ 1941 void 1942 knote(struct klist *list, long hint) 1943 { 1944 int ls; 1945 1946 ls = klist_lock(list); 1947 knote_locked(list, hint); 1948 klist_unlock(list, ls); 1949 } 1950 1951 void 1952 knote_locked(struct klist *list, long hint) 1953 { 1954 struct knote *kn, *kn0; 1955 struct kqueue *kq; 1956 1957 KLIST_ASSERT_LOCKED(list); 1958 1959 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) { 1960 if (filter_event(kn, hint)) { 1961 kq = kn->kn_kq; 1962 mtx_enter(&kq->kq_lock); 1963 knote_activate(kn); 1964 mtx_leave(&kq->kq_lock); 1965 } 1966 } 1967 } 1968 1969 /* 1970 * remove all knotes from a specified knlist 1971 */ 1972 void 1973 knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, int idx, 1974 int purge) 1975 { 1976 struct knote *kn; 1977 1978 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 1979 1980 /* Always fetch array pointer as another thread can resize kq_knlist. */ 1981 while ((kn = SLIST_FIRST(*plist + idx)) != NULL) { 1982 KASSERT(kn->kn_kq == kq); 1983 1984 if (!purge) { 1985 /* Skip pending badfd knotes. */ 1986 while (kn->kn_fop == &badfd_filtops) { 1987 kn = SLIST_NEXT(kn, kn_link); 1988 if (kn == NULL) 1989 return; 1990 KASSERT(kn->kn_kq == kq); 1991 } 1992 } 1993 1994 if (!knote_acquire(kn, NULL, 0)) { 1995 /* knote_acquire() has released kq_lock. */ 1996 mtx_enter(&kq->kq_lock); 1997 continue; 1998 } 1999 mtx_leave(&kq->kq_lock); 2000 filter_detach(kn); 2001 2002 /* 2003 * Notify poll(2) and select(2) when a monitored 2004 * file descriptor is closed. 2005 * 2006 * This reuses the original knote for delivering the 2007 * notification so as to avoid allocating memory. 2008 */ 2009 if (!purge && (kn->kn_flags & (__EV_POLL | __EV_SELECT)) && 2010 !(p->p_kq == kq && 2011 p->p_kq_serial > (unsigned long)kn->kn_udata) && 2012 kn->kn_fop != &badfd_filtops) { 2013 KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD); 2014 FRELE(kn->kn_fp, p); 2015 kn->kn_fp = NULL; 2016 2017 kn->kn_fop = &badfd_filtops; 2018 filter_event(kn, 0); 2019 mtx_enter(&kq->kq_lock); 2020 knote_activate(kn); 2021 knote_release(kn); 2022 continue; 2023 } 2024 2025 knote_drop(kn, p); 2026 mtx_enter(&kq->kq_lock); 2027 } 2028 } 2029 2030 /* 2031 * remove all knotes referencing a specified fd 2032 */ 2033 void 2034 knote_fdclose(struct proc *p, int fd) 2035 { 2036 struct filedesc *fdp = p->p_p->ps_fd; 2037 struct kqueue *kq; 2038 2039 /* 2040 * fdplock can be ignored if the file descriptor table is being freed 2041 * because no other thread can access the fdp. 2042 */ 2043 if (fdp->fd_refcnt != 0) 2044 fdpassertlocked(fdp); 2045 2046 LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) { 2047 mtx_enter(&kq->kq_lock); 2048 if (fd < kq->kq_knlistsize) 2049 knote_remove(p, kq, &kq->kq_knlist, fd, 0); 2050 mtx_leave(&kq->kq_lock); 2051 } 2052 } 2053 2054 /* 2055 * handle a process exiting, including the triggering of NOTE_EXIT notes 2056 * XXX this could be more efficient, doing a single pass down the klist 2057 */ 2058 void 2059 knote_processexit(struct process *pr) 2060 { 2061 /* this needs both the ps_mtx and exclusive kqueue_ps_list_lock. */ 2062 rw_enter_write(&kqueue_ps_list_lock); 2063 mtx_enter(&pr->ps_mtx); 2064 knote_locked(&pr->ps_klist, NOTE_EXIT); 2065 mtx_leave(&pr->ps_mtx); 2066 rw_exit_write(&kqueue_ps_list_lock); 2067 2068 /* remove other knotes hanging off the process */ 2069 klist_invalidate(&pr->ps_klist); 2070 } 2071 2072 void 2073 knote_processfork(struct process *pr, pid_t pid) 2074 { 2075 /* this needs both the ps_mtx and exclusive kqueue_ps_list_lock. */ 2076 rw_enter_write(&kqueue_ps_list_lock); 2077 mtx_enter(&pr->ps_mtx); 2078 knote_locked(&pr->ps_klist, NOTE_FORK | pid); 2079 mtx_leave(&pr->ps_mtx); 2080 rw_exit_write(&kqueue_ps_list_lock); 2081 } 2082 2083 void 2084 knote_attach(struct knote *kn) 2085 { 2086 struct kqueue *kq = kn->kn_kq; 2087 struct knlist *list; 2088 2089 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 2090 KASSERT(kn->kn_status & KN_PROCESSING); 2091 2092 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 2093 KASSERT(kq->kq_knlistsize > kn->kn_id); 2094 list = &kq->kq_knlist[kn->kn_id]; 2095 } else { 2096 KASSERT(kq->kq_knhashmask != 0); 2097 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2098 } 2099 SLIST_INSERT_HEAD(list, kn, kn_link); 2100 kq->kq_nknotes++; 2101 } 2102 2103 void 2104 knote_detach(struct knote *kn) 2105 { 2106 struct kqueue *kq = kn->kn_kq; 2107 struct knlist *list; 2108 2109 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 2110 KASSERT(kn->kn_status & KN_PROCESSING); 2111 2112 kq->kq_nknotes--; 2113 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 2114 list = &kq->kq_knlist[kn->kn_id]; 2115 else 2116 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2117 SLIST_REMOVE(list, kn, knote, kn_link); 2118 } 2119 2120 /* 2121 * should be called at spl == 0, since we don't want to hold spl 2122 * while calling FRELE and pool_put. 2123 */ 2124 void 2125 knote_drop(struct knote *kn, struct proc *p) 2126 { 2127 struct kqueue *kq = kn->kn_kq; 2128 2129 KASSERT(kn->kn_filter != EVFILT_MARKER); 2130 2131 mtx_enter(&kq->kq_lock); 2132 knote_detach(kn); 2133 if (kn->kn_status & KN_QUEUED) 2134 knote_dequeue(kn); 2135 if (kn->kn_status & KN_WAITING) { 2136 kn->kn_status &= ~KN_WAITING; 2137 wakeup(kn); 2138 } 2139 mtx_leave(&kq->kq_lock); 2140 2141 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL) 2142 FRELE(kn->kn_fp, p); 2143 pool_put(&knote_pool, kn); 2144 } 2145 2146 2147 void 2148 knote_enqueue(struct knote *kn) 2149 { 2150 struct kqueue *kq = kn->kn_kq; 2151 2152 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 2153 KASSERT(kn->kn_filter != EVFILT_MARKER); 2154 KASSERT((kn->kn_status & KN_QUEUED) == 0); 2155 2156 kqueue_check(kq); 2157 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2158 kn->kn_status |= KN_QUEUED; 2159 kq->kq_count++; 2160 kqueue_check(kq); 2161 kqueue_wakeup(kq); 2162 } 2163 2164 void 2165 knote_dequeue(struct knote *kn) 2166 { 2167 struct kqueue *kq = kn->kn_kq; 2168 2169 MUTEX_ASSERT_LOCKED(&kq->kq_lock); 2170 KASSERT(kn->kn_filter != EVFILT_MARKER); 2171 KASSERT(kn->kn_status & KN_QUEUED); 2172 2173 kqueue_check(kq); 2174 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2175 kn->kn_status &= ~KN_QUEUED; 2176 kq->kq_count--; 2177 kqueue_check(kq); 2178 } 2179 2180 /* 2181 * Assign parameters to the knote. 2182 * 2183 * The knote's object lock must be held. 2184 */ 2185 void 2186 knote_assign(const struct kevent *kev, struct knote *kn) 2187 { 2188 if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) 2189 KERNEL_ASSERT_LOCKED(); 2190 2191 kn->kn_sfflags = kev->fflags; 2192 kn->kn_sdata = kev->data; 2193 kn->kn_udata = kev->udata; 2194 } 2195 2196 /* 2197 * Submit the knote's event for delivery. 2198 * 2199 * The knote's object lock must be held. 2200 */ 2201 void 2202 knote_submit(struct knote *kn, struct kevent *kev) 2203 { 2204 if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) 2205 KERNEL_ASSERT_LOCKED(); 2206 2207 if (kev != NULL) { 2208 *kev = kn->kn_kevent; 2209 if (kn->kn_flags & EV_CLEAR) { 2210 kn->kn_fflags = 0; 2211 kn->kn_data = 0; 2212 } 2213 } 2214 } 2215 2216 void 2217 klist_init(struct klist *klist, const struct klistops *ops, void *arg) 2218 { 2219 SLIST_INIT(&klist->kl_list); 2220 klist->kl_ops = ops; 2221 klist->kl_arg = arg; 2222 } 2223 2224 void 2225 klist_free(struct klist *klist) 2226 { 2227 KASSERT(SLIST_EMPTY(&klist->kl_list)); 2228 } 2229 2230 void 2231 klist_insert(struct klist *klist, struct knote *kn) 2232 { 2233 int ls; 2234 2235 ls = klist_lock(klist); 2236 SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); 2237 klist_unlock(klist, ls); 2238 } 2239 2240 void 2241 klist_insert_locked(struct klist *klist, struct knote *kn) 2242 { 2243 KLIST_ASSERT_LOCKED(klist); 2244 2245 SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); 2246 } 2247 2248 void 2249 klist_remove(struct klist *klist, struct knote *kn) 2250 { 2251 int ls; 2252 2253 ls = klist_lock(klist); 2254 SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); 2255 klist_unlock(klist, ls); 2256 } 2257 2258 void 2259 klist_remove_locked(struct klist *klist, struct knote *kn) 2260 { 2261 KLIST_ASSERT_LOCKED(klist); 2262 2263 SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); 2264 } 2265 2266 /* 2267 * Detach all knotes from klist. The knotes are rewired to indicate EOF. 2268 * 2269 * The caller of this function must not hold any locks that can block 2270 * filterops callbacks that run with KN_PROCESSING. 2271 * Otherwise this function might deadlock. 2272 */ 2273 void 2274 klist_invalidate(struct klist *list) 2275 { 2276 struct knote *kn; 2277 struct kqueue *kq; 2278 struct proc *p = curproc; 2279 int ls; 2280 2281 NET_ASSERT_UNLOCKED(); 2282 2283 ls = klist_lock(list); 2284 while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) { 2285 kq = kn->kn_kq; 2286 mtx_enter(&kq->kq_lock); 2287 if (!knote_acquire(kn, list, ls)) { 2288 /* knote_acquire() has released kq_lock 2289 * and klist lock. */ 2290 ls = klist_lock(list); 2291 continue; 2292 } 2293 mtx_leave(&kq->kq_lock); 2294 klist_unlock(list, ls); 2295 filter_detach(kn); 2296 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 2297 kn->kn_fop = &dead_filtops; 2298 filter_event(kn, 0); 2299 mtx_enter(&kq->kq_lock); 2300 knote_activate(kn); 2301 knote_release(kn); 2302 mtx_leave(&kq->kq_lock); 2303 } else { 2304 knote_drop(kn, p); 2305 } 2306 ls = klist_lock(list); 2307 } 2308 klist_unlock(list, ls); 2309 } 2310 2311 static int 2312 klist_lock(struct klist *list) 2313 { 2314 int ls = 0; 2315 2316 if (list->kl_ops != NULL) { 2317 ls = list->kl_ops->klo_lock(list->kl_arg); 2318 } else { 2319 KERNEL_LOCK(); 2320 ls = splhigh(); 2321 } 2322 return ls; 2323 } 2324 2325 static void 2326 klist_unlock(struct klist *list, int ls) 2327 { 2328 if (list->kl_ops != NULL) { 2329 list->kl_ops->klo_unlock(list->kl_arg, ls); 2330 } else { 2331 splx(ls); 2332 KERNEL_UNLOCK(); 2333 } 2334 } 2335 2336 static void 2337 klist_mutex_assertlk(void *arg) 2338 { 2339 struct mutex *mtx = arg; 2340 2341 (void)mtx; 2342 2343 MUTEX_ASSERT_LOCKED(mtx); 2344 } 2345 2346 static int 2347 klist_mutex_lock(void *arg) 2348 { 2349 struct mutex *mtx = arg; 2350 2351 mtx_enter(mtx); 2352 return 0; 2353 } 2354 2355 static void 2356 klist_mutex_unlock(void *arg, int s) 2357 { 2358 struct mutex *mtx = arg; 2359 2360 mtx_leave(mtx); 2361 } 2362 2363 static const struct klistops mutex_klistops = { 2364 .klo_assertlk = klist_mutex_assertlk, 2365 .klo_lock = klist_mutex_lock, 2366 .klo_unlock = klist_mutex_unlock, 2367 }; 2368 2369 void 2370 klist_init_mutex(struct klist *klist, struct mutex *mtx) 2371 { 2372 klist_init(klist, &mutex_klistops, mtx); 2373 } 2374 2375 static void 2376 klist_rwlock_assertlk(void *arg) 2377 { 2378 struct rwlock *rwl = arg; 2379 2380 (void)rwl; 2381 2382 rw_assert_wrlock(rwl); 2383 } 2384 2385 static int 2386 klist_rwlock_lock(void *arg) 2387 { 2388 struct rwlock *rwl = arg; 2389 2390 rw_enter_write(rwl); 2391 return 0; 2392 } 2393 2394 static void 2395 klist_rwlock_unlock(void *arg, int s) 2396 { 2397 struct rwlock *rwl = arg; 2398 2399 rw_exit_write(rwl); 2400 } 2401 2402 static const struct klistops rwlock_klistops = { 2403 .klo_assertlk = klist_rwlock_assertlk, 2404 .klo_lock = klist_rwlock_lock, 2405 .klo_unlock = klist_rwlock_unlock, 2406 }; 2407 2408 void 2409 klist_init_rwlock(struct klist *klist, struct rwlock *rwl) 2410 { 2411 klist_init(klist, &rwlock_klistops, rwl); 2412 } 2413