1 /* $OpenBSD: kern_event.c,v 1.127 2020/02/25 13:21:17 mpi Exp $ */ 2 3 /*- 4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $ 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/atomic.h> 34 #include <sys/kernel.h> 35 #include <sys/proc.h> 36 #include <sys/pledge.h> 37 #include <sys/malloc.h> 38 #include <sys/unistd.h> 39 #include <sys/file.h> 40 #include <sys/filedesc.h> 41 #include <sys/fcntl.h> 42 #include <sys/selinfo.h> 43 #include <sys/queue.h> 44 #include <sys/event.h> 45 #include <sys/eventvar.h> 46 #include <sys/ktrace.h> 47 #include <sys/pool.h> 48 #include <sys/protosw.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/stat.h> 52 #include <sys/uio.h> 53 #include <sys/mount.h> 54 #include <sys/poll.h> 55 #include <sys/syscallargs.h> 56 #include <sys/timeout.h> 57 #include <sys/wait.h> 58 59 void kqueue_init(void); 60 void KQREF(struct kqueue *); 61 void KQRELE(struct kqueue *); 62 63 int kqueue_scan(struct kqueue *kq, int maxevents, 64 struct kevent *ulistp, struct timespec *timeout, 65 struct proc *p, int *retval); 66 67 int kqueue_read(struct file *, struct uio *, int); 68 int kqueue_write(struct file *, struct uio *, int); 69 int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 70 struct proc *p); 71 int kqueue_poll(struct file *fp, int events, struct proc *p); 72 int kqueue_kqfilter(struct file *fp, struct knote *kn); 73 int kqueue_stat(struct file *fp, struct stat *st, struct proc *p); 74 int kqueue_close(struct file *fp, struct proc *p); 75 void kqueue_wakeup(struct kqueue *kq); 76 77 static void kqueue_expand_hash(struct kqueue *kq); 78 static void kqueue_expand_list(struct kqueue *kq, int fd); 79 static void kqueue_task(void *); 80 81 const struct fileops kqueueops = { 82 .fo_read = kqueue_read, 83 .fo_write = kqueue_write, 84 .fo_ioctl = kqueue_ioctl, 85 .fo_poll = kqueue_poll, 86 .fo_kqfilter = kqueue_kqfilter, 87 .fo_stat = kqueue_stat, 88 .fo_close = kqueue_close 89 }; 90 91 void knote_attach(struct knote *kn); 92 void knote_drop(struct knote *kn, struct proc *p); 93 void knote_enqueue(struct knote *kn); 94 void knote_dequeue(struct knote *kn); 95 int knote_acquire(struct knote *kn); 96 void knote_release(struct knote *kn); 97 98 void filt_kqdetach(struct knote *kn); 99 int filt_kqueue(struct knote *kn, long hint); 100 int filt_procattach(struct knote *kn); 101 void filt_procdetach(struct knote *kn); 102 int filt_proc(struct knote *kn, long hint); 103 int filt_fileattach(struct knote *kn); 104 void filt_timerexpire(void *knx); 105 int filt_timerattach(struct knote *kn); 106 void filt_timerdetach(struct knote *kn); 107 int filt_timer(struct knote *kn, long hint); 108 void filt_seltruedetach(struct knote *kn); 109 110 const struct filterops kqread_filtops = { 111 .f_flags = FILTEROP_ISFD, 112 .f_attach = NULL, 113 .f_detach = filt_kqdetach, 114 .f_event = filt_kqueue, 115 }; 116 117 const struct filterops proc_filtops = { 118 .f_flags = 0, 119 .f_attach = filt_procattach, 120 .f_detach = filt_procdetach, 121 .f_event = filt_proc, 122 }; 123 124 const struct filterops file_filtops = { 125 .f_flags = FILTEROP_ISFD, 126 .f_attach = filt_fileattach, 127 .f_detach = NULL, 128 .f_event = NULL, 129 }; 130 131 const struct filterops timer_filtops = { 132 .f_flags = 0, 133 .f_attach = filt_timerattach, 134 .f_detach = filt_timerdetach, 135 .f_event = filt_timer, 136 }; 137 138 struct pool knote_pool; 139 struct pool kqueue_pool; 140 int kq_ntimeouts = 0; 141 int kq_timeoutmax = (4 * 1024); 142 143 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 144 145 /* 146 * Table for for all system-defined filters. 147 */ 148 const struct filterops *const sysfilt_ops[] = { 149 &file_filtops, /* EVFILT_READ */ 150 &file_filtops, /* EVFILT_WRITE */ 151 NULL, /*&aio_filtops,*/ /* EVFILT_AIO */ 152 &file_filtops, /* EVFILT_VNODE */ 153 &proc_filtops, /* EVFILT_PROC */ 154 &sig_filtops, /* EVFILT_SIGNAL */ 155 &timer_filtops, /* EVFILT_TIMER */ 156 &file_filtops, /* EVFILT_DEVICE */ 157 }; 158 159 void 160 KQREF(struct kqueue *kq) 161 { 162 atomic_inc_int(&kq->kq_refs); 163 } 164 165 void 166 KQRELE(struct kqueue *kq) 167 { 168 struct filedesc *fdp; 169 170 if (atomic_dec_int_nv(&kq->kq_refs) > 0) 171 return; 172 173 fdp = kq->kq_fdp; 174 if (rw_status(&fdp->fd_lock) == RW_WRITE) { 175 LIST_REMOVE(kq, kq_next); 176 } else { 177 fdplock(fdp); 178 LIST_REMOVE(kq, kq_next); 179 fdpunlock(fdp); 180 } 181 182 free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * sizeof(struct klist)); 183 hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT); 184 pool_put(&kqueue_pool, kq); 185 } 186 187 void 188 kqueue_init(void) 189 { 190 pool_init(&kqueue_pool, sizeof(struct kqueue), 0, IPL_MPFLOOR, 191 PR_WAITOK, "kqueuepl", NULL); 192 pool_init(&knote_pool, sizeof(struct knote), 0, IPL_MPFLOOR, 193 PR_WAITOK, "knotepl", NULL); 194 } 195 196 int 197 filt_fileattach(struct knote *kn) 198 { 199 struct file *fp = kn->kn_fp; 200 201 return fp->f_ops->fo_kqfilter(fp, kn); 202 } 203 204 int 205 kqueue_kqfilter(struct file *fp, struct knote *kn) 206 { 207 struct kqueue *kq = kn->kn_fp->f_data; 208 209 if (kn->kn_filter != EVFILT_READ) 210 return (EINVAL); 211 212 kn->kn_fop = &kqread_filtops; 213 SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext); 214 return (0); 215 } 216 217 void 218 filt_kqdetach(struct knote *kn) 219 { 220 struct kqueue *kq = kn->kn_fp->f_data; 221 222 SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext); 223 } 224 225 int 226 filt_kqueue(struct knote *kn, long hint) 227 { 228 struct kqueue *kq = kn->kn_fp->f_data; 229 230 kn->kn_data = kq->kq_count; 231 return (kn->kn_data > 0); 232 } 233 234 int 235 filt_procattach(struct knote *kn) 236 { 237 struct process *pr; 238 239 if ((curproc->p_p->ps_flags & PS_PLEDGE) && 240 (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0) 241 return pledge_fail(curproc, EPERM, PLEDGE_PROC); 242 243 if (kn->kn_id > PID_MAX) 244 return ESRCH; 245 246 pr = prfind(kn->kn_id); 247 if (pr == NULL) 248 return (ESRCH); 249 250 /* exiting processes can't be specified */ 251 if (pr->ps_flags & PS_EXITING) 252 return (ESRCH); 253 254 kn->kn_ptr.p_process = pr; 255 kn->kn_flags |= EV_CLEAR; /* automatically set */ 256 257 /* 258 * internal flag indicating registration done by kernel 259 */ 260 if (kn->kn_flags & EV_FLAG1) { 261 kn->kn_data = kn->kn_sdata; /* ppid */ 262 kn->kn_fflags = NOTE_CHILD; 263 kn->kn_flags &= ~EV_FLAG1; 264 } 265 266 /* XXX lock the proc here while adding to the list? */ 267 SLIST_INSERT_HEAD(&pr->ps_klist, kn, kn_selnext); 268 269 return (0); 270 } 271 272 /* 273 * The knote may be attached to a different process, which may exit, 274 * leaving nothing for the knote to be attached to. So when the process 275 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 276 * it will be deleted when read out. However, as part of the knote deletion, 277 * this routine is called, so a check is needed to avoid actually performing 278 * a detach, because the original process does not exist any more. 279 */ 280 void 281 filt_procdetach(struct knote *kn) 282 { 283 struct process *pr = kn->kn_ptr.p_process; 284 285 if (kn->kn_status & KN_DETACHED) 286 return; 287 288 /* XXX locking? this might modify another process. */ 289 SLIST_REMOVE(&pr->ps_klist, kn, knote, kn_selnext); 290 } 291 292 int 293 filt_proc(struct knote *kn, long hint) 294 { 295 u_int event; 296 297 /* 298 * mask off extra data 299 */ 300 event = (u_int)hint & NOTE_PCTRLMASK; 301 302 /* 303 * if the user is interested in this event, record it. 304 */ 305 if (kn->kn_sfflags & event) 306 kn->kn_fflags |= event; 307 308 /* 309 * process is gone, so flag the event as finished and remove it 310 * from the process's klist 311 */ 312 if (event == NOTE_EXIT) { 313 struct process *pr = kn->kn_ptr.p_process; 314 int s; 315 316 s = splhigh(); 317 kn->kn_status |= KN_DETACHED; 318 splx(s); 319 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 320 kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig); 321 SLIST_REMOVE(&pr->ps_klist, kn, knote, kn_selnext); 322 return (1); 323 } 324 325 /* 326 * process forked, and user wants to track the new process, 327 * so attach a new knote to it, and immediately report an 328 * event with the parent's pid. 329 */ 330 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 331 struct kevent kev; 332 int error; 333 334 /* 335 * register knote with new process. 336 */ 337 memset(&kev, 0, sizeof(kev)); 338 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 339 kev.filter = kn->kn_filter; 340 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 341 kev.fflags = kn->kn_sfflags; 342 kev.data = kn->kn_id; /* parent */ 343 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 344 error = kqueue_register(kn->kn_kq, &kev, NULL); 345 if (error) 346 kn->kn_fflags |= NOTE_TRACKERR; 347 } 348 349 return (kn->kn_fflags != 0); 350 } 351 352 static void 353 filt_timer_timeout_add(struct knote *kn) 354 { 355 struct timeval tv; 356 struct timeout *to = kn->kn_hook; 357 int tticks; 358 359 tv.tv_sec = kn->kn_sdata / 1000; 360 tv.tv_usec = (kn->kn_sdata % 1000) * 1000; 361 tticks = tvtohz(&tv); 362 /* Remove extra tick from tvtohz() if timeout has fired before. */ 363 if (timeout_triggered(to)) 364 tticks--; 365 timeout_add(to, (tticks > 0) ? tticks : 1); 366 } 367 368 void 369 filt_timerexpire(void *knx) 370 { 371 struct knote *kn = knx; 372 373 kn->kn_data++; 374 knote_activate(kn); 375 376 if ((kn->kn_flags & EV_ONESHOT) == 0) 377 filt_timer_timeout_add(kn); 378 } 379 380 381 /* 382 * data contains amount of time to sleep, in milliseconds 383 */ 384 int 385 filt_timerattach(struct knote *kn) 386 { 387 struct timeout *to; 388 389 if (kq_ntimeouts > kq_timeoutmax) 390 return (ENOMEM); 391 kq_ntimeouts++; 392 393 kn->kn_flags |= EV_CLEAR; /* automatically set */ 394 to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); 395 timeout_set(to, filt_timerexpire, kn); 396 kn->kn_hook = to; 397 filt_timer_timeout_add(kn); 398 399 return (0); 400 } 401 402 void 403 filt_timerdetach(struct knote *kn) 404 { 405 struct timeout *to; 406 407 to = (struct timeout *)kn->kn_hook; 408 timeout_del(to); 409 free(to, M_KEVENT, sizeof(*to)); 410 kq_ntimeouts--; 411 } 412 413 int 414 filt_timer(struct knote *kn, long hint) 415 { 416 return (kn->kn_data != 0); 417 } 418 419 420 /* 421 * filt_seltrue: 422 * 423 * This filter "event" routine simulates seltrue(). 424 */ 425 int 426 filt_seltrue(struct knote *kn, long hint) 427 { 428 429 /* 430 * We don't know how much data can be read/written, 431 * but we know that it *can* be. This is about as 432 * good as select/poll does as well. 433 */ 434 kn->kn_data = 0; 435 return (1); 436 } 437 438 /* 439 * This provides full kqfilter entry for device switch tables, which 440 * has same effect as filter using filt_seltrue() as filter method. 441 */ 442 void 443 filt_seltruedetach(struct knote *kn) 444 { 445 /* Nothing to do */ 446 } 447 448 const struct filterops seltrue_filtops = { 449 .f_flags = FILTEROP_ISFD, 450 .f_attach = NULL, 451 .f_detach = filt_seltruedetach, 452 .f_event = filt_seltrue, 453 }; 454 455 int 456 seltrue_kqfilter(dev_t dev, struct knote *kn) 457 { 458 switch (kn->kn_filter) { 459 case EVFILT_READ: 460 case EVFILT_WRITE: 461 kn->kn_fop = &seltrue_filtops; 462 break; 463 default: 464 return (EINVAL); 465 } 466 467 /* Nothing more to do */ 468 return (0); 469 } 470 471 static int 472 filt_dead(struct knote *kn, long hint) 473 { 474 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 475 kn->kn_data = 0; 476 return (1); 477 } 478 479 static void 480 filt_deaddetach(struct knote *kn) 481 { 482 /* Nothing to do */ 483 } 484 485 static const struct filterops dead_filtops = { 486 .f_flags = FILTEROP_ISFD, 487 .f_attach = NULL, 488 .f_detach = filt_deaddetach, 489 .f_event = filt_dead, 490 }; 491 492 int 493 sys_kqueue(struct proc *p, void *v, register_t *retval) 494 { 495 struct filedesc *fdp = p->p_fd; 496 struct kqueue *kq; 497 struct file *fp; 498 int fd, error; 499 500 kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO); 501 kq->kq_refs = 1; 502 kq->kq_fdp = fdp; 503 TAILQ_INIT(&kq->kq_head); 504 task_set(&kq->kq_task, kqueue_task, kq); 505 506 fdplock(fdp); 507 error = falloc(p, &fp, &fd); 508 if (error) 509 goto out; 510 fp->f_flag = FREAD | FWRITE; 511 fp->f_type = DTYPE_KQUEUE; 512 fp->f_ops = &kqueueops; 513 fp->f_data = kq; 514 *retval = fd; 515 LIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_next); 516 kq = NULL; 517 fdinsert(fdp, fd, 0, fp); 518 FRELE(fp, p); 519 out: 520 fdpunlock(fdp); 521 if (kq != NULL) 522 pool_put(&kqueue_pool, kq); 523 return (error); 524 } 525 526 int 527 sys_kevent(struct proc *p, void *v, register_t *retval) 528 { 529 struct filedesc* fdp = p->p_fd; 530 struct sys_kevent_args /* { 531 syscallarg(int) fd; 532 syscallarg(const struct kevent *) changelist; 533 syscallarg(int) nchanges; 534 syscallarg(struct kevent *) eventlist; 535 syscallarg(int) nevents; 536 syscallarg(const struct timespec *) timeout; 537 } */ *uap = v; 538 struct kevent *kevp; 539 struct kqueue *kq; 540 struct file *fp; 541 struct timespec ts; 542 struct timespec *tsp = NULL; 543 int i, n, nerrors, error; 544 struct kevent kev[KQ_NEVENTS]; 545 546 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) 547 return (EBADF); 548 549 if (fp->f_type != DTYPE_KQUEUE) { 550 error = EBADF; 551 goto done; 552 } 553 554 if (SCARG(uap, timeout) != NULL) { 555 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 556 if (error) 557 goto done; 558 #ifdef KTRACE 559 if (KTRPOINT(p, KTR_STRUCT)) 560 ktrreltimespec(p, &ts); 561 #endif 562 tsp = &ts; 563 } 564 565 kq = fp->f_data; 566 nerrors = 0; 567 568 while (SCARG(uap, nchanges) > 0) { 569 n = SCARG(uap, nchanges) > KQ_NEVENTS ? 570 KQ_NEVENTS : SCARG(uap, nchanges); 571 error = copyin(SCARG(uap, changelist), kev, 572 n * sizeof(struct kevent)); 573 if (error) 574 goto done; 575 #ifdef KTRACE 576 if (KTRPOINT(p, KTR_STRUCT)) 577 ktrevent(p, kev, n); 578 #endif 579 for (i = 0; i < n; i++) { 580 kevp = &kev[i]; 581 kevp->flags &= ~EV_SYSFLAGS; 582 error = kqueue_register(kq, kevp, p); 583 if (error || (kevp->flags & EV_RECEIPT)) { 584 if (SCARG(uap, nevents) != 0) { 585 kevp->flags = EV_ERROR; 586 kevp->data = error; 587 copyout(kevp, SCARG(uap, eventlist), 588 sizeof(*kevp)); 589 SCARG(uap, eventlist)++; 590 SCARG(uap, nevents)--; 591 nerrors++; 592 } else { 593 goto done; 594 } 595 } 596 } 597 SCARG(uap, nchanges) -= n; 598 SCARG(uap, changelist) += n; 599 } 600 if (nerrors) { 601 *retval = nerrors; 602 error = 0; 603 goto done; 604 } 605 606 KQREF(kq); 607 FRELE(fp, p); 608 error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist), 609 tsp, p, &n); 610 KQRELE(kq); 611 *retval = n; 612 return (error); 613 614 done: 615 FRELE(fp, p); 616 return (error); 617 } 618 619 #ifdef KQUEUE_DEBUG 620 void 621 kqueue_do_check(struct kqueue *kq, const char *func, int line) 622 { 623 struct knote *kn; 624 int count = 0, nmarker = 0; 625 626 KERNEL_ASSERT_LOCKED(); 627 splassert(IPL_HIGH); 628 629 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { 630 if (kn->kn_filter == EVFILT_MARKER) { 631 if ((kn->kn_status & KN_QUEUED) != 0) 632 panic("%s:%d: kq=%p kn=%p marker QUEUED", 633 func, line, kq, kn); 634 nmarker++; 635 } else { 636 if ((kn->kn_status & KN_ACTIVE) == 0) 637 panic("%s:%d: kq=%p kn=%p knote !ACTIVE", 638 func, line, kq, kn); 639 if ((kn->kn_status & KN_QUEUED) == 0) 640 panic("%s:%d: kq=%p kn=%p knote !QUEUED", 641 func, line, kq, kn); 642 if (kn->kn_kq != kq) 643 panic("%s:%d: kq=%p kn=%p kn_kq=%p != kq", 644 func, line, kq, kn, kn->kn_kq); 645 count++; 646 if (count > kq->kq_count) 647 goto bad; 648 } 649 } 650 if (count != kq->kq_count) { 651 bad: 652 panic("%s:%d: kq=%p kq_count=%d count=%d nmarker=%d", 653 func, line, kq, kq->kq_count, count, nmarker); 654 } 655 } 656 #define kqueue_check(kq) kqueue_do_check((kq), __func__, __LINE__) 657 #else 658 #define kqueue_check(kq) do {} while (0) 659 #endif 660 661 int 662 kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) 663 { 664 struct filedesc *fdp = kq->kq_fdp; 665 const struct filterops *fops = NULL; 666 struct file *fp = NULL; 667 struct knote *kn = NULL, *newkn = NULL; 668 struct klist *list = NULL; 669 int s, error = 0; 670 671 if (kev->filter < 0) { 672 if (kev->filter + EVFILT_SYSCOUNT < 0) 673 return (EINVAL); 674 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 675 } 676 677 if (fops == NULL) { 678 /* 679 * XXX 680 * filter attach routine is responsible for ensuring that 681 * the identifier can be attached to it. 682 */ 683 return (EINVAL); 684 } 685 686 if (fops->f_flags & FILTEROP_ISFD) { 687 /* validate descriptor */ 688 if (kev->ident > INT_MAX) 689 return (EBADF); 690 } 691 692 if (kev->flags & EV_ADD) 693 newkn = pool_get(&knote_pool, PR_WAITOK | PR_ZERO); 694 695 again: 696 if (fops->f_flags & FILTEROP_ISFD) { 697 if ((fp = fd_getfile(fdp, kev->ident)) == NULL) { 698 error = EBADF; 699 goto done; 700 } 701 if (kev->flags & EV_ADD) 702 kqueue_expand_list(kq, kev->ident); 703 if (kev->ident < kq->kq_knlistsize) 704 list = &kq->kq_knlist[kev->ident]; 705 } else { 706 if (kev->flags & EV_ADD) 707 kqueue_expand_hash(kq); 708 if (kq->kq_knhashmask != 0) { 709 list = &kq->kq_knhash[ 710 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 711 } 712 } 713 if (list != NULL) { 714 SLIST_FOREACH(kn, list, kn_link) { 715 if (kev->filter == kn->kn_filter && 716 kev->ident == kn->kn_id) { 717 s = splhigh(); 718 if (!knote_acquire(kn)) { 719 splx(s); 720 if (fp != NULL) { 721 FRELE(fp, p); 722 fp = NULL; 723 } 724 goto again; 725 } 726 splx(s); 727 break; 728 } 729 } 730 } 731 KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0); 732 733 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 734 error = ENOENT; 735 goto done; 736 } 737 738 /* 739 * kn now contains the matching knote, or NULL if no match. 740 * If adding a new knote, sleeping is not allowed until the knote 741 * has been inserted. 742 */ 743 if (kev->flags & EV_ADD) { 744 if (kn == NULL) { 745 kn = newkn; 746 newkn = NULL; 747 kn->kn_status = KN_PROCESSING; 748 kn->kn_fp = fp; 749 kn->kn_kq = kq; 750 kn->kn_fop = fops; 751 752 /* 753 * apply reference count to knote structure, and 754 * do not release it at the end of this routine. 755 */ 756 fp = NULL; 757 758 kn->kn_sfflags = kev->fflags; 759 kn->kn_sdata = kev->data; 760 kev->fflags = 0; 761 kev->data = 0; 762 kn->kn_kevent = *kev; 763 764 knote_attach(kn); 765 if ((error = fops->f_attach(kn)) != 0) { 766 knote_drop(kn, p); 767 goto done; 768 } 769 770 /* 771 * If this is a file descriptor filter, check if 772 * fd was closed while the knote was being added. 773 * knote_fdclose() has missed kn if the function 774 * ran before kn appeared in kq_knlist. 775 */ 776 if ((fops->f_flags & FILTEROP_ISFD) && 777 fd_checkclosed(fdp, kev->ident, kn->kn_fp)) { 778 /* 779 * Drop the knote silently without error 780 * because another thread might already have 781 * seen it. This corresponds to the insert 782 * happening in full before the close. 783 */ 784 kn->kn_fop->f_detach(kn); 785 knote_drop(kn, p); 786 goto done; 787 } 788 } else { 789 /* 790 * The user may change some filter values after the 791 * initial EV_ADD, but doing so will not reset any 792 * filters which have already been triggered. 793 */ 794 kn->kn_sfflags = kev->fflags; 795 kn->kn_sdata = kev->data; 796 kn->kn_kevent.udata = kev->udata; 797 } 798 799 s = splhigh(); 800 if (kn->kn_fop->f_event(kn, 0)) 801 knote_activate(kn); 802 splx(s); 803 804 } else if (kev->flags & EV_DELETE) { 805 kn->kn_fop->f_detach(kn); 806 knote_drop(kn, p); 807 goto done; 808 } 809 810 if ((kev->flags & EV_DISABLE) && 811 ((kn->kn_status & KN_DISABLED) == 0)) { 812 s = splhigh(); 813 kn->kn_status |= KN_DISABLED; 814 splx(s); 815 } 816 817 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 818 s = splhigh(); 819 kn->kn_status &= ~KN_DISABLED; 820 if (kn->kn_fop->f_event(kn, 0)) 821 kn->kn_status |= KN_ACTIVE; 822 if ((kn->kn_status & KN_ACTIVE) && 823 ((kn->kn_status & KN_QUEUED) == 0)) 824 knote_enqueue(kn); 825 splx(s); 826 } 827 828 s = splhigh(); 829 knote_release(kn); 830 splx(s); 831 done: 832 if (fp != NULL) 833 FRELE(fp, p); 834 if (newkn != NULL) 835 pool_put(&knote_pool, newkn); 836 return (error); 837 } 838 839 int 840 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp, 841 struct timespec *tsp, struct proc *p, int *retval) 842 { 843 struct kevent *kevp; 844 struct timespec elapsed, start, stop; 845 struct knote mend, mstart, *kn; 846 int s, count, timeout, nkev = 0, error = 0; 847 struct kevent kev[KQ_NEVENTS]; 848 849 count = maxevents; 850 if (count == 0) 851 goto done; 852 853 if (tsp != NULL && (tsp->tv_sec < 0 || !timespecisvalid(tsp))) { 854 error = EINVAL; 855 goto done; 856 } 857 858 memset(&mstart, 0, sizeof(mstart)); 859 memset(&mend, 0, sizeof(mend)); 860 861 retry: 862 if (kq->kq_state & KQ_DYING) { 863 error = EBADF; 864 goto done; 865 } 866 867 kevp = &kev[0]; 868 s = splhigh(); 869 if (kq->kq_count == 0) { 870 if (tsp != NULL && !timespecisset(tsp)) { 871 splx(s); 872 error = 0; 873 goto done; 874 } 875 kq->kq_state |= KQ_SLEEP; 876 timeout = (tsp == NULL) ? 0 : tstohz(tsp); 877 if (tsp != NULL) 878 getnanouptime(&start); 879 error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); 880 if (tsp != NULL) { 881 getnanouptime(&stop); 882 timespecsub(&stop, &start, &elapsed); 883 timespecsub(tsp, &elapsed, tsp); 884 if (tsp->tv_sec < 0) 885 timespecclear(tsp); 886 } 887 splx(s); 888 if (error == 0 || error == EWOULDBLOCK) 889 goto retry; 890 /* don't restart after signals... */ 891 if (error == ERESTART) 892 error = EINTR; 893 goto done; 894 } 895 896 mstart.kn_filter = EVFILT_MARKER; 897 mstart.kn_status = KN_PROCESSING; 898 TAILQ_INSERT_HEAD(&kq->kq_head, &mstart, kn_tqe); 899 mend.kn_filter = EVFILT_MARKER; 900 mend.kn_status = KN_PROCESSING; 901 TAILQ_INSERT_TAIL(&kq->kq_head, &mend, kn_tqe); 902 while (count) { 903 kn = TAILQ_NEXT(&mstart, kn_tqe); 904 if (kn->kn_filter == EVFILT_MARKER) { 905 if (kn == &mend) { 906 TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe); 907 TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe); 908 splx(s); 909 if (count == maxevents) 910 goto retry; 911 goto done; 912 } 913 914 /* Move start marker past another thread's marker. */ 915 TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe); 916 TAILQ_INSERT_AFTER(&kq->kq_head, kn, &mstart, kn_tqe); 917 continue; 918 } 919 920 if (!knote_acquire(kn)) 921 continue; 922 923 kqueue_check(kq); 924 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 925 kn->kn_status &= ~KN_QUEUED; 926 kq->kq_count--; 927 kqueue_check(kq); 928 929 if (kn->kn_status & KN_DISABLED) { 930 knote_release(kn); 931 continue; 932 } 933 if ((kn->kn_flags & EV_ONESHOT) == 0 && 934 kn->kn_fop->f_event(kn, 0) == 0) { 935 if ((kn->kn_status & KN_QUEUED) == 0) 936 kn->kn_status &= ~KN_ACTIVE; 937 knote_release(kn); 938 kqueue_check(kq); 939 continue; 940 } 941 *kevp = kn->kn_kevent; 942 kevp++; 943 nkev++; 944 if (kn->kn_flags & EV_ONESHOT) { 945 splx(s); 946 kn->kn_fop->f_detach(kn); 947 knote_drop(kn, p); 948 s = splhigh(); 949 } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 950 if (kn->kn_flags & EV_CLEAR) { 951 kn->kn_data = 0; 952 kn->kn_fflags = 0; 953 } 954 if (kn->kn_flags & EV_DISPATCH) 955 kn->kn_status |= KN_DISABLED; 956 if ((kn->kn_status & KN_QUEUED) == 0) 957 kn->kn_status &= ~KN_ACTIVE; 958 knote_release(kn); 959 } else { 960 if ((kn->kn_status & KN_QUEUED) == 0) { 961 kqueue_check(kq); 962 kq->kq_count++; 963 kn->kn_status |= KN_QUEUED; 964 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 965 } 966 knote_release(kn); 967 } 968 kqueue_check(kq); 969 count--; 970 if (nkev == KQ_NEVENTS) { 971 splx(s); 972 #ifdef KTRACE 973 if (KTRPOINT(p, KTR_STRUCT)) 974 ktrevent(p, kev, nkev); 975 #endif 976 error = copyout(kev, ulistp, 977 sizeof(struct kevent) * nkev); 978 ulistp += nkev; 979 nkev = 0; 980 kevp = &kev[0]; 981 s = splhigh(); 982 if (error) 983 break; 984 } 985 } 986 TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe); 987 TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe); 988 splx(s); 989 done: 990 if (nkev != 0) { 991 #ifdef KTRACE 992 if (KTRPOINT(p, KTR_STRUCT)) 993 ktrevent(p, kev, nkev); 994 #endif 995 error = copyout(kev, ulistp, 996 sizeof(struct kevent) * nkev); 997 } 998 *retval = maxevents - count; 999 return (error); 1000 } 1001 1002 /* 1003 * XXX 1004 * This could be expanded to call kqueue_scan, if desired. 1005 */ 1006 int 1007 kqueue_read(struct file *fp, struct uio *uio, int fflags) 1008 { 1009 return (ENXIO); 1010 } 1011 1012 int 1013 kqueue_write(struct file *fp, struct uio *uio, int fflags) 1014 { 1015 return (ENXIO); 1016 } 1017 1018 int 1019 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) 1020 { 1021 return (ENOTTY); 1022 } 1023 1024 int 1025 kqueue_poll(struct file *fp, int events, struct proc *p) 1026 { 1027 struct kqueue *kq = (struct kqueue *)fp->f_data; 1028 int revents = 0; 1029 int s = splhigh(); 1030 1031 if (events & (POLLIN | POLLRDNORM)) { 1032 if (kq->kq_count) { 1033 revents |= events & (POLLIN | POLLRDNORM); 1034 } else { 1035 selrecord(p, &kq->kq_sel); 1036 kq->kq_state |= KQ_SEL; 1037 } 1038 } 1039 splx(s); 1040 return (revents); 1041 } 1042 1043 int 1044 kqueue_stat(struct file *fp, struct stat *st, struct proc *p) 1045 { 1046 struct kqueue *kq = fp->f_data; 1047 1048 memset(st, 0, sizeof(*st)); 1049 st->st_size = kq->kq_count; 1050 st->st_blksize = sizeof(struct kevent); 1051 st->st_mode = S_IFIFO; 1052 return (0); 1053 } 1054 1055 int 1056 kqueue_close(struct file *fp, struct proc *p) 1057 { 1058 struct kqueue *kq = fp->f_data; 1059 int i; 1060 1061 KERNEL_LOCK(); 1062 1063 for (i = 0; i < kq->kq_knlistsize; i++) 1064 knote_remove(p, &kq->kq_knlist[i]); 1065 if (kq->kq_knhashmask != 0) { 1066 for (i = 0; i < kq->kq_knhashmask + 1; i++) 1067 knote_remove(p, &kq->kq_knhash[i]); 1068 } 1069 fp->f_data = NULL; 1070 1071 kq->kq_state |= KQ_DYING; 1072 kqueue_wakeup(kq); 1073 1074 KASSERT(SLIST_EMPTY(&kq->kq_sel.si_note)); 1075 task_del(systq, &kq->kq_task); 1076 1077 KQRELE(kq); 1078 1079 KERNEL_UNLOCK(); 1080 1081 return (0); 1082 } 1083 1084 static void 1085 kqueue_task(void *arg) 1086 { 1087 struct kqueue *kq = arg; 1088 1089 KNOTE(&kq->kq_sel.si_note, 0); 1090 KQRELE(kq); 1091 } 1092 1093 void 1094 kqueue_wakeup(struct kqueue *kq) 1095 { 1096 1097 if (kq->kq_state & KQ_SLEEP) { 1098 kq->kq_state &= ~KQ_SLEEP; 1099 wakeup(kq); 1100 } 1101 if (kq->kq_state & KQ_SEL) { 1102 kq->kq_state &= ~KQ_SEL; 1103 selwakeup(&kq->kq_sel); 1104 } else if (!SLIST_EMPTY(&kq->kq_sel.si_note)) { 1105 /* Defer activation to avoid recursion. */ 1106 KQREF(kq); 1107 if (!task_add(systq, &kq->kq_task)) 1108 KQRELE(kq); 1109 } 1110 } 1111 1112 static void 1113 kqueue_expand_hash(struct kqueue *kq) 1114 { 1115 struct klist *hash; 1116 u_long hashmask; 1117 1118 if (kq->kq_knhashmask == 0) { 1119 hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask); 1120 if (kq->kq_knhashmask == 0) { 1121 kq->kq_knhash = hash; 1122 kq->kq_knhashmask = hashmask; 1123 } else { 1124 /* Another thread has allocated the hash. */ 1125 hashfree(hash, KN_HASHSIZE, M_KEVENT); 1126 } 1127 } 1128 } 1129 1130 static void 1131 kqueue_expand_list(struct kqueue *kq, int fd) 1132 { 1133 struct klist *list; 1134 int size; 1135 1136 if (kq->kq_knlistsize <= fd) { 1137 size = kq->kq_knlistsize; 1138 while (size <= fd) 1139 size += KQEXTENT; 1140 list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK); 1141 if (kq->kq_knlistsize <= fd) { 1142 memcpy(list, kq->kq_knlist, 1143 kq->kq_knlistsize * sizeof(*list)); 1144 memset(&list[kq->kq_knlistsize], 0, 1145 (size - kq->kq_knlistsize) * sizeof(*list)); 1146 free(kq->kq_knlist, M_KEVENT, 1147 kq->kq_knlistsize * sizeof(*list)); 1148 kq->kq_knlist = list; 1149 kq->kq_knlistsize = size; 1150 } else { 1151 /* Another thread has expanded the list. */ 1152 free(list, M_KEVENT, size * sizeof(*list)); 1153 } 1154 } 1155 } 1156 1157 /* 1158 * Acquire a knote, return non-zero on success, 0 on failure. 1159 * 1160 * If we cannot acquire the knote we sleep and return 0. The knote 1161 * may be stale on return in this case and the caller must restart 1162 * whatever loop they are in. 1163 */ 1164 int 1165 knote_acquire(struct knote *kn) 1166 { 1167 splassert(IPL_HIGH); 1168 KASSERT(kn->kn_filter != EVFILT_MARKER); 1169 1170 if (kn->kn_status & KN_PROCESSING) { 1171 kn->kn_status |= KN_WAITING; 1172 tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); 1173 /* knote may be stale now */ 1174 return (0); 1175 } 1176 kn->kn_status |= KN_PROCESSING; 1177 return (1); 1178 } 1179 1180 /* 1181 * Release an acquired knote, clearing KN_PROCESSING. 1182 */ 1183 void 1184 knote_release(struct knote *kn) 1185 { 1186 splassert(IPL_HIGH); 1187 KASSERT(kn->kn_filter != EVFILT_MARKER); 1188 KASSERT(kn->kn_status & KN_PROCESSING); 1189 1190 if (kn->kn_status & KN_WAITING) { 1191 kn->kn_status &= ~KN_WAITING; 1192 wakeup(kn); 1193 } 1194 kn->kn_status &= ~KN_PROCESSING; 1195 /* kn should not be accessed anymore */ 1196 } 1197 1198 /* 1199 * activate one knote. 1200 */ 1201 void 1202 knote_activate(struct knote *kn) 1203 { 1204 int s; 1205 1206 s = splhigh(); 1207 kn->kn_status |= KN_ACTIVE; 1208 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) 1209 knote_enqueue(kn); 1210 splx(s); 1211 } 1212 1213 /* 1214 * walk down a list of knotes, activating them if their event has triggered. 1215 */ 1216 void 1217 knote(struct klist *list, long hint) 1218 { 1219 struct knote *kn, *kn0; 1220 1221 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn0) 1222 if (kn->kn_fop->f_event(kn, hint)) 1223 knote_activate(kn); 1224 } 1225 1226 /* 1227 * remove all knotes from a specified klist 1228 */ 1229 void 1230 knote_remove(struct proc *p, struct klist *list) 1231 { 1232 struct knote *kn; 1233 int s; 1234 1235 while ((kn = SLIST_FIRST(list)) != NULL) { 1236 s = splhigh(); 1237 if (!knote_acquire(kn)) { 1238 splx(s); 1239 continue; 1240 } 1241 splx(s); 1242 kn->kn_fop->f_detach(kn); 1243 knote_drop(kn, p); 1244 } 1245 } 1246 1247 /* 1248 * remove all knotes referencing a specified fd 1249 */ 1250 void 1251 knote_fdclose(struct proc *p, int fd) 1252 { 1253 struct filedesc *fdp = p->p_p->ps_fd; 1254 struct kqueue *kq; 1255 struct klist *list; 1256 1257 /* 1258 * fdplock can be ignored if the file descriptor table is being freed 1259 * because no other thread can access the fdp. 1260 */ 1261 if (fdp->fd_refcnt != 0) 1262 fdpassertlocked(fdp); 1263 1264 if (LIST_EMPTY(&fdp->fd_kqlist)) 1265 return; 1266 1267 KERNEL_LOCK(); 1268 LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) { 1269 if (fd >= kq->kq_knlistsize) 1270 continue; 1271 1272 list = &kq->kq_knlist[fd]; 1273 knote_remove(p, list); 1274 } 1275 KERNEL_UNLOCK(); 1276 } 1277 1278 /* 1279 * handle a process exiting, including the triggering of NOTE_EXIT notes 1280 * XXX this could be more efficient, doing a single pass down the klist 1281 */ 1282 void 1283 knote_processexit(struct proc *p) 1284 { 1285 struct process *pr = p->p_p; 1286 1287 KNOTE(&pr->ps_klist, NOTE_EXIT); 1288 1289 /* remove other knotes hanging off the process */ 1290 knote_remove(p, &pr->ps_klist); 1291 } 1292 1293 void 1294 knote_attach(struct knote *kn) 1295 { 1296 struct kqueue *kq = kn->kn_kq; 1297 struct klist *list; 1298 1299 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1300 KASSERT(kq->kq_knlistsize > kn->kn_id); 1301 list = &kq->kq_knlist[kn->kn_id]; 1302 } else { 1303 KASSERT(kq->kq_knhashmask != 0); 1304 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1305 } 1306 SLIST_INSERT_HEAD(list, kn, kn_link); 1307 } 1308 1309 /* 1310 * should be called at spl == 0, since we don't want to hold spl 1311 * while calling FRELE and pool_put. 1312 */ 1313 void 1314 knote_drop(struct knote *kn, struct proc *p) 1315 { 1316 struct kqueue *kq = kn->kn_kq; 1317 struct klist *list; 1318 int s; 1319 1320 KASSERT(kn->kn_filter != EVFILT_MARKER); 1321 1322 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 1323 list = &kq->kq_knlist[kn->kn_id]; 1324 else 1325 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1326 1327 SLIST_REMOVE(list, kn, knote, kn_link); 1328 s = splhigh(); 1329 if (kn->kn_status & KN_QUEUED) 1330 knote_dequeue(kn); 1331 if (kn->kn_status & KN_WAITING) { 1332 kn->kn_status &= ~KN_WAITING; 1333 wakeup(kn); 1334 } 1335 splx(s); 1336 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 1337 FRELE(kn->kn_fp, p); 1338 pool_put(&knote_pool, kn); 1339 } 1340 1341 1342 void 1343 knote_enqueue(struct knote *kn) 1344 { 1345 struct kqueue *kq = kn->kn_kq; 1346 1347 splassert(IPL_HIGH); 1348 KASSERT(kn->kn_filter != EVFILT_MARKER); 1349 KASSERT((kn->kn_status & KN_QUEUED) == 0); 1350 1351 kqueue_check(kq); 1352 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1353 kn->kn_status |= KN_QUEUED; 1354 kq->kq_count++; 1355 kqueue_check(kq); 1356 kqueue_wakeup(kq); 1357 } 1358 1359 void 1360 knote_dequeue(struct knote *kn) 1361 { 1362 struct kqueue *kq = kn->kn_kq; 1363 1364 splassert(IPL_HIGH); 1365 KASSERT(kn->kn_filter != EVFILT_MARKER); 1366 KASSERT(kn->kn_status & KN_QUEUED); 1367 1368 kqueue_check(kq); 1369 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1370 kn->kn_status &= ~KN_QUEUED; 1371 kq->kq_count--; 1372 kqueue_check(kq); 1373 } 1374 1375 void 1376 klist_invalidate(struct klist *list) 1377 { 1378 struct knote *kn; 1379 int s; 1380 1381 /* 1382 * NET_LOCK() must not be held because it can block another thread 1383 * in f_event with a knote acquired. 1384 */ 1385 NET_ASSERT_UNLOCKED(); 1386 1387 s = splhigh(); 1388 while ((kn = SLIST_FIRST(list)) != NULL) { 1389 if (!knote_acquire(kn)) 1390 continue; 1391 splx(s); 1392 kn->kn_fop->f_detach(kn); 1393 kn->kn_fop = &dead_filtops; 1394 knote_activate(kn); 1395 s = splhigh(); 1396 knote_release(kn); 1397 } 1398 splx(s); 1399 } 1400