1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/proc.h> 33 #include <sys/malloc.h> 34 #include <sys/unistd.h> 35 #include <sys/file.h> 36 #include <sys/lock.h> 37 #include <sys/fcntl.h> 38 #include <sys/queue.h> 39 #include <sys/event.h> 40 #include <sys/eventvar.h> 41 #include <sys/protosw.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/stat.h> 45 #include <sys/sysctl.h> 46 #include <sys/sysmsg.h> 47 #include <sys/thread.h> 48 #include <sys/uio.h> 49 #include <sys/signalvar.h> 50 #include <sys/filio.h> 51 #include <sys/ktr.h> 52 #include <sys/spinlock.h> 53 54 #include <sys/thread2.h> 55 #include <sys/file2.h> 56 #include <sys/mplock2.h> 57 #include <sys/spinlock2.h> 58 59 #define EVENT_REGISTER 1 60 #define EVENT_PROCESS 2 61 62 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 63 64 struct kevent_copyin_args { 65 const struct kevent_args *ka; 66 struct kevent *eventlist; 67 const struct kevent *changelist; 68 int pchanges; 69 }; 70 71 #define KNOTE_CACHE_MAX 64 72 73 struct knote_cache_list { 74 struct klist knote_cache; 75 int knote_cache_cnt; 76 } __cachealign; 77 78 static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 79 struct knote *marker, int closedcounter, int flags); 80 static int kqueue_read(struct file *fp, struct uio *uio, 81 struct ucred *cred, int flags); 82 static int kqueue_write(struct file *fp, struct uio *uio, 83 struct ucred *cred, int flags); 84 static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 85 struct ucred *cred, struct sysmsg *msg); 86 static int kqueue_kqfilter(struct file *fp, struct knote *kn); 87 static int kqueue_stat(struct file *fp, struct stat *st, 88 struct ucred *cred); 89 static int kqueue_close(struct file *fp); 90 static void kqueue_wakeup(struct kqueue *kq); 91 static int filter_attach(struct knote *kn); 92 static int filter_event(struct knote *kn, long hint); 93 94 /* 95 * MPSAFE 96 */ 97 static struct fileops kqueueops = { 98 .fo_read = kqueue_read, 99 .fo_write = kqueue_write, 100 .fo_ioctl = kqueue_ioctl, 101 .fo_kqfilter = kqueue_kqfilter, 102 .fo_stat = kqueue_stat, 103 .fo_close = kqueue_close, 104 .fo_shutdown = nofo_shutdown, 105 .fo_seek = badfo_seek 106 }; 107 108 static void knote_attach(struct knote *kn); 109 static void knote_drop(struct knote *kn); 110 static void knote_detach_and_drop(struct knote *kn); 111 static void knote_enqueue(struct knote *kn); 112 static void knote_dequeue(struct knote *kn); 113 static struct knote *knote_alloc(void); 114 static void knote_free(struct knote *kn); 115 116 static void precise_sleep_intr(systimer_t info, int in_ipi, 117 struct intrframe *frame); 118 static int precise_sleep(void *ident, int flags, const char *wmesg, 119 int us); 120 121 static void filt_kqdetach(struct knote *kn); 122 static int filt_kqueue(struct knote *kn, long hint); 123 static int filt_procattach(struct knote *kn); 124 static void filt_procdetach(struct knote *kn); 125 static int filt_proc(struct knote *kn, long hint); 126 static int filt_fileattach(struct knote *kn); 127 static void filt_timerexpire(void *knx); 128 static int filt_timerattach(struct knote *kn); 129 static void filt_timerdetach(struct knote *kn); 130 static int filt_timer(struct knote *kn, long hint); 131 static int filt_userattach(struct knote *kn); 132 static void filt_userdetach(struct knote *kn); 133 static int filt_user(struct knote *kn, long hint); 134 static void filt_usertouch(struct knote *kn, struct kevent *kev, 135 u_long type); 136 static int filt_fsattach(struct knote *kn); 137 static void filt_fsdetach(struct knote *kn); 138 static int filt_fs(struct knote *kn, long hint); 139 140 static struct filterops file_filtops = 141 { FILTEROP_ISFD | FILTEROP_MPSAFE, filt_fileattach, NULL, NULL }; 142 static struct filterops kqread_filtops = 143 { FILTEROP_ISFD | FILTEROP_MPSAFE, NULL, filt_kqdetach, filt_kqueue }; 144 static struct filterops proc_filtops = 145 { FILTEROP_MPSAFE, filt_procattach, filt_procdetach, filt_proc }; 146 static struct filterops timer_filtops = 147 { FILTEROP_MPSAFE, filt_timerattach, filt_timerdetach, filt_timer }; 148 static struct filterops user_filtops = 149 { FILTEROP_MPSAFE, filt_userattach, filt_userdetach, filt_user }; 150 static struct filterops fs_filtops = 151 { FILTEROP_MPSAFE, filt_fsattach, filt_fsdetach, filt_fs }; 152 153 static int kq_ncallouts = 0; 154 static int kq_calloutmax = 65536; 155 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 156 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 157 static int kq_checkloop = 1000000; 158 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW, 159 &kq_checkloop, 0, "Maximum number of loops for kqueue scan"); 160 static int kq_sleep_threshold = 20000; 161 SYSCTL_INT(_kern, OID_AUTO, kq_sleep_threshold, CTLFLAG_RW, 162 &kq_sleep_threshold, 0, "Minimum sleep duration without busy-looping"); 163 164 #define KNOTE_ACTIVATE(kn) do { \ 165 kn->kn_status |= KN_ACTIVE; \ 166 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 167 knote_enqueue(kn); \ 168 } while(0) 169 170 #define KN_HASHSIZE 64 /* XXX should be tunable */ 171 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 172 173 extern struct filterops aio_filtops; 174 extern struct filterops sig_filtops; 175 176 /* 177 * Table for for all system-defined filters. 178 */ 179 static struct filterops *sysfilt_ops[] = { 180 &file_filtops, /* EVFILT_READ */ 181 &file_filtops, /* EVFILT_WRITE */ 182 &aio_filtops, /* EVFILT_AIO */ 183 &file_filtops, /* EVFILT_VNODE */ 184 &proc_filtops, /* EVFILT_PROC */ 185 &sig_filtops, /* EVFILT_SIGNAL */ 186 &timer_filtops, /* EVFILT_TIMER */ 187 &file_filtops, /* EVFILT_EXCEPT */ 188 &user_filtops, /* EVFILT_USER */ 189 &fs_filtops, /* EVFILT_FS */ 190 }; 191 192 static struct knote_cache_list knote_cache_lists[MAXCPU]; 193 194 /* 195 * Acquire a knote, return non-zero on success, 0 on failure. 196 * 197 * If we cannot acquire the knote we sleep and return 0. The knote 198 * may be stale on return in this case and the caller must restart 199 * whatever loop they are in. 200 * 201 * Related kq token must be held. 202 */ 203 static __inline int 204 knote_acquire(struct knote *kn) 205 { 206 if (kn->kn_status & KN_PROCESSING) { 207 kn->kn_status |= KN_WAITING | KN_REPROCESS; 208 tsleep(kn, 0, "kqepts", hz); 209 /* knote may be stale now */ 210 return(0); 211 } 212 kn->kn_status |= KN_PROCESSING; 213 return(1); 214 } 215 216 /* 217 * Release an acquired knote, clearing KN_PROCESSING and handling any 218 * KN_REPROCESS events. 219 * 220 * Caller must be holding the related kq token 221 * 222 * Non-zero is returned if the knote is destroyed or detached. 223 */ 224 static __inline int 225 knote_release(struct knote *kn) 226 { 227 int ret; 228 229 while (kn->kn_status & KN_REPROCESS) { 230 kn->kn_status &= ~KN_REPROCESS; 231 if (kn->kn_status & KN_WAITING) { 232 kn->kn_status &= ~KN_WAITING; 233 wakeup(kn); 234 } 235 if (kn->kn_status & KN_DELETING) { 236 knote_detach_and_drop(kn); 237 return(1); 238 /* NOT REACHED */ 239 } 240 if (filter_event(kn, 0)) 241 KNOTE_ACTIVATE(kn); 242 } 243 if (kn->kn_status & KN_DETACHED) 244 ret = 1; 245 else 246 ret = 0; 247 kn->kn_status &= ~KN_PROCESSING; 248 /* kn should not be accessed anymore */ 249 return ret; 250 } 251 252 static int 253 filt_fileattach(struct knote *kn) 254 { 255 return (fo_kqfilter(kn->kn_fp, kn)); 256 } 257 258 /* 259 * MPSAFE 260 */ 261 static int 262 kqueue_kqfilter(struct file *fp, struct knote *kn) 263 { 264 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 265 266 if (kn->kn_filter != EVFILT_READ) 267 return (EOPNOTSUPP); 268 269 kn->kn_fop = &kqread_filtops; 270 knote_insert(&kq->kq_kqinfo.ki_note, kn); 271 return (0); 272 } 273 274 static void 275 filt_kqdetach(struct knote *kn) 276 { 277 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 278 279 knote_remove(&kq->kq_kqinfo.ki_note, kn); 280 } 281 282 /*ARGSUSED*/ 283 static int 284 filt_kqueue(struct knote *kn, long hint) 285 { 286 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 287 288 kn->kn_data = kq->kq_count; 289 return (kn->kn_data > 0); 290 } 291 292 static int 293 filt_procattach(struct knote *kn) 294 { 295 struct proc *p; 296 int immediate; 297 298 immediate = 0; 299 p = pfind(kn->kn_id); 300 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 301 p = zpfind(kn->kn_id); 302 immediate = 1; 303 } 304 if (p == NULL) { 305 return (ESRCH); 306 } 307 if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) { 308 if (p) 309 PRELE(p); 310 return (EACCES); 311 } 312 313 lwkt_gettoken(&p->p_token); 314 kn->kn_ptr.p_proc = p; 315 kn->kn_flags |= EV_CLEAR; /* automatically set */ 316 317 /* 318 * internal flag indicating registration done by kernel 319 */ 320 if (kn->kn_flags & EV_FLAG1) { 321 kn->kn_data = kn->kn_sdata; /* ppid */ 322 kn->kn_fflags = NOTE_CHILD; 323 kn->kn_flags &= ~EV_FLAG1; 324 } 325 326 knote_insert(&p->p_klist, kn); 327 328 /* 329 * Immediately activate any exit notes if the target process is a 330 * zombie. This is necessary to handle the case where the target 331 * process, e.g. a child, dies before the kevent is negistered. 332 */ 333 if (immediate && filt_proc(kn, NOTE_EXIT)) 334 KNOTE_ACTIVATE(kn); 335 lwkt_reltoken(&p->p_token); 336 PRELE(p); 337 338 return (0); 339 } 340 341 /* 342 * The knote may be attached to a different process, which may exit, 343 * leaving nothing for the knote to be attached to. So when the process 344 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 345 * it will be deleted when read out. However, as part of the knote deletion, 346 * this routine is called, so a check is needed to avoid actually performing 347 * a detach, because the original process does not exist any more. 348 */ 349 static void 350 filt_procdetach(struct knote *kn) 351 { 352 struct proc *p; 353 354 if (kn->kn_status & KN_DETACHED) 355 return; 356 p = kn->kn_ptr.p_proc; 357 knote_remove(&p->p_klist, kn); 358 } 359 360 static int 361 filt_proc(struct knote *kn, long hint) 362 { 363 u_int event; 364 365 /* 366 * mask off extra data 367 */ 368 event = (u_int)hint & NOTE_PCTRLMASK; 369 370 /* 371 * if the user is interested in this event, record it. 372 */ 373 if (kn->kn_sfflags & event) 374 kn->kn_fflags |= event; 375 376 /* 377 * Process is gone, so flag the event as finished. Detach the 378 * knote from the process now because the process will be poof, 379 * gone later on. 380 */ 381 if (event == NOTE_EXIT) { 382 struct proc *p = kn->kn_ptr.p_proc; 383 if ((kn->kn_status & KN_DETACHED) == 0) { 384 PHOLD(p); 385 knote_remove(&p->p_klist, kn); 386 kn->kn_status |= KN_DETACHED; 387 kn->kn_data = p->p_xstat; 388 kn->kn_ptr.p_proc = NULL; 389 PRELE(p); 390 } 391 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 392 return (1); 393 } 394 395 /* 396 * process forked, and user wants to track the new process, 397 * so attach a new knote to it, and immediately report an 398 * event with the parent's pid. 399 */ 400 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 401 struct kevent kev; 402 int error; 403 int n; 404 405 /* 406 * register knote with new process. 407 */ 408 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 409 kev.filter = kn->kn_filter; 410 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 411 kev.fflags = kn->kn_sfflags; 412 kev.data = kn->kn_id; /* parent */ 413 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 414 n = 1; 415 error = kqueue_register(kn->kn_kq, &kev, &n, 0); 416 if (error) 417 kn->kn_fflags |= NOTE_TRACKERR; 418 } 419 420 return (kn->kn_fflags != 0); 421 } 422 423 static void 424 filt_timerreset(struct knote *kn) 425 { 426 struct callout *calloutp; 427 struct timeval tv; 428 int tticks; 429 430 tv.tv_sec = kn->kn_sdata / 1000; 431 tv.tv_usec = (kn->kn_sdata % 1000) * 1000; 432 tticks = tvtohz_high(&tv); 433 calloutp = (struct callout *)kn->kn_hook; 434 callout_reset(calloutp, tticks, filt_timerexpire, kn); 435 } 436 437 /* 438 * The callout interlocks with callout_stop() but can still 439 * race a deletion so if KN_DELETING is set we just don't touch 440 * the knote. 441 */ 442 static void 443 filt_timerexpire(void *knx) 444 { 445 struct knote *kn = knx; 446 struct kqueue *kq = kn->kn_kq; 447 448 lwkt_getpooltoken(kq); 449 450 /* 451 * Open knote_acquire(), since we can't sleep in callout, 452 * however, we do need to record this expiration. 453 */ 454 kn->kn_data++; 455 if (kn->kn_status & KN_PROCESSING) { 456 kn->kn_status |= KN_REPROCESS; 457 if ((kn->kn_status & KN_DELETING) == 0 && 458 (kn->kn_flags & EV_ONESHOT) == 0) 459 filt_timerreset(kn); 460 lwkt_relpooltoken(kq); 461 return; 462 } 463 KASSERT((kn->kn_status & KN_DELETING) == 0, 464 ("acquire a deleting knote %#x", kn->kn_status)); 465 kn->kn_status |= KN_PROCESSING; 466 467 KNOTE_ACTIVATE(kn); 468 if ((kn->kn_flags & EV_ONESHOT) == 0) 469 filt_timerreset(kn); 470 471 knote_release(kn); 472 473 lwkt_relpooltoken(kq); 474 } 475 476 /* 477 * data contains amount of time to sleep, in milliseconds 478 */ 479 static int 480 filt_timerattach(struct knote *kn) 481 { 482 struct callout *calloutp; 483 int prev_ncallouts; 484 485 prev_ncallouts = atomic_fetchadd_int(&kq_ncallouts, 1); 486 if (prev_ncallouts >= kq_calloutmax) { 487 atomic_subtract_int(&kq_ncallouts, 1); 488 kn->kn_hook = NULL; 489 return (ENOMEM); 490 } 491 492 kn->kn_flags |= EV_CLEAR; /* automatically set */ 493 calloutp = kmalloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); 494 callout_init_mp(calloutp); 495 kn->kn_hook = (caddr_t)calloutp; 496 497 filt_timerreset(kn); 498 return (0); 499 } 500 501 /* 502 * This function is called with the knote flagged locked but it is 503 * still possible to race a callout event due to the callback blocking. 504 */ 505 static void 506 filt_timerdetach(struct knote *kn) 507 { 508 struct callout *calloutp; 509 510 calloutp = (struct callout *)kn->kn_hook; 511 callout_terminate(calloutp); 512 kn->kn_hook = NULL; 513 kfree(calloutp, M_KQUEUE); 514 atomic_subtract_int(&kq_ncallouts, 1); 515 } 516 517 static int 518 filt_timer(struct knote *kn, long hint) 519 { 520 return (kn->kn_data != 0); 521 } 522 523 /* 524 * EVFILT_USER 525 */ 526 static int 527 filt_userattach(struct knote *kn) 528 { 529 u_int ffctrl; 530 531 kn->kn_hook = NULL; 532 if (kn->kn_sfflags & NOTE_TRIGGER) 533 kn->kn_ptr.hookid = 1; 534 else 535 kn->kn_ptr.hookid = 0; 536 537 ffctrl = kn->kn_sfflags & NOTE_FFCTRLMASK; 538 kn->kn_sfflags &= NOTE_FFLAGSMASK; 539 switch (ffctrl) { 540 case NOTE_FFNOP: 541 break; 542 543 case NOTE_FFAND: 544 kn->kn_fflags &= kn->kn_sfflags; 545 break; 546 547 case NOTE_FFOR: 548 kn->kn_fflags |= kn->kn_sfflags; 549 break; 550 551 case NOTE_FFCOPY: 552 kn->kn_fflags = kn->kn_sfflags; 553 break; 554 555 default: 556 /* XXX Return error? */ 557 break; 558 } 559 /* We just happen to copy this value as well. Undocumented. */ 560 kn->kn_data = kn->kn_sdata; 561 562 return 0; 563 } 564 565 static void 566 filt_userdetach(struct knote *kn) 567 { 568 /* nothing to do */ 569 } 570 571 static int 572 filt_user(struct knote *kn, long hint) 573 { 574 return (kn->kn_ptr.hookid); 575 } 576 577 static void 578 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 579 { 580 u_int ffctrl; 581 582 switch (type) { 583 case EVENT_REGISTER: 584 if (kev->fflags & NOTE_TRIGGER) 585 kn->kn_ptr.hookid = 1; 586 587 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 588 kev->fflags &= NOTE_FFLAGSMASK; 589 switch (ffctrl) { 590 case NOTE_FFNOP: 591 break; 592 593 case NOTE_FFAND: 594 kn->kn_fflags &= kev->fflags; 595 break; 596 597 case NOTE_FFOR: 598 kn->kn_fflags |= kev->fflags; 599 break; 600 601 case NOTE_FFCOPY: 602 kn->kn_fflags = kev->fflags; 603 break; 604 605 default: 606 /* XXX Return error? */ 607 break; 608 } 609 /* We just happen to copy this value as well. Undocumented. */ 610 kn->kn_data = kev->data; 611 612 /* 613 * This is not the correct use of EV_CLEAR in an event 614 * modification, it should have been passed as a NOTE instead. 615 * But we need to maintain compatibility with Apple & FreeBSD. 616 * 617 * Note however that EV_CLEAR can still be used when doing 618 * the initial registration of the event and works as expected 619 * (clears the event on reception). 620 */ 621 if (kev->flags & EV_CLEAR) { 622 kn->kn_ptr.hookid = 0; 623 /* 624 * Clearing kn->kn_data is fine, since it gets set 625 * every time anyway. We just shouldn't clear 626 * kn->kn_fflags here, since that would limit the 627 * possible uses of this API. NOTE_FFAND or 628 * NOTE_FFCOPY should be used for explicitly clearing 629 * kn->kn_fflags. 630 */ 631 kn->kn_data = 0; 632 } 633 break; 634 635 case EVENT_PROCESS: 636 *kev = kn->kn_kevent; 637 kev->fflags = kn->kn_fflags; 638 kev->data = kn->kn_data; 639 if (kn->kn_flags & EV_CLEAR) { 640 kn->kn_ptr.hookid = 0; 641 /* kn_data, kn_fflags handled by parent */ 642 } 643 break; 644 645 default: 646 panic("filt_usertouch() - invalid type (%ld)", type); 647 break; 648 } 649 } 650 651 /* 652 * EVFILT_FS 653 */ 654 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist); 655 656 static int 657 filt_fsattach(struct knote *kn) 658 { 659 kn->kn_flags |= EV_CLEAR; 660 knote_insert(&fs_klist, kn); 661 662 return (0); 663 } 664 665 static void 666 filt_fsdetach(struct knote *kn) 667 { 668 knote_remove(&fs_klist, kn); 669 } 670 671 static int 672 filt_fs(struct knote *kn, long hint) 673 { 674 kn->kn_fflags |= hint; 675 return (kn->kn_fflags != 0); 676 } 677 678 /* 679 * Initialize a kqueue. 680 * 681 * NOTE: The lwp/proc code initializes a kqueue for select/poll ops. 682 */ 683 void 684 kqueue_init(struct kqueue *kq, struct filedesc *fdp) 685 { 686 bzero(kq, sizeof(*kq)); 687 TAILQ_INIT(&kq->kq_knpend); 688 TAILQ_INIT(&kq->kq_knlist); 689 kq->kq_fdp = fdp; 690 SLIST_INIT(&kq->kq_kqinfo.ki_note); 691 } 692 693 /* 694 * Terminate a kqueue. Freeing the actual kq itself is left up to the 695 * caller (it might be embedded in a lwp so we don't do it here). 696 * 697 * The kq's knlist must be completely eradicated so block on any 698 * processing races. 699 */ 700 void 701 kqueue_terminate(struct kqueue *kq) 702 { 703 struct knote *kn; 704 705 lwkt_getpooltoken(kq); 706 while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) { 707 if (knote_acquire(kn)) 708 knote_detach_and_drop(kn); 709 } 710 lwkt_relpooltoken(kq); 711 712 if (kq->kq_knhash) { 713 hashdestroy(kq->kq_knhash, M_KQUEUE, kq->kq_knhashmask); 714 kq->kq_knhash = NULL; 715 kq->kq_knhashmask = 0; 716 } 717 } 718 719 /* 720 * MPSAFE 721 */ 722 int 723 sys_kqueue(struct sysmsg *sysmsg, const struct kqueue_args *uap) 724 { 725 struct thread *td = curthread; 726 struct kqueue *kq; 727 struct file *fp; 728 int fd, error; 729 730 error = falloc(td->td_lwp, &fp, &fd); 731 if (error) 732 return (error); 733 fp->f_flag = FREAD | FWRITE; 734 fp->f_type = DTYPE_KQUEUE; 735 fp->f_ops = &kqueueops; 736 737 kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); 738 kqueue_init(kq, td->td_proc->p_fd); 739 fp->f_data = kq; 740 741 fsetfd(kq->kq_fdp, fp, fd); 742 sysmsg->sysmsg_result = fd; 743 fdrop(fp); 744 return (0); 745 } 746 747 /* 748 * Copy 'count' items into the destination list pointed to by uap->eventlist. 749 */ 750 static int 751 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res) 752 { 753 struct kevent_copyin_args *kap; 754 int error; 755 756 kap = (struct kevent_copyin_args *)arg; 757 758 error = copyout(kevp, kap->eventlist, count * sizeof(*kevp)); 759 if (error == 0) { 760 kap->eventlist += count; 761 *res += count; 762 } else { 763 *res = -1; 764 } 765 766 return (error); 767 } 768 769 /* 770 * Copy at most 'max' items from the list pointed to by kap->changelist, 771 * return number of items in 'events'. 772 */ 773 static int 774 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events) 775 { 776 struct kevent_copyin_args *kap; 777 int error, count; 778 779 kap = (struct kevent_copyin_args *)arg; 780 781 count = min(kap->ka->nchanges - kap->pchanges, max); 782 error = copyin(kap->changelist, kevp, count * sizeof *kevp); 783 if (error == 0) { 784 kap->changelist += count; 785 kap->pchanges += count; 786 *events = count; 787 } 788 789 return (error); 790 } 791 792 /* 793 * MPSAFE 794 */ 795 int 796 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, 797 k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn, 798 struct timespec *tsp_in, int flags) 799 { 800 struct kevent *kevp; 801 struct timespec *tsp, ats; 802 int i, n, total, error, nerrors = 0; 803 int gobbled; 804 int lres; 805 int limit = kq_checkloop; 806 int closedcounter; 807 struct kevent kev[KQ_NEVENTS]; 808 struct knote marker; 809 struct lwkt_token *tok; 810 811 if (tsp_in == NULL || tsp_in->tv_sec || tsp_in->tv_nsec) 812 atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC); 813 814 tsp = tsp_in; 815 *res = 0; 816 817 closedcounter = kq->kq_fdp->fd_closedcounter; 818 819 for (;;) { 820 n = 0; 821 error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n); 822 if (error) 823 return error; 824 if (n == 0) 825 break; 826 for (i = 0; i < n; ++i) 827 kev[i].flags &= ~EV_SYSFLAGS; 828 for (i = 0; i < n; ++i) { 829 gobbled = n - i; 830 831 error = kqueue_register(kq, &kev[i], &gobbled, flags); 832 i += gobbled - 1; 833 kevp = &kev[i]; 834 835 /* 836 * If a registration returns an error we 837 * immediately post the error. The kevent() 838 * call itself will fail with the error if 839 * no space is available for posting. 840 * 841 * Such errors normally bypass the timeout/blocking 842 * code. However, if the copyoutfn function refuses 843 * to post the error (see sys_poll()), then we 844 * ignore it too. 845 */ 846 if (error || (kevp->flags & EV_RECEIPT)) { 847 kevp->flags = EV_ERROR; 848 kevp->data = error; 849 lres = *res; 850 kevent_copyoutfn(uap, kevp, 1, res); 851 if (*res < 0) { 852 return error; 853 } else if (lres != *res) { 854 nevents--; 855 nerrors++; 856 } 857 } 858 } 859 } 860 if (nerrors) 861 return 0; 862 863 /* 864 * Acquire/wait for events - setup timeout 865 * 866 * If no timeout specified clean up the run path by clearing the 867 * PRECISE flag. 868 */ 869 if (tsp != NULL) { 870 if (tsp->tv_sec || tsp->tv_nsec) { 871 getnanouptime(&ats); 872 timespecadd(tsp, &ats, tsp); /* tsp = target time */ 873 } 874 } else { 875 flags &= ~KEVENT_TIMEOUT_PRECISE; 876 } 877 878 /* 879 * Loop as required. 880 * 881 * Collect as many events as we can. Sleeping on successive 882 * loops is disabled if copyoutfn has incremented (*res). 883 * 884 * The loop stops if an error occurs, all events have been 885 * scanned (the marker has been reached), or fewer than the 886 * maximum number of events is found. 887 * 888 * The copyoutfn function does not have to increment (*res) in 889 * order for the loop to continue. 890 * 891 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents. 892 */ 893 total = 0; 894 error = 0; 895 marker.kn_filter = EVFILT_MARKER; 896 marker.kn_status = KN_PROCESSING; 897 898 tok = lwkt_token_pool_lookup(kq); 899 flags = (flags & ~KEVENT_SCAN_MASK) | KEVENT_SCAN_INSERT_MARKER; 900 901 while ((n = nevents - total) > 0) { 902 if (n > KQ_NEVENTS) 903 n = KQ_NEVENTS; 904 905 /* 906 * Process all received events 907 * Account for all non-spurious events in our total 908 */ 909 i = kqueue_scan(kq, kev, n, &marker, closedcounter, flags); 910 flags = (flags & ~KEVENT_SCAN_MASK) | KEVENT_SCAN_KEEP_MARKER; 911 if (i) { 912 lres = *res; 913 error = kevent_copyoutfn(uap, kev, i, res); 914 total += *res - lres; 915 if (error) 916 break; 917 } 918 if (limit && --limit == 0) 919 panic("kqueue: checkloop failed i=%d", i); 920 921 /* 922 * Normally when fewer events are returned than requested 923 * we can stop. However, if only spurious events were 924 * collected the copyout will not bump (*res) and we have 925 * to continue. 926 */ 927 if (i < n && *res) 928 break; 929 930 /* 931 * If no events were recorded (no events happened or the events 932 * that did happen were all spurious), block until an event 933 * occurs or the timeout occurs and reload the marker. 934 * 935 * If we saturated n (i == n) loop up without sleeping to 936 * continue processing the list. 937 */ 938 if (i != n && kq->kq_count == 0 && *res == 0) { 939 int timeout; 940 int ustimeout; 941 942 if (tsp == NULL) { 943 timeout = 0; 944 ustimeout = 0; 945 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 946 error = EWOULDBLOCK; 947 break; 948 } else { 949 struct timespec atx = *tsp; 950 951 getnanouptime(&ats); 952 timespecsub(&atx, &ats, &atx); 953 if (atx.tv_sec < 0 || 954 (atx.tv_sec == 0 && atx.tv_nsec <= 0)) { 955 error = EWOULDBLOCK; 956 break; 957 } 958 if (flags & KEVENT_TIMEOUT_PRECISE) { 959 if (atx.tv_sec == 0 && 960 atx.tv_nsec < kq_sleep_threshold) { 961 ustimeout = kq_sleep_threshold / 962 1000; 963 } else if (atx.tv_sec < 60) { 964 ustimeout = 965 atx.tv_sec * 1000000 + 966 atx.tv_nsec / 1000; 967 } else { 968 ustimeout = 60 * 1000000; 969 } 970 if (ustimeout == 0) 971 ustimeout = 1; 972 timeout = 0; 973 } else if (atx.tv_sec > 60 * 60) { 974 timeout = 60 * 60 * hz; 975 ustimeout = 0; 976 } else { 977 timeout = tstohz_high(&atx); 978 ustimeout = 0; 979 } 980 } 981 982 lwkt_gettoken(tok); 983 if (kq->kq_count == 0) { 984 kq->kq_sleep_cnt++; 985 if (__predict_false(kq->kq_sleep_cnt == 0)) { 986 /* 987 * Guard against possible wrapping. And 988 * set it to 2, so that kqueue_wakeup() 989 * can wake everyone up. 990 */ 991 kq->kq_sleep_cnt = 2; 992 } 993 if (flags & KEVENT_TIMEOUT_PRECISE) { 994 error = precise_sleep(kq, PCATCH, 995 "kqread", ustimeout); 996 } else { 997 error = tsleep(kq, PCATCH, 998 "kqread", timeout); 999 } 1000 1001 /* don't restart after signals... */ 1002 if (error == ERESTART) 1003 error = EINTR; 1004 if (error == EWOULDBLOCK) 1005 error = 0; 1006 if (error) { 1007 lwkt_reltoken(tok); 1008 break; 1009 } 1010 flags = (flags & ~KEVENT_SCAN_MASK) | 1011 KEVENT_SCAN_RELOAD_MARKER; 1012 } 1013 lwkt_reltoken(tok); 1014 } 1015 1016 /* 1017 * Deal with an edge case where spurious events can cause 1018 * a loop to occur without moving the marker. This can 1019 * prevent kqueue_scan() from picking up new events which 1020 * race us. We must be sure to move the marker for this 1021 * case. 1022 * 1023 * NOTE: We do not want to move the marker if events 1024 * were scanned because normal kqueue operations 1025 * may reactivate events. Moving the marker in 1026 * that case could result in duplicates for the 1027 * same event. 1028 */ 1029 if (i == 0) { 1030 flags = (flags & ~KEVENT_SCAN_MASK) | 1031 KEVENT_SCAN_RELOAD_MARKER; 1032 } 1033 } 1034 1035 /* 1036 * Remove the marker 1037 */ 1038 if ((flags & KEVENT_SCAN_INSERT_MARKER) == 0) { 1039 lwkt_gettoken(tok); 1040 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 1041 lwkt_reltoken(tok); 1042 } 1043 1044 /* Timeouts do not return EWOULDBLOCK. */ 1045 if (error == EWOULDBLOCK) 1046 error = 0; 1047 return error; 1048 } 1049 1050 /* 1051 * MPALMOSTSAFE 1052 */ 1053 int 1054 sys_kevent(struct sysmsg *sysmsg, const struct kevent_args *uap) 1055 { 1056 struct thread *td = curthread; 1057 struct timespec ts, *tsp; 1058 struct kqueue *kq; 1059 struct file *fp = NULL; 1060 struct kevent_copyin_args *kap, ka; 1061 int error; 1062 1063 if (uap->timeout) { 1064 error = copyin(uap->timeout, &ts, sizeof(ts)); 1065 if (error) 1066 return (error); 1067 tsp = &ts; 1068 } else { 1069 tsp = NULL; 1070 } 1071 fp = holdfp(td, uap->fd, -1); 1072 if (fp == NULL) 1073 return (EBADF); 1074 if (fp->f_type != DTYPE_KQUEUE) { 1075 fdrop(fp); 1076 return (EBADF); 1077 } 1078 1079 kq = (struct kqueue *)fp->f_data; 1080 1081 kap = &ka; 1082 kap->ka = uap; 1083 kap->pchanges = 0; 1084 kap->eventlist = uap->eventlist; 1085 kap->changelist = uap->changelist; 1086 1087 error = kern_kevent(kq, uap->nevents, &sysmsg->sysmsg_result, kap, 1088 kevent_copyin, kevent_copyout, tsp, 0); 1089 1090 dropfp(td, uap->fd, fp); 1091 1092 return (error); 1093 } 1094 1095 /* 1096 * Efficiently load multiple file pointers. This significantly reduces 1097 * threaded overhead. When doing simple polling we can depend on the 1098 * per-thread (fd,fp) cache. With more descriptors, we batch. 1099 */ 1100 static 1101 void 1102 floadkevfps(thread_t td, struct filedesc *fdp, struct kevent *kev, 1103 struct file **fp, int climit) 1104 { 1105 struct filterops *fops; 1106 int tdcache; 1107 1108 if (climit <= 2 && td->td_proc && td->td_proc->p_fd == fdp) { 1109 tdcache = 1; 1110 } else { 1111 tdcache = 0; 1112 spin_lock_shared(&fdp->fd_spin); 1113 } 1114 1115 while (climit) { 1116 *fp = NULL; 1117 if (kev->filter < 0 && 1118 kev->filter + EVFILT_SYSCOUNT >= 0) { 1119 fops = sysfilt_ops[~kev->filter]; 1120 if (fops->f_flags & FILTEROP_ISFD) { 1121 if (tdcache) { 1122 *fp = holdfp(td, kev->ident, -1); 1123 } else { 1124 *fp = holdfp_fdp_locked(fdp, 1125 kev->ident, -1); 1126 } 1127 } 1128 } 1129 --climit; 1130 ++fp; 1131 ++kev; 1132 } 1133 if (tdcache == 0) 1134 spin_unlock_shared(&fdp->fd_spin); 1135 } 1136 1137 /* 1138 * Register up to *countp kev's. Always registers at least 1. 1139 * 1140 * The number registered is returned in *countp. 1141 * 1142 * If an error occurs or a kev is flagged EV_RECEIPT, it is 1143 * processed and included in *countp, and processing then 1144 * stops. 1145 * 1146 * If flags contains KEVENT_UNIQUE_NOTES, kev->data contains an identifier 1147 * to further distinguish knotes which might otherwise have the same kq, 1148 * ident, and filter (used by *poll() because multiple pfds are allowed to 1149 * reference the same descriptor and implied kq filter). kev->data is 1150 * implied to be zero for event processing when this flag is set. 1151 */ 1152 int 1153 kqueue_register(struct kqueue *kq, struct kevent *kev, int *countp, int flags) 1154 { 1155 struct filedesc *fdp = kq->kq_fdp; 1156 struct klist *list = NULL; 1157 struct filterops *fops; 1158 struct file *fp[KQ_NEVENTS]; 1159 struct knote *kn = NULL; 1160 struct thread *td; 1161 int error; 1162 int count; 1163 int climit; 1164 int closedcounter; 1165 int uniqifier = 0; 1166 struct knote_cache_list *cache_list; 1167 1168 td = curthread; 1169 climit = *countp; 1170 if (climit > KQ_NEVENTS) 1171 climit = KQ_NEVENTS; 1172 closedcounter = fdp->fd_closedcounter; 1173 floadkevfps(td, fdp, kev, fp, climit); 1174 1175 lwkt_getpooltoken(kq); 1176 count = 0; 1177 error = 0; 1178 1179 /* 1180 * To avoid races, only one thread can register events on this 1181 * kqueue at a time. 1182 */ 1183 while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) { 1184 kq->kq_state |= KQ_REGWAIT; 1185 tsleep(&kq->kq_regtd, 0, "kqreg", 0); 1186 } 1187 if (__predict_false(kq->kq_regtd != NULL)) { 1188 /* Recursive calling of kqueue_register() */ 1189 td = NULL; 1190 } else { 1191 /* Owner of the kq_regtd, i.e. td != NULL */ 1192 kq->kq_regtd = td; 1193 } 1194 1195 loop: 1196 /* 1197 * knote uniqifiers are used by *poll() because there may be 1198 * multiple pfd[] entries for the same descriptor and filter. 1199 * The unique id is stored in kev->data and kev->data for the 1200 * kevent is implied to be zero. 1201 */ 1202 if (flags & KEVENT_UNIQUE_NOTES) { 1203 uniqifier = kev->data; 1204 kev->data = 0; 1205 } 1206 1207 if (kev->filter < 0) { 1208 if (kev->filter + EVFILT_SYSCOUNT < 0) { 1209 error = EINVAL; 1210 ++count; 1211 goto done; 1212 } 1213 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 1214 } else { 1215 /* 1216 * XXX 1217 * filter attach routine is responsible for insuring that 1218 * the identifier can be attached to it. 1219 */ 1220 error = EINVAL; 1221 ++count; 1222 goto done; 1223 } 1224 1225 if (fops->f_flags & FILTEROP_ISFD) { 1226 /* validate descriptor */ 1227 if (fp[count] == NULL) { 1228 error = EBADF; 1229 ++count; 1230 goto done; 1231 } 1232 } 1233 1234 cache_list = &knote_cache_lists[mycpuid]; 1235 if (SLIST_EMPTY(&cache_list->knote_cache)) { 1236 struct knote *new_kn; 1237 1238 new_kn = knote_alloc(); 1239 crit_enter(); 1240 SLIST_INSERT_HEAD(&cache_list->knote_cache, new_kn, kn_link); 1241 cache_list->knote_cache_cnt++; 1242 crit_exit(); 1243 } 1244 1245 if (fp[count] != NULL) { 1246 list = &fp[count]->f_klist; 1247 } else if (kq->kq_knhashmask) { 1248 list = &kq->kq_knhash[ 1249 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1250 } 1251 if (list != NULL) { 1252 lwkt_getpooltoken(list); 1253 again: 1254 SLIST_FOREACH(kn, list, kn_link) { 1255 if (kn->kn_kq == kq && 1256 kn->kn_filter == kev->filter && 1257 kn->kn_id == kev->ident && 1258 kn->kn_uniqifier == uniqifier) 1259 { 1260 if (knote_acquire(kn) == 0) 1261 goto again; 1262 break; 1263 } 1264 } 1265 lwkt_relpooltoken(list); 1266 } 1267 1268 /* 1269 * NOTE: At this point if kn is non-NULL we will have acquired 1270 * it and set KN_PROCESSING. 1271 */ 1272 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 1273 error = ENOENT; 1274 ++count; 1275 goto done; 1276 } 1277 1278 /* 1279 * kn now contains the matching knote, or NULL if no match 1280 */ 1281 if (kev->flags & EV_ADD) { 1282 if (kn == NULL) { 1283 crit_enter(); 1284 kn = SLIST_FIRST(&cache_list->knote_cache); 1285 if (kn == NULL) { 1286 crit_exit(); 1287 kn = knote_alloc(); 1288 } else { 1289 SLIST_REMOVE_HEAD(&cache_list->knote_cache, 1290 kn_link); 1291 cache_list->knote_cache_cnt--; 1292 crit_exit(); 1293 } 1294 kn->kn_fp = fp[count]; 1295 kn->kn_kq = kq; 1296 kn->kn_fop = fops; 1297 kn->kn_uniqifier = uniqifier; 1298 1299 /* 1300 * apply reference count to knote structure, and 1301 * do not release it at the end of this routine. 1302 */ 1303 fp[count] = NULL; /* safety */ 1304 1305 kn->kn_sfflags = kev->fflags; 1306 kn->kn_sdata = kev->data; 1307 kev->fflags = 0; 1308 kev->data = 0; 1309 kn->kn_kevent = *kev; 1310 1311 /* 1312 * KN_PROCESSING prevents the knote from getting 1313 * ripped out from under us while we are trying 1314 * to attach it, in case the attach blocks. 1315 */ 1316 kn->kn_status = KN_PROCESSING; 1317 knote_attach(kn); 1318 if ((error = filter_attach(kn)) != 0) { 1319 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1320 knote_drop(kn); 1321 ++count; 1322 goto done; 1323 } 1324 1325 /* 1326 * Interlock against close races which either tried 1327 * to remove our knote while we were blocked or missed 1328 * it entirely prior to our attachment. We do not 1329 * want to end up with a knote on a closed descriptor. 1330 */ 1331 if ((fops->f_flags & FILTEROP_ISFD) && 1332 checkfdclosed(curthread, fdp, kev->ident, kn->kn_fp, 1333 closedcounter)) { 1334 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1335 } 1336 } else { 1337 /* 1338 * The user may change some filter values after the 1339 * initial EV_ADD, but doing so will not reset any 1340 * filter which have already been triggered. 1341 */ 1342 KKASSERT(kn->kn_status & KN_PROCESSING); 1343 if (fops == &user_filtops) { 1344 filt_usertouch(kn, kev, EVENT_REGISTER); 1345 } else { 1346 kn->kn_sfflags = kev->fflags; 1347 kn->kn_sdata = kev->data; 1348 kn->kn_kevent.udata = kev->udata; 1349 } 1350 } 1351 1352 /* 1353 * Execute the filter event to immediately activate the 1354 * knote if necessary. If reprocessing events are pending 1355 * due to blocking above we do not run the filter here 1356 * but instead let knote_release() do it. Otherwise we 1357 * might run the filter on a deleted event. 1358 */ 1359 if ((kn->kn_status & KN_REPROCESS) == 0) { 1360 if (filter_event(kn, 0)) 1361 KNOTE_ACTIVATE(kn); 1362 } 1363 } else if (kev->flags & EV_DELETE) { 1364 /* 1365 * Delete the existing knote 1366 */ 1367 knote_detach_and_drop(kn); 1368 error = 0; 1369 ++count; 1370 goto done; 1371 } else { 1372 /* 1373 * Modify an existing event. 1374 * 1375 * The user may change some filter values after the 1376 * initial EV_ADD, but doing so will not reset any 1377 * filter which have already been triggered. 1378 */ 1379 KKASSERT(kn->kn_status & KN_PROCESSING); 1380 if (fops == &user_filtops) { 1381 filt_usertouch(kn, kev, EVENT_REGISTER); 1382 } else { 1383 kn->kn_sfflags = kev->fflags; 1384 kn->kn_sdata = kev->data; 1385 kn->kn_kevent.udata = kev->udata; 1386 } 1387 1388 /* 1389 * Execute the filter event to immediately activate the 1390 * knote if necessary. If reprocessing events are pending 1391 * due to blocking above we do not run the filter here 1392 * but instead let knote_release() do it. Otherwise we 1393 * might run the filter on a deleted event. 1394 */ 1395 if ((kn->kn_status & KN_REPROCESS) == 0) { 1396 if (filter_event(kn, 0)) 1397 KNOTE_ACTIVATE(kn); 1398 } 1399 } 1400 1401 /* 1402 * Disablement does not deactivate a knote here. 1403 */ 1404 if ((kev->flags & EV_DISABLE) && 1405 ((kn->kn_status & KN_DISABLED) == 0)) 1406 { 1407 kn->kn_status |= KN_DISABLED; 1408 } 1409 1410 /* 1411 * Re-enablement may have to immediately enqueue an active knote. 1412 */ 1413 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 1414 kn->kn_status &= ~KN_DISABLED; 1415 if ((kn->kn_status & KN_ACTIVE) && 1416 ((kn->kn_status & KN_QUEUED) == 0)) 1417 { 1418 knote_enqueue(kn); 1419 } 1420 } 1421 1422 /* 1423 * Handle any required reprocessing 1424 */ 1425 knote_release(kn); 1426 /* kn may be invalid now */ 1427 1428 /* 1429 * Loop control. We stop on errors (above), and also stop after 1430 * processing EV_RECEIPT, so the caller can process it. 1431 */ 1432 ++count; 1433 if (kev->flags & EV_RECEIPT) { 1434 error = 0; 1435 goto done; 1436 } 1437 ++kev; 1438 if (count < climit) { 1439 if (fp[count-1]) /* drop unprocessed fp */ 1440 fdrop(fp[count-1]); 1441 goto loop; 1442 } 1443 1444 /* 1445 * Cleanup 1446 */ 1447 done: 1448 if (td != NULL) { /* Owner of the kq_regtd */ 1449 kq->kq_regtd = NULL; 1450 if (__predict_false(kq->kq_state & KQ_REGWAIT)) { 1451 kq->kq_state &= ~KQ_REGWAIT; 1452 wakeup(&kq->kq_regtd); 1453 } 1454 } 1455 lwkt_relpooltoken(kq); 1456 1457 /* 1458 * Drop unprocessed file pointers 1459 */ 1460 *countp = count; 1461 if (count && fp[count-1]) 1462 fdrop(fp[count-1]); 1463 while (count < climit) { 1464 if (fp[count]) 1465 fdrop(fp[count]); 1466 ++count; 1467 } 1468 return (error); 1469 } 1470 1471 /* 1472 * Scan the kqueue, return the number of active events placed in kevp up 1473 * to count. 1474 * 1475 * Continuous mode events may get recycled, do not continue scanning past 1476 * marker unless no events have been collected. 1477 */ 1478 static int 1479 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 1480 struct knote *marker, int closedcounter, int flags) 1481 { 1482 struct knote *kn, local_marker; 1483 thread_t td = curthread; 1484 int total; 1485 1486 total = 0; 1487 local_marker.kn_filter = EVFILT_MARKER; 1488 local_marker.kn_status = KN_PROCESSING; 1489 1490 lwkt_getpooltoken(kq); 1491 1492 /* 1493 * Adjust marker, insert initial marker, or leave the marker alone. 1494 * 1495 * Also setup our local_marker. 1496 */ 1497 switch(flags & KEVENT_SCAN_MASK) { 1498 case KEVENT_SCAN_RELOAD_MARKER: 1499 TAILQ_REMOVE(&kq->kq_knpend, marker, kn_tqe); 1500 /* fall through */ 1501 case KEVENT_SCAN_INSERT_MARKER: 1502 TAILQ_INSERT_TAIL(&kq->kq_knpend, marker, kn_tqe); 1503 break; 1504 } 1505 TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe); 1506 1507 /* 1508 * Collect events. 1509 */ 1510 while (count) { 1511 kn = TAILQ_NEXT(&local_marker, kn_tqe); 1512 if (kn->kn_filter == EVFILT_MARKER) { 1513 /* Marker reached, we are done */ 1514 if (kn == marker) 1515 break; 1516 1517 /* Move local marker past some other threads marker */ 1518 kn = TAILQ_NEXT(kn, kn_tqe); 1519 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1520 TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe); 1521 continue; 1522 } 1523 1524 /* 1525 * We can't skip a knote undergoing processing, otherwise 1526 * we risk not returning it when the user process expects 1527 * it should be returned. Sleep and retry. 1528 */ 1529 if (knote_acquire(kn) == 0) 1530 continue; 1531 1532 /* 1533 * Remove the event for processing. 1534 * 1535 * WARNING! We must leave KN_QUEUED set to prevent the 1536 * event from being KNOTE_ACTIVATE()d while 1537 * the queue state is in limbo, in case we 1538 * block. 1539 */ 1540 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1541 kq->kq_count--; 1542 1543 /* 1544 * Kernel select() and poll() functions cache previous 1545 * operations on the assumption that future operations 1546 * will use similr descriptor sets. This removes any 1547 * stale entries in a way that does not require a descriptor 1548 * lookup and is thus not affected by close() races. 1549 * 1550 * Do not report to *_copyout() 1551 */ 1552 if (flags & KEVENT_AUTO_STALE) { 1553 if ((uint64_t)kn->kn_kevent.udata < 1554 curthread->td_lwp->lwp_kqueue_serial) 1555 { 1556 kn->kn_status |= KN_DELETING | KN_REPROCESS | 1557 KN_DISABLED; 1558 } 1559 } 1560 1561 /* 1562 * If a descriptor is close()d out from under a poll/select, 1563 * we want to report the event but delete the note because 1564 * the note can wind up being 'stuck' on kq_knpend. 1565 */ 1566 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && 1567 checkfdclosed(td, kq->kq_fdp, kn->kn_kevent.ident, 1568 kn->kn_fp, closedcounter)) 1569 { 1570 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1571 } 1572 1573 if (kn->kn_status & KN_DISABLED) { 1574 /* 1575 * If disabled we ensure the event is not queued 1576 * but leave its active bit set. On re-enablement 1577 * the event may be immediately triggered. 1578 */ 1579 kn->kn_status &= ~KN_QUEUED; 1580 } else if ((kn->kn_flags & EV_ONESHOT) == 0 && 1581 (kn->kn_status & KN_DELETING) == 0 && 1582 filter_event(kn, 0) == 0) { 1583 /* 1584 * If not running in one-shot mode and the event 1585 * is no longer present we ensure it is removed 1586 * from the queue and ignore it. 1587 */ 1588 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1589 } else { 1590 /* 1591 * Post the event 1592 */ 1593 if (kn->kn_fop == &user_filtops) 1594 filt_usertouch(kn, kevp, EVENT_PROCESS); 1595 else 1596 *kevp = kn->kn_kevent; 1597 ++kevp; 1598 ++total; 1599 --count; 1600 1601 if (kn->kn_flags & EV_ONESHOT) { 1602 kn->kn_status &= ~KN_QUEUED; 1603 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1604 } else { 1605 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1606 if (kn->kn_flags & EV_CLEAR) { 1607 kn->kn_data = 0; 1608 kn->kn_fflags = 0; 1609 } 1610 if (kn->kn_flags & EV_DISPATCH) { 1611 kn->kn_status |= KN_DISABLED; 1612 } 1613 kn->kn_status &= ~(KN_QUEUED | 1614 KN_ACTIVE); 1615 } else { 1616 TAILQ_INSERT_TAIL(&kq->kq_knpend, 1617 kn, 1618 kn_tqe); 1619 kq->kq_count++; 1620 } 1621 } 1622 } 1623 1624 /* 1625 * Handle any post-processing states 1626 */ 1627 knote_release(kn); 1628 } 1629 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1630 1631 lwkt_relpooltoken(kq); 1632 return (total); 1633 } 1634 1635 /* 1636 * XXX 1637 * This could be expanded to call kqueue_scan, if desired. 1638 * 1639 * MPSAFE 1640 */ 1641 static int 1642 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1643 { 1644 return (ENXIO); 1645 } 1646 1647 /* 1648 * MPSAFE 1649 */ 1650 static int 1651 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1652 { 1653 return (ENXIO); 1654 } 1655 1656 /* 1657 * MPALMOSTSAFE 1658 */ 1659 static int 1660 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 1661 struct ucred *cred, struct sysmsg *msg) 1662 { 1663 struct kqueue *kq; 1664 int error; 1665 1666 kq = (struct kqueue *)fp->f_data; 1667 lwkt_getpooltoken(kq); 1668 switch(com) { 1669 case FIOASYNC: 1670 if (*(int *)data) 1671 kq->kq_state |= KQ_ASYNC; 1672 else 1673 kq->kq_state &= ~KQ_ASYNC; 1674 error = 0; 1675 break; 1676 case FIOSETOWN: 1677 error = fsetown(*(int *)data, &kq->kq_sigio); 1678 break; 1679 default: 1680 error = ENOTTY; 1681 break; 1682 } 1683 lwkt_relpooltoken(kq); 1684 return (error); 1685 } 1686 1687 /* 1688 * MPSAFE 1689 */ 1690 static int 1691 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred) 1692 { 1693 struct kqueue *kq = (struct kqueue *)fp->f_data; 1694 1695 bzero((void *)st, sizeof(*st)); 1696 st->st_size = kq->kq_count; 1697 st->st_blksize = sizeof(struct kevent); 1698 st->st_mode = S_IFIFO; 1699 return (0); 1700 } 1701 1702 /* 1703 * MPSAFE 1704 */ 1705 static int 1706 kqueue_close(struct file *fp) 1707 { 1708 struct kqueue *kq = (struct kqueue *)fp->f_data; 1709 1710 kqueue_terminate(kq); 1711 1712 fp->f_data = NULL; 1713 funsetown(&kq->kq_sigio); 1714 1715 kfree(kq, M_KQUEUE); 1716 return (0); 1717 } 1718 1719 static void 1720 kqueue_wakeup(struct kqueue *kq) 1721 { 1722 if (kq->kq_sleep_cnt) { 1723 u_int sleep_cnt = kq->kq_sleep_cnt; 1724 1725 kq->kq_sleep_cnt = 0; 1726 if (sleep_cnt == 1) 1727 wakeup_one(kq); 1728 else 1729 wakeup(kq); 1730 } 1731 KNOTE(&kq->kq_kqinfo.ki_note, 0); 1732 } 1733 1734 /* 1735 * Calls filterops f_attach function, acquiring mplock if filter is not 1736 * marked as FILTEROP_MPSAFE. 1737 * 1738 * Caller must be holding the related kq token 1739 */ 1740 static int 1741 filter_attach(struct knote *kn) 1742 { 1743 int ret; 1744 1745 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1746 ret = kn->kn_fop->f_attach(kn); 1747 } else { 1748 get_mplock(); 1749 ret = kn->kn_fop->f_attach(kn); 1750 rel_mplock(); 1751 } 1752 return (ret); 1753 } 1754 1755 /* 1756 * Detach the knote and drop it, destroying the knote. 1757 * 1758 * Calls filterops f_detach function, acquiring mplock if filter is not 1759 * marked as FILTEROP_MPSAFE. 1760 * 1761 * Caller must be holding the related kq token 1762 */ 1763 static void 1764 knote_detach_and_drop(struct knote *kn) 1765 { 1766 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1767 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1768 kn->kn_fop->f_detach(kn); 1769 } else { 1770 get_mplock(); 1771 kn->kn_fop->f_detach(kn); 1772 rel_mplock(); 1773 } 1774 knote_drop(kn); 1775 } 1776 1777 /* 1778 * Calls filterops f_event function, acquiring mplock if filter is not 1779 * marked as FILTEROP_MPSAFE. 1780 * 1781 * If the knote is in the middle of being created or deleted we cannot 1782 * safely call the filter op. 1783 * 1784 * Caller must be holding the related kq token 1785 */ 1786 static int 1787 filter_event(struct knote *kn, long hint) 1788 { 1789 int ret; 1790 1791 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1792 ret = kn->kn_fop->f_event(kn, hint); 1793 } else { 1794 get_mplock(); 1795 ret = kn->kn_fop->f_event(kn, hint); 1796 rel_mplock(); 1797 } 1798 return (ret); 1799 } 1800 1801 /* 1802 * Walk down a list of knotes, activating them if their event has triggered. 1803 * 1804 * If we encounter any knotes which are undergoing processing we just mark 1805 * them for reprocessing and do not try to [re]activate the knote. However, 1806 * if a hint is being passed we have to wait and that makes things a bit 1807 * sticky. 1808 */ 1809 void 1810 knote(struct klist *list, long hint) 1811 { 1812 struct kqueue *kq; 1813 struct knote *kn; 1814 struct knote *kntmp; 1815 1816 lwkt_getpooltoken(list); 1817 restart: 1818 SLIST_FOREACH(kn, list, kn_next) { 1819 kq = kn->kn_kq; 1820 lwkt_getpooltoken(kq); 1821 1822 /* temporary verification hack */ 1823 SLIST_FOREACH(kntmp, list, kn_next) { 1824 if (kn == kntmp) 1825 break; 1826 } 1827 if (kn != kntmp || kn->kn_kq != kq) { 1828 lwkt_relpooltoken(kq); 1829 goto restart; 1830 } 1831 1832 if (kn->kn_status & KN_PROCESSING) { 1833 /* 1834 * Someone else is processing the knote, ask the 1835 * other thread to reprocess it and don't mess 1836 * with it otherwise. 1837 */ 1838 if (hint == 0) { 1839 kn->kn_status |= KN_REPROCESS; 1840 lwkt_relpooltoken(kq); 1841 continue; 1842 } 1843 1844 /* 1845 * If the hint is non-zero we have to wait or risk 1846 * losing the state the caller is trying to update. 1847 * 1848 * XXX This is a real problem, certain process 1849 * and signal filters will bump kn_data for 1850 * already-processed notes more than once if 1851 * we restart the list scan. FIXME. 1852 */ 1853 kn->kn_status |= KN_WAITING | KN_REPROCESS; 1854 tsleep(kn, 0, "knotec", hz); 1855 lwkt_relpooltoken(kq); 1856 goto restart; 1857 } 1858 1859 /* 1860 * Become the reprocessing master ourselves. 1861 * 1862 * If hint is non-zero running the event is mandatory 1863 * when not deleting so do it whether reprocessing is 1864 * set or not. 1865 */ 1866 kn->kn_status |= KN_PROCESSING; 1867 if ((kn->kn_status & KN_DELETING) == 0) { 1868 if (filter_event(kn, hint)) 1869 KNOTE_ACTIVATE(kn); 1870 } 1871 if (knote_release(kn)) { 1872 lwkt_relpooltoken(kq); 1873 goto restart; 1874 } 1875 lwkt_relpooltoken(kq); 1876 } 1877 lwkt_relpooltoken(list); 1878 } 1879 1880 /* 1881 * Insert knote at head of klist. 1882 * 1883 * This function may only be called via a filter function and thus 1884 * kq_token should already be held and marked for processing. 1885 */ 1886 void 1887 knote_insert(struct klist *klist, struct knote *kn) 1888 { 1889 lwkt_getpooltoken(klist); 1890 KKASSERT(kn->kn_status & KN_PROCESSING); 1891 SLIST_INSERT_HEAD(klist, kn, kn_next); 1892 lwkt_relpooltoken(klist); 1893 } 1894 1895 /* 1896 * Remove knote from a klist 1897 * 1898 * This function may only be called via a filter function and thus 1899 * kq_token should already be held and marked for processing. 1900 */ 1901 void 1902 knote_remove(struct klist *klist, struct knote *kn) 1903 { 1904 lwkt_getpooltoken(klist); 1905 KKASSERT(kn->kn_status & KN_PROCESSING); 1906 SLIST_REMOVE(klist, kn, knote, kn_next); 1907 lwkt_relpooltoken(klist); 1908 } 1909 1910 void 1911 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst, 1912 struct filterops *ops, void *hook) 1913 { 1914 struct kqueue *kq; 1915 struct knote *kn; 1916 1917 lwkt_getpooltoken(&src->ki_note); 1918 lwkt_getpooltoken(&dst->ki_note); 1919 while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) { 1920 kq = kn->kn_kq; 1921 lwkt_getpooltoken(kq); 1922 if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) { 1923 lwkt_relpooltoken(kq); 1924 continue; 1925 } 1926 if (knote_acquire(kn)) { 1927 knote_remove(&src->ki_note, kn); 1928 kn->kn_fop = ops; 1929 kn->kn_hook = hook; 1930 knote_insert(&dst->ki_note, kn); 1931 knote_release(kn); 1932 /* kn may be invalid now */ 1933 } 1934 lwkt_relpooltoken(kq); 1935 } 1936 lwkt_relpooltoken(&dst->ki_note); 1937 lwkt_relpooltoken(&src->ki_note); 1938 } 1939 1940 /* 1941 * Remove all knotes referencing a specified fd 1942 */ 1943 void 1944 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd) 1945 { 1946 struct kqueue *kq; 1947 struct knote *kn; 1948 struct knote *kntmp; 1949 1950 lwkt_getpooltoken(&fp->f_klist); 1951 restart: 1952 SLIST_FOREACH(kn, &fp->f_klist, kn_link) { 1953 if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) { 1954 kq = kn->kn_kq; 1955 lwkt_getpooltoken(kq); 1956 1957 /* temporary verification hack */ 1958 SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) { 1959 if (kn == kntmp) 1960 break; 1961 } 1962 if (kn != kntmp || kn->kn_kq->kq_fdp != fdp || 1963 kn->kn_id != fd || kn->kn_kq != kq) { 1964 lwkt_relpooltoken(kq); 1965 goto restart; 1966 } 1967 if (knote_acquire(kn)) 1968 knote_detach_and_drop(kn); 1969 lwkt_relpooltoken(kq); 1970 goto restart; 1971 } 1972 } 1973 lwkt_relpooltoken(&fp->f_klist); 1974 } 1975 1976 /* 1977 * Low level attach function. 1978 * 1979 * The knote should already be marked for processing. 1980 * Caller must hold the related kq token. 1981 */ 1982 static void 1983 knote_attach(struct knote *kn) 1984 { 1985 struct klist *list; 1986 struct kqueue *kq = kn->kn_kq; 1987 1988 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1989 KKASSERT(kn->kn_fp); 1990 list = &kn->kn_fp->f_klist; 1991 } else { 1992 if (kq->kq_knhashmask == 0) 1993 kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1994 &kq->kq_knhashmask); 1995 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1996 } 1997 lwkt_getpooltoken(list); 1998 SLIST_INSERT_HEAD(list, kn, kn_link); 1999 lwkt_relpooltoken(list); 2000 TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink); 2001 } 2002 2003 /* 2004 * Low level drop function. 2005 * 2006 * The knote should already be marked for processing. 2007 * Caller must hold the related kq token. 2008 */ 2009 static void 2010 knote_drop(struct knote *kn) 2011 { 2012 struct kqueue *kq; 2013 struct klist *list; 2014 2015 kq = kn->kn_kq; 2016 2017 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 2018 list = &kn->kn_fp->f_klist; 2019 else 2020 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2021 2022 lwkt_getpooltoken(list); 2023 SLIST_REMOVE(list, kn, knote, kn_link); 2024 lwkt_relpooltoken(list); 2025 TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink); 2026 if (kn->kn_status & KN_QUEUED) 2027 knote_dequeue(kn); 2028 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 2029 fdrop(kn->kn_fp); 2030 kn->kn_fp = NULL; 2031 } 2032 knote_free(kn); 2033 } 2034 2035 /* 2036 * Low level enqueue function. 2037 * 2038 * The knote should already be marked for processing. 2039 * Caller must be holding the kq token 2040 */ 2041 static void 2042 knote_enqueue(struct knote *kn) 2043 { 2044 struct kqueue *kq = kn->kn_kq; 2045 2046 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 2047 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 2048 kn->kn_status |= KN_QUEUED; 2049 ++kq->kq_count; 2050 2051 /* 2052 * Send SIGIO on request (typically set up as a mailbox signal) 2053 */ 2054 if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1) 2055 pgsigio(kq->kq_sigio, SIGIO, 0); 2056 2057 kqueue_wakeup(kq); 2058 } 2059 2060 /* 2061 * Low level dequeue function. 2062 * 2063 * The knote should already be marked for processing. 2064 * Caller must be holding the kq token 2065 */ 2066 static void 2067 knote_dequeue(struct knote *kn) 2068 { 2069 struct kqueue *kq = kn->kn_kq; 2070 2071 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 2072 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 2073 kn->kn_status &= ~KN_QUEUED; 2074 kq->kq_count--; 2075 } 2076 2077 static struct knote * 2078 knote_alloc(void) 2079 { 2080 return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK); 2081 } 2082 2083 static void 2084 knote_free(struct knote *kn) 2085 { 2086 struct knote_cache_list *cache_list; 2087 2088 cache_list = &knote_cache_lists[mycpuid]; 2089 if (cache_list->knote_cache_cnt < KNOTE_CACHE_MAX) { 2090 crit_enter(); 2091 SLIST_INSERT_HEAD(&cache_list->knote_cache, kn, kn_link); 2092 cache_list->knote_cache_cnt++; 2093 crit_exit(); 2094 return; 2095 } 2096 kfree(kn, M_KQUEUE); 2097 } 2098 2099 struct sleepinfo { 2100 void *ident; 2101 int timedout; 2102 }; 2103 2104 static void 2105 precise_sleep_intr(systimer_t info, int in_ipi, struct intrframe *frame) 2106 { 2107 struct sleepinfo *si; 2108 2109 si = info->data; 2110 si->timedout = 1; 2111 wakeup(si->ident); 2112 } 2113 2114 static int 2115 precise_sleep(void *ident, int flags, const char *wmesg, int us) 2116 { 2117 struct systimer info; 2118 struct sleepinfo si = { 2119 .ident = ident, 2120 .timedout = 0, 2121 }; 2122 int r; 2123 2124 tsleep_interlock(ident, flags); 2125 systimer_init_oneshot(&info, precise_sleep_intr, &si, us); 2126 r = tsleep(ident, flags | PINTERLOCKED, wmesg, 0); 2127 systimer_del(&info); 2128 if (si.timedout) 2129 r = EWOULDBLOCK; 2130 2131 return r; 2132 } 2133