1 /* $NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran and Mindaugas Rasiukevicius. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 67 */ 68 69 /* 70 * System calls of synchronous I/O multiplexing subsystem. 71 * 72 * Locking 73 * 74 * Two locks are used: <object-lock> and selcluster_t::sc_lock. 75 * 76 * The <object-lock> might be a device driver or another subsystem, e.g. 77 * socket or pipe. This lock is not exported, and thus invisible to this 78 * subsystem. Mainly, synchronisation between selrecord() and selnotify() 79 * routines depends on this lock, as it will be described in the comments. 80 * 81 * Lock order 82 * 83 * <object-lock> -> 84 * selcluster_t::sc_lock 85 */ 86 87 #include <sys/cdefs.h> 88 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $"); 89 90 #include <sys/param.h> 91 92 #include <sys/atomic.h> 93 #include <sys/bitops.h> 94 #include <sys/cpu.h> 95 #include <sys/file.h> 96 #include <sys/filedesc.h> 97 #include <sys/kernel.h> 98 #include <sys/lwp.h> 99 #include <sys/mount.h> 100 #include <sys/poll.h> 101 #include <sys/proc.h> 102 #include <sys/signalvar.h> 103 #include <sys/sleepq.h> 104 #include <sys/socketvar.h> 105 #include <sys/socketvar.h> 106 #include <sys/syncobj.h> 107 #include <sys/syscallargs.h> 108 #include <sys/sysctl.h> 109 #include <sys/systm.h> 110 #include <sys/uio.h> 111 112 /* Flags for lwp::l_selflag. */ 113 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 114 #define SEL_SCANNING 1 /* polling descriptors */ 115 #define SEL_BLOCKING 2 /* blocking and waiting for event */ 116 #define SEL_EVENT 3 /* interrupted, events set directly */ 117 118 /* 119 * Per-cluster state for select()/poll(). For a system with fewer 120 * than 64 CPUs, this gives us per-CPU clusters. 121 */ 122 #define SELCLUSTERS 64 123 #define SELCLUSTERMASK (SELCLUSTERS - 1) 124 125 typedef struct selcluster { 126 kmutex_t *sc_lock; 127 sleepq_t sc_sleepq; 128 uint64_t sc_mask; 129 int sc_ncoll; 130 } selcluster_t; 131 132 static inline int selscan(char *, const int, const size_t, register_t *); 133 static inline int pollscan(struct pollfd *, const int, register_t *); 134 static void selclear(void); 135 136 static const int sel_flag[] = { 137 POLLRDNORM | POLLHUP | POLLERR, 138 POLLWRNORM | POLLHUP | POLLERR, 139 POLLRDBAND 140 }; 141 142 /* 143 * LWPs are woken using the sleep queue only due to a collision, the case 144 * with the maximum Suck Factor. Save the cost of sorting for named waiters 145 * by inserting in LIFO order. In the future it would be preferable to not 146 * enqueue LWPs at all, unless subject to a collision. 147 */ 148 syncobj_t select_sobj = { 149 .sobj_name = "select", 150 .sobj_flag = SOBJ_SLEEPQ_LIFO, 151 .sobj_boostpri = PRI_KERNEL, 152 .sobj_unsleep = sleepq_unsleep, 153 .sobj_changepri = sleepq_changepri, 154 .sobj_lendpri = sleepq_lendpri, 155 .sobj_owner = syncobj_noowner, 156 }; 157 158 static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; 159 static int direct_select __read_mostly = 0; 160 161 /* Operations: either select() or poll(). */ 162 const char selop_select[] = "select"; 163 const char selop_poll[] = "poll"; 164 165 /* 166 * Select system call. 167 */ 168 int 169 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, 170 register_t *retval) 171 { 172 /* { 173 syscallarg(int) nd; 174 syscallarg(fd_set *) in; 175 syscallarg(fd_set *) ou; 176 syscallarg(fd_set *) ex; 177 syscallarg(const struct timespec *) ts; 178 syscallarg(sigset_t *) mask; 179 } */ 180 struct timespec ats, *ts = NULL; 181 sigset_t amask, *mask = NULL; 182 int error; 183 184 if (SCARG(uap, ts)) { 185 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 186 if (error) 187 return error; 188 ts = &ats; 189 } 190 if (SCARG(uap, mask) != NULL) { 191 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 192 if (error) 193 return error; 194 mask = &amask; 195 } 196 197 return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 198 SCARG(uap, ou), SCARG(uap, ex), ts, mask); 199 } 200 201 int 202 sys___select50(struct lwp *l, const struct sys___select50_args *uap, 203 register_t *retval) 204 { 205 /* { 206 syscallarg(int) nd; 207 syscallarg(fd_set *) in; 208 syscallarg(fd_set *) ou; 209 syscallarg(fd_set *) ex; 210 syscallarg(struct timeval *) tv; 211 } */ 212 struct timeval atv; 213 struct timespec ats, *ts = NULL; 214 int error; 215 216 if (SCARG(uap, tv)) { 217 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); 218 if (error) 219 return error; 220 221 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) 222 return EINVAL; 223 224 TIMEVAL_TO_TIMESPEC(&atv, &ats); 225 ts = &ats; 226 } 227 228 return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 229 SCARG(uap, ou), SCARG(uap, ex), ts, NULL); 230 } 231 232 /* 233 * sel_do_scan: common code to perform the scan on descriptors. 234 */ 235 static int 236 sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni, 237 struct timespec *ts, sigset_t *mask, register_t *retval) 238 { 239 lwp_t * const l = curlwp; 240 selcluster_t *sc; 241 kmutex_t *lock; 242 struct timespec sleepts; 243 int error, timo; 244 245 timo = 0; 246 if (ts && inittimeleft(ts, &sleepts) == -1) { 247 return EINVAL; 248 } 249 250 if (__predict_false(mask)) 251 sigsuspendsetup(l, mask); 252 253 /* 254 * We may context switch during or at any time after picking a CPU 255 * and cluster to associate with, but it doesn't matter. In the 256 * unlikely event we migrate elsewhere all we risk is a little lock 257 * contention; correctness is not sacrificed. 258 */ 259 sc = curcpu()->ci_data.cpu_selcluster; 260 lock = sc->sc_lock; 261 l->l_selcluster = sc; 262 263 if (opname == selop_select) { 264 l->l_selbits = fds; 265 l->l_selni = ni; 266 } else { 267 l->l_selbits = NULL; 268 } 269 270 for (;;) { 271 int ncoll; 272 273 SLIST_INIT(&l->l_selwait); 274 l->l_selret = 0; 275 276 /* 277 * No need to lock. If this is overwritten by another value 278 * while scanning, we will retry below. We only need to see 279 * exact state from the descriptors that we are about to poll, 280 * and lock activity resulting from fo_poll is enough to 281 * provide an up to date value for new polling activity. 282 */ 283 if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) { 284 /* Non-blocking: no need for selrecord()/selclear() */ 285 l->l_selflag = SEL_RESET; 286 } else { 287 l->l_selflag = SEL_SCANNING; 288 } 289 ncoll = sc->sc_ncoll; 290 membar_release(); 291 292 if (opname == selop_select) { 293 error = selscan((char *)fds, nf, ni, retval); 294 } else { 295 error = pollscan((struct pollfd *)fds, nf, retval); 296 } 297 if (error || *retval) 298 break; 299 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) 300 break; 301 /* 302 * Acquire the lock and perform the (re)checks. Note, if 303 * collision has occurred, then our state does not matter, 304 * as we must perform re-scan. Therefore, check it first. 305 */ 306 state_check: 307 mutex_spin_enter(lock); 308 if (__predict_false(sc->sc_ncoll != ncoll)) { 309 /* Collision: perform re-scan. */ 310 mutex_spin_exit(lock); 311 selclear(); 312 continue; 313 } 314 if (__predict_true(l->l_selflag == SEL_EVENT)) { 315 /* Events occurred, they are set directly. */ 316 mutex_spin_exit(lock); 317 break; 318 } 319 if (__predict_true(l->l_selflag == SEL_RESET)) { 320 /* Events occurred, but re-scan is requested. */ 321 mutex_spin_exit(lock); 322 selclear(); 323 continue; 324 } 325 /* Nothing happen, therefore - sleep. */ 326 l->l_selflag = SEL_BLOCKING; 327 KASSERT(l->l_blcnt == 0); 328 (void)sleepq_enter(&sc->sc_sleepq, l, lock); 329 sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true); 330 error = sleepq_block(timo, true, &select_sobj, 0); 331 if (error != 0) { 332 break; 333 } 334 /* Awoken: need to check the state. */ 335 goto state_check; 336 } 337 selclear(); 338 339 /* Add direct events if any. */ 340 if (l->l_selflag == SEL_EVENT) { 341 KASSERT(l->l_selret != 0); 342 *retval += l->l_selret; 343 } 344 345 if (__predict_false(mask)) 346 sigsuspendteardown(l); 347 348 /* select and poll are not restarted after signals... */ 349 if (error == ERESTART) 350 return EINTR; 351 if (error == EWOULDBLOCK) 352 return 0; 353 return error; 354 } 355 356 int 357 selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, 358 fd_set *u_ex, struct timespec *ts, sigset_t *mask) 359 { 360 char smallbits[howmany(FD_SETSIZE, NFDBITS) * 361 sizeof(fd_mask) * 6]; 362 char *bits; 363 int error, nf; 364 size_t ni; 365 366 if (nd < 0) 367 return (EINVAL); 368 nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles; 369 if (nd > nf) { 370 /* forgiving; slightly wrong */ 371 nd = nf; 372 } 373 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 374 if (ni * 6 > sizeof(smallbits)) 375 bits = kmem_alloc(ni * 6, KM_SLEEP); 376 else 377 bits = smallbits; 378 379 #define getbits(name, x) \ 380 if (u_ ## name) { \ 381 error = copyin(u_ ## name, bits + ni * x, ni); \ 382 if (error) \ 383 goto fail; \ 384 } else \ 385 memset(bits + ni * x, 0, ni); 386 getbits(in, 0); 387 getbits(ou, 1); 388 getbits(ex, 2); 389 #undef getbits 390 391 error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval); 392 if (error == 0 && u_in != NULL) 393 error = copyout(bits + ni * 3, u_in, ni); 394 if (error == 0 && u_ou != NULL) 395 error = copyout(bits + ni * 4, u_ou, ni); 396 if (error == 0 && u_ex != NULL) 397 error = copyout(bits + ni * 5, u_ex, ni); 398 fail: 399 if (bits != smallbits) 400 kmem_free(bits, ni * 6); 401 return (error); 402 } 403 404 static inline int 405 selscan(char *bits, const int nfd, const size_t ni, register_t *retval) 406 { 407 fd_mask *ibitp, *obitp; 408 int msk, i, j, fd, n; 409 file_t *fp; 410 lwp_t *l; 411 412 ibitp = (fd_mask *)(bits + ni * 0); 413 obitp = (fd_mask *)(bits + ni * 3); 414 n = 0; 415 l = curlwp; 416 417 memset(obitp, 0, ni * 3); 418 for (msk = 0; msk < 3; msk++) { 419 for (i = 0; i < nfd; i += NFDBITS) { 420 fd_mask ibits, obits; 421 422 ibits = *ibitp; 423 obits = 0; 424 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 425 ibits &= ~(1U << j); 426 if ((fp = fd_getfile(fd)) == NULL) 427 return (EBADF); 428 /* 429 * Setup an argument to selrecord(), which is 430 * a file descriptor number. 431 */ 432 l->l_selrec = fd; 433 if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { 434 if (!direct_select) { 435 /* 436 * Have events: do nothing in 437 * selrecord(). 438 */ 439 l->l_selflag = SEL_RESET; 440 } 441 obits |= (1U << j); 442 n++; 443 } 444 fd_putfile(fd); 445 } 446 if (obits != 0) { 447 if (direct_select) { 448 kmutex_t *lock; 449 lock = l->l_selcluster->sc_lock; 450 mutex_spin_enter(lock); 451 *obitp |= obits; 452 mutex_spin_exit(lock); 453 } else { 454 *obitp |= obits; 455 } 456 } 457 ibitp++; 458 obitp++; 459 } 460 } 461 *retval = n; 462 return (0); 463 } 464 465 /* 466 * Poll system call. 467 */ 468 int 469 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) 470 { 471 /* { 472 syscallarg(struct pollfd *) fds; 473 syscallarg(u_int) nfds; 474 syscallarg(int) timeout; 475 } */ 476 struct timespec ats, *ts = NULL; 477 478 if (SCARG(uap, timeout) != INFTIM) { 479 ats.tv_sec = SCARG(uap, timeout) / 1000; 480 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; 481 ts = &ats; 482 } 483 484 return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); 485 } 486 487 /* 488 * Poll system call. 489 */ 490 int 491 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, 492 register_t *retval) 493 { 494 /* { 495 syscallarg(struct pollfd *) fds; 496 syscallarg(u_int) nfds; 497 syscallarg(const struct timespec *) ts; 498 syscallarg(const sigset_t *) mask; 499 } */ 500 struct timespec ats, *ts = NULL; 501 sigset_t amask, *mask = NULL; 502 int error; 503 504 if (SCARG(uap, ts)) { 505 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 506 if (error) 507 return error; 508 ts = &ats; 509 } 510 if (SCARG(uap, mask)) { 511 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 512 if (error) 513 return error; 514 mask = &amask; 515 } 516 517 return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); 518 } 519 520 int 521 pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, 522 struct timespec *ts, sigset_t *mask) 523 { 524 struct pollfd smallfds[32]; 525 struct pollfd *fds; 526 int error; 527 size_t ni; 528 529 if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) { 530 /* 531 * Prevent userland from causing over-allocation. 532 * Raising the default limit too high can still cause 533 * a lot of memory to be allocated, but this also means 534 * that the file descriptor array will also be large. 535 * 536 * To reduce the memory requirements here, we could 537 * process the 'fds' array in chunks, but that 538 * is a lot of code that isn't normally useful. 539 * (Or just move the copyin/out into pollscan().) 540 * 541 * Historically the code silently truncated 'fds' to 542 * dt_nfiles entries - but that does cause issues. 543 * 544 * Using the max limit equivalent to sysctl 545 * kern.maxfiles is the moral equivalent of OPEN_MAX 546 * as specified by POSIX. 547 * 548 * We add a slop of 1000 in case the resource limit was 549 * changed after opening descriptors or the same descriptor 550 * was specified more than once. 551 */ 552 return EINVAL; 553 } 554 ni = nfds * sizeof(struct pollfd); 555 if (ni > sizeof(smallfds)) 556 fds = kmem_alloc(ni, KM_SLEEP); 557 else 558 fds = smallfds; 559 560 error = copyin(u_fds, fds, ni); 561 if (error) 562 goto fail; 563 564 error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval); 565 if (error == 0) 566 error = copyout(fds, u_fds, ni); 567 fail: 568 if (fds != smallfds) 569 kmem_free(fds, ni); 570 return (error); 571 } 572 573 static inline int 574 pollscan(struct pollfd *fds, const int nfd, register_t *retval) 575 { 576 file_t *fp; 577 int i, n = 0, revents; 578 579 for (i = 0; i < nfd; i++, fds++) { 580 fds->revents = 0; 581 if (fds->fd < 0) { 582 revents = 0; 583 } else if ((fp = fd_getfile(fds->fd)) == NULL) { 584 revents = POLLNVAL; 585 } else { 586 /* 587 * Perform poll: registers select request or returns 588 * the events which are set. Setup an argument for 589 * selrecord(), which is a pointer to struct pollfd. 590 */ 591 curlwp->l_selrec = (uintptr_t)fds; 592 revents = (*fp->f_ops->fo_poll)(fp, 593 fds->events | POLLERR | POLLHUP); 594 fd_putfile(fds->fd); 595 } 596 if (revents) { 597 if (!direct_select) { 598 /* Have events: do nothing in selrecord(). */ 599 curlwp->l_selflag = SEL_RESET; 600 } 601 fds->revents = revents; 602 n++; 603 } 604 } 605 *retval = n; 606 return (0); 607 } 608 609 int 610 seltrue(dev_t dev, int events, lwp_t *l) 611 { 612 613 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 614 } 615 616 /* 617 * Record a select request. Concurrency issues: 618 * 619 * The caller holds the same lock across calls to selrecord() and 620 * selnotify(), so we don't need to consider a concurrent wakeup 621 * while in this routine. 622 * 623 * The only activity we need to guard against is selclear(), called by 624 * another thread that is exiting sel_do_scan(). 625 * `sel_lwp' can only become non-NULL while the caller's lock is held, 626 * so it cannot become non-NULL due to a change made by another thread 627 * while we are in this routine. It can only become _NULL_ due to a 628 * call to selclear(). 629 * 630 * If it is non-NULL and != selector there is the potential for 631 * selclear() to be called by another thread. If either of those 632 * conditions are true, we're not interested in touching the `named 633 * waiter' part of the selinfo record because we need to record a 634 * collision. Hence there is no need for additional locking in this 635 * routine. 636 */ 637 void 638 selrecord(lwp_t *selector, struct selinfo *sip) 639 { 640 selcluster_t *sc; 641 lwp_t *other; 642 643 KASSERT(selector == curlwp); 644 645 sc = selector->l_selcluster; 646 other = sip->sel_lwp; 647 648 if (selector->l_selflag == SEL_RESET) { 649 /* 0. We're not going to block - will poll again if needed. */ 650 } else if (other == selector) { 651 /* 1. We (selector) already claimed to be the first LWP. */ 652 KASSERT(sip->sel_cluster == sc); 653 } else if (other == NULL) { 654 /* 655 * 2. No first LWP, therefore we (selector) are the first. 656 * 657 * There may be unnamed waiters (collisions). Issue a memory 658 * barrier to ensure that we access sel_lwp (above) before 659 * other fields - this guards against a call to selclear(). 660 */ 661 membar_acquire(); 662 sip->sel_lwp = selector; 663 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 664 /* Copy the argument, which is for selnotify(). */ 665 sip->sel_fdinfo = selector->l_selrec; 666 /* Replace selinfo's lock with the chosen cluster's lock. */ 667 sip->sel_cluster = sc; 668 } else { 669 /* 3. Multiple waiters: record a collision. */ 670 sip->sel_collision |= sc->sc_mask; 671 KASSERT(sip->sel_cluster != NULL); 672 } 673 } 674 675 /* 676 * Record a knote. 677 * 678 * The caller holds the same lock as for selrecord(). 679 */ 680 void 681 selrecord_knote(struct selinfo *sip, struct knote *kn) 682 { 683 klist_insert(&sip->sel_klist, kn); 684 } 685 686 /* 687 * Remove a knote. 688 * 689 * The caller holds the same lock as for selrecord(). 690 * 691 * Returns true if the last knote was removed and the list 692 * is now empty. 693 */ 694 bool 695 selremove_knote(struct selinfo *sip, struct knote *kn) 696 { 697 return klist_remove(&sip->sel_klist, kn); 698 } 699 700 /* 701 * sel_setevents: a helper function for selnotify(), to set the events 702 * for LWP sleeping in selcommon() or pollcommon(). 703 */ 704 static inline bool 705 sel_setevents(lwp_t *l, struct selinfo *sip, const int events) 706 { 707 const int oflag = l->l_selflag; 708 int ret = 0; 709 710 /* 711 * If we require re-scan or it was required by somebody else, 712 * then just (re)set SEL_RESET and return. 713 */ 714 if (__predict_false(events == 0 || oflag == SEL_RESET)) { 715 l->l_selflag = SEL_RESET; 716 return true; 717 } 718 /* 719 * Direct set. Note: select state of LWP is locked. First, 720 * determine whether it is selcommon() or pollcommon(). 721 */ 722 if (l->l_selbits != NULL) { 723 const size_t ni = l->l_selni; 724 fd_mask *fds = (fd_mask *)l->l_selbits; 725 fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); 726 const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); 727 const int idx = fd >> __NFDSHIFT; 728 int n; 729 730 for (n = 0; n < 3; n++) { 731 if ((fds[idx] & fbit) != 0 && 732 (ofds[idx] & fbit) == 0 && 733 (sel_flag[n] & events)) { 734 ofds[idx] |= fbit; 735 ret++; 736 } 737 fds = (fd_mask *)((char *)fds + ni); 738 ofds = (fd_mask *)((char *)ofds + ni); 739 } 740 } else { 741 struct pollfd *pfd = (void *)sip->sel_fdinfo; 742 int revents = events & (pfd->events | POLLERR | POLLHUP); 743 744 if (revents) { 745 if (pfd->revents == 0) 746 ret = 1; 747 pfd->revents |= revents; 748 } 749 } 750 /* Check whether there are any events to return. */ 751 if (!ret) { 752 return false; 753 } 754 /* Indicate direct set and note the event (cluster lock is held). */ 755 l->l_selflag = SEL_EVENT; 756 l->l_selret += ret; 757 return true; 758 } 759 760 /* 761 * Do a wakeup when a selectable event occurs. Concurrency issues: 762 * 763 * As per selrecord(), the caller's object lock is held. If there 764 * is a named waiter, we must acquire the associated selcluster's lock 765 * in order to synchronize with selclear() and pollers going to sleep 766 * in sel_do_scan(). 767 * 768 * sip->sel_cluser cannot change at this point, as it is only changed 769 * in selrecord(), and concurrent calls to selrecord() are locked 770 * out by the caller. 771 */ 772 void 773 selnotify(struct selinfo *sip, int events, long knhint) 774 { 775 selcluster_t *sc; 776 uint64_t mask; 777 int index, oflag; 778 lwp_t *l; 779 kmutex_t *lock; 780 781 KNOTE(&sip->sel_klist, knhint); 782 783 if (sip->sel_lwp != NULL) { 784 /* One named LWP is waiting. */ 785 sc = sip->sel_cluster; 786 lock = sc->sc_lock; 787 mutex_spin_enter(lock); 788 /* Still there? */ 789 if (sip->sel_lwp != NULL) { 790 /* 791 * Set the events for our LWP and indicate that. 792 * Otherwise, request for a full re-scan. 793 */ 794 l = sip->sel_lwp; 795 oflag = l->l_selflag; 796 797 if (!direct_select) { 798 l->l_selflag = SEL_RESET; 799 } else if (!sel_setevents(l, sip, events)) { 800 /* No events to return. */ 801 mutex_spin_exit(lock); 802 return; 803 } 804 805 /* 806 * If thread is sleeping, wake it up. If it's not 807 * yet asleep, it will notice the change in state 808 * and will re-poll the descriptors. 809 */ 810 if (oflag == SEL_BLOCKING && l->l_mutex == lock) { 811 KASSERT(l->l_wchan == sc); 812 sleepq_remove(l->l_sleepq, l, true); 813 } 814 } 815 mutex_spin_exit(lock); 816 } 817 818 if ((mask = sip->sel_collision) != 0) { 819 /* 820 * There was a collision (multiple waiters): we must 821 * inform all potentially interested waiters. 822 */ 823 sip->sel_collision = 0; 824 do { 825 index = ffs64(mask) - 1; 826 mask ^= __BIT(index); 827 sc = selcluster[index]; 828 lock = sc->sc_lock; 829 mutex_spin_enter(lock); 830 sc->sc_ncoll++; 831 sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); 832 } while (__predict_false(mask != 0)); 833 } 834 } 835 836 /* 837 * Remove an LWP from all objects that it is waiting for. Concurrency 838 * issues: 839 * 840 * The object owner's (e.g. device driver) lock is not held here. Calls 841 * can be made to selrecord() and we do not synchronize against those 842 * directly using locks. However, we use `sel_lwp' to lock out changes. 843 * Before clearing it we must use memory barriers to ensure that we can 844 * safely traverse the list of selinfo records. 845 */ 846 static void 847 selclear(void) 848 { 849 struct selinfo *sip, *next; 850 selcluster_t *sc; 851 lwp_t *l; 852 kmutex_t *lock; 853 854 l = curlwp; 855 sc = l->l_selcluster; 856 lock = sc->sc_lock; 857 858 /* 859 * If the request was non-blocking, or we found events on the first 860 * descriptor, there will be no need to clear anything - avoid 861 * taking the lock. 862 */ 863 if (SLIST_EMPTY(&l->l_selwait)) { 864 return; 865 } 866 867 mutex_spin_enter(lock); 868 for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { 869 KASSERT(sip->sel_lwp == l); 870 KASSERT(sip->sel_cluster == l->l_selcluster); 871 872 /* 873 * Read link to next selinfo record, if any. 874 * It's no longer safe to touch `sip' after clearing 875 * `sel_lwp', so ensure that the read of `sel_chain' 876 * completes before the clearing of sel_lwp becomes 877 * globally visible. 878 */ 879 next = SLIST_NEXT(sip, sel_chain); 880 /* Release the record for another named waiter to use. */ 881 atomic_store_release(&sip->sel_lwp, NULL); 882 } 883 mutex_spin_exit(lock); 884 } 885 886 /* 887 * Initialize the select/poll system calls. Called once for each 888 * CPU in the system, as they are attached. 889 */ 890 void 891 selsysinit(struct cpu_info *ci) 892 { 893 selcluster_t *sc; 894 u_int index; 895 896 /* If already a cluster in place for this bit, re-use. */ 897 index = cpu_index(ci) & SELCLUSTERMASK; 898 sc = selcluster[index]; 899 if (sc == NULL) { 900 sc = kmem_alloc(roundup2(sizeof(selcluster_t), 901 coherency_unit) + coherency_unit, KM_SLEEP); 902 sc = (void *)roundup2((uintptr_t)sc, coherency_unit); 903 sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 904 sleepq_init(&sc->sc_sleepq); 905 sc->sc_ncoll = 0; 906 sc->sc_mask = __BIT(index); 907 selcluster[index] = sc; 908 } 909 ci->ci_data.cpu_selcluster = sc; 910 } 911 912 /* 913 * Initialize a selinfo record. 914 */ 915 void 916 selinit(struct selinfo *sip) 917 { 918 919 memset(sip, 0, sizeof(*sip)); 920 klist_init(&sip->sel_klist); 921 } 922 923 /* 924 * Destroy a selinfo record. The owning object must not gain new 925 * references while this is in progress: all activity on the record 926 * must be stopped. 927 * 928 * Concurrency issues: we only need guard against a call to selclear() 929 * by a thread exiting sel_do_scan(). The caller has prevented further 930 * references being made to the selinfo record via selrecord(), and it 931 * will not call selnotify() again. 932 */ 933 void 934 seldestroy(struct selinfo *sip) 935 { 936 selcluster_t *sc; 937 kmutex_t *lock; 938 lwp_t *l; 939 940 klist_fini(&sip->sel_klist); 941 942 if (sip->sel_lwp == NULL) 943 return; 944 945 /* 946 * Lock out selclear(). The selcluster pointer can't change while 947 * we are here since it is only ever changed in selrecord(), 948 * and that will not be entered again for this record because 949 * it is dying. 950 */ 951 KASSERT(sip->sel_cluster != NULL); 952 sc = sip->sel_cluster; 953 lock = sc->sc_lock; 954 mutex_spin_enter(lock); 955 if ((l = sip->sel_lwp) != NULL) { 956 /* 957 * This should rarely happen, so although SLIST_REMOVE() 958 * is slow, using it here is not a problem. 959 */ 960 KASSERT(l->l_selcluster == sc); 961 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 962 sip->sel_lwp = NULL; 963 } 964 mutex_spin_exit(lock); 965 } 966 967 /* 968 * System control nodes. 969 */ 970 SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup") 971 { 972 973 sysctl_createv(clog, 0, NULL, NULL, 974 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 975 CTLTYPE_INT, "direct_select", 976 SYSCTL_DESCR("Enable/disable direct select (for testing)"), 977 NULL, 0, &direct_select, 0, 978 CTL_KERN, CTL_CREATE, CTL_EOL); 979 } 980