1 /* $NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran and Mindaugas Rasiukevicius. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 67 */ 68 69 /* 70 * System calls of synchronous I/O multiplexing subsystem. 71 * 72 * Locking 73 * 74 * Two locks are used: <object-lock> and selcluster_t::sc_lock. 75 * 76 * The <object-lock> might be a device driver or another subsystem, e.g. 77 * socket or pipe. This lock is not exported, and thus invisible to this 78 * subsystem. Mainly, synchronisation between selrecord() and selnotify() 79 * routines depends on this lock, as it will be described in the comments. 80 * 81 * Lock order 82 * 83 * <object-lock> -> 84 * selcluster_t::sc_lock 85 */ 86 87 #include <sys/cdefs.h> 88 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $"); 89 90 #include <sys/param.h> 91 92 #include <sys/atomic.h> 93 #include <sys/bitops.h> 94 #include <sys/cpu.h> 95 #include <sys/file.h> 96 #include <sys/filedesc.h> 97 #include <sys/kernel.h> 98 #include <sys/lwp.h> 99 #include <sys/mount.h> 100 #include <sys/poll.h> 101 #include <sys/proc.h> 102 #include <sys/signalvar.h> 103 #include <sys/sleepq.h> 104 #include <sys/socketvar.h> 105 #include <sys/socketvar.h> 106 #include <sys/syncobj.h> 107 #include <sys/syscallargs.h> 108 #include <sys/sysctl.h> 109 #include <sys/systm.h> 110 #include <sys/uio.h> 111 112 /* Flags for lwp::l_selflag. */ 113 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 114 #define SEL_SCANNING 1 /* polling descriptors */ 115 #define SEL_BLOCKING 2 /* blocking and waiting for event */ 116 #define SEL_EVENT 3 /* interrupted, events set directly */ 117 118 /* 119 * Per-cluster state for select()/poll(). For a system with fewer 120 * than 64 CPUs, this gives us per-CPU clusters. 121 */ 122 #define SELCLUSTERS 64 123 #define SELCLUSTERMASK (SELCLUSTERS - 1) 124 125 typedef struct selcluster { 126 kmutex_t *sc_lock; 127 sleepq_t sc_sleepq; 128 uint64_t sc_mask; 129 int sc_ncoll; 130 } selcluster_t; 131 132 static inline int selscan(char *, const int, const size_t, register_t *); 133 static inline int pollscan(struct pollfd *, const int, register_t *); 134 static void selclear(void); 135 136 static const int sel_flag[] = { 137 POLLRDNORM | POLLHUP | POLLERR, 138 POLLWRNORM | POLLHUP | POLLERR, 139 POLLRDBAND 140 }; 141 142 /* 143 * LWPs are woken using the sleep queue only due to a collision, the case 144 * with the maximum Suck Factor. Save the cost of sorting for named waiters 145 * by inserting in LIFO order. In the future it would be preferable to not 146 * enqueue LWPs at all, unless subject to a collision. 147 */ 148 syncobj_t select_sobj = { 149 .sobj_name = "select", 150 .sobj_flag = SOBJ_SLEEPQ_LIFO, 151 .sobj_boostpri = PRI_KERNEL, 152 .sobj_unsleep = sleepq_unsleep, 153 .sobj_changepri = sleepq_changepri, 154 .sobj_lendpri = sleepq_lendpri, 155 .sobj_owner = syncobj_noowner, 156 }; 157 158 static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; 159 static int direct_select __read_mostly = 0; 160 161 /* Operations: either select() or poll(). */ 162 const char selop_select[] = "select"; 163 const char selop_poll[] = "poll"; 164 165 /* 166 * Select system call. 167 */ 168 int 169 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, 170 register_t *retval) 171 { 172 /* { 173 syscallarg(int) nd; 174 syscallarg(fd_set *) in; 175 syscallarg(fd_set *) ou; 176 syscallarg(fd_set *) ex; 177 syscallarg(const struct timespec *) ts; 178 syscallarg(sigset_t *) mask; 179 } */ 180 struct timespec ats, *ts = NULL; 181 sigset_t amask, *mask = NULL; 182 int error; 183 184 if (SCARG(uap, ts)) { 185 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 186 if (error) 187 return error; 188 ts = &ats; 189 } 190 if (SCARG(uap, mask) != NULL) { 191 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 192 if (error) 193 return error; 194 mask = &amask; 195 } 196 197 return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 198 SCARG(uap, ou), SCARG(uap, ex), ts, mask); 199 } 200 201 int 202 sys___select50(struct lwp *l, const struct sys___select50_args *uap, 203 register_t *retval) 204 { 205 /* { 206 syscallarg(int) nd; 207 syscallarg(fd_set *) in; 208 syscallarg(fd_set *) ou; 209 syscallarg(fd_set *) ex; 210 syscallarg(struct timeval *) tv; 211 } */ 212 struct timeval atv; 213 struct timespec ats, *ts = NULL; 214 int error; 215 216 if (SCARG(uap, tv)) { 217 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); 218 if (error) 219 return error; 220 221 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) 222 return EINVAL; 223 224 TIMEVAL_TO_TIMESPEC(&atv, &ats); 225 ts = &ats; 226 } 227 228 return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 229 SCARG(uap, ou), SCARG(uap, ex), ts, NULL); 230 } 231 232 /* 233 * sel_do_scan: common code to perform the scan on descriptors. 234 */ 235 static int 236 sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni, 237 struct timespec *ts, sigset_t *mask, register_t *retval) 238 { 239 lwp_t * const l = curlwp; 240 selcluster_t *sc; 241 kmutex_t *lock; 242 struct timespec sleepts; 243 int error, timo; 244 245 timo = 0; 246 if (ts && inittimeleft(ts, &sleepts) == -1) { 247 return EINVAL; 248 } 249 250 if (__predict_false(mask)) 251 sigsuspendsetup(l, mask); 252 253 /* 254 * We may context switch during or at any time after picking a CPU 255 * and cluster to associate with, but it doesn't matter. In the 256 * unlikely event we migrate elsewhere all we risk is a little lock 257 * contention; correctness is not sacrificed. 258 */ 259 sc = curcpu()->ci_data.cpu_selcluster; 260 lock = sc->sc_lock; 261 l->l_selcluster = sc; 262 263 if (opname == selop_select) { 264 l->l_selbits = fds; 265 l->l_selni = ni; 266 } else { 267 l->l_selbits = NULL; 268 } 269 270 for (;;) { 271 int ncoll; 272 273 SLIST_INIT(&l->l_selwait); 274 l->l_selret = 0; 275 276 /* 277 * No need to lock. If this is overwritten by another value 278 * while scanning, we will retry below. We only need to see 279 * exact state from the descriptors that we are about to poll, 280 * and lock activity resulting from fo_poll is enough to 281 * provide an up to date value for new polling activity. 282 */ 283 if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) { 284 /* Non-blocking: no need for selrecord()/selclear() */ 285 l->l_selflag = SEL_RESET; 286 } else { 287 l->l_selflag = SEL_SCANNING; 288 } 289 ncoll = sc->sc_ncoll; 290 membar_release(); 291 292 if (opname == selop_select) { 293 error = selscan((char *)fds, nf, ni, retval); 294 } else { 295 error = pollscan((struct pollfd *)fds, nf, retval); 296 } 297 if (error || *retval) 298 break; 299 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) 300 break; 301 /* 302 * Acquire the lock and perform the (re)checks. Note, if 303 * collision has occurred, then our state does not matter, 304 * as we must perform re-scan. Therefore, check it first. 305 */ 306 state_check: 307 mutex_spin_enter(lock); 308 if (__predict_false(sc->sc_ncoll != ncoll)) { 309 /* Collision: perform re-scan. */ 310 mutex_spin_exit(lock); 311 selclear(); 312 continue; 313 } 314 if (__predict_true(l->l_selflag == SEL_EVENT)) { 315 /* Events occurred, they are set directly. */ 316 mutex_spin_exit(lock); 317 break; 318 } 319 if (__predict_true(l->l_selflag == SEL_RESET)) { 320 /* Events occurred, but re-scan is requested. */ 321 mutex_spin_exit(lock); 322 selclear(); 323 continue; 324 } 325 /* Nothing happen, therefore - sleep. */ 326 l->l_selflag = SEL_BLOCKING; 327 KASSERT(l->l_blcnt == 0); 328 (void)sleepq_enter(&sc->sc_sleepq, l, lock); 329 sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true); 330 error = sleepq_block(timo, true, &select_sobj, 0); 331 if (error != 0) { 332 break; 333 } 334 /* Awoken: need to check the state. */ 335 goto state_check; 336 } 337 selclear(); 338 339 /* Add direct events if any. */ 340 if (l->l_selflag == SEL_EVENT) { 341 KASSERT(l->l_selret != 0); 342 *retval += l->l_selret; 343 } 344 345 if (__predict_false(mask)) 346 sigsuspendteardown(l); 347 348 /* select and poll are not restarted after signals... */ 349 if (error == ERESTART) 350 return EINTR; 351 if (error == EWOULDBLOCK) 352 return 0; 353 return error; 354 } 355 356 /* designed to be compatible with FD_SET() FD_ISSET() ... */ 357 static int 358 anyset(void *p, size_t nbits) 359 { 360 size_t nwords; 361 __fd_mask mask; 362 __fd_mask *f = (__fd_mask *)p; 363 364 nwords = nbits / __NFDBITS; 365 366 while (nwords-- > 0) 367 if (*f++ != 0) 368 return 1; 369 370 nbits &= __NFDMASK; 371 if (nbits != 0) { 372 mask = (1U << nbits) - 1; 373 if ((*f & mask) != 0) 374 return 1; 375 } 376 return 0; 377 } 378 379 int 380 selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, 381 fd_set *u_ex, struct timespec *ts, sigset_t *mask) 382 { 383 char smallbits[howmany(FD_SETSIZE, NFDBITS) * 384 sizeof(fd_mask) * 6]; 385 char *bits; 386 int error, nf, fb, db; 387 size_t ni; 388 389 if (nd < 0) 390 return EINVAL; 391 392 nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles; 393 394 /* 395 * Don't allow absurdly large numbers of fds to be selected. 396 * (used to silently truncate, naughty naughty, no more ...) 397 * 398 * The additional FD_SETSIZE allows for cases where the limit 399 * is not a round binary number, but the fd_set wants to 400 * include all the possible fds, as fd_sets are always 401 * multiples of 32 bits (__NFDBITS extra would be enough). 402 * 403 * The first test handles the case where the res limit has been 404 * set lower after some fds were opened, we always allow selecting 405 * up to the highest currently open fd. 406 */ 407 if (nd > nf + FD_SETSIZE && 408 nd > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + FD_SETSIZE) 409 return EINVAL; 410 411 fb = howmany(nf, __NFDBITS); /* how many fd_masks */ 412 db = howmany(nd, __NFDBITS); 413 414 if (db > fb) { 415 size_t off; 416 417 /* 418 * the application wants to supply more fd masks than can 419 * possibly represent valid file descriptors. 420 * 421 * Check the excess fd_masks, if any bits are set in them 422 * that must be an error (cannot represent valid fd). 423 * 424 * Supplying lots of extra cleared fd_masks is dumb, 425 * but harmless, so allow that. 426 */ 427 ni = (db - fb) * sizeof(fd_mask); /* excess bytes */ 428 bits = smallbits; 429 430 /* skip over the valid fd_masks, those will be checked below */ 431 off = howmany(nf, __NFDBITS) * sizeof(__fd_mask); 432 433 nd -= fb * NFDBITS; /* the number of excess fds */ 434 435 #define checkbits(name, o, sz, fds) \ 436 do { \ 437 if (u_ ## name != NULL) { \ 438 error = copyin((char *)u_ ## name + o, \ 439 bits, sz); \ 440 if (error) \ 441 goto fail; \ 442 if (anyset(bits, (fds) ? \ 443 (size_t)(fds) : CHAR_BIT * (sz))) { \ 444 error = EBADF; \ 445 goto fail; \ 446 } \ 447 } \ 448 } while (0) 449 450 while (ni > sizeof(smallbits)) { 451 checkbits(in, off, sizeof(smallbits), 0); 452 checkbits(ou, off, sizeof(smallbits), 0); 453 checkbits(ex, off, sizeof(smallbits), 0); 454 455 off += sizeof(smallbits); 456 ni -= sizeof(smallbits); 457 nd -= sizeof(smallbits) * CHAR_BIT; 458 } 459 checkbits(in, off, ni, nd); 460 checkbits(ou, off, ni, nd); 461 checkbits(ex, off, ni, nd); 462 #undef checkbits 463 464 db = fb; /* now just check the plausible fds */ 465 nd = db * __NFDBITS; 466 } 467 468 ni = db * sizeof(fd_mask); 469 if (ni * 6 > sizeof(smallbits)) 470 bits = kmem_alloc(ni * 6, KM_SLEEP); 471 else 472 bits = smallbits; 473 474 #define getbits(name, x) \ 475 do { \ 476 if (u_ ## name) { \ 477 error = copyin(u_ ## name, bits + ni * x, ni); \ 478 if (error) \ 479 goto fail; \ 480 } else \ 481 memset(bits + ni * x, 0, ni); \ 482 } while (0) 483 484 getbits(in, 0); 485 getbits(ou, 1); 486 getbits(ex, 2); 487 #undef getbits 488 489 error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval); 490 491 #define copyback(name, x) \ 492 do { \ 493 if (error == 0 && u_ ## name != NULL) \ 494 error = copyout(bits + ni * x, \ 495 u_ ## name, ni); \ 496 } while (0) 497 498 copyback(in, 3); 499 copyback(ou, 4); 500 copyback(ex, 5); 501 #undef copyback 502 503 fail: 504 if (bits != smallbits) 505 kmem_free(bits, ni * 6); 506 return (error); 507 } 508 509 static inline int 510 selscan(char *bits, const int nfd, const size_t ni, register_t *retval) 511 { 512 fd_mask *ibitp, *obitp; 513 int msk, i, j, fd, n; 514 file_t *fp; 515 lwp_t *l; 516 517 ibitp = (fd_mask *)(bits + ni * 0); 518 obitp = (fd_mask *)(bits + ni * 3); 519 n = 0; 520 l = curlwp; 521 522 memset(obitp, 0, ni * 3); 523 for (msk = 0; msk < 3; msk++) { 524 for (i = 0; i < nfd; i += NFDBITS) { 525 fd_mask ibits, obits; 526 527 ibits = *ibitp; 528 obits = 0; 529 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 530 ibits &= ~(1U << j); 531 if ((fp = fd_getfile(fd)) == NULL) 532 return (EBADF); 533 /* 534 * Setup an argument to selrecord(), which is 535 * a file descriptor number. 536 */ 537 l->l_selrec = fd; 538 if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { 539 if (!direct_select) { 540 /* 541 * Have events: do nothing in 542 * selrecord(). 543 */ 544 l->l_selflag = SEL_RESET; 545 } 546 obits |= (1U << j); 547 n++; 548 } 549 fd_putfile(fd); 550 } 551 if (obits != 0) { 552 if (direct_select) { 553 kmutex_t *lock; 554 lock = l->l_selcluster->sc_lock; 555 mutex_spin_enter(lock); 556 *obitp |= obits; 557 mutex_spin_exit(lock); 558 } else { 559 *obitp |= obits; 560 } 561 } 562 ibitp++; 563 obitp++; 564 } 565 } 566 *retval = n; 567 return (0); 568 } 569 570 /* 571 * Poll system call. 572 */ 573 int 574 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) 575 { 576 /* { 577 syscallarg(struct pollfd *) fds; 578 syscallarg(u_int) nfds; 579 syscallarg(int) timeout; 580 } */ 581 struct timespec ats, *ts = NULL; 582 583 if (SCARG(uap, timeout) != INFTIM) { 584 ats.tv_sec = SCARG(uap, timeout) / 1000; 585 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; 586 ts = &ats; 587 } 588 589 return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); 590 } 591 592 /* 593 * Poll system call. 594 */ 595 int 596 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, 597 register_t *retval) 598 { 599 /* { 600 syscallarg(struct pollfd *) fds; 601 syscallarg(u_int) nfds; 602 syscallarg(const struct timespec *) ts; 603 syscallarg(const sigset_t *) mask; 604 } */ 605 struct timespec ats, *ts = NULL; 606 sigset_t amask, *mask = NULL; 607 int error; 608 609 if (SCARG(uap, ts)) { 610 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 611 if (error) 612 return error; 613 ts = &ats; 614 } 615 if (SCARG(uap, mask)) { 616 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 617 if (error) 618 return error; 619 mask = &amask; 620 } 621 622 return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); 623 } 624 625 int 626 pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, 627 struct timespec *ts, sigset_t *mask) 628 { 629 struct pollfd smallfds[32]; 630 struct pollfd *fds; 631 int error; 632 size_t ni; 633 634 if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) { 635 /* 636 * Prevent userland from causing over-allocation. 637 * Raising the default limit too high can still cause 638 * a lot of memory to be allocated, but this also means 639 * that the file descriptor array will also be large. 640 * 641 * To reduce the memory requirements here, we could 642 * process the 'fds' array in chunks, but that 643 * is a lot of code that isn't normally useful. 644 * (Or just move the copyin/out into pollscan().) 645 * 646 * Historically the code silently truncated 'fds' to 647 * dt_nfiles entries - but that does cause issues. 648 * 649 * Using the max limit equivalent to sysctl 650 * kern.maxfiles is the moral equivalent of OPEN_MAX 651 * as specified by POSIX. 652 * 653 * We add a slop of 1000 in case the resource limit was 654 * changed after opening descriptors or the same descriptor 655 * was specified more than once. 656 */ 657 return EINVAL; 658 } 659 ni = nfds * sizeof(struct pollfd); 660 if (ni > sizeof(smallfds)) 661 fds = kmem_alloc(ni, KM_SLEEP); 662 else 663 fds = smallfds; 664 665 error = copyin(u_fds, fds, ni); 666 if (error) 667 goto fail; 668 669 error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval); 670 if (error == 0) 671 error = copyout(fds, u_fds, ni); 672 fail: 673 if (fds != smallfds) 674 kmem_free(fds, ni); 675 return (error); 676 } 677 678 static inline int 679 pollscan(struct pollfd *fds, const int nfd, register_t *retval) 680 { 681 file_t *fp; 682 int i, n = 0, revents; 683 684 for (i = 0; i < nfd; i++, fds++) { 685 fds->revents = 0; 686 if (fds->fd < 0) { 687 revents = 0; 688 } else if ((fp = fd_getfile(fds->fd)) == NULL) { 689 revents = POLLNVAL; 690 } else { 691 /* 692 * Perform poll: registers select request or returns 693 * the events which are set. Setup an argument for 694 * selrecord(), which is a pointer to struct pollfd. 695 */ 696 curlwp->l_selrec = (uintptr_t)fds; 697 revents = (*fp->f_ops->fo_poll)(fp, 698 fds->events | POLLERR | POLLHUP); 699 fd_putfile(fds->fd); 700 } 701 if (revents) { 702 if (!direct_select) { 703 /* Have events: do nothing in selrecord(). */ 704 curlwp->l_selflag = SEL_RESET; 705 } 706 fds->revents = revents; 707 n++; 708 } 709 } 710 *retval = n; 711 return (0); 712 } 713 714 int 715 seltrue(dev_t dev, int events, lwp_t *l) 716 { 717 718 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 719 } 720 721 /* 722 * Record a select request. Concurrency issues: 723 * 724 * The caller holds the same lock across calls to selrecord() and 725 * selnotify(), so we don't need to consider a concurrent wakeup 726 * while in this routine. 727 * 728 * The only activity we need to guard against is selclear(), called by 729 * another thread that is exiting sel_do_scan(). 730 * `sel_lwp' can only become non-NULL while the caller's lock is held, 731 * so it cannot become non-NULL due to a change made by another thread 732 * while we are in this routine. It can only become _NULL_ due to a 733 * call to selclear(). 734 * 735 * If it is non-NULL and != selector there is the potential for 736 * selclear() to be called by another thread. If either of those 737 * conditions are true, we're not interested in touching the `named 738 * waiter' part of the selinfo record because we need to record a 739 * collision. Hence there is no need for additional locking in this 740 * routine. 741 */ 742 void 743 selrecord(lwp_t *selector, struct selinfo *sip) 744 { 745 selcluster_t *sc; 746 lwp_t *other; 747 748 KASSERT(selector == curlwp); 749 750 sc = selector->l_selcluster; 751 other = sip->sel_lwp; 752 753 if (selector->l_selflag == SEL_RESET) { 754 /* 0. We're not going to block - will poll again if needed. */ 755 } else if (other == selector) { 756 /* 1. We (selector) already claimed to be the first LWP. */ 757 KASSERT(sip->sel_cluster == sc); 758 } else if (other == NULL) { 759 /* 760 * 2. No first LWP, therefore we (selector) are the first. 761 * 762 * There may be unnamed waiters (collisions). Issue a memory 763 * barrier to ensure that we access sel_lwp (above) before 764 * other fields - this guards against a call to selclear(). 765 */ 766 membar_acquire(); 767 sip->sel_lwp = selector; 768 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 769 /* Copy the argument, which is for selnotify(). */ 770 sip->sel_fdinfo = selector->l_selrec; 771 /* Replace selinfo's lock with the chosen cluster's lock. */ 772 sip->sel_cluster = sc; 773 } else { 774 /* 3. Multiple waiters: record a collision. */ 775 sip->sel_collision |= sc->sc_mask; 776 KASSERT(sip->sel_cluster != NULL); 777 } 778 } 779 780 /* 781 * Record a knote. 782 * 783 * The caller holds the same lock as for selrecord(). 784 */ 785 void 786 selrecord_knote(struct selinfo *sip, struct knote *kn) 787 { 788 klist_insert(&sip->sel_klist, kn); 789 } 790 791 /* 792 * Remove a knote. 793 * 794 * The caller holds the same lock as for selrecord(). 795 * 796 * Returns true if the last knote was removed and the list 797 * is now empty. 798 */ 799 bool 800 selremove_knote(struct selinfo *sip, struct knote *kn) 801 { 802 return klist_remove(&sip->sel_klist, kn); 803 } 804 805 /* 806 * sel_setevents: a helper function for selnotify(), to set the events 807 * for LWP sleeping in selcommon() or pollcommon(). 808 */ 809 static inline bool 810 sel_setevents(lwp_t *l, struct selinfo *sip, const int events) 811 { 812 const int oflag = l->l_selflag; 813 int ret = 0; 814 815 /* 816 * If we require re-scan or it was required by somebody else, 817 * then just (re)set SEL_RESET and return. 818 */ 819 if (__predict_false(events == 0 || oflag == SEL_RESET)) { 820 l->l_selflag = SEL_RESET; 821 return true; 822 } 823 /* 824 * Direct set. Note: select state of LWP is locked. First, 825 * determine whether it is selcommon() or pollcommon(). 826 */ 827 if (l->l_selbits != NULL) { 828 const size_t ni = l->l_selni; 829 fd_mask *fds = (fd_mask *)l->l_selbits; 830 fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); 831 const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); 832 const int idx = fd >> __NFDSHIFT; 833 int n; 834 835 for (n = 0; n < 3; n++) { 836 if ((fds[idx] & fbit) != 0 && 837 (ofds[idx] & fbit) == 0 && 838 (sel_flag[n] & events)) { 839 ofds[idx] |= fbit; 840 ret++; 841 } 842 fds = (fd_mask *)((char *)fds + ni); 843 ofds = (fd_mask *)((char *)ofds + ni); 844 } 845 } else { 846 struct pollfd *pfd = (void *)sip->sel_fdinfo; 847 int revents = events & (pfd->events | POLLERR | POLLHUP); 848 849 if (revents) { 850 if (pfd->revents == 0) 851 ret = 1; 852 pfd->revents |= revents; 853 } 854 } 855 /* Check whether there are any events to return. */ 856 if (!ret) { 857 return false; 858 } 859 /* Indicate direct set and note the event (cluster lock is held). */ 860 l->l_selflag = SEL_EVENT; 861 l->l_selret += ret; 862 return true; 863 } 864 865 /* 866 * Do a wakeup when a selectable event occurs. Concurrency issues: 867 * 868 * As per selrecord(), the caller's object lock is held. If there 869 * is a named waiter, we must acquire the associated selcluster's lock 870 * in order to synchronize with selclear() and pollers going to sleep 871 * in sel_do_scan(). 872 * 873 * sip->sel_cluser cannot change at this point, as it is only changed 874 * in selrecord(), and concurrent calls to selrecord() are locked 875 * out by the caller. 876 */ 877 void 878 selnotify(struct selinfo *sip, int events, long knhint) 879 { 880 selcluster_t *sc; 881 uint64_t mask; 882 int index, oflag; 883 lwp_t *l; 884 kmutex_t *lock; 885 886 KNOTE(&sip->sel_klist, knhint); 887 888 if (sip->sel_lwp != NULL) { 889 /* One named LWP is waiting. */ 890 sc = sip->sel_cluster; 891 lock = sc->sc_lock; 892 mutex_spin_enter(lock); 893 /* Still there? */ 894 if (sip->sel_lwp != NULL) { 895 /* 896 * Set the events for our LWP and indicate that. 897 * Otherwise, request for a full re-scan. 898 */ 899 l = sip->sel_lwp; 900 oflag = l->l_selflag; 901 902 if (!direct_select) { 903 l->l_selflag = SEL_RESET; 904 } else if (!sel_setevents(l, sip, events)) { 905 /* No events to return. */ 906 mutex_spin_exit(lock); 907 return; 908 } 909 910 /* 911 * If thread is sleeping, wake it up. If it's not 912 * yet asleep, it will notice the change in state 913 * and will re-poll the descriptors. 914 */ 915 if (oflag == SEL_BLOCKING && l->l_mutex == lock) { 916 KASSERT(l->l_wchan == sc); 917 sleepq_remove(l->l_sleepq, l, true); 918 } 919 } 920 mutex_spin_exit(lock); 921 } 922 923 if ((mask = sip->sel_collision) != 0) { 924 /* 925 * There was a collision (multiple waiters): we must 926 * inform all potentially interested waiters. 927 */ 928 sip->sel_collision = 0; 929 do { 930 index = ffs64(mask) - 1; 931 mask ^= __BIT(index); 932 sc = selcluster[index]; 933 lock = sc->sc_lock; 934 mutex_spin_enter(lock); 935 sc->sc_ncoll++; 936 sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); 937 } while (__predict_false(mask != 0)); 938 } 939 } 940 941 /* 942 * Remove an LWP from all objects that it is waiting for. Concurrency 943 * issues: 944 * 945 * The object owner's (e.g. device driver) lock is not held here. Calls 946 * can be made to selrecord() and we do not synchronize against those 947 * directly using locks. However, we use `sel_lwp' to lock out changes. 948 * Before clearing it we must use memory barriers to ensure that we can 949 * safely traverse the list of selinfo records. 950 */ 951 static void 952 selclear(void) 953 { 954 struct selinfo *sip, *next; 955 selcluster_t *sc; 956 lwp_t *l; 957 kmutex_t *lock; 958 959 l = curlwp; 960 sc = l->l_selcluster; 961 lock = sc->sc_lock; 962 963 /* 964 * If the request was non-blocking, or we found events on the first 965 * descriptor, there will be no need to clear anything - avoid 966 * taking the lock. 967 */ 968 if (SLIST_EMPTY(&l->l_selwait)) { 969 return; 970 } 971 972 mutex_spin_enter(lock); 973 for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { 974 KASSERT(sip->sel_lwp == l); 975 KASSERT(sip->sel_cluster == l->l_selcluster); 976 977 /* 978 * Read link to next selinfo record, if any. 979 * It's no longer safe to touch `sip' after clearing 980 * `sel_lwp', so ensure that the read of `sel_chain' 981 * completes before the clearing of sel_lwp becomes 982 * globally visible. 983 */ 984 next = SLIST_NEXT(sip, sel_chain); 985 /* Release the record for another named waiter to use. */ 986 atomic_store_release(&sip->sel_lwp, NULL); 987 } 988 mutex_spin_exit(lock); 989 } 990 991 /* 992 * Initialize the select/poll system calls. Called once for each 993 * CPU in the system, as they are attached. 994 */ 995 void 996 selsysinit(struct cpu_info *ci) 997 { 998 selcluster_t *sc; 999 u_int index; 1000 1001 /* If already a cluster in place for this bit, re-use. */ 1002 index = cpu_index(ci) & SELCLUSTERMASK; 1003 sc = selcluster[index]; 1004 if (sc == NULL) { 1005 sc = kmem_alloc(roundup2(sizeof(selcluster_t), 1006 coherency_unit) + coherency_unit, KM_SLEEP); 1007 sc = (void *)roundup2((uintptr_t)sc, coherency_unit); 1008 sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 1009 sleepq_init(&sc->sc_sleepq); 1010 sc->sc_ncoll = 0; 1011 sc->sc_mask = __BIT(index); 1012 selcluster[index] = sc; 1013 } 1014 ci->ci_data.cpu_selcluster = sc; 1015 } 1016 1017 /* 1018 * Initialize a selinfo record. 1019 */ 1020 void 1021 selinit(struct selinfo *sip) 1022 { 1023 1024 memset(sip, 0, sizeof(*sip)); 1025 klist_init(&sip->sel_klist); 1026 } 1027 1028 /* 1029 * Destroy a selinfo record. The owning object must not gain new 1030 * references while this is in progress: all activity on the record 1031 * must be stopped. 1032 * 1033 * Concurrency issues: we only need guard against a call to selclear() 1034 * by a thread exiting sel_do_scan(). The caller has prevented further 1035 * references being made to the selinfo record via selrecord(), and it 1036 * will not call selnotify() again. 1037 */ 1038 void 1039 seldestroy(struct selinfo *sip) 1040 { 1041 selcluster_t *sc; 1042 kmutex_t *lock; 1043 lwp_t *l; 1044 1045 klist_fini(&sip->sel_klist); 1046 1047 if (sip->sel_lwp == NULL) 1048 return; 1049 1050 /* 1051 * Lock out selclear(). The selcluster pointer can't change while 1052 * we are here since it is only ever changed in selrecord(), 1053 * and that will not be entered again for this record because 1054 * it is dying. 1055 */ 1056 KASSERT(sip->sel_cluster != NULL); 1057 sc = sip->sel_cluster; 1058 lock = sc->sc_lock; 1059 mutex_spin_enter(lock); 1060 if ((l = sip->sel_lwp) != NULL) { 1061 /* 1062 * This should rarely happen, so although SLIST_REMOVE() 1063 * is slow, using it here is not a problem. 1064 */ 1065 KASSERT(l->l_selcluster == sc); 1066 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 1067 sip->sel_lwp = NULL; 1068 } 1069 mutex_spin_exit(lock); 1070 } 1071 1072 /* 1073 * System control nodes. 1074 */ 1075 SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup") 1076 { 1077 1078 sysctl_createv(clog, 0, NULL, NULL, 1079 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1080 CTLTYPE_INT, "direct_select", 1081 SYSCTL_DESCR("Enable/disable direct select (for testing)"), 1082 NULL, 0, &direct_select, 0, 1083 CTL_KERN, CTL_CREATE, CTL_EOL); 1084 } 1085