1 /* $NetBSD: sys_select.c,v 1.15 2009/05/24 21:41:26 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 66 */ 67 68 /* 69 * System calls relating to files. 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.15 2009/05/24 21:41:26 ad Exp $"); 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/filedesc.h> 78 #include <sys/ioctl.h> 79 #include <sys/file.h> 80 #include <sys/proc.h> 81 #include <sys/socketvar.h> 82 #include <sys/signalvar.h> 83 #include <sys/uio.h> 84 #include <sys/kernel.h> 85 #include <sys/stat.h> 86 #include <sys/poll.h> 87 #include <sys/vnode.h> 88 #include <sys/mount.h> 89 #include <sys/syscallargs.h> 90 #include <sys/cpu.h> 91 #include <sys/atomic.h> 92 #include <sys/socketvar.h> 93 #include <sys/sleepq.h> 94 95 /* Flags for lwp::l_selflag. */ 96 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 97 #define SEL_SCANNING 1 /* polling descriptors */ 98 #define SEL_BLOCKING 2 /* about to block on select_cv */ 99 100 /* Per-CPU state for select()/poll(). */ 101 #if MAXCPUS > 32 102 #error adjust this code 103 #endif 104 typedef struct selcpu { 105 kmutex_t *sc_lock; 106 sleepq_t sc_sleepq; 107 int sc_ncoll; 108 uint32_t sc_mask; 109 } selcpu_t; 110 111 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *); 112 static int pollscan(lwp_t *, struct pollfd *, int, register_t *); 113 static void selclear(void); 114 115 static syncobj_t select_sobj = { 116 SOBJ_SLEEPQ_FIFO, 117 sleepq_unsleep, 118 sleepq_changepri, 119 sleepq_lendpri, 120 syncobj_noowner, 121 }; 122 123 /* 124 * Select system call. 125 */ 126 int 127 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, 128 register_t *retval) 129 { 130 /* { 131 syscallarg(int) nd; 132 syscallarg(fd_set *) in; 133 syscallarg(fd_set *) ou; 134 syscallarg(fd_set *) ex; 135 syscallarg(const struct timespec *) ts; 136 syscallarg(sigset_t *) mask; 137 } */ 138 struct timespec ats, *ts = NULL; 139 sigset_t amask, *mask = NULL; 140 int error; 141 142 if (SCARG(uap, ts)) { 143 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 144 if (error) 145 return error; 146 ts = &ats; 147 } 148 if (SCARG(uap, mask) != NULL) { 149 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 150 if (error) 151 return error; 152 mask = &amask; 153 } 154 155 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in), 156 SCARG(uap, ou), SCARG(uap, ex), ts, mask); 157 } 158 159 int 160 inittimeleft(struct timespec *ts, struct timespec *sleepts) 161 { 162 if (itimespecfix(ts)) 163 return -1; 164 getnanouptime(sleepts); 165 return 0; 166 } 167 168 int 169 gettimeleft(struct timespec *ts, struct timespec *sleepts) 170 { 171 /* 172 * We have to recalculate the timeout on every retry. 173 */ 174 struct timespec sleptts; 175 /* 176 * reduce ts by elapsed time 177 * based on monotonic time scale 178 */ 179 getnanouptime(&sleptts); 180 timespecadd(ts, sleepts, ts); 181 timespecsub(ts, &sleptts, ts); 182 *sleepts = sleptts; 183 return tstohz(ts); 184 } 185 186 int 187 sys___select50(struct lwp *l, const struct sys___select50_args *uap, 188 register_t *retval) 189 { 190 /* { 191 syscallarg(int) nd; 192 syscallarg(fd_set *) in; 193 syscallarg(fd_set *) ou; 194 syscallarg(fd_set *) ex; 195 syscallarg(struct timeval *) tv; 196 } */ 197 struct timeval atv; 198 struct timespec ats, *ts = NULL; 199 int error; 200 201 if (SCARG(uap, tv)) { 202 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); 203 if (error) 204 return error; 205 TIMEVAL_TO_TIMESPEC(&atv, &ats); 206 ts = &ats; 207 } 208 209 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in), 210 SCARG(uap, ou), SCARG(uap, ex), ts, NULL); 211 } 212 213 int 214 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in, 215 fd_set *u_ou, fd_set *u_ex, struct timespec *ts, sigset_t *mask) 216 { 217 char smallbits[howmany(FD_SETSIZE, NFDBITS) * 218 sizeof(fd_mask) * 6]; 219 proc_t * const p = l->l_proc; 220 char *bits; 221 int ncoll, error, timo, nf; 222 size_t ni; 223 sigset_t oldmask; 224 struct timespec sleepts; 225 selcpu_t *sc; 226 kmutex_t *lock; 227 228 error = 0; 229 if (nd < 0) 230 return (EINVAL); 231 nf = p->p_fd->fd_dt->dt_nfiles; 232 if (nd > nf) { 233 /* forgiving; slightly wrong */ 234 nd = nf; 235 } 236 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 237 if (ni * 6 > sizeof(smallbits)) { 238 bits = kmem_alloc(ni * 6, KM_SLEEP); 239 if (bits == NULL) 240 return ENOMEM; 241 } else 242 bits = smallbits; 243 244 #define getbits(name, x) \ 245 if (u_ ## name) { \ 246 error = copyin(u_ ## name, bits + ni * x, ni); \ 247 if (error) \ 248 goto done; \ 249 } else \ 250 memset(bits + ni * x, 0, ni); 251 getbits(in, 0); 252 getbits(ou, 1); 253 getbits(ex, 2); 254 #undef getbits 255 256 timo = 0; 257 if (ts && inittimeleft(ts, &sleepts) == -1) { 258 error = EINVAL; 259 goto done; 260 } 261 262 if (mask) { 263 sigminusset(&sigcantmask, mask); 264 mutex_enter(p->p_lock); 265 oldmask = l->l_sigmask; 266 l->l_sigmask = *mask; 267 mutex_exit(p->p_lock); 268 } else 269 oldmask = l->l_sigmask; /* XXXgcc */ 270 271 sc = curcpu()->ci_data.cpu_selcpu; 272 lock = sc->sc_lock; 273 l->l_selcpu = sc; 274 SLIST_INIT(&l->l_selwait); 275 for (;;) { 276 /* 277 * No need to lock. If this is overwritten by another 278 * value while scanning, we will retry below. We only 279 * need to see exact state from the descriptors that 280 * we are about to poll, and lock activity resulting 281 * from fo_poll is enough to provide an up to date value 282 * for new polling activity. 283 */ 284 l->l_selflag = SEL_SCANNING; 285 ncoll = sc->sc_ncoll; 286 287 error = selscan(l, (fd_mask *)(bits + ni * 0), 288 (fd_mask *)(bits + ni * 3), nd, retval); 289 290 if (error || *retval) 291 break; 292 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) 293 break; 294 mutex_spin_enter(lock); 295 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) { 296 mutex_spin_exit(lock); 297 continue; 298 } 299 l->l_selflag = SEL_BLOCKING; 300 l->l_kpriority = true; 301 sleepq_enter(&sc->sc_sleepq, l, lock); 302 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj); 303 error = sleepq_block(timo, true); 304 if (error != 0) 305 break; 306 } 307 selclear(); 308 309 if (mask) { 310 mutex_enter(p->p_lock); 311 l->l_sigmask = oldmask; 312 mutex_exit(p->p_lock); 313 } 314 315 done: 316 /* select is not restarted after signals... */ 317 if (error == ERESTART) 318 error = EINTR; 319 if (error == EWOULDBLOCK) 320 error = 0; 321 if (error == 0 && u_in != NULL) 322 error = copyout(bits + ni * 3, u_in, ni); 323 if (error == 0 && u_ou != NULL) 324 error = copyout(bits + ni * 4, u_ou, ni); 325 if (error == 0 && u_ex != NULL) 326 error = copyout(bits + ni * 5, u_ex, ni); 327 if (bits != smallbits) 328 kmem_free(bits, ni * 6); 329 return (error); 330 } 331 332 int 333 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd, 334 register_t *retval) 335 { 336 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR, 337 POLLWRNORM | POLLHUP | POLLERR, 338 POLLRDBAND }; 339 int msk, i, j, fd, n; 340 fd_mask ibits, obits; 341 file_t *fp; 342 343 n = 0; 344 for (msk = 0; msk < 3; msk++) { 345 for (i = 0; i < nfd; i += NFDBITS) { 346 ibits = *ibitp++; 347 obits = 0; 348 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 349 ibits &= ~(1 << j); 350 if ((fp = fd_getfile(fd)) == NULL) 351 return (EBADF); 352 if ((*fp->f_ops->fo_poll)(fp, flag[msk])) { 353 obits |= (1 << j); 354 n++; 355 } 356 fd_putfile(fd); 357 } 358 *obitp++ = obits; 359 } 360 } 361 *retval = n; 362 return (0); 363 } 364 365 /* 366 * Poll system call. 367 */ 368 int 369 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) 370 { 371 /* { 372 syscallarg(struct pollfd *) fds; 373 syscallarg(u_int) nfds; 374 syscallarg(int) timeout; 375 } */ 376 struct timespec ats, *ts = NULL; 377 378 if (SCARG(uap, timeout) != INFTIM) { 379 ats.tv_sec = SCARG(uap, timeout) / 1000; 380 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; 381 ts = &ats; 382 } 383 384 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds), 385 ts, NULL); 386 } 387 388 /* 389 * Poll system call. 390 */ 391 int 392 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, 393 register_t *retval) 394 { 395 /* { 396 syscallarg(struct pollfd *) fds; 397 syscallarg(u_int) nfds; 398 syscallarg(const struct timespec *) ts; 399 syscallarg(const sigset_t *) mask; 400 } */ 401 struct timespec ats, *ts = NULL; 402 sigset_t amask, *mask = NULL; 403 int error; 404 405 if (SCARG(uap, ts)) { 406 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 407 if (error) 408 return error; 409 ts = &ats; 410 } 411 if (SCARG(uap, mask)) { 412 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 413 if (error) 414 return error; 415 mask = &amask; 416 } 417 418 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds), 419 ts, mask); 420 } 421 422 int 423 pollcommon(lwp_t *l, register_t *retval, struct pollfd *u_fds, u_int nfds, 424 struct timespec *ts, sigset_t *mask) 425 { 426 struct pollfd smallfds[32]; 427 struct pollfd *fds; 428 proc_t * const p = l->l_proc; 429 sigset_t oldmask; 430 int ncoll, error, timo; 431 size_t ni, nf; 432 struct timespec sleepts; 433 selcpu_t *sc; 434 kmutex_t *lock; 435 436 nf = p->p_fd->fd_dt->dt_nfiles; 437 if (nfds > nf) { 438 /* forgiving; slightly wrong */ 439 nfds = nf; 440 } 441 ni = nfds * sizeof(struct pollfd); 442 if (ni > sizeof(smallfds)) { 443 fds = kmem_alloc(ni, KM_SLEEP); 444 if (fds == NULL) 445 return ENOMEM; 446 } else 447 fds = smallfds; 448 449 error = copyin(u_fds, fds, ni); 450 if (error) 451 goto done; 452 453 timo = 0; 454 if (ts && inittimeleft(ts, &sleepts) == -1) { 455 error = EINVAL; 456 goto done; 457 } 458 459 if (mask) { 460 sigminusset(&sigcantmask, mask); 461 mutex_enter(p->p_lock); 462 oldmask = l->l_sigmask; 463 l->l_sigmask = *mask; 464 mutex_exit(p->p_lock); 465 } else 466 oldmask = l->l_sigmask; /* XXXgcc */ 467 468 sc = curcpu()->ci_data.cpu_selcpu; 469 lock = sc->sc_lock; 470 l->l_selcpu = sc; 471 SLIST_INIT(&l->l_selwait); 472 for (;;) { 473 /* 474 * No need to lock. If this is overwritten by another 475 * value while scanning, we will retry below. We only 476 * need to see exact state from the descriptors that 477 * we are about to poll, and lock activity resulting 478 * from fo_poll is enough to provide an up to date value 479 * for new polling activity. 480 */ 481 ncoll = sc->sc_ncoll; 482 l->l_selflag = SEL_SCANNING; 483 484 error = pollscan(l, fds, nfds, retval); 485 486 if (error || *retval) 487 break; 488 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) 489 break; 490 mutex_spin_enter(lock); 491 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) { 492 mutex_spin_exit(lock); 493 continue; 494 } 495 l->l_selflag = SEL_BLOCKING; 496 l->l_kpriority = true; 497 sleepq_enter(&sc->sc_sleepq, l, lock); 498 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj); 499 error = sleepq_block(timo, true); 500 if (error != 0) 501 break; 502 } 503 selclear(); 504 505 if (mask) { 506 mutex_enter(p->p_lock); 507 l->l_sigmask = oldmask; 508 mutex_exit(p->p_lock); 509 } 510 done: 511 /* poll is not restarted after signals... */ 512 if (error == ERESTART) 513 error = EINTR; 514 if (error == EWOULDBLOCK) 515 error = 0; 516 if (error == 0) 517 error = copyout(fds, u_fds, ni); 518 if (fds != smallfds) 519 kmem_free(fds, ni); 520 return (error); 521 } 522 523 int 524 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval) 525 { 526 int i, n; 527 file_t *fp; 528 529 n = 0; 530 for (i = 0; i < nfd; i++, fds++) { 531 if (fds->fd < 0) { 532 fds->revents = 0; 533 } else if ((fp = fd_getfile(fds->fd)) == NULL) { 534 fds->revents = POLLNVAL; 535 n++; 536 } else { 537 fds->revents = (*fp->f_ops->fo_poll)(fp, 538 fds->events | POLLERR | POLLHUP); 539 if (fds->revents != 0) 540 n++; 541 fd_putfile(fds->fd); 542 } 543 } 544 *retval = n; 545 return (0); 546 } 547 548 /*ARGSUSED*/ 549 int 550 seltrue(dev_t dev, int events, lwp_t *l) 551 { 552 553 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 554 } 555 556 /* 557 * Record a select request. Concurrency issues: 558 * 559 * The caller holds the same lock across calls to selrecord() and 560 * selnotify(), so we don't need to consider a concurrent wakeup 561 * while in this routine. 562 * 563 * The only activity we need to guard against is selclear(), called by 564 * another thread that is exiting selcommon() or pollcommon(). 565 * `sel_lwp' can only become non-NULL while the caller's lock is held, 566 * so it cannot become non-NULL due to a change made by another thread 567 * while we are in this routine. It can only become _NULL_ due to a 568 * call to selclear(). 569 * 570 * If it is non-NULL and != selector there is the potential for 571 * selclear() to be called by another thread. If either of those 572 * conditions are true, we're not interested in touching the `named 573 * waiter' part of the selinfo record because we need to record a 574 * collision. Hence there is no need for additional locking in this 575 * routine. 576 */ 577 void 578 selrecord(lwp_t *selector, struct selinfo *sip) 579 { 580 selcpu_t *sc; 581 lwp_t *other; 582 583 KASSERT(selector == curlwp); 584 585 sc = selector->l_selcpu; 586 other = sip->sel_lwp; 587 588 if (other == selector) { 589 /* `selector' has already claimed it. */ 590 KASSERT(sip->sel_cpu = sc); 591 } else if (other == NULL) { 592 /* 593 * First named waiter, although there may be unnamed 594 * waiters (collisions). Issue a memory barrier to 595 * ensure that we access sel_lwp (above) before other 596 * fields - this guards against a call to selclear(). 597 */ 598 membar_enter(); 599 sip->sel_lwp = selector; 600 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 601 /* Replace selinfo's lock with our chosen CPU's lock. */ 602 sip->sel_cpu = sc; 603 } else { 604 /* Multiple waiters: record a collision. */ 605 sip->sel_collision |= sc->sc_mask; 606 KASSERT(sip->sel_cpu != NULL); 607 } 608 } 609 610 /* 611 * Do a wakeup when a selectable event occurs. Concurrency issues: 612 * 613 * As per selrecord(), the caller's object lock is held. If there 614 * is a named waiter, we must acquire the associated selcpu's lock 615 * in order to synchronize with selclear() and pollers going to sleep 616 * in selcommon() and/or pollcommon(). 617 * 618 * sip->sel_cpu cannot change at this point, as it is only changed 619 * in selrecord(), and concurrent calls to selrecord() are locked 620 * out by the caller. 621 */ 622 void 623 selnotify(struct selinfo *sip, int events, long knhint) 624 { 625 selcpu_t *sc; 626 uint32_t mask; 627 int index, oflag, swapin; 628 lwp_t *l; 629 kmutex_t *lock; 630 631 KNOTE(&sip->sel_klist, knhint); 632 633 if (sip->sel_lwp != NULL) { 634 /* One named LWP is waiting. */ 635 swapin = 0; 636 sc = sip->sel_cpu; 637 lock = sc->sc_lock; 638 mutex_spin_enter(lock); 639 /* Still there? */ 640 if (sip->sel_lwp != NULL) { 641 l = sip->sel_lwp; 642 /* 643 * If thread is sleeping, wake it up. If it's not 644 * yet asleep, it will notice the change in state 645 * and will re-poll the descriptors. 646 */ 647 oflag = l->l_selflag; 648 l->l_selflag = SEL_RESET; 649 if (oflag == SEL_BLOCKING && l->l_mutex == lock) { 650 KASSERT(l->l_wchan == sc); 651 swapin = sleepq_unsleep(l, false); 652 } 653 } 654 mutex_spin_exit(lock); 655 if (swapin) 656 uvm_kick_scheduler(); 657 } 658 659 if ((mask = sip->sel_collision) != 0) { 660 /* 661 * There was a collision (multiple waiters): we must 662 * inform all potentially interested waiters. 663 */ 664 sip->sel_collision = 0; 665 do { 666 index = ffs(mask) - 1; 667 mask &= ~(1 << index); 668 sc = cpu_lookup(index)->ci_data.cpu_selcpu; 669 lock = sc->sc_lock; 670 mutex_spin_enter(lock); 671 sc->sc_ncoll++; 672 sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); 673 } while (__predict_false(mask != 0)); 674 } 675 } 676 677 /* 678 * Remove an LWP from all objects that it is waiting for. Concurrency 679 * issues: 680 * 681 * The object owner's (e.g. device driver) lock is not held here. Calls 682 * can be made to selrecord() and we do not synchronize against those 683 * directly using locks. However, we use `sel_lwp' to lock out changes. 684 * Before clearing it we must use memory barriers to ensure that we can 685 * safely traverse the list of selinfo records. 686 */ 687 static void 688 selclear(void) 689 { 690 struct selinfo *sip, *next; 691 selcpu_t *sc; 692 lwp_t *l; 693 kmutex_t *lock; 694 695 l = curlwp; 696 sc = l->l_selcpu; 697 lock = sc->sc_lock; 698 699 mutex_spin_enter(lock); 700 for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { 701 KASSERT(sip->sel_lwp == l); 702 KASSERT(sip->sel_cpu == l->l_selcpu); 703 /* 704 * Read link to next selinfo record, if any. 705 * It's no longer safe to touch `sip' after clearing 706 * `sel_lwp', so ensure that the read of `sel_chain' 707 * completes before the clearing of sel_lwp becomes 708 * globally visible. 709 */ 710 next = SLIST_NEXT(sip, sel_chain); 711 membar_exit(); 712 /* Release the record for another named waiter to use. */ 713 sip->sel_lwp = NULL; 714 } 715 mutex_spin_exit(lock); 716 } 717 718 /* 719 * Initialize the select/poll system calls. Called once for each 720 * CPU in the system, as they are attached. 721 */ 722 void 723 selsysinit(struct cpu_info *ci) 724 { 725 selcpu_t *sc; 726 727 sc = kmem_alloc(roundup2(sizeof(selcpu_t), coherency_unit) + 728 coherency_unit, KM_SLEEP); 729 sc = (void *)roundup2((uintptr_t)sc, coherency_unit); 730 sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 731 sleepq_init(&sc->sc_sleepq); 732 sc->sc_ncoll = 0; 733 sc->sc_mask = (1 << cpu_index(ci)); 734 ci->ci_data.cpu_selcpu = sc; 735 } 736 737 /* 738 * Initialize a selinfo record. 739 */ 740 void 741 selinit(struct selinfo *sip) 742 { 743 744 memset(sip, 0, sizeof(*sip)); 745 } 746 747 /* 748 * Destroy a selinfo record. The owning object must not gain new 749 * references while this is in progress: all activity on the record 750 * must be stopped. 751 * 752 * Concurrency issues: we only need guard against a call to selclear() 753 * by a thread exiting selcommon() and/or pollcommon(). The caller has 754 * prevented further references being made to the selinfo record via 755 * selrecord(), and it won't call selwakeup() again. 756 */ 757 void 758 seldestroy(struct selinfo *sip) 759 { 760 selcpu_t *sc; 761 kmutex_t *lock; 762 lwp_t *l; 763 764 if (sip->sel_lwp == NULL) 765 return; 766 767 /* 768 * Lock out selclear(). The selcpu pointer can't change while 769 * we are here since it is only ever changed in selrecord(), 770 * and that will not be entered again for this record because 771 * it is dying. 772 */ 773 KASSERT(sip->sel_cpu != NULL); 774 sc = sip->sel_cpu; 775 lock = sc->sc_lock; 776 mutex_spin_enter(lock); 777 if ((l = sip->sel_lwp) != NULL) { 778 /* 779 * This should rarely happen, so although SLIST_REMOVE() 780 * is slow, using it here is not a problem. 781 */ 782 KASSERT(l->l_selcpu == sc); 783 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 784 sip->sel_lwp = NULL; 785 } 786 mutex_spin_exit(lock); 787 } 788 789 int 790 pollsock(struct socket *so, const struct timespec *tsp, int events) 791 { 792 int ncoll, error, timo; 793 struct timespec sleepts, ts; 794 selcpu_t *sc; 795 lwp_t *l; 796 kmutex_t *lock; 797 798 timo = 0; 799 if (tsp != NULL) { 800 ts = *tsp; 801 if (inittimeleft(&ts, &sleepts) == -1) 802 return EINVAL; 803 } 804 805 l = curlwp; 806 sc = l->l_cpu->ci_data.cpu_selcpu; 807 lock = sc->sc_lock; 808 l->l_selcpu = sc; 809 SLIST_INIT(&l->l_selwait); 810 error = 0; 811 for (;;) { 812 /* 813 * No need to lock. If this is overwritten by another 814 * value while scanning, we will retry below. We only 815 * need to see exact state from the descriptors that 816 * we are about to poll, and lock activity resulting 817 * from fo_poll is enough to provide an up to date value 818 * for new polling activity. 819 */ 820 ncoll = sc->sc_ncoll; 821 l->l_selflag = SEL_SCANNING; 822 if (sopoll(so, events) != 0) 823 break; 824 if (tsp && (timo = gettimeleft(&ts, &sleepts)) <= 0) 825 break; 826 mutex_spin_enter(lock); 827 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) { 828 mutex_spin_exit(lock); 829 continue; 830 } 831 l->l_selflag = SEL_BLOCKING; 832 sleepq_enter(&sc->sc_sleepq, l, lock); 833 sleepq_enqueue(&sc->sc_sleepq, sc, "pollsock", &select_sobj); 834 error = sleepq_block(timo, true); 835 if (error != 0) 836 break; 837 } 838 selclear(); 839 /* poll is not restarted after signals... */ 840 if (error == ERESTART) 841 error = EINTR; 842 if (error == EWOULDBLOCK) 843 error = 0; 844 return (error); 845 } 846