1*ed992de5Skhorben /* $NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $ */ 23acbed8eSad 33acbed8eSad /*- 46ed72b5fSad * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023 56ed72b5fSad * The NetBSD Foundation, Inc. 63acbed8eSad * All rights reserved. 73acbed8eSad * 83acbed8eSad * This code is derived from software contributed to The NetBSD Foundation 9ce9dfd6aSrmind * by Andrew Doran and Mindaugas Rasiukevicius. 103acbed8eSad * 113acbed8eSad * Redistribution and use in source and binary forms, with or without 123acbed8eSad * modification, are permitted provided that the following conditions 133acbed8eSad * are met: 143acbed8eSad * 1. Redistributions of source code must retain the above copyright 153acbed8eSad * notice, this list of conditions and the following disclaimer. 163acbed8eSad * 2. Redistributions in binary form must reproduce the above copyright 173acbed8eSad * notice, this list of conditions and the following disclaimer in the 183acbed8eSad * documentation and/or other materials provided with the distribution. 193acbed8eSad * 203acbed8eSad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 213acbed8eSad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 223acbed8eSad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 233acbed8eSad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 243acbed8eSad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 253acbed8eSad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 263acbed8eSad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 273acbed8eSad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 283acbed8eSad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 293acbed8eSad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 303acbed8eSad * POSSIBILITY OF SUCH DAMAGE. 313acbed8eSad */ 323acbed8eSad 333acbed8eSad /* 343acbed8eSad * Copyright (c) 1982, 1986, 1989, 1993 353acbed8eSad * The Regents of the University of California. All rights reserved. 363acbed8eSad * (c) UNIX System Laboratories, Inc. 373acbed8eSad * All or some portions of this file are derived from material licensed 383acbed8eSad * to the University of California by American Telephone and Telegraph 393acbed8eSad * Co. or Unix System Laboratories, Inc. and are reproduced herein with 403acbed8eSad * the permission of UNIX System Laboratories, Inc. 413acbed8eSad * 423acbed8eSad * Redistribution and use in source and binary forms, with or without 433acbed8eSad * modification, are permitted provided that the following conditions 443acbed8eSad * are met: 453acbed8eSad * 1. Redistributions of source code must retain the above copyright 463acbed8eSad * notice, this list of conditions and the following disclaimer. 473acbed8eSad * 2. Redistributions in binary form must reproduce the above copyright 483acbed8eSad * notice, this list of conditions and the following disclaimer in the 493acbed8eSad * documentation and/or other materials provided with the distribution. 503acbed8eSad * 3. Neither the name of the University nor the names of its contributors 513acbed8eSad * may be used to endorse or promote products derived from this software 523acbed8eSad * without specific prior written permission. 533acbed8eSad * 543acbed8eSad * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 553acbed8eSad * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 563acbed8eSad * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 573acbed8eSad * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 583acbed8eSad * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 593acbed8eSad * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 603acbed8eSad * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 613acbed8eSad * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 623acbed8eSad * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 633acbed8eSad * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 643acbed8eSad * SUCH DAMAGE. 653acbed8eSad * 663acbed8eSad * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 673acbed8eSad */ 683acbed8eSad 693acbed8eSad /* 704fff1555Srmind * System calls of synchronous I/O multiplexing subsystem. 714fff1555Srmind * 724fff1555Srmind * Locking 734fff1555Srmind * 742e2855a6Sad * Two locks are used: <object-lock> and selcluster_t::sc_lock. 754fff1555Srmind * 764fff1555Srmind * The <object-lock> might be a device driver or another subsystem, e.g. 774fff1555Srmind * socket or pipe. This lock is not exported, and thus invisible to this 784fff1555Srmind * subsystem. Mainly, synchronisation between selrecord() and selnotify() 794fff1555Srmind * routines depends on this lock, as it will be described in the comments. 804fff1555Srmind * 814fff1555Srmind * Lock order 824fff1555Srmind * 834fff1555Srmind * <object-lock> -> 842e2855a6Sad * selcluster_t::sc_lock 853acbed8eSad */ 863acbed8eSad 873acbed8eSad #include <sys/cdefs.h> 88*ed992de5Skhorben __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $"); 893acbed8eSad 903acbed8eSad #include <sys/param.h> 911da60e94Sriastradh 921da60e94Sriastradh #include <sys/atomic.h> 931da60e94Sriastradh #include <sys/bitops.h> 941da60e94Sriastradh #include <sys/cpu.h> 953acbed8eSad #include <sys/file.h> 961da60e94Sriastradh #include <sys/filedesc.h> 973acbed8eSad #include <sys/kernel.h> 9811a35aedSrmind #include <sys/lwp.h> 993acbed8eSad #include <sys/mount.h> 1001da60e94Sriastradh #include <sys/poll.h> 1011da60e94Sriastradh #include <sys/proc.h> 1021da60e94Sriastradh #include <sys/signalvar.h> 1033acbed8eSad #include <sys/sleepq.h> 1041da60e94Sriastradh #include <sys/socketvar.h> 1051da60e94Sriastradh #include <sys/socketvar.h> 106fac91bbeSriastradh #include <sys/syncobj.h> 1071da60e94Sriastradh #include <sys/syscallargs.h> 1081da60e94Sriastradh #include <sys/sysctl.h> 1091da60e94Sriastradh #include <sys/systm.h> 1101da60e94Sriastradh #include <sys/uio.h> 1113acbed8eSad 1123acbed8eSad /* Flags for lwp::l_selflag. */ 1133acbed8eSad #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 1143acbed8eSad #define SEL_SCANNING 1 /* polling descriptors */ 115ce9dfd6aSrmind #define SEL_BLOCKING 2 /* blocking and waiting for event */ 116ce9dfd6aSrmind #define SEL_EVENT 3 /* interrupted, events set directly */ 117ce9dfd6aSrmind 1182e2855a6Sad /* 1192e2855a6Sad * Per-cluster state for select()/poll(). For a system with fewer 1203cec1e76Sad * than 64 CPUs, this gives us per-CPU clusters. 1212e2855a6Sad */ 1223cec1e76Sad #define SELCLUSTERS 64 1232e2855a6Sad #define SELCLUSTERMASK (SELCLUSTERS - 1) 1242e2855a6Sad 1252e2855a6Sad typedef struct selcluster { 1267364cd36Sad kmutex_t *sc_lock; 1273acbed8eSad sleepq_t sc_sleepq; 128b5034f03Sad uint64_t sc_mask; 1293acbed8eSad int sc_ncoll; 1302e2855a6Sad } selcluster_t; 1313acbed8eSad 132ce9dfd6aSrmind static inline int selscan(char *, const int, const size_t, register_t *); 133ce9dfd6aSrmind static inline int pollscan(struct pollfd *, const int, register_t *); 1343acbed8eSad static void selclear(void); 1353acbed8eSad 136ce9dfd6aSrmind static const int sel_flag[] = { 137ce9dfd6aSrmind POLLRDNORM | POLLHUP | POLLERR, 138ce9dfd6aSrmind POLLWRNORM | POLLHUP | POLLERR, 139ce9dfd6aSrmind POLLRDBAND 140ce9dfd6aSrmind }; 141ce9dfd6aSrmind 142ee5f078dSad /* 143ee5f078dSad * LWPs are woken using the sleep queue only due to a collision, the case 144ee5f078dSad * with the maximum Suck Factor. Save the cost of sorting for named waiters 145ee5f078dSad * by inserting in LIFO order. In the future it would be preferable to not 146ee5f078dSad * enqueue LWPs at all, unless subject to a collision. 147ee5f078dSad */ 148dadd0e50Sad syncobj_t select_sobj = { 149f4853583Sriastradh .sobj_name = "select", 150ee5f078dSad .sobj_flag = SOBJ_SLEEPQ_LIFO, 1516ed72b5fSad .sobj_boostpri = PRI_KERNEL, 1528812081aSozaki-r .sobj_unsleep = sleepq_unsleep, 1538812081aSozaki-r .sobj_changepri = sleepq_changepri, 1548812081aSozaki-r .sobj_lendpri = sleepq_lendpri, 1558812081aSozaki-r .sobj_owner = syncobj_noowner, 1563acbed8eSad }; 1573acbed8eSad 158ce9dfd6aSrmind static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; 159501f07ceSrmind static int direct_select __read_mostly = 0; 1602e2855a6Sad 161b5034f03Sad /* Operations: either select() or poll(). */ 162b5034f03Sad const char selop_select[] = "select"; 163b5034f03Sad const char selop_poll[] = "poll"; 164b5034f03Sad 1653acbed8eSad /* 1663acbed8eSad * Select system call. 1673acbed8eSad */ 1683acbed8eSad int 169461a86f9Schristos sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, 170461a86f9Schristos register_t *retval) 1713acbed8eSad { 1723acbed8eSad /* { 1733acbed8eSad syscallarg(int) nd; 1743acbed8eSad syscallarg(fd_set *) in; 1753acbed8eSad syscallarg(fd_set *) ou; 1763acbed8eSad syscallarg(fd_set *) ex; 1773acbed8eSad syscallarg(const struct timespec *) ts; 1783acbed8eSad syscallarg(sigset_t *) mask; 1793acbed8eSad } */ 1802b1b4bc6Schristos struct timespec ats, *ts = NULL; 1813acbed8eSad sigset_t amask, *mask = NULL; 1823acbed8eSad int error; 1833acbed8eSad 1843acbed8eSad if (SCARG(uap, ts)) { 1853acbed8eSad error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 1863acbed8eSad if (error) 1873acbed8eSad return error; 1882b1b4bc6Schristos ts = &ats; 1893acbed8eSad } 1903acbed8eSad if (SCARG(uap, mask) != NULL) { 1913acbed8eSad error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 1923acbed8eSad if (error) 1933acbed8eSad return error; 1943acbed8eSad mask = &amask; 1953acbed8eSad } 1963acbed8eSad 19712839500Srmind return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 1982b1b4bc6Schristos SCARG(uap, ou), SCARG(uap, ex), ts, mask); 1993acbed8eSad } 2003acbed8eSad 2013acbed8eSad int 202461a86f9Schristos sys___select50(struct lwp *l, const struct sys___select50_args *uap, 203461a86f9Schristos register_t *retval) 2043acbed8eSad { 2053acbed8eSad /* { 2063acbed8eSad syscallarg(int) nd; 2073acbed8eSad syscallarg(fd_set *) in; 2083acbed8eSad syscallarg(fd_set *) ou; 2093acbed8eSad syscallarg(fd_set *) ex; 2103acbed8eSad syscallarg(struct timeval *) tv; 2113acbed8eSad } */ 2122b1b4bc6Schristos struct timeval atv; 2132b1b4bc6Schristos struct timespec ats, *ts = NULL; 2143acbed8eSad int error; 2153acbed8eSad 2163acbed8eSad if (SCARG(uap, tv)) { 2172b1b4bc6Schristos error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); 2183acbed8eSad if (error) 2193acbed8eSad return error; 2200af36754Skamil 2210af36754Skamil if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) 2220af36754Skamil return EINVAL; 2230af36754Skamil 2242b1b4bc6Schristos TIMEVAL_TO_TIMESPEC(&atv, &ats); 2252b1b4bc6Schristos ts = &ats; 2263acbed8eSad } 2273acbed8eSad 22812839500Srmind return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 2292b1b4bc6Schristos SCARG(uap, ou), SCARG(uap, ex), ts, NULL); 2303acbed8eSad } 2313acbed8eSad 2321ceff942Srmind /* 2331ceff942Srmind * sel_do_scan: common code to perform the scan on descriptors. 2341ceff942Srmind */ 2351ceff942Srmind static int 236b5034f03Sad sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni, 237ce9dfd6aSrmind struct timespec *ts, sigset_t *mask, register_t *retval) 2381ceff942Srmind { 2391ceff942Srmind lwp_t * const l = curlwp; 2402e2855a6Sad selcluster_t *sc; 2411ceff942Srmind kmutex_t *lock; 2421ceff942Srmind struct timespec sleepts; 2431ceff942Srmind int error, timo; 2441ceff942Srmind 2451ceff942Srmind timo = 0; 2461ceff942Srmind if (ts && inittimeleft(ts, &sleepts) == -1) { 2471ceff942Srmind return EINVAL; 2481ceff942Srmind } 2491ceff942Srmind 250fc6147c6Schristos if (__predict_false(mask)) 251e2543d03Schristos sigsuspendsetup(l, mask); 2521ceff942Srmind 253b5034f03Sad /* 254b5034f03Sad * We may context switch during or at any time after picking a CPU 255b5034f03Sad * and cluster to associate with, but it doesn't matter. In the 256b5034f03Sad * unlikely event we migrate elsewhere all we risk is a little lock 257b5034f03Sad * contention; correctness is not sacrificed. 258b5034f03Sad */ 2592e2855a6Sad sc = curcpu()->ci_data.cpu_selcluster; 2601ceff942Srmind lock = sc->sc_lock; 2612e2855a6Sad l->l_selcluster = sc; 262b5034f03Sad 263b5034f03Sad if (opname == selop_select) { 264ae3b98c1Srmind l->l_selbits = fds; 265ce9dfd6aSrmind l->l_selni = ni; 266ce9dfd6aSrmind } else { 267ce9dfd6aSrmind l->l_selbits = NULL; 268ce9dfd6aSrmind } 26974039084Shannken 2701ceff942Srmind for (;;) { 2711ceff942Srmind int ncoll; 2721ceff942Srmind 27374039084Shannken SLIST_INIT(&l->l_selwait); 27474039084Shannken l->l_selret = 0; 27574039084Shannken 2761ceff942Srmind /* 2771ceff942Srmind * No need to lock. If this is overwritten by another value 2781ceff942Srmind * while scanning, we will retry below. We only need to see 2791ceff942Srmind * exact state from the descriptors that we are about to poll, 2801ceff942Srmind * and lock activity resulting from fo_poll is enough to 2811ceff942Srmind * provide an up to date value for new polling activity. 2821ceff942Srmind */ 283b5034f03Sad if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) { 284b5034f03Sad /* Non-blocking: no need for selrecord()/selclear() */ 285b5034f03Sad l->l_selflag = SEL_RESET; 286b5034f03Sad } else { 2871ceff942Srmind l->l_selflag = SEL_SCANNING; 288b5034f03Sad } 2891ceff942Srmind ncoll = sc->sc_ncoll; 290008402f2Sriastradh membar_release(); 2911ceff942Srmind 292b5034f03Sad if (opname == selop_select) { 293ce9dfd6aSrmind error = selscan((char *)fds, nf, ni, retval); 2941ceff942Srmind } else { 295ce9dfd6aSrmind error = pollscan((struct pollfd *)fds, nf, retval); 2961ceff942Srmind } 2971ceff942Srmind if (error || *retval) 2981ceff942Srmind break; 2991ceff942Srmind if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) 3001ceff942Srmind break; 301ce9dfd6aSrmind /* 302ce9dfd6aSrmind * Acquire the lock and perform the (re)checks. Note, if 303cbf5c65aSandvar * collision has occurred, then our state does not matter, 304ce9dfd6aSrmind * as we must perform re-scan. Therefore, check it first. 305ce9dfd6aSrmind */ 306ce9dfd6aSrmind state_check: 3071ceff942Srmind mutex_spin_enter(lock); 308ce9dfd6aSrmind if (__predict_false(sc->sc_ncoll != ncoll)) { 309ce9dfd6aSrmind /* Collision: perform re-scan. */ 3101ceff942Srmind mutex_spin_exit(lock); 31174039084Shannken selclear(); 3121ceff942Srmind continue; 3131ceff942Srmind } 314ce9dfd6aSrmind if (__predict_true(l->l_selflag == SEL_EVENT)) { 315cbf5c65aSandvar /* Events occurred, they are set directly. */ 316ce9dfd6aSrmind mutex_spin_exit(lock); 317ce9dfd6aSrmind break; 318ce9dfd6aSrmind } 319ce9dfd6aSrmind if (__predict_true(l->l_selflag == SEL_RESET)) { 320cbf5c65aSandvar /* Events occurred, but re-scan is requested. */ 321ce9dfd6aSrmind mutex_spin_exit(lock); 32274039084Shannken selclear(); 323ce9dfd6aSrmind continue; 324ce9dfd6aSrmind } 325ce9dfd6aSrmind /* Nothing happen, therefore - sleep. */ 3261ceff942Srmind l->l_selflag = SEL_BLOCKING; 3270a6ca13bSad KASSERT(l->l_blcnt == 0); 3280a6ca13bSad (void)sleepq_enter(&sc->sc_sleepq, l, lock); 32946a9878aSad sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true); 3300a6ca13bSad error = sleepq_block(timo, true, &select_sobj, 0); 331ce9dfd6aSrmind if (error != 0) { 3321ceff942Srmind break; 3331ceff942Srmind } 334ce9dfd6aSrmind /* Awoken: need to check the state. */ 335ce9dfd6aSrmind goto state_check; 336ce9dfd6aSrmind } 3371ceff942Srmind selclear(); 3381ceff942Srmind 33974039084Shannken /* Add direct events if any. */ 34074039084Shannken if (l->l_selflag == SEL_EVENT) { 34174039084Shannken KASSERT(l->l_selret != 0); 34274039084Shannken *retval += l->l_selret; 34374039084Shannken } 34474039084Shannken 3450b60c7beSchristos if (__predict_false(mask)) 3460b60c7beSchristos sigsuspendteardown(l); 3470b60c7beSchristos 348ef379fcbSdsl /* select and poll are not restarted after signals... */ 349ef379fcbSdsl if (error == ERESTART) 350ef379fcbSdsl return EINTR; 351ef379fcbSdsl if (error == EWOULDBLOCK) 352ef379fcbSdsl return 0; 3531ceff942Srmind return error; 3541ceff942Srmind } 3551ceff942Srmind 356501e579fSkre /* designed to be compatible with FD_SET() FD_ISSET() ... */ 357501e579fSkre static int 358501e579fSkre anyset(void *p, size_t nbits) 359501e579fSkre { 360501e579fSkre size_t nwords; 361501e579fSkre __fd_mask mask; 362501e579fSkre __fd_mask *f = (__fd_mask *)p; 363501e579fSkre 364501e579fSkre nwords = nbits / __NFDBITS; 365501e579fSkre 366501e579fSkre while (nwords-- > 0) 367501e579fSkre if (*f++ != 0) 368501e579fSkre return 1; 369501e579fSkre 370501e579fSkre nbits &= __NFDMASK; 371501e579fSkre if (nbits != 0) { 372501e579fSkre mask = (1U << nbits) - 1; 373501e579fSkre if ((*f & mask) != 0) 374501e579fSkre return 1; 375501e579fSkre } 376501e579fSkre return 0; 377501e579fSkre } 378501e579fSkre 3793acbed8eSad int 38012839500Srmind selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, 38112839500Srmind fd_set *u_ex, struct timespec *ts, sigset_t *mask) 3823acbed8eSad { 3833acbed8eSad char smallbits[howmany(FD_SETSIZE, NFDBITS) * 3843acbed8eSad sizeof(fd_mask) * 6]; 3853acbed8eSad char *bits; 386501e579fSkre int error, nf, fb, db; 3873acbed8eSad size_t ni; 3883acbed8eSad 3893acbed8eSad if (nd < 0) 390501e579fSkre return EINVAL; 391501e579fSkre 3928e6cd4ceSriastradh nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles; 393501e579fSkre 394501e579fSkre /* 395501e579fSkre * Don't allow absurdly large numbers of fds to be selected. 396501e579fSkre * (used to silently truncate, naughty naughty, no more ...) 397501e579fSkre * 398*ed992de5Skhorben * The additional FD_SETSIZE allows for cases where the limit 399501e579fSkre * is not a round binary number, but the fd_set wants to 400501e579fSkre * include all the possible fds, as fd_sets are always 401501e579fSkre * multiples of 32 bits (__NFDBITS extra would be enough). 402501e579fSkre * 403501e579fSkre * The first test handles the case where the res limit has been 404501e579fSkre * set lower after some fds were opened, we always allow selecting 405501e579fSkre * up to the highest currently open fd. 406501e579fSkre */ 407501e579fSkre if (nd > nf + FD_SETSIZE && 408501e579fSkre nd > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + FD_SETSIZE) 409501e579fSkre return EINVAL; 410501e579fSkre 411501e579fSkre fb = howmany(nf, __NFDBITS); /* how many fd_masks */ 412501e579fSkre db = howmany(nd, __NFDBITS); 413501e579fSkre 414501e579fSkre if (db > fb) { 415501e579fSkre size_t off; 416501e579fSkre 417501e579fSkre /* 418501e579fSkre * the application wants to supply more fd masks than can 419501e579fSkre * possibly represent valid file descriptors. 420501e579fSkre * 421501e579fSkre * Check the excess fd_masks, if any bits are set in them 422501e579fSkre * that must be an error (cannot represent valid fd). 423501e579fSkre * 424501e579fSkre * Supplying lots of extra cleared fd_masks is dumb, 425501e579fSkre * but harmless, so allow that. 426501e579fSkre */ 427501e579fSkre ni = (db - fb) * sizeof(fd_mask); /* excess bytes */ 428501e579fSkre bits = smallbits; 429501e579fSkre 430501e579fSkre /* skip over the valid fd_masks, those will be checked below */ 431501e579fSkre off = howmany(nf, __NFDBITS) * sizeof(__fd_mask); 432501e579fSkre 433501e579fSkre nd -= fb * NFDBITS; /* the number of excess fds */ 434501e579fSkre 435501e579fSkre #define checkbits(name, o, sz, fds) \ 436501e579fSkre do { \ 437501e579fSkre if (u_ ## name != NULL) { \ 438501e579fSkre error = copyin((char *)u_ ## name + o, \ 439501e579fSkre bits, sz); \ 440501e579fSkre if (error) \ 441501e579fSkre goto fail; \ 442501e579fSkre if (anyset(bits, (fds) ? \ 443501e579fSkre (size_t)(fds) : CHAR_BIT * (sz))) { \ 444501e579fSkre error = EBADF; \ 445501e579fSkre goto fail; \ 446501e579fSkre } \ 447501e579fSkre } \ 448501e579fSkre } while (0) 449501e579fSkre 450501e579fSkre while (ni > sizeof(smallbits)) { 451501e579fSkre checkbits(in, off, sizeof(smallbits), 0); 452501e579fSkre checkbits(ou, off, sizeof(smallbits), 0); 453501e579fSkre checkbits(ex, off, sizeof(smallbits), 0); 454501e579fSkre 455501e579fSkre off += sizeof(smallbits); 456501e579fSkre ni -= sizeof(smallbits); 457501e579fSkre nd -= sizeof(smallbits) * CHAR_BIT; 4583acbed8eSad } 459501e579fSkre checkbits(in, off, ni, nd); 460501e579fSkre checkbits(ou, off, ni, nd); 461501e579fSkre checkbits(ex, off, ni, nd); 462501e579fSkre #undef checkbits 463501e579fSkre 464501e579fSkre db = fb; /* now just check the plausible fds */ 465501e579fSkre nd = db * __NFDBITS; 466501e579fSkre } 467501e579fSkre 468501e579fSkre ni = db * sizeof(fd_mask); 469fd34ea77Schs if (ni * 6 > sizeof(smallbits)) 4703acbed8eSad bits = kmem_alloc(ni * 6, KM_SLEEP); 471fd34ea77Schs else 4723acbed8eSad bits = smallbits; 4733acbed8eSad 4743acbed8eSad #define getbits(name, x) \ 475501e579fSkre do { \ 4763acbed8eSad if (u_ ## name) { \ 4773acbed8eSad error = copyin(u_ ## name, bits + ni * x, ni); \ 4783acbed8eSad if (error) \ 479ef379fcbSdsl goto fail; \ 4803acbed8eSad } else \ 481501e579fSkre memset(bits + ni * x, 0, ni); \ 482501e579fSkre } while (0) 483501e579fSkre 4843acbed8eSad getbits(in, 0); 4853acbed8eSad getbits(ou, 1); 4863acbed8eSad getbits(ex, 2); 4873acbed8eSad #undef getbits 4883acbed8eSad 489b5034f03Sad error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval); 490501e579fSkre 491501e579fSkre #define copyback(name, x) \ 492501e579fSkre do { \ 493501e579fSkre if (error == 0 && u_ ## name != NULL) \ 494501e579fSkre error = copyout(bits + ni * x, \ 495501e579fSkre u_ ## name, ni); \ 496501e579fSkre } while (0) 497501e579fSkre 498501e579fSkre copyback(in, 3); 499501e579fSkre copyback(ou, 4); 500501e579fSkre copyback(ex, 5); 501501e579fSkre #undef copyback 502501e579fSkre 503ef379fcbSdsl fail: 5043acbed8eSad if (bits != smallbits) 5053acbed8eSad kmem_free(bits, ni * 6); 5063acbed8eSad return (error); 5073acbed8eSad } 5083acbed8eSad 50912839500Srmind static inline int 510ce9dfd6aSrmind selscan(char *bits, const int nfd, const size_t ni, register_t *retval) 5113acbed8eSad { 5121ceff942Srmind fd_mask *ibitp, *obitp; 513ce9dfd6aSrmind int msk, i, j, fd, n; 5143acbed8eSad file_t *fp; 515b5034f03Sad lwp_t *l; 5163acbed8eSad 5171ceff942Srmind ibitp = (fd_mask *)(bits + ni * 0); 5181ceff942Srmind obitp = (fd_mask *)(bits + ni * 3); 5193acbed8eSad n = 0; 520b5034f03Sad l = curlwp; 5211ceff942Srmind 52274039084Shannken memset(obitp, 0, ni * 3); 5233acbed8eSad for (msk = 0; msk < 3; msk++) { 5243acbed8eSad for (i = 0; i < nfd; i += NFDBITS) { 525ce9dfd6aSrmind fd_mask ibits, obits; 526ce9dfd6aSrmind 527d5d9d492Shannken ibits = *ibitp; 5283acbed8eSad obits = 0; 5293acbed8eSad while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 530638734caSmsaitoh ibits &= ~(1U << j); 5313acbed8eSad if ((fp = fd_getfile(fd)) == NULL) 5323acbed8eSad return (EBADF); 533ce9dfd6aSrmind /* 534ce9dfd6aSrmind * Setup an argument to selrecord(), which is 535ce9dfd6aSrmind * a file descriptor number. 536ce9dfd6aSrmind */ 537b5034f03Sad l->l_selrec = fd; 538ce9dfd6aSrmind if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { 539b5034f03Sad if (!direct_select) { 540b5034f03Sad /* 541b5034f03Sad * Have events: do nothing in 542b5034f03Sad * selrecord(). 543b5034f03Sad */ 544b5034f03Sad l->l_selflag = SEL_RESET; 545b5034f03Sad } 546638734caSmsaitoh obits |= (1U << j); 5473acbed8eSad n++; 5483acbed8eSad } 5493acbed8eSad fd_putfile(fd); 5503acbed8eSad } 55174039084Shannken if (obits != 0) { 552501f07ceSrmind if (direct_select) { 553501f07ceSrmind kmutex_t *lock; 554b5034f03Sad lock = l->l_selcluster->sc_lock; 55574039084Shannken mutex_spin_enter(lock); 55674039084Shannken *obitp |= obits; 55774039084Shannken mutex_spin_exit(lock); 558501f07ceSrmind } else { 559d5d9d492Shannken *obitp |= obits; 560501f07ceSrmind } 56174039084Shannken } 562d5d9d492Shannken ibitp++; 56374039084Shannken obitp++; 5643acbed8eSad } 5653acbed8eSad } 5663acbed8eSad *retval = n; 5673acbed8eSad return (0); 5683acbed8eSad } 5693acbed8eSad 5703acbed8eSad /* 5713acbed8eSad * Poll system call. 5723acbed8eSad */ 5733acbed8eSad int 5743acbed8eSad sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) 5753acbed8eSad { 5763acbed8eSad /* { 5773acbed8eSad syscallarg(struct pollfd *) fds; 5783acbed8eSad syscallarg(u_int) nfds; 5793acbed8eSad syscallarg(int) timeout; 5803acbed8eSad } */ 5812b1b4bc6Schristos struct timespec ats, *ts = NULL; 5823acbed8eSad 5833acbed8eSad if (SCARG(uap, timeout) != INFTIM) { 5842b1b4bc6Schristos ats.tv_sec = SCARG(uap, timeout) / 1000; 5852b1b4bc6Schristos ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; 5862b1b4bc6Schristos ts = &ats; 5873acbed8eSad } 5883acbed8eSad 58912839500Srmind return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); 5903acbed8eSad } 5913acbed8eSad 5923acbed8eSad /* 5933acbed8eSad * Poll system call. 5943acbed8eSad */ 5953acbed8eSad int 596461a86f9Schristos sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, 597461a86f9Schristos register_t *retval) 5983acbed8eSad { 5993acbed8eSad /* { 6003acbed8eSad syscallarg(struct pollfd *) fds; 6013acbed8eSad syscallarg(u_int) nfds; 6023acbed8eSad syscallarg(const struct timespec *) ts; 6033acbed8eSad syscallarg(const sigset_t *) mask; 6043acbed8eSad } */ 6052b1b4bc6Schristos struct timespec ats, *ts = NULL; 6063acbed8eSad sigset_t amask, *mask = NULL; 6073acbed8eSad int error; 6083acbed8eSad 6093acbed8eSad if (SCARG(uap, ts)) { 6103acbed8eSad error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 6113acbed8eSad if (error) 6123acbed8eSad return error; 6132b1b4bc6Schristos ts = &ats; 6143acbed8eSad } 6153acbed8eSad if (SCARG(uap, mask)) { 6163acbed8eSad error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 6173acbed8eSad if (error) 6183acbed8eSad return error; 6193acbed8eSad mask = &amask; 6203acbed8eSad } 6213acbed8eSad 62212839500Srmind return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); 6233acbed8eSad } 6243acbed8eSad 6253acbed8eSad int 62612839500Srmind pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, 6272b1b4bc6Schristos struct timespec *ts, sigset_t *mask) 6283acbed8eSad { 629bd2755f5Syamt struct pollfd smallfds[32]; 630bd2755f5Syamt struct pollfd *fds; 6311ceff942Srmind int error; 632ef379fcbSdsl size_t ni; 6333acbed8eSad 634a4ff8f65Schristos if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) { 635ef379fcbSdsl /* 636600ed393Schristos * Prevent userland from causing over-allocation. 637600ed393Schristos * Raising the default limit too high can still cause 638600ed393Schristos * a lot of memory to be allocated, but this also means 639600ed393Schristos * that the file descriptor array will also be large. 640600ed393Schristos * 641600ed393Schristos * To reduce the memory requirements here, we could 642600ed393Schristos * process the 'fds' array in chunks, but that 643ef379fcbSdsl * is a lot of code that isn't normally useful. 644ef379fcbSdsl * (Or just move the copyin/out into pollscan().) 645600ed393Schristos * 646ef379fcbSdsl * Historically the code silently truncated 'fds' to 647ef379fcbSdsl * dt_nfiles entries - but that does cause issues. 648ed042c77Schristos * 649ed042c77Schristos * Using the max limit equivalent to sysctl 650ed042c77Schristos * kern.maxfiles is the moral equivalent of OPEN_MAX 651a4ff8f65Schristos * as specified by POSIX. 652a4ff8f65Schristos * 653a4ff8f65Schristos * We add a slop of 1000 in case the resource limit was 654a4ff8f65Schristos * changed after opening descriptors or the same descriptor 655a4ff8f65Schristos * was specified more than once. 656ef379fcbSdsl */ 657ef379fcbSdsl return EINVAL; 6583acbed8eSad } 6593acbed8eSad ni = nfds * sizeof(struct pollfd); 660fd34ea77Schs if (ni > sizeof(smallfds)) 661bd2755f5Syamt fds = kmem_alloc(ni, KM_SLEEP); 662fd34ea77Schs else 663bd2755f5Syamt fds = smallfds; 6643acbed8eSad 665bd2755f5Syamt error = copyin(u_fds, fds, ni); 6663acbed8eSad if (error) 667ef379fcbSdsl goto fail; 6683acbed8eSad 669b5034f03Sad error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval); 6703acbed8eSad if (error == 0) 671bd2755f5Syamt error = copyout(fds, u_fds, ni); 672ef379fcbSdsl fail: 673bd2755f5Syamt if (fds != smallfds) 674bd2755f5Syamt kmem_free(fds, ni); 6753acbed8eSad return (error); 6763acbed8eSad } 6773acbed8eSad 67812839500Srmind static inline int 679ce9dfd6aSrmind pollscan(struct pollfd *fds, const int nfd, register_t *retval) 6803acbed8eSad { 6813acbed8eSad file_t *fp; 68274039084Shannken int i, n = 0, revents; 6833acbed8eSad 6843acbed8eSad for (i = 0; i < nfd; i++, fds++) { 6853acbed8eSad fds->revents = 0; 68674039084Shannken if (fds->fd < 0) { 68774039084Shannken revents = 0; 6883acbed8eSad } else if ((fp = fd_getfile(fds->fd)) == NULL) { 68974039084Shannken revents = POLLNVAL; 6903acbed8eSad } else { 691ce9dfd6aSrmind /* 692ce9dfd6aSrmind * Perform poll: registers select request or returns 693ce9dfd6aSrmind * the events which are set. Setup an argument for 694ce9dfd6aSrmind * selrecord(), which is a pointer to struct pollfd. 695ce9dfd6aSrmind */ 696ce9dfd6aSrmind curlwp->l_selrec = (uintptr_t)fds; 69774039084Shannken revents = (*fp->f_ops->fo_poll)(fp, 6983acbed8eSad fds->events | POLLERR | POLLHUP); 6993acbed8eSad fd_putfile(fds->fd); 7003acbed8eSad } 70174039084Shannken if (revents) { 702b5034f03Sad if (!direct_select) { 703b5034f03Sad /* Have events: do nothing in selrecord(). */ 704b5034f03Sad curlwp->l_selflag = SEL_RESET; 705b5034f03Sad } 70674039084Shannken fds->revents = revents; 70774039084Shannken n++; 70874039084Shannken } 7093acbed8eSad } 7103acbed8eSad *retval = n; 7113acbed8eSad return (0); 7123acbed8eSad } 7133acbed8eSad 7143acbed8eSad int 7153acbed8eSad seltrue(dev_t dev, int events, lwp_t *l) 7163acbed8eSad { 7173acbed8eSad 7183acbed8eSad return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 7193acbed8eSad } 7203acbed8eSad 7213acbed8eSad /* 7223acbed8eSad * Record a select request. Concurrency issues: 7233acbed8eSad * 7243acbed8eSad * The caller holds the same lock across calls to selrecord() and 72570f8f58cSyamt * selnotify(), so we don't need to consider a concurrent wakeup 7263acbed8eSad * while in this routine. 7273acbed8eSad * 7283acbed8eSad * The only activity we need to guard against is selclear(), called by 7291ceff942Srmind * another thread that is exiting sel_do_scan(). 7303acbed8eSad * `sel_lwp' can only become non-NULL while the caller's lock is held, 7313acbed8eSad * so it cannot become non-NULL due to a change made by another thread 7323acbed8eSad * while we are in this routine. It can only become _NULL_ due to a 7333acbed8eSad * call to selclear(). 7343acbed8eSad * 7353acbed8eSad * If it is non-NULL and != selector there is the potential for 7363acbed8eSad * selclear() to be called by another thread. If either of those 7373acbed8eSad * conditions are true, we're not interested in touching the `named 7383acbed8eSad * waiter' part of the selinfo record because we need to record a 7393acbed8eSad * collision. Hence there is no need for additional locking in this 7403acbed8eSad * routine. 7413acbed8eSad */ 7423acbed8eSad void 7433acbed8eSad selrecord(lwp_t *selector, struct selinfo *sip) 7443acbed8eSad { 7452e2855a6Sad selcluster_t *sc; 7463acbed8eSad lwp_t *other; 7473acbed8eSad 7483acbed8eSad KASSERT(selector == curlwp); 7493acbed8eSad 7502e2855a6Sad sc = selector->l_selcluster; 7513acbed8eSad other = sip->sel_lwp; 7523acbed8eSad 753b5034f03Sad if (selector->l_selflag == SEL_RESET) { 754b5034f03Sad /* 0. We're not going to block - will poll again if needed. */ 755b5034f03Sad } else if (other == selector) { 756ce9dfd6aSrmind /* 1. We (selector) already claimed to be the first LWP. */ 75780ae1f31Sriastradh KASSERT(sip->sel_cluster == sc); 7583acbed8eSad } else if (other == NULL) { 7593acbed8eSad /* 760ce9dfd6aSrmind * 2. No first LWP, therefore we (selector) are the first. 761ce9dfd6aSrmind * 762ce9dfd6aSrmind * There may be unnamed waiters (collisions). Issue a memory 763ce9dfd6aSrmind * barrier to ensure that we access sel_lwp (above) before 764ce9dfd6aSrmind * other fields - this guards against a call to selclear(). 7653acbed8eSad */ 766008402f2Sriastradh membar_acquire(); 7673acbed8eSad sip->sel_lwp = selector; 7683acbed8eSad SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 769ce9dfd6aSrmind /* Copy the argument, which is for selnotify(). */ 770ce9dfd6aSrmind sip->sel_fdinfo = selector->l_selrec; 7712e2855a6Sad /* Replace selinfo's lock with the chosen cluster's lock. */ 7722e2855a6Sad sip->sel_cluster = sc; 7733acbed8eSad } else { 774ce9dfd6aSrmind /* 3. Multiple waiters: record a collision. */ 7753acbed8eSad sip->sel_collision |= sc->sc_mask; 7762e2855a6Sad KASSERT(sip->sel_cluster != NULL); 7773acbed8eSad } 7783acbed8eSad } 7793acbed8eSad 7803acbed8eSad /* 781b56263bdSthorpej * Record a knote. 782b56263bdSthorpej * 783b56263bdSthorpej * The caller holds the same lock as for selrecord(). 784b56263bdSthorpej */ 785b56263bdSthorpej void 786b56263bdSthorpej selrecord_knote(struct selinfo *sip, struct knote *kn) 787b56263bdSthorpej { 788576702f1Sthorpej klist_insert(&sip->sel_klist, kn); 789b56263bdSthorpej } 790b56263bdSthorpej 791b56263bdSthorpej /* 792b56263bdSthorpej * Remove a knote. 793b56263bdSthorpej * 794b56263bdSthorpej * The caller holds the same lock as for selrecord(). 795966aaf88Sthorpej * 796966aaf88Sthorpej * Returns true if the last knote was removed and the list 797966aaf88Sthorpej * is now empty. 798b56263bdSthorpej */ 799966aaf88Sthorpej bool 800b56263bdSthorpej selremove_knote(struct selinfo *sip, struct knote *kn) 801b56263bdSthorpej { 802576702f1Sthorpej return klist_remove(&sip->sel_klist, kn); 803b56263bdSthorpej } 804b56263bdSthorpej 805b56263bdSthorpej /* 806ce9dfd6aSrmind * sel_setevents: a helper function for selnotify(), to set the events 807ce9dfd6aSrmind * for LWP sleeping in selcommon() or pollcommon(). 808ce9dfd6aSrmind */ 809ae3b98c1Srmind static inline bool 810ce9dfd6aSrmind sel_setevents(lwp_t *l, struct selinfo *sip, const int events) 811ce9dfd6aSrmind { 812ce9dfd6aSrmind const int oflag = l->l_selflag; 813ae3b98c1Srmind int ret = 0; 814ce9dfd6aSrmind 815ce9dfd6aSrmind /* 816ce9dfd6aSrmind * If we require re-scan or it was required by somebody else, 817ce9dfd6aSrmind * then just (re)set SEL_RESET and return. 818ce9dfd6aSrmind */ 819ce9dfd6aSrmind if (__predict_false(events == 0 || oflag == SEL_RESET)) { 820ce9dfd6aSrmind l->l_selflag = SEL_RESET; 821ae3b98c1Srmind return true; 822ce9dfd6aSrmind } 823ce9dfd6aSrmind /* 824ce9dfd6aSrmind * Direct set. Note: select state of LWP is locked. First, 825ce9dfd6aSrmind * determine whether it is selcommon() or pollcommon(). 826ce9dfd6aSrmind */ 827ce9dfd6aSrmind if (l->l_selbits != NULL) { 8282903a8dcSrmind const size_t ni = l->l_selni; 829ae3b98c1Srmind fd_mask *fds = (fd_mask *)l->l_selbits; 830ae3b98c1Srmind fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); 831ae3b98c1Srmind const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); 8322903a8dcSrmind const int idx = fd >> __NFDSHIFT; 833ce9dfd6aSrmind int n; 834ce9dfd6aSrmind 835ce9dfd6aSrmind for (n = 0; n < 3; n++) { 83674039084Shannken if ((fds[idx] & fbit) != 0 && 83774039084Shannken (ofds[idx] & fbit) == 0 && 83874039084Shannken (sel_flag[n] & events)) { 839ae3b98c1Srmind ofds[idx] |= fbit; 840ae3b98c1Srmind ret++; 841ce9dfd6aSrmind } 842ce9dfd6aSrmind fds = (fd_mask *)((char *)fds + ni); 843ae3b98c1Srmind ofds = (fd_mask *)((char *)ofds + ni); 844ce9dfd6aSrmind } 845ce9dfd6aSrmind } else { 846ce9dfd6aSrmind struct pollfd *pfd = (void *)sip->sel_fdinfo; 847ae3b98c1Srmind int revents = events & (pfd->events | POLLERR | POLLHUP); 848ae3b98c1Srmind 849ae3b98c1Srmind if (revents) { 85074039084Shannken if (pfd->revents == 0) 851ae3b98c1Srmind ret = 1; 85274039084Shannken pfd->revents |= revents; 853ae3b98c1Srmind } 854ae3b98c1Srmind } 855ae3b98c1Srmind /* Check whether there are any events to return. */ 856ae3b98c1Srmind if (!ret) { 857ae3b98c1Srmind return false; 858ce9dfd6aSrmind } 859ce9dfd6aSrmind /* Indicate direct set and note the event (cluster lock is held). */ 860ce9dfd6aSrmind l->l_selflag = SEL_EVENT; 861ae3b98c1Srmind l->l_selret += ret; 862ae3b98c1Srmind return true; 863ce9dfd6aSrmind } 864ce9dfd6aSrmind 865ce9dfd6aSrmind /* 8663acbed8eSad * Do a wakeup when a selectable event occurs. Concurrency issues: 8673acbed8eSad * 8683acbed8eSad * As per selrecord(), the caller's object lock is held. If there 8692e2855a6Sad * is a named waiter, we must acquire the associated selcluster's lock 8703acbed8eSad * in order to synchronize with selclear() and pollers going to sleep 8711ceff942Srmind * in sel_do_scan(). 8723acbed8eSad * 8732e2855a6Sad * sip->sel_cluser cannot change at this point, as it is only changed 8743acbed8eSad * in selrecord(), and concurrent calls to selrecord() are locked 8753acbed8eSad * out by the caller. 8763acbed8eSad */ 8773acbed8eSad void 8783acbed8eSad selnotify(struct selinfo *sip, int events, long knhint) 8793acbed8eSad { 8802e2855a6Sad selcluster_t *sc; 881b5034f03Sad uint64_t mask; 88240cf6f36Srmind int index, oflag; 8833acbed8eSad lwp_t *l; 8847364cd36Sad kmutex_t *lock; 8853acbed8eSad 8863acbed8eSad KNOTE(&sip->sel_klist, knhint); 8873acbed8eSad 8883acbed8eSad if (sip->sel_lwp != NULL) { 8893acbed8eSad /* One named LWP is waiting. */ 8902e2855a6Sad sc = sip->sel_cluster; 8917364cd36Sad lock = sc->sc_lock; 8927364cd36Sad mutex_spin_enter(lock); 8933acbed8eSad /* Still there? */ 8943acbed8eSad if (sip->sel_lwp != NULL) { 895ce9dfd6aSrmind /* 896ce9dfd6aSrmind * Set the events for our LWP and indicate that. 897ce9dfd6aSrmind * Otherwise, request for a full re-scan. 898ce9dfd6aSrmind */ 8993acbed8eSad l = sip->sel_lwp; 900ce9dfd6aSrmind oflag = l->l_selflag; 901501f07ceSrmind 902501f07ceSrmind if (!direct_select) { 903501f07ceSrmind l->l_selflag = SEL_RESET; 904501f07ceSrmind } else if (!sel_setevents(l, sip, events)) { 905ae3b98c1Srmind /* No events to return. */ 906ae3b98c1Srmind mutex_spin_exit(lock); 907ae3b98c1Srmind return; 908ae3b98c1Srmind } 909501f07ceSrmind 9103acbed8eSad /* 9113acbed8eSad * If thread is sleeping, wake it up. If it's not 9123acbed8eSad * yet asleep, it will notice the change in state 9133acbed8eSad * and will re-poll the descriptors. 9143acbed8eSad */ 9157364cd36Sad if (oflag == SEL_BLOCKING && l->l_mutex == lock) { 9163acbed8eSad KASSERT(l->l_wchan == sc); 91732a89764Sad sleepq_remove(l->l_sleepq, l, true); 9183acbed8eSad } 9193acbed8eSad } 9207364cd36Sad mutex_spin_exit(lock); 9213acbed8eSad } 9223acbed8eSad 9233acbed8eSad if ((mask = sip->sel_collision) != 0) { 9243acbed8eSad /* 9253acbed8eSad * There was a collision (multiple waiters): we must 9263acbed8eSad * inform all potentially interested waiters. 9273acbed8eSad */ 9283acbed8eSad sip->sel_collision = 0; 929bd9b59aaSad do { 930b5034f03Sad index = ffs64(mask) - 1; 931b5034f03Sad mask ^= __BIT(index); 9322e2855a6Sad sc = selcluster[index]; 9337364cd36Sad lock = sc->sc_lock; 9347364cd36Sad mutex_spin_enter(lock); 9353acbed8eSad sc->sc_ncoll++; 9367364cd36Sad sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); 937bd9b59aaSad } while (__predict_false(mask != 0)); 9383acbed8eSad } 9393acbed8eSad } 9403acbed8eSad 9413acbed8eSad /* 9423acbed8eSad * Remove an LWP from all objects that it is waiting for. Concurrency 9433acbed8eSad * issues: 9443acbed8eSad * 9453acbed8eSad * The object owner's (e.g. device driver) lock is not held here. Calls 9463acbed8eSad * can be made to selrecord() and we do not synchronize against those 9473acbed8eSad * directly using locks. However, we use `sel_lwp' to lock out changes. 9483acbed8eSad * Before clearing it we must use memory barriers to ensure that we can 9493acbed8eSad * safely traverse the list of selinfo records. 9503acbed8eSad */ 9513acbed8eSad static void 9523acbed8eSad selclear(void) 9533acbed8eSad { 9543acbed8eSad struct selinfo *sip, *next; 9552e2855a6Sad selcluster_t *sc; 9563acbed8eSad lwp_t *l; 9577364cd36Sad kmutex_t *lock; 9583acbed8eSad 9593acbed8eSad l = curlwp; 9602e2855a6Sad sc = l->l_selcluster; 9617364cd36Sad lock = sc->sc_lock; 9623acbed8eSad 963b5034f03Sad /* 964b5034f03Sad * If the request was non-blocking, or we found events on the first 965b5034f03Sad * descriptor, there will be no need to clear anything - avoid 966b5034f03Sad * taking the lock. 967b5034f03Sad */ 968b5034f03Sad if (SLIST_EMPTY(&l->l_selwait)) { 969b5034f03Sad return; 970b5034f03Sad } 971b5034f03Sad 9727364cd36Sad mutex_spin_enter(lock); 9733acbed8eSad for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { 9743acbed8eSad KASSERT(sip->sel_lwp == l); 9752e2855a6Sad KASSERT(sip->sel_cluster == l->l_selcluster); 9762e2855a6Sad 9773acbed8eSad /* 9783acbed8eSad * Read link to next selinfo record, if any. 9793acbed8eSad * It's no longer safe to touch `sip' after clearing 9803acbed8eSad * `sel_lwp', so ensure that the read of `sel_chain' 9813acbed8eSad * completes before the clearing of sel_lwp becomes 9823acbed8eSad * globally visible. 9833acbed8eSad */ 9843acbed8eSad next = SLIST_NEXT(sip, sel_chain); 9853acbed8eSad /* Release the record for another named waiter to use. */ 986008402f2Sriastradh atomic_store_release(&sip->sel_lwp, NULL); 9873acbed8eSad } 9887364cd36Sad mutex_spin_exit(lock); 9893acbed8eSad } 9903acbed8eSad 9913acbed8eSad /* 9923acbed8eSad * Initialize the select/poll system calls. Called once for each 9933acbed8eSad * CPU in the system, as they are attached. 9943acbed8eSad */ 9953acbed8eSad void 9963acbed8eSad selsysinit(struct cpu_info *ci) 9973acbed8eSad { 9982e2855a6Sad selcluster_t *sc; 9992e2855a6Sad u_int index; 10003acbed8eSad 10012e2855a6Sad /* If already a cluster in place for this bit, re-use. */ 10022e2855a6Sad index = cpu_index(ci) & SELCLUSTERMASK; 10032e2855a6Sad sc = selcluster[index]; 10042e2855a6Sad if (sc == NULL) { 10052e2855a6Sad sc = kmem_alloc(roundup2(sizeof(selcluster_t), 10062e2855a6Sad coherency_unit) + coherency_unit, KM_SLEEP); 1007feb4783fSad sc = (void *)roundup2((uintptr_t)sc, coherency_unit); 10087364cd36Sad sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 100993e0e983Sad sleepq_init(&sc->sc_sleepq); 10103acbed8eSad sc->sc_ncoll = 0; 10116caaf101Smsaitoh sc->sc_mask = __BIT(index); 10122e2855a6Sad selcluster[index] = sc; 10132e2855a6Sad } 10142e2855a6Sad ci->ci_data.cpu_selcluster = sc; 10153acbed8eSad } 10163acbed8eSad 10173acbed8eSad /* 10183acbed8eSad * Initialize a selinfo record. 10193acbed8eSad */ 10203acbed8eSad void 10213acbed8eSad selinit(struct selinfo *sip) 10223acbed8eSad { 10233acbed8eSad 10243acbed8eSad memset(sip, 0, sizeof(*sip)); 1025576702f1Sthorpej klist_init(&sip->sel_klist); 10263acbed8eSad } 10273acbed8eSad 10283acbed8eSad /* 10293acbed8eSad * Destroy a selinfo record. The owning object must not gain new 10303acbed8eSad * references while this is in progress: all activity on the record 10313acbed8eSad * must be stopped. 10323acbed8eSad * 10333acbed8eSad * Concurrency issues: we only need guard against a call to selclear() 10341ceff942Srmind * by a thread exiting sel_do_scan(). The caller has prevented further 10351ceff942Srmind * references being made to the selinfo record via selrecord(), and it 1036ce9dfd6aSrmind * will not call selnotify() again. 10373acbed8eSad */ 10383acbed8eSad void 10393acbed8eSad seldestroy(struct selinfo *sip) 10403acbed8eSad { 10412e2855a6Sad selcluster_t *sc; 10427364cd36Sad kmutex_t *lock; 10433acbed8eSad lwp_t *l; 10443acbed8eSad 1045576702f1Sthorpej klist_fini(&sip->sel_klist); 1046576702f1Sthorpej 10473acbed8eSad if (sip->sel_lwp == NULL) 10483acbed8eSad return; 10493acbed8eSad 10503acbed8eSad /* 10512e2855a6Sad * Lock out selclear(). The selcluster pointer can't change while 10523acbed8eSad * we are here since it is only ever changed in selrecord(), 10533acbed8eSad * and that will not be entered again for this record because 10543acbed8eSad * it is dying. 10553acbed8eSad */ 10562e2855a6Sad KASSERT(sip->sel_cluster != NULL); 10572e2855a6Sad sc = sip->sel_cluster; 10587364cd36Sad lock = sc->sc_lock; 10597364cd36Sad mutex_spin_enter(lock); 10603acbed8eSad if ((l = sip->sel_lwp) != NULL) { 10613acbed8eSad /* 10623acbed8eSad * This should rarely happen, so although SLIST_REMOVE() 10633acbed8eSad * is slow, using it here is not a problem. 10643acbed8eSad */ 10652e2855a6Sad KASSERT(l->l_selcluster == sc); 10663acbed8eSad SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 10673acbed8eSad sip->sel_lwp = NULL; 10683acbed8eSad } 10697364cd36Sad mutex_spin_exit(lock); 10703acbed8eSad } 10713acbed8eSad 1072501f07ceSrmind /* 1073501f07ceSrmind * System control nodes. 1074501f07ceSrmind */ 1075501f07ceSrmind SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup") 1076501f07ceSrmind { 1077501f07ceSrmind 10784f6fb3bfSpooka sysctl_createv(clog, 0, NULL, NULL, 1079501f07ceSrmind CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1080501f07ceSrmind CTLTYPE_INT, "direct_select", 1081501f07ceSrmind SYSCTL_DESCR("Enable/disable direct select (for testing)"), 1082501f07ceSrmind NULL, 0, &direct_select, 0, 10834f6fb3bfSpooka CTL_KERN, CTL_CREATE, CTL_EOL); 1084501f07ceSrmind } 1085