xref: /netbsd-src/sys/kern/sys_select.c (revision ed992de57ddd4f2221bcec7e564f214926aedf84)
1*ed992de5Skhorben /*	$NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $	*/
23acbed8eSad 
33acbed8eSad /*-
46ed72b5fSad  * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023
56ed72b5fSad  *     The NetBSD Foundation, Inc.
63acbed8eSad  * All rights reserved.
73acbed8eSad  *
83acbed8eSad  * This code is derived from software contributed to The NetBSD Foundation
9ce9dfd6aSrmind  * by Andrew Doran and Mindaugas Rasiukevicius.
103acbed8eSad  *
113acbed8eSad  * Redistribution and use in source and binary forms, with or without
123acbed8eSad  * modification, are permitted provided that the following conditions
133acbed8eSad  * are met:
143acbed8eSad  * 1. Redistributions of source code must retain the above copyright
153acbed8eSad  *    notice, this list of conditions and the following disclaimer.
163acbed8eSad  * 2. Redistributions in binary form must reproduce the above copyright
173acbed8eSad  *    notice, this list of conditions and the following disclaimer in the
183acbed8eSad  *    documentation and/or other materials provided with the distribution.
193acbed8eSad  *
203acbed8eSad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
213acbed8eSad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
223acbed8eSad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
233acbed8eSad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
243acbed8eSad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
253acbed8eSad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
263acbed8eSad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
273acbed8eSad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
283acbed8eSad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
293acbed8eSad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
303acbed8eSad  * POSSIBILITY OF SUCH DAMAGE.
313acbed8eSad  */
323acbed8eSad 
333acbed8eSad /*
343acbed8eSad  * Copyright (c) 1982, 1986, 1989, 1993
353acbed8eSad  *	The Regents of the University of California.  All rights reserved.
363acbed8eSad  * (c) UNIX System Laboratories, Inc.
373acbed8eSad  * All or some portions of this file are derived from material licensed
383acbed8eSad  * to the University of California by American Telephone and Telegraph
393acbed8eSad  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
403acbed8eSad  * the permission of UNIX System Laboratories, Inc.
413acbed8eSad  *
423acbed8eSad  * Redistribution and use in source and binary forms, with or without
433acbed8eSad  * modification, are permitted provided that the following conditions
443acbed8eSad  * are met:
453acbed8eSad  * 1. Redistributions of source code must retain the above copyright
463acbed8eSad  *    notice, this list of conditions and the following disclaimer.
473acbed8eSad  * 2. Redistributions in binary form must reproduce the above copyright
483acbed8eSad  *    notice, this list of conditions and the following disclaimer in the
493acbed8eSad  *    documentation and/or other materials provided with the distribution.
503acbed8eSad  * 3. Neither the name of the University nor the names of its contributors
513acbed8eSad  *    may be used to endorse or promote products derived from this software
523acbed8eSad  *    without specific prior written permission.
533acbed8eSad  *
543acbed8eSad  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
553acbed8eSad  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
563acbed8eSad  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
573acbed8eSad  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
583acbed8eSad  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
593acbed8eSad  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
603acbed8eSad  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
613acbed8eSad  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
623acbed8eSad  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
633acbed8eSad  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
643acbed8eSad  * SUCH DAMAGE.
653acbed8eSad  *
663acbed8eSad  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
673acbed8eSad  */
683acbed8eSad 
693acbed8eSad /*
704fff1555Srmind  * System calls of synchronous I/O multiplexing subsystem.
714fff1555Srmind  *
724fff1555Srmind  * Locking
734fff1555Srmind  *
742e2855a6Sad  * Two locks are used: <object-lock> and selcluster_t::sc_lock.
754fff1555Srmind  *
764fff1555Srmind  * The <object-lock> might be a device driver or another subsystem, e.g.
774fff1555Srmind  * socket or pipe.  This lock is not exported, and thus invisible to this
784fff1555Srmind  * subsystem.  Mainly, synchronisation between selrecord() and selnotify()
794fff1555Srmind  * routines depends on this lock, as it will be described in the comments.
804fff1555Srmind  *
814fff1555Srmind  * Lock order
824fff1555Srmind  *
834fff1555Srmind  *	<object-lock> ->
842e2855a6Sad  *		selcluster_t::sc_lock
853acbed8eSad  */
863acbed8eSad 
873acbed8eSad #include <sys/cdefs.h>
88*ed992de5Skhorben __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $");
893acbed8eSad 
903acbed8eSad #include <sys/param.h>
911da60e94Sriastradh 
921da60e94Sriastradh #include <sys/atomic.h>
931da60e94Sriastradh #include <sys/bitops.h>
941da60e94Sriastradh #include <sys/cpu.h>
953acbed8eSad #include <sys/file.h>
961da60e94Sriastradh #include <sys/filedesc.h>
973acbed8eSad #include <sys/kernel.h>
9811a35aedSrmind #include <sys/lwp.h>
993acbed8eSad #include <sys/mount.h>
1001da60e94Sriastradh #include <sys/poll.h>
1011da60e94Sriastradh #include <sys/proc.h>
1021da60e94Sriastradh #include <sys/signalvar.h>
1033acbed8eSad #include <sys/sleepq.h>
1041da60e94Sriastradh #include <sys/socketvar.h>
1051da60e94Sriastradh #include <sys/socketvar.h>
106fac91bbeSriastradh #include <sys/syncobj.h>
1071da60e94Sriastradh #include <sys/syscallargs.h>
1081da60e94Sriastradh #include <sys/sysctl.h>
1091da60e94Sriastradh #include <sys/systm.h>
1101da60e94Sriastradh #include <sys/uio.h>
1113acbed8eSad 
1123acbed8eSad /* Flags for lwp::l_selflag. */
1133acbed8eSad #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
1143acbed8eSad #define	SEL_SCANNING	1	/* polling descriptors */
115ce9dfd6aSrmind #define	SEL_BLOCKING	2	/* blocking and waiting for event */
116ce9dfd6aSrmind #define	SEL_EVENT	3	/* interrupted, events set directly */
117ce9dfd6aSrmind 
1182e2855a6Sad /*
1192e2855a6Sad  * Per-cluster state for select()/poll().  For a system with fewer
1203cec1e76Sad  * than 64 CPUs, this gives us per-CPU clusters.
1212e2855a6Sad  */
1223cec1e76Sad #define	SELCLUSTERS	64
1232e2855a6Sad #define	SELCLUSTERMASK	(SELCLUSTERS - 1)
1242e2855a6Sad 
1252e2855a6Sad typedef struct selcluster {
1267364cd36Sad 	kmutex_t	*sc_lock;
1273acbed8eSad 	sleepq_t	sc_sleepq;
128b5034f03Sad 	uint64_t	sc_mask;
1293acbed8eSad 	int		sc_ncoll;
1302e2855a6Sad } selcluster_t;
1313acbed8eSad 
132ce9dfd6aSrmind static inline int	selscan(char *, const int, const size_t, register_t *);
133ce9dfd6aSrmind static inline int	pollscan(struct pollfd *, const int, register_t *);
1343acbed8eSad static void		selclear(void);
1353acbed8eSad 
136ce9dfd6aSrmind static const int sel_flag[] = {
137ce9dfd6aSrmind 	POLLRDNORM | POLLHUP | POLLERR,
138ce9dfd6aSrmind 	POLLWRNORM | POLLHUP | POLLERR,
139ce9dfd6aSrmind 	POLLRDBAND
140ce9dfd6aSrmind };
141ce9dfd6aSrmind 
142ee5f078dSad /*
143ee5f078dSad  * LWPs are woken using the sleep queue only due to a collision, the case
144ee5f078dSad  * with the maximum Suck Factor.  Save the cost of sorting for named waiters
145ee5f078dSad  * by inserting in LIFO order.  In the future it would be preferable to not
146ee5f078dSad  * enqueue LWPs at all, unless subject to a collision.
147ee5f078dSad  */
148dadd0e50Sad syncobj_t select_sobj = {
149f4853583Sriastradh 	.sobj_name	= "select",
150ee5f078dSad 	.sobj_flag	= SOBJ_SLEEPQ_LIFO,
1516ed72b5fSad 	.sobj_boostpri  = PRI_KERNEL,
1528812081aSozaki-r 	.sobj_unsleep	= sleepq_unsleep,
1538812081aSozaki-r 	.sobj_changepri	= sleepq_changepri,
1548812081aSozaki-r 	.sobj_lendpri	= sleepq_lendpri,
1558812081aSozaki-r 	.sobj_owner	= syncobj_noowner,
1563acbed8eSad };
1573acbed8eSad 
158ce9dfd6aSrmind static selcluster_t	*selcluster[SELCLUSTERS] __read_mostly;
159501f07ceSrmind static int		direct_select __read_mostly = 0;
1602e2855a6Sad 
161b5034f03Sad /* Operations: either select() or poll(). */
162b5034f03Sad const char		selop_select[] = "select";
163b5034f03Sad const char		selop_poll[] = "poll";
164b5034f03Sad 
1653acbed8eSad /*
1663acbed8eSad  * Select system call.
1673acbed8eSad  */
1683acbed8eSad int
169461a86f9Schristos sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
170461a86f9Schristos     register_t *retval)
1713acbed8eSad {
1723acbed8eSad 	/* {
1733acbed8eSad 		syscallarg(int)				nd;
1743acbed8eSad 		syscallarg(fd_set *)			in;
1753acbed8eSad 		syscallarg(fd_set *)			ou;
1763acbed8eSad 		syscallarg(fd_set *)			ex;
1773acbed8eSad 		syscallarg(const struct timespec *)	ts;
1783acbed8eSad 		syscallarg(sigset_t *)			mask;
1793acbed8eSad 	} */
1802b1b4bc6Schristos 	struct timespec	ats, *ts = NULL;
1813acbed8eSad 	sigset_t	amask, *mask = NULL;
1823acbed8eSad 	int		error;
1833acbed8eSad 
1843acbed8eSad 	if (SCARG(uap, ts)) {
1853acbed8eSad 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
1863acbed8eSad 		if (error)
1873acbed8eSad 			return error;
1882b1b4bc6Schristos 		ts = &ats;
1893acbed8eSad 	}
1903acbed8eSad 	if (SCARG(uap, mask) != NULL) {
1913acbed8eSad 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1923acbed8eSad 		if (error)
1933acbed8eSad 			return error;
1943acbed8eSad 		mask = &amask;
1953acbed8eSad 	}
1963acbed8eSad 
19712839500Srmind 	return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
1982b1b4bc6Schristos 	    SCARG(uap, ou), SCARG(uap, ex), ts, mask);
1993acbed8eSad }
2003acbed8eSad 
2013acbed8eSad int
202461a86f9Schristos sys___select50(struct lwp *l, const struct sys___select50_args *uap,
203461a86f9Schristos     register_t *retval)
2043acbed8eSad {
2053acbed8eSad 	/* {
2063acbed8eSad 		syscallarg(int)			nd;
2073acbed8eSad 		syscallarg(fd_set *)		in;
2083acbed8eSad 		syscallarg(fd_set *)		ou;
2093acbed8eSad 		syscallarg(fd_set *)		ex;
2103acbed8eSad 		syscallarg(struct timeval *)	tv;
2113acbed8eSad 	} */
2122b1b4bc6Schristos 	struct timeval atv;
2132b1b4bc6Schristos 	struct timespec ats, *ts = NULL;
2143acbed8eSad 	int error;
2153acbed8eSad 
2163acbed8eSad 	if (SCARG(uap, tv)) {
2172b1b4bc6Schristos 		error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
2183acbed8eSad 		if (error)
2193acbed8eSad 			return error;
2200af36754Skamil 
2210af36754Skamil 		if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
2220af36754Skamil 			return EINVAL;
2230af36754Skamil 
2242b1b4bc6Schristos 		TIMEVAL_TO_TIMESPEC(&atv, &ats);
2252b1b4bc6Schristos 		ts = &ats;
2263acbed8eSad 	}
2273acbed8eSad 
22812839500Srmind 	return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
2292b1b4bc6Schristos 	    SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
2303acbed8eSad }
2313acbed8eSad 
2321ceff942Srmind /*
2331ceff942Srmind  * sel_do_scan: common code to perform the scan on descriptors.
2341ceff942Srmind  */
2351ceff942Srmind static int
236b5034f03Sad sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni,
237ce9dfd6aSrmind     struct timespec *ts, sigset_t *mask, register_t *retval)
2381ceff942Srmind {
2391ceff942Srmind 	lwp_t		* const l = curlwp;
2402e2855a6Sad 	selcluster_t	*sc;
2411ceff942Srmind 	kmutex_t	*lock;
2421ceff942Srmind 	struct timespec	sleepts;
2431ceff942Srmind 	int		error, timo;
2441ceff942Srmind 
2451ceff942Srmind 	timo = 0;
2461ceff942Srmind 	if (ts && inittimeleft(ts, &sleepts) == -1) {
2471ceff942Srmind 		return EINVAL;
2481ceff942Srmind 	}
2491ceff942Srmind 
250fc6147c6Schristos 	if (__predict_false(mask))
251e2543d03Schristos 		sigsuspendsetup(l, mask);
2521ceff942Srmind 
253b5034f03Sad 	/*
254b5034f03Sad 	 * We may context switch during or at any time after picking a CPU
255b5034f03Sad 	 * and cluster to associate with, but it doesn't matter.  In the
256b5034f03Sad 	 * unlikely event we migrate elsewhere all we risk is a little lock
257b5034f03Sad 	 * contention; correctness is not sacrificed.
258b5034f03Sad 	 */
2592e2855a6Sad 	sc = curcpu()->ci_data.cpu_selcluster;
2601ceff942Srmind 	lock = sc->sc_lock;
2612e2855a6Sad 	l->l_selcluster = sc;
262b5034f03Sad 
263b5034f03Sad 	if (opname == selop_select) {
264ae3b98c1Srmind 		l->l_selbits = fds;
265ce9dfd6aSrmind 		l->l_selni = ni;
266ce9dfd6aSrmind 	} else {
267ce9dfd6aSrmind 		l->l_selbits = NULL;
268ce9dfd6aSrmind 	}
26974039084Shannken 
2701ceff942Srmind 	for (;;) {
2711ceff942Srmind 		int ncoll;
2721ceff942Srmind 
27374039084Shannken 		SLIST_INIT(&l->l_selwait);
27474039084Shannken 		l->l_selret = 0;
27574039084Shannken 
2761ceff942Srmind 		/*
2771ceff942Srmind 		 * No need to lock.  If this is overwritten by another value
2781ceff942Srmind 		 * while scanning, we will retry below.  We only need to see
2791ceff942Srmind 		 * exact state from the descriptors that we are about to poll,
2801ceff942Srmind 		 * and lock activity resulting from fo_poll is enough to
2811ceff942Srmind 		 * provide an up to date value for new polling activity.
2821ceff942Srmind 		 */
283b5034f03Sad 		if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) {
284b5034f03Sad 			/* Non-blocking: no need for selrecord()/selclear() */
285b5034f03Sad 			l->l_selflag = SEL_RESET;
286b5034f03Sad 		} else {
2871ceff942Srmind 			l->l_selflag = SEL_SCANNING;
288b5034f03Sad 		}
2891ceff942Srmind 		ncoll = sc->sc_ncoll;
290008402f2Sriastradh 		membar_release();
2911ceff942Srmind 
292b5034f03Sad 		if (opname == selop_select) {
293ce9dfd6aSrmind 			error = selscan((char *)fds, nf, ni, retval);
2941ceff942Srmind 		} else {
295ce9dfd6aSrmind 			error = pollscan((struct pollfd *)fds, nf, retval);
2961ceff942Srmind 		}
2971ceff942Srmind 		if (error || *retval)
2981ceff942Srmind 			break;
2991ceff942Srmind 		if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
3001ceff942Srmind 			break;
301ce9dfd6aSrmind 		/*
302ce9dfd6aSrmind 		 * Acquire the lock and perform the (re)checks.  Note, if
303cbf5c65aSandvar 		 * collision has occurred, then our state does not matter,
304ce9dfd6aSrmind 		 * as we must perform re-scan.  Therefore, check it first.
305ce9dfd6aSrmind 		 */
306ce9dfd6aSrmind state_check:
3071ceff942Srmind 		mutex_spin_enter(lock);
308ce9dfd6aSrmind 		if (__predict_false(sc->sc_ncoll != ncoll)) {
309ce9dfd6aSrmind 			/* Collision: perform re-scan. */
3101ceff942Srmind 			mutex_spin_exit(lock);
31174039084Shannken 			selclear();
3121ceff942Srmind 			continue;
3131ceff942Srmind 		}
314ce9dfd6aSrmind 		if (__predict_true(l->l_selflag == SEL_EVENT)) {
315cbf5c65aSandvar 			/* Events occurred, they are set directly. */
316ce9dfd6aSrmind 			mutex_spin_exit(lock);
317ce9dfd6aSrmind 			break;
318ce9dfd6aSrmind 		}
319ce9dfd6aSrmind 		if (__predict_true(l->l_selflag == SEL_RESET)) {
320cbf5c65aSandvar 			/* Events occurred, but re-scan is requested. */
321ce9dfd6aSrmind 			mutex_spin_exit(lock);
32274039084Shannken 			selclear();
323ce9dfd6aSrmind 			continue;
324ce9dfd6aSrmind 		}
325ce9dfd6aSrmind 		/* Nothing happen, therefore - sleep. */
3261ceff942Srmind 		l->l_selflag = SEL_BLOCKING;
3270a6ca13bSad 		KASSERT(l->l_blcnt == 0);
3280a6ca13bSad 		(void)sleepq_enter(&sc->sc_sleepq, l, lock);
32946a9878aSad 		sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true);
3300a6ca13bSad 		error = sleepq_block(timo, true, &select_sobj, 0);
331ce9dfd6aSrmind 		if (error != 0) {
3321ceff942Srmind 			break;
3331ceff942Srmind 		}
334ce9dfd6aSrmind 		/* Awoken: need to check the state. */
335ce9dfd6aSrmind 		goto state_check;
336ce9dfd6aSrmind 	}
3371ceff942Srmind 	selclear();
3381ceff942Srmind 
33974039084Shannken 	/* Add direct events if any. */
34074039084Shannken 	if (l->l_selflag == SEL_EVENT) {
34174039084Shannken 		KASSERT(l->l_selret != 0);
34274039084Shannken 		*retval += l->l_selret;
34374039084Shannken 	}
34474039084Shannken 
3450b60c7beSchristos 	if (__predict_false(mask))
3460b60c7beSchristos 		sigsuspendteardown(l);
3470b60c7beSchristos 
348ef379fcbSdsl 	/* select and poll are not restarted after signals... */
349ef379fcbSdsl 	if (error == ERESTART)
350ef379fcbSdsl 		return EINTR;
351ef379fcbSdsl 	if (error == EWOULDBLOCK)
352ef379fcbSdsl 		return 0;
3531ceff942Srmind 	return error;
3541ceff942Srmind }
3551ceff942Srmind 
356501e579fSkre /* designed to be compatible with FD_SET() FD_ISSET() ... */
357501e579fSkre static int
358501e579fSkre anyset(void *p, size_t nbits)
359501e579fSkre {
360501e579fSkre 	size_t nwords;
361501e579fSkre 	__fd_mask mask;
362501e579fSkre 	__fd_mask *f = (__fd_mask *)p;
363501e579fSkre 
364501e579fSkre 	nwords = nbits / __NFDBITS;
365501e579fSkre 
366501e579fSkre 	while (nwords-- > 0)
367501e579fSkre 		if (*f++ != 0)
368501e579fSkre 			return 1;
369501e579fSkre 
370501e579fSkre 	nbits &= __NFDMASK;
371501e579fSkre 	if (nbits != 0) {
372501e579fSkre 		mask = (1U << nbits) - 1;
373501e579fSkre 		if ((*f & mask) != 0)
374501e579fSkre 			return 1;
375501e579fSkre 	}
376501e579fSkre 	return 0;
377501e579fSkre }
378501e579fSkre 
3793acbed8eSad int
38012839500Srmind selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
38112839500Srmind     fd_set *u_ex, struct timespec *ts, sigset_t *mask)
3823acbed8eSad {
3833acbed8eSad 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
3843acbed8eSad 			    sizeof(fd_mask) * 6];
3853acbed8eSad 	char 		*bits;
386501e579fSkre 	int		error, nf, fb, db;
3873acbed8eSad 	size_t		ni;
3883acbed8eSad 
3893acbed8eSad 	if (nd < 0)
390501e579fSkre 		return EINVAL;
391501e579fSkre 
3928e6cd4ceSriastradh 	nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles;
393501e579fSkre 
394501e579fSkre 	/*
395501e579fSkre 	 * Don't allow absurdly large numbers of fds to be selected.
396501e579fSkre 	 * (used to silently truncate, naughty naughty, no more ...)
397501e579fSkre 	 *
398*ed992de5Skhorben 	 * The additional FD_SETSIZE allows for cases where the limit
399501e579fSkre 	 * is not a round binary number, but the fd_set wants to
400501e579fSkre 	 * include all the possible fds, as fd_sets are always
401501e579fSkre 	 * multiples of 32 bits (__NFDBITS extra would be enough).
402501e579fSkre 	 *
403501e579fSkre 	 * The first test handles the case where the res limit has been
404501e579fSkre 	 * set lower after some fds were opened, we always allow selecting
405501e579fSkre 	 * up to the highest currently open fd.
406501e579fSkre 	 */
407501e579fSkre 	if (nd > nf + FD_SETSIZE &&
408501e579fSkre 	    nd > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + FD_SETSIZE)
409501e579fSkre 		return EINVAL;
410501e579fSkre 
411501e579fSkre 	fb = howmany(nf, __NFDBITS);		/* how many fd_masks */
412501e579fSkre 	db = howmany(nd, __NFDBITS);
413501e579fSkre 
414501e579fSkre 	if (db > fb) {
415501e579fSkre 		size_t off;
416501e579fSkre 
417501e579fSkre 		/*
418501e579fSkre 		 * the application wants to supply more fd masks than can
419501e579fSkre 		 * possibly represent valid file descriptors.
420501e579fSkre 		 *
421501e579fSkre 		 * Check the excess fd_masks, if any bits are set in them
422501e579fSkre 		 * that must be an error (cannot represent valid fd).
423501e579fSkre 		 *
424501e579fSkre 		 * Supplying lots of extra cleared fd_masks is dumb,
425501e579fSkre 		 * but harmless, so allow that.
426501e579fSkre 		 */
427501e579fSkre 		ni = (db - fb) * sizeof(fd_mask);	/* excess bytes */
428501e579fSkre 		bits = smallbits;
429501e579fSkre 
430501e579fSkre 		/* skip over the valid fd_masks, those will be checked below */
431501e579fSkre 		off = howmany(nf, __NFDBITS) * sizeof(__fd_mask);
432501e579fSkre 
433501e579fSkre 		nd -= fb * NFDBITS;	/* the number of excess fds */
434501e579fSkre 
435501e579fSkre #define checkbits(name, o, sz, fds)					\
436501e579fSkre 		do {							\
437501e579fSkre 		    if (u_ ## name != NULL) {				\
438501e579fSkre 			error = copyin((char *)u_ ## name + o,		\
439501e579fSkre 					bits, sz);			\
440501e579fSkre 			if (error)					\
441501e579fSkre 			    goto fail;					\
442501e579fSkre 			if (anyset(bits, (fds) ?			\
443501e579fSkre 				 (size_t)(fds) : CHAR_BIT * (sz))) {	\
444501e579fSkre 			    error = EBADF;				\
445501e579fSkre 			    goto fail;					\
446501e579fSkre 			}						\
447501e579fSkre 		    }							\
448501e579fSkre 		} while (0)
449501e579fSkre 
450501e579fSkre 		while (ni > sizeof(smallbits)) {
451501e579fSkre 			checkbits(in, off, sizeof(smallbits), 0);
452501e579fSkre 			checkbits(ou, off, sizeof(smallbits), 0);
453501e579fSkre 			checkbits(ex, off, sizeof(smallbits), 0);
454501e579fSkre 
455501e579fSkre 			off += sizeof(smallbits);
456501e579fSkre 			ni -= sizeof(smallbits);
457501e579fSkre 			nd -= sizeof(smallbits) * CHAR_BIT;
4583acbed8eSad 		}
459501e579fSkre 		checkbits(in, off, ni, nd);
460501e579fSkre 		checkbits(ou, off, ni, nd);
461501e579fSkre 		checkbits(ex, off, ni, nd);
462501e579fSkre #undef checkbits
463501e579fSkre 
464501e579fSkre 		db = fb;	/* now just check the plausible fds */
465501e579fSkre 		nd = db * __NFDBITS;
466501e579fSkre 	}
467501e579fSkre 
468501e579fSkre 	ni = db * sizeof(fd_mask);
469fd34ea77Schs 	if (ni * 6 > sizeof(smallbits))
4703acbed8eSad 		bits = kmem_alloc(ni * 6, KM_SLEEP);
471fd34ea77Schs 	else
4723acbed8eSad 		bits = smallbits;
4733acbed8eSad 
4743acbed8eSad #define	getbits(name, x)						\
475501e579fSkre 	do {								\
4763acbed8eSad 		if (u_ ## name) {					\
4773acbed8eSad 			error = copyin(u_ ## name, bits + ni * x, ni);	\
4783acbed8eSad 			if (error)					\
479ef379fcbSdsl 				goto fail;				\
4803acbed8eSad 		} else							\
481501e579fSkre 			memset(bits + ni * x, 0, ni);			\
482501e579fSkre 	} while (0)
483501e579fSkre 
4843acbed8eSad 	getbits(in, 0);
4853acbed8eSad 	getbits(ou, 1);
4863acbed8eSad 	getbits(ex, 2);
4873acbed8eSad #undef	getbits
4883acbed8eSad 
489b5034f03Sad 	error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval);
490501e579fSkre 
491501e579fSkre #define copyback(name, x)						\
492501e579fSkre 		do {							\
493501e579fSkre 			if (error == 0 && u_ ## name != NULL)		\
494501e579fSkre 				error = copyout(bits + ni * x,		\
495501e579fSkre 						u_ ## name, ni);	\
496501e579fSkre 		} while (0)
497501e579fSkre 
498501e579fSkre 	copyback(in, 3);
499501e579fSkre 	copyback(ou, 4);
500501e579fSkre 	copyback(ex, 5);
501501e579fSkre #undef copyback
502501e579fSkre 
503ef379fcbSdsl  fail:
5043acbed8eSad 	if (bits != smallbits)
5053acbed8eSad 		kmem_free(bits, ni * 6);
5063acbed8eSad 	return (error);
5073acbed8eSad }
5083acbed8eSad 
50912839500Srmind static inline int
510ce9dfd6aSrmind selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
5113acbed8eSad {
5121ceff942Srmind 	fd_mask *ibitp, *obitp;
513ce9dfd6aSrmind 	int msk, i, j, fd, n;
5143acbed8eSad 	file_t *fp;
515b5034f03Sad 	lwp_t *l;
5163acbed8eSad 
5171ceff942Srmind 	ibitp = (fd_mask *)(bits + ni * 0);
5181ceff942Srmind 	obitp = (fd_mask *)(bits + ni * 3);
5193acbed8eSad 	n = 0;
520b5034f03Sad 	l = curlwp;
5211ceff942Srmind 
52274039084Shannken 	memset(obitp, 0, ni * 3);
5233acbed8eSad 	for (msk = 0; msk < 3; msk++) {
5243acbed8eSad 		for (i = 0; i < nfd; i += NFDBITS) {
525ce9dfd6aSrmind 			fd_mask ibits, obits;
526ce9dfd6aSrmind 
527d5d9d492Shannken 			ibits = *ibitp;
5283acbed8eSad 			obits = 0;
5293acbed8eSad 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
530638734caSmsaitoh 				ibits &= ~(1U << j);
5313acbed8eSad 				if ((fp = fd_getfile(fd)) == NULL)
5323acbed8eSad 					return (EBADF);
533ce9dfd6aSrmind 				/*
534ce9dfd6aSrmind 				 * Setup an argument to selrecord(), which is
535ce9dfd6aSrmind 				 * a file descriptor number.
536ce9dfd6aSrmind 				 */
537b5034f03Sad 				l->l_selrec = fd;
538ce9dfd6aSrmind 				if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) {
539b5034f03Sad 					if (!direct_select) {
540b5034f03Sad 						/*
541b5034f03Sad 						 * Have events: do nothing in
542b5034f03Sad 						 * selrecord().
543b5034f03Sad 						 */
544b5034f03Sad 						l->l_selflag = SEL_RESET;
545b5034f03Sad 					}
546638734caSmsaitoh 					obits |= (1U << j);
5473acbed8eSad 					n++;
5483acbed8eSad 				}
5493acbed8eSad 				fd_putfile(fd);
5503acbed8eSad 			}
55174039084Shannken 			if (obits != 0) {
552501f07ceSrmind 				if (direct_select) {
553501f07ceSrmind 					kmutex_t *lock;
554b5034f03Sad 					lock = l->l_selcluster->sc_lock;
55574039084Shannken 					mutex_spin_enter(lock);
55674039084Shannken 					*obitp |= obits;
55774039084Shannken 					mutex_spin_exit(lock);
558501f07ceSrmind 				} else {
559d5d9d492Shannken 					*obitp |= obits;
560501f07ceSrmind 				}
56174039084Shannken 			}
562d5d9d492Shannken 			ibitp++;
56374039084Shannken 			obitp++;
5643acbed8eSad 		}
5653acbed8eSad 	}
5663acbed8eSad 	*retval = n;
5673acbed8eSad 	return (0);
5683acbed8eSad }
5693acbed8eSad 
5703acbed8eSad /*
5713acbed8eSad  * Poll system call.
5723acbed8eSad  */
5733acbed8eSad int
5743acbed8eSad sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
5753acbed8eSad {
5763acbed8eSad 	/* {
5773acbed8eSad 		syscallarg(struct pollfd *)	fds;
5783acbed8eSad 		syscallarg(u_int)		nfds;
5793acbed8eSad 		syscallarg(int)			timeout;
5803acbed8eSad 	} */
5812b1b4bc6Schristos 	struct timespec	ats, *ts = NULL;
5823acbed8eSad 
5833acbed8eSad 	if (SCARG(uap, timeout) != INFTIM) {
5842b1b4bc6Schristos 		ats.tv_sec = SCARG(uap, timeout) / 1000;
5852b1b4bc6Schristos 		ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
5862b1b4bc6Schristos 		ts = &ats;
5873acbed8eSad 	}
5883acbed8eSad 
58912839500Srmind 	return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL);
5903acbed8eSad }
5913acbed8eSad 
5923acbed8eSad /*
5933acbed8eSad  * Poll system call.
5943acbed8eSad  */
5953acbed8eSad int
596461a86f9Schristos sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
597461a86f9Schristos     register_t *retval)
5983acbed8eSad {
5993acbed8eSad 	/* {
6003acbed8eSad 		syscallarg(struct pollfd *)		fds;
6013acbed8eSad 		syscallarg(u_int)			nfds;
6023acbed8eSad 		syscallarg(const struct timespec *)	ts;
6033acbed8eSad 		syscallarg(const sigset_t *)		mask;
6043acbed8eSad 	} */
6052b1b4bc6Schristos 	struct timespec	ats, *ts = NULL;
6063acbed8eSad 	sigset_t	amask, *mask = NULL;
6073acbed8eSad 	int		error;
6083acbed8eSad 
6093acbed8eSad 	if (SCARG(uap, ts)) {
6103acbed8eSad 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
6113acbed8eSad 		if (error)
6123acbed8eSad 			return error;
6132b1b4bc6Schristos 		ts = &ats;
6143acbed8eSad 	}
6153acbed8eSad 	if (SCARG(uap, mask)) {
6163acbed8eSad 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
6173acbed8eSad 		if (error)
6183acbed8eSad 			return error;
6193acbed8eSad 		mask = &amask;
6203acbed8eSad 	}
6213acbed8eSad 
62212839500Srmind 	return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
6233acbed8eSad }
6243acbed8eSad 
6253acbed8eSad int
62612839500Srmind pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
6272b1b4bc6Schristos     struct timespec *ts, sigset_t *mask)
6283acbed8eSad {
629bd2755f5Syamt 	struct pollfd	smallfds[32];
630bd2755f5Syamt 	struct pollfd	*fds;
6311ceff942Srmind 	int		error;
632ef379fcbSdsl 	size_t		ni;
6333acbed8eSad 
634a4ff8f65Schristos 	if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) {
635ef379fcbSdsl 		/*
636600ed393Schristos 		 * Prevent userland from causing over-allocation.
637600ed393Schristos 		 * Raising the default limit too high can still cause
638600ed393Schristos 		 * a lot of memory to be allocated, but this also means
639600ed393Schristos 		 * that the file descriptor array will also be large.
640600ed393Schristos 		 *
641600ed393Schristos 		 * To reduce the memory requirements here, we could
642600ed393Schristos 		 * process the 'fds' array in chunks, but that
643ef379fcbSdsl 		 * is a lot of code that isn't normally useful.
644ef379fcbSdsl 		 * (Or just move the copyin/out into pollscan().)
645600ed393Schristos 		 *
646ef379fcbSdsl 		 * Historically the code silently truncated 'fds' to
647ef379fcbSdsl 		 * dt_nfiles entries - but that does cause issues.
648ed042c77Schristos 		 *
649ed042c77Schristos 		 * Using the max limit equivalent to sysctl
650ed042c77Schristos 		 * kern.maxfiles is the moral equivalent of OPEN_MAX
651a4ff8f65Schristos 		 * as specified by POSIX.
652a4ff8f65Schristos 		 *
653a4ff8f65Schristos 		 * We add a slop of 1000 in case the resource limit was
654a4ff8f65Schristos 		 * changed after opening descriptors or the same descriptor
655a4ff8f65Schristos 		 * was specified more than once.
656ef379fcbSdsl 		 */
657ef379fcbSdsl 		return EINVAL;
6583acbed8eSad 	}
6593acbed8eSad 	ni = nfds * sizeof(struct pollfd);
660fd34ea77Schs 	if (ni > sizeof(smallfds))
661bd2755f5Syamt 		fds = kmem_alloc(ni, KM_SLEEP);
662fd34ea77Schs 	else
663bd2755f5Syamt 		fds = smallfds;
6643acbed8eSad 
665bd2755f5Syamt 	error = copyin(u_fds, fds, ni);
6663acbed8eSad 	if (error)
667ef379fcbSdsl 		goto fail;
6683acbed8eSad 
669b5034f03Sad 	error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval);
6703acbed8eSad 	if (error == 0)
671bd2755f5Syamt 		error = copyout(fds, u_fds, ni);
672ef379fcbSdsl  fail:
673bd2755f5Syamt 	if (fds != smallfds)
674bd2755f5Syamt 		kmem_free(fds, ni);
6753acbed8eSad 	return (error);
6763acbed8eSad }
6773acbed8eSad 
67812839500Srmind static inline int
679ce9dfd6aSrmind pollscan(struct pollfd *fds, const int nfd, register_t *retval)
6803acbed8eSad {
6813acbed8eSad 	file_t *fp;
68274039084Shannken 	int i, n = 0, revents;
6833acbed8eSad 
6843acbed8eSad 	for (i = 0; i < nfd; i++, fds++) {
6853acbed8eSad 		fds->revents = 0;
68674039084Shannken 		if (fds->fd < 0) {
68774039084Shannken 			revents = 0;
6883acbed8eSad 		} else if ((fp = fd_getfile(fds->fd)) == NULL) {
68974039084Shannken 			revents = POLLNVAL;
6903acbed8eSad 		} else {
691ce9dfd6aSrmind 			/*
692ce9dfd6aSrmind 			 * Perform poll: registers select request or returns
693ce9dfd6aSrmind 			 * the events which are set.  Setup an argument for
694ce9dfd6aSrmind 			 * selrecord(), which is a pointer to struct pollfd.
695ce9dfd6aSrmind 			 */
696ce9dfd6aSrmind 			curlwp->l_selrec = (uintptr_t)fds;
69774039084Shannken 			revents = (*fp->f_ops->fo_poll)(fp,
6983acbed8eSad 			    fds->events | POLLERR | POLLHUP);
6993acbed8eSad 			fd_putfile(fds->fd);
7003acbed8eSad 		}
70174039084Shannken 		if (revents) {
702b5034f03Sad 			if (!direct_select)  {
703b5034f03Sad 				/* Have events: do nothing in selrecord(). */
704b5034f03Sad 				curlwp->l_selflag = SEL_RESET;
705b5034f03Sad 			}
70674039084Shannken 			fds->revents = revents;
70774039084Shannken 			n++;
70874039084Shannken 		}
7093acbed8eSad 	}
7103acbed8eSad 	*retval = n;
7113acbed8eSad 	return (0);
7123acbed8eSad }
7133acbed8eSad 
7143acbed8eSad int
7153acbed8eSad seltrue(dev_t dev, int events, lwp_t *l)
7163acbed8eSad {
7173acbed8eSad 
7183acbed8eSad 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
7193acbed8eSad }
7203acbed8eSad 
7213acbed8eSad /*
7223acbed8eSad  * Record a select request.  Concurrency issues:
7233acbed8eSad  *
7243acbed8eSad  * The caller holds the same lock across calls to selrecord() and
72570f8f58cSyamt  * selnotify(), so we don't need to consider a concurrent wakeup
7263acbed8eSad  * while in this routine.
7273acbed8eSad  *
7283acbed8eSad  * The only activity we need to guard against is selclear(), called by
7291ceff942Srmind  * another thread that is exiting sel_do_scan().
7303acbed8eSad  * `sel_lwp' can only become non-NULL while the caller's lock is held,
7313acbed8eSad  * so it cannot become non-NULL due to a change made by another thread
7323acbed8eSad  * while we are in this routine.  It can only become _NULL_ due to a
7333acbed8eSad  * call to selclear().
7343acbed8eSad  *
7353acbed8eSad  * If it is non-NULL and != selector there is the potential for
7363acbed8eSad  * selclear() to be called by another thread.  If either of those
7373acbed8eSad  * conditions are true, we're not interested in touching the `named
7383acbed8eSad  * waiter' part of the selinfo record because we need to record a
7393acbed8eSad  * collision.  Hence there is no need for additional locking in this
7403acbed8eSad  * routine.
7413acbed8eSad  */
7423acbed8eSad void
7433acbed8eSad selrecord(lwp_t *selector, struct selinfo *sip)
7443acbed8eSad {
7452e2855a6Sad 	selcluster_t *sc;
7463acbed8eSad 	lwp_t *other;
7473acbed8eSad 
7483acbed8eSad 	KASSERT(selector == curlwp);
7493acbed8eSad 
7502e2855a6Sad 	sc = selector->l_selcluster;
7513acbed8eSad 	other = sip->sel_lwp;
7523acbed8eSad 
753b5034f03Sad 	if (selector->l_selflag == SEL_RESET) {
754b5034f03Sad 		/* 0. We're not going to block - will poll again if needed. */
755b5034f03Sad 	} else if (other == selector) {
756ce9dfd6aSrmind 		/* 1. We (selector) already claimed to be the first LWP. */
75780ae1f31Sriastradh 		KASSERT(sip->sel_cluster == sc);
7583acbed8eSad 	} else if (other == NULL) {
7593acbed8eSad 		/*
760ce9dfd6aSrmind 		 * 2. No first LWP, therefore we (selector) are the first.
761ce9dfd6aSrmind 		 *
762ce9dfd6aSrmind 		 * There may be unnamed waiters (collisions).  Issue a memory
763ce9dfd6aSrmind 		 * barrier to ensure that we access sel_lwp (above) before
764ce9dfd6aSrmind 		 * other fields - this guards against a call to selclear().
7653acbed8eSad 		 */
766008402f2Sriastradh 		membar_acquire();
7673acbed8eSad 		sip->sel_lwp = selector;
7683acbed8eSad 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
769ce9dfd6aSrmind 		/* Copy the argument, which is for selnotify(). */
770ce9dfd6aSrmind 		sip->sel_fdinfo = selector->l_selrec;
7712e2855a6Sad 		/* Replace selinfo's lock with the chosen cluster's lock. */
7722e2855a6Sad 		sip->sel_cluster = sc;
7733acbed8eSad 	} else {
774ce9dfd6aSrmind 		/* 3. Multiple waiters: record a collision. */
7753acbed8eSad 		sip->sel_collision |= sc->sc_mask;
7762e2855a6Sad 		KASSERT(sip->sel_cluster != NULL);
7773acbed8eSad 	}
7783acbed8eSad }
7793acbed8eSad 
7803acbed8eSad /*
781b56263bdSthorpej  * Record a knote.
782b56263bdSthorpej  *
783b56263bdSthorpej  * The caller holds the same lock as for selrecord().
784b56263bdSthorpej  */
785b56263bdSthorpej void
786b56263bdSthorpej selrecord_knote(struct selinfo *sip, struct knote *kn)
787b56263bdSthorpej {
788576702f1Sthorpej 	klist_insert(&sip->sel_klist, kn);
789b56263bdSthorpej }
790b56263bdSthorpej 
791b56263bdSthorpej /*
792b56263bdSthorpej  * Remove a knote.
793b56263bdSthorpej  *
794b56263bdSthorpej  * The caller holds the same lock as for selrecord().
795966aaf88Sthorpej  *
796966aaf88Sthorpej  * Returns true if the last knote was removed and the list
797966aaf88Sthorpej  * is now empty.
798b56263bdSthorpej  */
799966aaf88Sthorpej bool
800b56263bdSthorpej selremove_knote(struct selinfo *sip, struct knote *kn)
801b56263bdSthorpej {
802576702f1Sthorpej 	return klist_remove(&sip->sel_klist, kn);
803b56263bdSthorpej }
804b56263bdSthorpej 
805b56263bdSthorpej /*
806ce9dfd6aSrmind  * sel_setevents: a helper function for selnotify(), to set the events
807ce9dfd6aSrmind  * for LWP sleeping in selcommon() or pollcommon().
808ce9dfd6aSrmind  */
809ae3b98c1Srmind static inline bool
810ce9dfd6aSrmind sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
811ce9dfd6aSrmind {
812ce9dfd6aSrmind 	const int oflag = l->l_selflag;
813ae3b98c1Srmind 	int ret = 0;
814ce9dfd6aSrmind 
815ce9dfd6aSrmind 	/*
816ce9dfd6aSrmind 	 * If we require re-scan or it was required by somebody else,
817ce9dfd6aSrmind 	 * then just (re)set SEL_RESET and return.
818ce9dfd6aSrmind 	 */
819ce9dfd6aSrmind 	if (__predict_false(events == 0 || oflag == SEL_RESET)) {
820ce9dfd6aSrmind 		l->l_selflag = SEL_RESET;
821ae3b98c1Srmind 		return true;
822ce9dfd6aSrmind 	}
823ce9dfd6aSrmind 	/*
824ce9dfd6aSrmind 	 * Direct set.  Note: select state of LWP is locked.  First,
825ce9dfd6aSrmind 	 * determine whether it is selcommon() or pollcommon().
826ce9dfd6aSrmind 	 */
827ce9dfd6aSrmind 	if (l->l_selbits != NULL) {
8282903a8dcSrmind 		const size_t ni = l->l_selni;
829ae3b98c1Srmind 		fd_mask *fds = (fd_mask *)l->l_selbits;
830ae3b98c1Srmind 		fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3);
831ae3b98c1Srmind 		const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK);
8322903a8dcSrmind 		const int idx = fd >> __NFDSHIFT;
833ce9dfd6aSrmind 		int n;
834ce9dfd6aSrmind 
835ce9dfd6aSrmind 		for (n = 0; n < 3; n++) {
83674039084Shannken 			if ((fds[idx] & fbit) != 0 &&
83774039084Shannken 			    (ofds[idx] & fbit) == 0 &&
83874039084Shannken 			    (sel_flag[n] & events)) {
839ae3b98c1Srmind 				ofds[idx] |= fbit;
840ae3b98c1Srmind 				ret++;
841ce9dfd6aSrmind 			}
842ce9dfd6aSrmind 			fds = (fd_mask *)((char *)fds + ni);
843ae3b98c1Srmind 			ofds = (fd_mask *)((char *)ofds + ni);
844ce9dfd6aSrmind 		}
845ce9dfd6aSrmind 	} else {
846ce9dfd6aSrmind 		struct pollfd *pfd = (void *)sip->sel_fdinfo;
847ae3b98c1Srmind 		int revents = events & (pfd->events | POLLERR | POLLHUP);
848ae3b98c1Srmind 
849ae3b98c1Srmind 		if (revents) {
85074039084Shannken 			if (pfd->revents == 0)
851ae3b98c1Srmind 				ret = 1;
85274039084Shannken 			pfd->revents |= revents;
853ae3b98c1Srmind 		}
854ae3b98c1Srmind 	}
855ae3b98c1Srmind 	/* Check whether there are any events to return. */
856ae3b98c1Srmind 	if (!ret) {
857ae3b98c1Srmind 		return false;
858ce9dfd6aSrmind 	}
859ce9dfd6aSrmind 	/* Indicate direct set and note the event (cluster lock is held). */
860ce9dfd6aSrmind 	l->l_selflag = SEL_EVENT;
861ae3b98c1Srmind 	l->l_selret += ret;
862ae3b98c1Srmind 	return true;
863ce9dfd6aSrmind }
864ce9dfd6aSrmind 
865ce9dfd6aSrmind /*
8663acbed8eSad  * Do a wakeup when a selectable event occurs.  Concurrency issues:
8673acbed8eSad  *
8683acbed8eSad  * As per selrecord(), the caller's object lock is held.  If there
8692e2855a6Sad  * is a named waiter, we must acquire the associated selcluster's lock
8703acbed8eSad  * in order to synchronize with selclear() and pollers going to sleep
8711ceff942Srmind  * in sel_do_scan().
8723acbed8eSad  *
8732e2855a6Sad  * sip->sel_cluser cannot change at this point, as it is only changed
8743acbed8eSad  * in selrecord(), and concurrent calls to selrecord() are locked
8753acbed8eSad  * out by the caller.
8763acbed8eSad  */
8773acbed8eSad void
8783acbed8eSad selnotify(struct selinfo *sip, int events, long knhint)
8793acbed8eSad {
8802e2855a6Sad 	selcluster_t *sc;
881b5034f03Sad 	uint64_t mask;
88240cf6f36Srmind 	int index, oflag;
8833acbed8eSad 	lwp_t *l;
8847364cd36Sad 	kmutex_t *lock;
8853acbed8eSad 
8863acbed8eSad 	KNOTE(&sip->sel_klist, knhint);
8873acbed8eSad 
8883acbed8eSad 	if (sip->sel_lwp != NULL) {
8893acbed8eSad 		/* One named LWP is waiting. */
8902e2855a6Sad 		sc = sip->sel_cluster;
8917364cd36Sad 		lock = sc->sc_lock;
8927364cd36Sad 		mutex_spin_enter(lock);
8933acbed8eSad 		/* Still there? */
8943acbed8eSad 		if (sip->sel_lwp != NULL) {
895ce9dfd6aSrmind 			/*
896ce9dfd6aSrmind 			 * Set the events for our LWP and indicate that.
897ce9dfd6aSrmind 			 * Otherwise, request for a full re-scan.
898ce9dfd6aSrmind 			 */
8993acbed8eSad 			l = sip->sel_lwp;
900ce9dfd6aSrmind 			oflag = l->l_selflag;
901501f07ceSrmind 
902501f07ceSrmind 			if (!direct_select) {
903501f07ceSrmind 				l->l_selflag = SEL_RESET;
904501f07ceSrmind 			} else if (!sel_setevents(l, sip, events)) {
905ae3b98c1Srmind 				/* No events to return. */
906ae3b98c1Srmind 				mutex_spin_exit(lock);
907ae3b98c1Srmind 				return;
908ae3b98c1Srmind 			}
909501f07ceSrmind 
9103acbed8eSad 			/*
9113acbed8eSad 			 * If thread is sleeping, wake it up.  If it's not
9123acbed8eSad 			 * yet asleep, it will notice the change in state
9133acbed8eSad 			 * and will re-poll the descriptors.
9143acbed8eSad 			 */
9157364cd36Sad 			if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
9163acbed8eSad 				KASSERT(l->l_wchan == sc);
91732a89764Sad 				sleepq_remove(l->l_sleepq, l, true);
9183acbed8eSad 			}
9193acbed8eSad 		}
9207364cd36Sad 		mutex_spin_exit(lock);
9213acbed8eSad 	}
9223acbed8eSad 
9233acbed8eSad 	if ((mask = sip->sel_collision) != 0) {
9243acbed8eSad 		/*
9253acbed8eSad 		 * There was a collision (multiple waiters): we must
9263acbed8eSad 		 * inform all potentially interested waiters.
9273acbed8eSad 		 */
9283acbed8eSad 		sip->sel_collision = 0;
929bd9b59aaSad 		do {
930b5034f03Sad 			index = ffs64(mask) - 1;
931b5034f03Sad 			mask ^= __BIT(index);
9322e2855a6Sad 			sc = selcluster[index];
9337364cd36Sad 			lock = sc->sc_lock;
9347364cd36Sad 			mutex_spin_enter(lock);
9353acbed8eSad 			sc->sc_ncoll++;
9367364cd36Sad 			sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
937bd9b59aaSad 		} while (__predict_false(mask != 0));
9383acbed8eSad 	}
9393acbed8eSad }
9403acbed8eSad 
9413acbed8eSad /*
9423acbed8eSad  * Remove an LWP from all objects that it is waiting for.  Concurrency
9433acbed8eSad  * issues:
9443acbed8eSad  *
9453acbed8eSad  * The object owner's (e.g. device driver) lock is not held here.  Calls
9463acbed8eSad  * can be made to selrecord() and we do not synchronize against those
9473acbed8eSad  * directly using locks.  However, we use `sel_lwp' to lock out changes.
9483acbed8eSad  * Before clearing it we must use memory barriers to ensure that we can
9493acbed8eSad  * safely traverse the list of selinfo records.
9503acbed8eSad  */
9513acbed8eSad static void
9523acbed8eSad selclear(void)
9533acbed8eSad {
9543acbed8eSad 	struct selinfo *sip, *next;
9552e2855a6Sad 	selcluster_t *sc;
9563acbed8eSad 	lwp_t *l;
9577364cd36Sad 	kmutex_t *lock;
9583acbed8eSad 
9593acbed8eSad 	l = curlwp;
9602e2855a6Sad 	sc = l->l_selcluster;
9617364cd36Sad 	lock = sc->sc_lock;
9623acbed8eSad 
963b5034f03Sad 	/*
964b5034f03Sad 	 * If the request was non-blocking, or we found events on the first
965b5034f03Sad 	 * descriptor, there will be no need to clear anything - avoid
966b5034f03Sad 	 * taking the lock.
967b5034f03Sad 	 */
968b5034f03Sad 	if (SLIST_EMPTY(&l->l_selwait)) {
969b5034f03Sad 		return;
970b5034f03Sad 	}
971b5034f03Sad 
9727364cd36Sad 	mutex_spin_enter(lock);
9733acbed8eSad 	for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
9743acbed8eSad 		KASSERT(sip->sel_lwp == l);
9752e2855a6Sad 		KASSERT(sip->sel_cluster == l->l_selcluster);
9762e2855a6Sad 
9773acbed8eSad 		/*
9783acbed8eSad 		 * Read link to next selinfo record, if any.
9793acbed8eSad 		 * It's no longer safe to touch `sip' after clearing
9803acbed8eSad 		 * `sel_lwp', so ensure that the read of `sel_chain'
9813acbed8eSad 		 * completes before the clearing of sel_lwp becomes
9823acbed8eSad 		 * globally visible.
9833acbed8eSad 		 */
9843acbed8eSad 		next = SLIST_NEXT(sip, sel_chain);
9853acbed8eSad 		/* Release the record for another named waiter to use. */
986008402f2Sriastradh 		atomic_store_release(&sip->sel_lwp, NULL);
9873acbed8eSad 	}
9887364cd36Sad 	mutex_spin_exit(lock);
9893acbed8eSad }
9903acbed8eSad 
9913acbed8eSad /*
9923acbed8eSad  * Initialize the select/poll system calls.  Called once for each
9933acbed8eSad  * CPU in the system, as they are attached.
9943acbed8eSad  */
9953acbed8eSad void
9963acbed8eSad selsysinit(struct cpu_info *ci)
9973acbed8eSad {
9982e2855a6Sad 	selcluster_t *sc;
9992e2855a6Sad 	u_int index;
10003acbed8eSad 
10012e2855a6Sad 	/* If already a cluster in place for this bit, re-use. */
10022e2855a6Sad 	index = cpu_index(ci) & SELCLUSTERMASK;
10032e2855a6Sad 	sc = selcluster[index];
10042e2855a6Sad 	if (sc == NULL) {
10052e2855a6Sad 		sc = kmem_alloc(roundup2(sizeof(selcluster_t),
10062e2855a6Sad 		    coherency_unit) + coherency_unit, KM_SLEEP);
1007feb4783fSad 		sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
10087364cd36Sad 		sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
100993e0e983Sad 		sleepq_init(&sc->sc_sleepq);
10103acbed8eSad 		sc->sc_ncoll = 0;
10116caaf101Smsaitoh 		sc->sc_mask = __BIT(index);
10122e2855a6Sad 		selcluster[index] = sc;
10132e2855a6Sad 	}
10142e2855a6Sad 	ci->ci_data.cpu_selcluster = sc;
10153acbed8eSad }
10163acbed8eSad 
10173acbed8eSad /*
10183acbed8eSad  * Initialize a selinfo record.
10193acbed8eSad  */
10203acbed8eSad void
10213acbed8eSad selinit(struct selinfo *sip)
10223acbed8eSad {
10233acbed8eSad 
10243acbed8eSad 	memset(sip, 0, sizeof(*sip));
1025576702f1Sthorpej 	klist_init(&sip->sel_klist);
10263acbed8eSad }
10273acbed8eSad 
10283acbed8eSad /*
10293acbed8eSad  * Destroy a selinfo record.  The owning object must not gain new
10303acbed8eSad  * references while this is in progress: all activity on the record
10313acbed8eSad  * must be stopped.
10323acbed8eSad  *
10333acbed8eSad  * Concurrency issues: we only need guard against a call to selclear()
10341ceff942Srmind  * by a thread exiting sel_do_scan().  The caller has prevented further
10351ceff942Srmind  * references being made to the selinfo record via selrecord(), and it
1036ce9dfd6aSrmind  * will not call selnotify() again.
10373acbed8eSad  */
10383acbed8eSad void
10393acbed8eSad seldestroy(struct selinfo *sip)
10403acbed8eSad {
10412e2855a6Sad 	selcluster_t *sc;
10427364cd36Sad 	kmutex_t *lock;
10433acbed8eSad 	lwp_t *l;
10443acbed8eSad 
1045576702f1Sthorpej 	klist_fini(&sip->sel_klist);
1046576702f1Sthorpej 
10473acbed8eSad 	if (sip->sel_lwp == NULL)
10483acbed8eSad 		return;
10493acbed8eSad 
10503acbed8eSad 	/*
10512e2855a6Sad 	 * Lock out selclear().  The selcluster pointer can't change while
10523acbed8eSad 	 * we are here since it is only ever changed in selrecord(),
10533acbed8eSad 	 * and that will not be entered again for this record because
10543acbed8eSad 	 * it is dying.
10553acbed8eSad 	 */
10562e2855a6Sad 	KASSERT(sip->sel_cluster != NULL);
10572e2855a6Sad 	sc = sip->sel_cluster;
10587364cd36Sad 	lock = sc->sc_lock;
10597364cd36Sad 	mutex_spin_enter(lock);
10603acbed8eSad 	if ((l = sip->sel_lwp) != NULL) {
10613acbed8eSad 		/*
10623acbed8eSad 		 * This should rarely happen, so although SLIST_REMOVE()
10633acbed8eSad 		 * is slow, using it here is not a problem.
10643acbed8eSad 		 */
10652e2855a6Sad 		KASSERT(l->l_selcluster == sc);
10663acbed8eSad 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
10673acbed8eSad 		sip->sel_lwp = NULL;
10683acbed8eSad 	}
10697364cd36Sad 	mutex_spin_exit(lock);
10703acbed8eSad }
10713acbed8eSad 
1072501f07ceSrmind /*
1073501f07ceSrmind  * System control nodes.
1074501f07ceSrmind  */
1075501f07ceSrmind SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup")
1076501f07ceSrmind {
1077501f07ceSrmind 
10784f6fb3bfSpooka 	sysctl_createv(clog, 0, NULL, NULL,
1079501f07ceSrmind 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1080501f07ceSrmind 		CTLTYPE_INT, "direct_select",
1081501f07ceSrmind 		SYSCTL_DESCR("Enable/disable direct select (for testing)"),
1082501f07ceSrmind 		NULL, 0, &direct_select, 0,
10834f6fb3bfSpooka 		CTL_KERN, CTL_CREATE, CTL_EOL);
1084501f07ceSrmind }
1085