xref: /openbsd-src/sys/kern/sys_generic.c (revision 24bb5fcea3ed904bc467217bdaadb5dfc618d5bf)
1 /*	$OpenBSD: sys_generic.c,v 1.135 2021/01/08 09:29:04 visa Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/time.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #ifdef KTRACE
59 #include <sys/ktrace.h>
60 #endif
61 #include <sys/sched.h>
62 #include <sys/pledge.h>
63 
64 #include <sys/mount.h>
65 #include <sys/syscallargs.h>
66 
67 #include <uvm/uvm_extern.h>
68 
69 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
70 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
71 int pollout(struct pollfd *, struct pollfd *, u_int);
72 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
73     struct timespec *, const sigset_t *, register_t *);
74 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
75     const sigset_t *, register_t *);
76 void doselwakeup(struct selinfo *);
77 
78 int
79 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
80     unsigned int iovcnt, size_t *residp)
81 {
82 #ifdef KTRACE
83 	struct proc *p = curproc;
84 #endif
85 	struct iovec *iov;
86 	int error, i;
87 	size_t resid = 0;
88 
89 	if (iovcnt > UIO_SMALLIOV) {
90 		if (iovcnt > IOV_MAX)
91 			return (EINVAL);
92 		iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
93 	} else if (iovcnt > 0) {
94 		iov = aiov;
95 	} else {
96 		return (EINVAL);
97 	}
98 	*iovp = iov;
99 
100 	if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
101 		return (error);
102 
103 #ifdef KTRACE
104 	if (KTRPOINT(p, KTR_STRUCT))
105 		ktriovec(p, iov, iovcnt);
106 #endif
107 
108 	for (i = 0; i < iovcnt; i++) {
109 		resid += iov->iov_len;
110 		/*
111 		 * Writes return ssize_t because -1 is returned on error.
112 		 * Therefore we must restrict the length to SSIZE_MAX to
113 		 * avoid garbage return values.  Note that the addition is
114 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
115 		 */
116 		if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
117 			return (EINVAL);
118 		iov++;
119 	}
120 
121 	if (residp != NULL)
122 		*residp = resid;
123 
124 	return (0);
125 }
126 
127 void
128 iovec_free(struct iovec *iov, unsigned int iovcnt)
129 {
130 	if (iovcnt > UIO_SMALLIOV)
131 		free(iov, M_IOV, iovcnt * sizeof(*iov));
132 }
133 
134 /*
135  * Read system call.
136  */
137 int
138 sys_read(struct proc *p, void *v, register_t *retval)
139 {
140 	struct sys_read_args /* {
141 		syscallarg(int) fd;
142 		syscallarg(void *) buf;
143 		syscallarg(size_t) nbyte;
144 	} */ *uap = v;
145 	struct iovec iov;
146 	struct uio auio;
147 
148 	iov.iov_base = SCARG(uap, buf);
149 	iov.iov_len = SCARG(uap, nbyte);
150 	if (iov.iov_len > SSIZE_MAX)
151 		return (EINVAL);
152 
153 	auio.uio_iov = &iov;
154 	auio.uio_iovcnt = 1;
155 	auio.uio_resid = iov.iov_len;
156 
157 	return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
158 }
159 
160 /*
161  * Scatter read system call.
162  */
163 int
164 sys_readv(struct proc *p, void *v, register_t *retval)
165 {
166 	struct sys_readv_args /* {
167 		syscallarg(int) fd;
168 		syscallarg(const struct iovec *) iovp;
169 		syscallarg(int) iovcnt;
170 	} */ *uap = v;
171 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
172 	int error, iovcnt = SCARG(uap, iovcnt);
173 	struct uio auio;
174 	size_t resid;
175 
176 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
177 	if (error)
178 		goto done;
179 
180 	auio.uio_iov = iov;
181 	auio.uio_iovcnt = iovcnt;
182 	auio.uio_resid = resid;
183 
184 	error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
185  done:
186 	iovec_free(iov, iovcnt);
187 	return (error);
188 }
189 
190 int
191 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
192     register_t *retval)
193 {
194 	struct filedesc *fdp = p->p_fd;
195 	struct file *fp;
196 	long cnt, error = 0;
197 	u_int iovlen;
198 #ifdef KTRACE
199 	struct iovec *ktriov = NULL;
200 #endif
201 
202 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
203 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
204 
205 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
206 		return (EBADF);
207 
208 	/* Checks for positioned read. */
209 	if (flags & FO_POSITION) {
210 		struct vnode *vp = fp->f_data;
211 
212 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
213 		    (vp->v_flag & VISTTY)) {
214 			error = ESPIPE;
215 			goto done;
216 		}
217 
218 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
219 			error = EINVAL;
220 			goto done;
221 		}
222 	}
223 
224 	uio->uio_rw = UIO_READ;
225 	uio->uio_segflg = UIO_USERSPACE;
226 	uio->uio_procp = p;
227 #ifdef KTRACE
228 	/*
229 	 * if tracing, save a copy of iovec
230 	 */
231 	if (KTRPOINT(p, KTR_GENIO)) {
232 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
233 		memcpy(ktriov, uio->uio_iov, iovlen);
234 	}
235 #endif
236 	cnt = uio->uio_resid;
237 	error = (*fp->f_ops->fo_read)(fp, uio, flags);
238 	if (error) {
239 		if (uio->uio_resid != cnt && (error == ERESTART ||
240 		    error == EINTR || error == EWOULDBLOCK))
241 			error = 0;
242 	}
243 	cnt -= uio->uio_resid;
244 
245 	mtx_enter(&fp->f_mtx);
246 	fp->f_rxfer++;
247 	fp->f_rbytes += cnt;
248 	mtx_leave(&fp->f_mtx);
249 #ifdef KTRACE
250 	if (ktriov != NULL) {
251 		if (error == 0)
252 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
253 		free(ktriov, M_TEMP, iovlen);
254 	}
255 #endif
256 	*retval = cnt;
257  done:
258 	FRELE(fp, p);
259 	return (error);
260 }
261 
262 /*
263  * Write system call
264  */
265 int
266 sys_write(struct proc *p, void *v, register_t *retval)
267 {
268 	struct sys_write_args /* {
269 		syscallarg(int) fd;
270 		syscallarg(const void *) buf;
271 		syscallarg(size_t) nbyte;
272 	} */ *uap = v;
273 	struct iovec iov;
274 	struct uio auio;
275 
276 	iov.iov_base = (void *)SCARG(uap, buf);
277 	iov.iov_len = SCARG(uap, nbyte);
278 	if (iov.iov_len > SSIZE_MAX)
279 		return (EINVAL);
280 
281 	auio.uio_iov = &iov;
282 	auio.uio_iovcnt = 1;
283 	auio.uio_resid = iov.iov_len;
284 
285 	return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
286 }
287 
288 /*
289  * Gather write system call
290  */
291 int
292 sys_writev(struct proc *p, void *v, register_t *retval)
293 {
294 	struct sys_writev_args /* {
295 		syscallarg(int) fd;
296 		syscallarg(const struct iovec *) iovp;
297 		syscallarg(int) iovcnt;
298 	} */ *uap = v;
299 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
300 	int error, iovcnt = SCARG(uap, iovcnt);
301 	struct uio auio;
302 	size_t resid;
303 
304 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
305 	if (error)
306 		goto done;
307 
308 	auio.uio_iov = iov;
309 	auio.uio_iovcnt = iovcnt;
310 	auio.uio_resid = resid;
311 
312 	error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
313  done:
314 	iovec_free(iov, iovcnt);
315  	return (error);
316 }
317 
318 int
319 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
320     register_t *retval)
321 {
322 	struct filedesc *fdp = p->p_fd;
323 	struct file *fp;
324 	long cnt, error = 0;
325 	u_int iovlen;
326 #ifdef KTRACE
327 	struct iovec *ktriov = NULL;
328 #endif
329 
330 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
331 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
332 
333 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
334 		return (EBADF);
335 
336 	/* Checks for positioned write. */
337 	if (flags & FO_POSITION) {
338 		struct vnode *vp = fp->f_data;
339 
340 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
341 		    (vp->v_flag & VISTTY)) {
342 			error = ESPIPE;
343 			goto done;
344 		}
345 
346 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
347 			error = EINVAL;
348 			goto done;
349 		}
350 	}
351 
352 	uio->uio_rw = UIO_WRITE;
353 	uio->uio_segflg = UIO_USERSPACE;
354 	uio->uio_procp = p;
355 #ifdef KTRACE
356 	/*
357 	 * if tracing, save a copy of iovec
358 	 */
359 	if (KTRPOINT(p, KTR_GENIO)) {
360 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
361 		memcpy(ktriov, uio->uio_iov, iovlen);
362 	}
363 #endif
364 	cnt = uio->uio_resid;
365 	error = (*fp->f_ops->fo_write)(fp, uio, flags);
366 	if (error) {
367 		if (uio->uio_resid != cnt && (error == ERESTART ||
368 		    error == EINTR || error == EWOULDBLOCK))
369 			error = 0;
370 		if (error == EPIPE) {
371 			KERNEL_LOCK();
372 			ptsignal(p, SIGPIPE, STHREAD);
373 			KERNEL_UNLOCK();
374 		}
375 	}
376 	cnt -= uio->uio_resid;
377 
378 	mtx_enter(&fp->f_mtx);
379 	fp->f_wxfer++;
380 	fp->f_wbytes += cnt;
381 	mtx_leave(&fp->f_mtx);
382 #ifdef KTRACE
383 	if (ktriov != NULL) {
384 		if (error == 0)
385 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
386 		free(ktriov, M_TEMP, iovlen);
387 	}
388 #endif
389 	*retval = cnt;
390  done:
391 	FRELE(fp, p);
392 	return (error);
393 }
394 
395 /*
396  * Ioctl system call
397  */
398 int
399 sys_ioctl(struct proc *p, void *v, register_t *retval)
400 {
401 	struct sys_ioctl_args /* {
402 		syscallarg(int) fd;
403 		syscallarg(u_long) com;
404 		syscallarg(void *) data;
405 	} */ *uap = v;
406 	struct file *fp;
407 	struct filedesc *fdp = p->p_fd;
408 	u_long com = SCARG(uap, com);
409 	int error = 0;
410 	u_int size = 0;
411 	caddr_t data, memp = NULL;
412 	int tmp;
413 #define STK_PARAMS	128
414 	long long stkbuf[STK_PARAMS / sizeof(long long)];
415 
416 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
417 		return (EBADF);
418 
419 	if (fp->f_type == DTYPE_SOCKET) {
420 		struct socket *so = fp->f_data;
421 
422 		if (so->so_state & SS_DNS) {
423 			error = EINVAL;
424 			goto out;
425 		}
426 	}
427 
428 	error = pledge_ioctl(p, com, fp);
429 	if (error)
430 		goto out;
431 
432 	switch (com) {
433 	case FIONCLEX:
434 	case FIOCLEX:
435 		fdplock(fdp);
436 		if (com == FIONCLEX)
437 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
438 		else
439 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
440 		fdpunlock(fdp);
441 		goto out;
442 	}
443 
444 	/*
445 	 * Interpret high order word to find amount of data to be
446 	 * copied to/from the user's address space.
447 	 */
448 	size = IOCPARM_LEN(com);
449 	if (size > IOCPARM_MAX) {
450 		error = ENOTTY;
451 		goto out;
452 	}
453 	if (size > sizeof (stkbuf)) {
454 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
455 		data = memp;
456 	} else
457 		data = (caddr_t)stkbuf;
458 	if (com&IOC_IN) {
459 		if (size) {
460 			error = copyin(SCARG(uap, data), data, size);
461 			if (error) {
462 				goto out;
463 			}
464 		} else
465 			*(caddr_t *)data = SCARG(uap, data);
466 	} else if ((com&IOC_OUT) && size)
467 		/*
468 		 * Zero the buffer so the user always
469 		 * gets back something deterministic.
470 		 */
471 		memset(data, 0, size);
472 	else if (com&IOC_VOID)
473 		*(caddr_t *)data = SCARG(uap, data);
474 
475 	switch (com) {
476 
477 	case FIONBIO:
478 		if ((tmp = *(int *)data) != 0)
479 			atomic_setbits_int(&fp->f_flag, FNONBLOCK);
480 		else
481 			atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
482 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
483 		break;
484 
485 	case FIOASYNC:
486 		if ((tmp = *(int *)data) != 0)
487 			atomic_setbits_int(&fp->f_flag, FASYNC);
488 		else
489 			atomic_clearbits_int(&fp->f_flag, FASYNC);
490 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
491 		break;
492 
493 	default:
494 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
495 		break;
496 	}
497 	/*
498 	 * Copy any data to user, size was
499 	 * already set and checked above.
500 	 */
501 	if (error == 0 && (com&IOC_OUT) && size)
502 		error = copyout(data, SCARG(uap, data), size);
503 out:
504 	FRELE(fp, p);
505 	free(memp, M_IOCTLOPS, size);
506 	return (error);
507 }
508 
509 int	selwait, nselcoll;
510 
511 /*
512  * Select system call.
513  */
514 int
515 sys_select(struct proc *p, void *v, register_t *retval)
516 {
517 	struct sys_select_args /* {
518 		syscallarg(int) nd;
519 		syscallarg(fd_set *) in;
520 		syscallarg(fd_set *) ou;
521 		syscallarg(fd_set *) ex;
522 		syscallarg(struct timeval *) tv;
523 	} */ *uap = v;
524 
525 	struct timespec ts, *tsp = NULL;
526 	int error;
527 
528 	if (SCARG(uap, tv) != NULL) {
529 		struct timeval tv;
530 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
531 			return (error);
532 #ifdef KTRACE
533 		if (KTRPOINT(p, KTR_STRUCT))
534 			ktrreltimeval(p, &tv);
535 #endif
536 		if (tv.tv_sec < 0 || !timerisvalid(&tv))
537 			return (EINVAL);
538 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
539 		tsp = &ts;
540 	}
541 
542 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
543 	    SCARG(uap, ex), tsp, NULL, retval));
544 }
545 
546 int
547 sys_pselect(struct proc *p, void *v, register_t *retval)
548 {
549 	struct sys_pselect_args /* {
550 		syscallarg(int) nd;
551 		syscallarg(fd_set *) in;
552 		syscallarg(fd_set *) ou;
553 		syscallarg(fd_set *) ex;
554 		syscallarg(const struct timespec *) ts;
555 		syscallarg(const sigset_t *) mask;
556 	} */ *uap = v;
557 
558 	struct timespec ts, *tsp = NULL;
559 	sigset_t ss, *ssp = NULL;
560 	int error;
561 
562 	if (SCARG(uap, ts) != NULL) {
563 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
564 			return (error);
565 #ifdef KTRACE
566 		if (KTRPOINT(p, KTR_STRUCT))
567 			ktrreltimespec(p, &ts);
568 #endif
569 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
570 			return (EINVAL);
571 		tsp = &ts;
572 	}
573 	if (SCARG(uap, mask) != NULL) {
574 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
575 			return (error);
576 		ssp = &ss;
577 	}
578 
579 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
580 	    SCARG(uap, ex), tsp, ssp, retval));
581 }
582 
583 int
584 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
585     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
586 {
587 	fd_mask bits[6];
588 	fd_set *pibits[3], *pobits[3];
589 	struct timespec elapsed, start, stop;
590 	uint64_t nsecs;
591 	int s, ncoll, error = 0;
592 	u_int ni;
593 
594 	if (nd < 0)
595 		return (EINVAL);
596 	if (nd > p->p_fd->fd_nfiles) {
597 		/* forgiving; slightly wrong */
598 		nd = p->p_fd->fd_nfiles;
599 	}
600 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
601 	if (ni > sizeof(bits[0])) {
602 		caddr_t mbits;
603 
604 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
605 		pibits[0] = (fd_set *)&mbits[ni * 0];
606 		pibits[1] = (fd_set *)&mbits[ni * 1];
607 		pibits[2] = (fd_set *)&mbits[ni * 2];
608 		pobits[0] = (fd_set *)&mbits[ni * 3];
609 		pobits[1] = (fd_set *)&mbits[ni * 4];
610 		pobits[2] = (fd_set *)&mbits[ni * 5];
611 	} else {
612 		memset(bits, 0, sizeof(bits));
613 		pibits[0] = (fd_set *)&bits[0];
614 		pibits[1] = (fd_set *)&bits[1];
615 		pibits[2] = (fd_set *)&bits[2];
616 		pobits[0] = (fd_set *)&bits[3];
617 		pobits[1] = (fd_set *)&bits[4];
618 		pobits[2] = (fd_set *)&bits[5];
619 	}
620 
621 #define	getbits(name, x) \
622 	if (name && (error = copyin(name, pibits[x], ni))) \
623 		goto done;
624 	getbits(in, 0);
625 	getbits(ou, 1);
626 	getbits(ex, 2);
627 #undef	getbits
628 #ifdef KTRACE
629 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
630 		if (in) ktrfdset(p, pibits[0], ni);
631 		if (ou) ktrfdset(p, pibits[1], ni);
632 		if (ex) ktrfdset(p, pibits[2], ni);
633 	}
634 #endif
635 
636 	if (sigmask)
637 		dosigsuspend(p, *sigmask &~ sigcantmask);
638 
639 retry:
640 	ncoll = nselcoll;
641 	atomic_setbits_int(&p->p_flag, P_SELECT);
642 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
643 	if (error || *retval)
644 		goto done;
645 	if (timeout == NULL || timespecisset(timeout)) {
646 		if (timeout != NULL) {
647 			getnanouptime(&start);
648 			nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
649 		} else
650 			nsecs = INFSLP;
651 		s = splhigh();
652 		if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
653 			splx(s);
654 			goto retry;
655 		}
656 		atomic_clearbits_int(&p->p_flag, P_SELECT);
657 		error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
658 		splx(s);
659 		if (timeout != NULL) {
660 			getnanouptime(&stop);
661 			timespecsub(&stop, &start, &elapsed);
662 			timespecsub(timeout, &elapsed, timeout);
663 			if (timeout->tv_sec < 0)
664 				timespecclear(timeout);
665 		}
666 		if (error == 0 || error == EWOULDBLOCK)
667 			goto retry;
668 	}
669 done:
670 	atomic_clearbits_int(&p->p_flag, P_SELECT);
671 	/* select is not restarted after signals... */
672 	if (error == ERESTART)
673 		error = EINTR;
674 	if (error == EWOULDBLOCK)
675 		error = 0;
676 #define	putbits(name, x) \
677 	if (name && (error2 = copyout(pobits[x], name, ni))) \
678 		error = error2;
679 	if (error == 0) {
680 		int error2;
681 
682 		putbits(in, 0);
683 		putbits(ou, 1);
684 		putbits(ex, 2);
685 #undef putbits
686 #ifdef KTRACE
687 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
688 			if (in) ktrfdset(p, pobits[0], ni);
689 			if (ou) ktrfdset(p, pobits[1], ni);
690 			if (ex) ktrfdset(p, pobits[2], ni);
691 		}
692 #endif
693 	}
694 
695 	if (pibits[0] != (fd_set *)&bits[0])
696 		free(pibits[0], M_TEMP, 6 * ni);
697 	return (error);
698 }
699 
700 int
701 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
702     register_t *retval)
703 {
704 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
705 	struct filedesc *fdp = p->p_fd;
706 	int msk, i, j, fd;
707 	fd_mask bits;
708 	struct file *fp;
709 	int n = 0;
710 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
711 
712 	for (msk = 0; msk < 3; msk++) {
713 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
714 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
715 
716 		for (i = 0; i < nfd; i += NFDBITS) {
717 			bits = pibits->fds_bits[i/NFDBITS];
718 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
719 				bits &= ~(1 << j);
720 				if ((fp = fd_getfile(fdp, fd)) == NULL)
721 					return (EBADF);
722 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
723 					FD_SET(fd, pobits);
724 					n++;
725 				}
726 				FRELE(fp, p);
727 			}
728 		}
729 	}
730 	*retval = n;
731 	return (0);
732 }
733 
734 int
735 seltrue(dev_t dev, int events, struct proc *p)
736 {
737 
738 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
739 }
740 
741 int
742 selfalse(dev_t dev, int events, struct proc *p)
743 {
744 
745 	return (0);
746 }
747 
748 /*
749  * Record a select request.
750  */
751 void
752 selrecord(struct proc *selector, struct selinfo *sip)
753 {
754 	struct proc *p;
755 	pid_t mytid;
756 
757 	KERNEL_ASSERT_LOCKED();
758 
759 	mytid = selector->p_tid;
760 	if (sip->si_seltid == mytid)
761 		return;
762 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
763 	    p->p_wchan == (caddr_t)&selwait)
764 		sip->si_flags |= SI_COLL;
765 	else
766 		sip->si_seltid = mytid;
767 }
768 
769 /*
770  * Do a wakeup when a selectable event occurs.
771  */
772 void
773 selwakeup(struct selinfo *sip)
774 {
775 	KERNEL_LOCK();
776 	KNOTE(&sip->si_note, NOTE_SUBMIT);
777 	doselwakeup(sip);
778 	KERNEL_UNLOCK();
779 }
780 
781 void
782 doselwakeup(struct selinfo *sip)
783 {
784 	struct proc *p;
785 
786 	KERNEL_ASSERT_LOCKED();
787 
788 	if (sip->si_seltid == 0)
789 		return;
790 	if (sip->si_flags & SI_COLL) {
791 		nselcoll++;
792 		sip->si_flags &= ~SI_COLL;
793 		wakeup(&selwait);
794 	}
795 	p = tfind(sip->si_seltid);
796 	sip->si_seltid = 0;
797 	if (p != NULL) {
798 		if (wakeup_proc(p, &selwait)) {
799 			/* nothing else to do */
800 		} else if (p->p_flag & P_SELECT)
801 			atomic_clearbits_int(&p->p_flag, P_SELECT);
802 	}
803 }
804 
805 void
806 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
807 {
808 	struct filedesc *fdp = p->p_fd;
809 	struct file *fp;
810 	u_int i;
811 	int n = 0;
812 
813 	for (i = 0; i < nfd; i++, pl++) {
814 		/* Check the file descriptor. */
815 		if (pl->fd < 0) {
816 			pl->revents = 0;
817 			continue;
818 		}
819 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
820 			pl->revents = POLLNVAL;
821 			n++;
822 			continue;
823 		}
824 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
825 		FRELE(fp, p);
826 		if (pl->revents != 0)
827 			n++;
828 	}
829 	*retval = n;
830 }
831 
832 /*
833  * Only copyout the revents field.
834  */
835 int
836 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
837 {
838 	int error = 0;
839 	u_int i = 0;
840 
841 	while (!error && i++ < nfds) {
842 		error = copyout(&pl->revents, &upl->revents,
843 		    sizeof(upl->revents));
844 		pl++;
845 		upl++;
846 	}
847 
848 	return (error);
849 }
850 
851 /*
852  * We are using the same mechanism as select only we encode/decode args
853  * differently.
854  */
855 int
856 sys_poll(struct proc *p, void *v, register_t *retval)
857 {
858 	struct sys_poll_args /* {
859 		syscallarg(struct pollfd *) fds;
860 		syscallarg(u_int) nfds;
861 		syscallarg(int) timeout;
862 	} */ *uap = v;
863 
864 	struct timespec ts, *tsp = NULL;
865 	int msec = SCARG(uap, timeout);
866 
867 	if (msec != INFTIM) {
868 		if (msec < 0)
869 			return (EINVAL);
870 		ts.tv_sec = msec / 1000;
871 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
872 		tsp = &ts;
873 	}
874 
875 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
876 	    retval));
877 }
878 
879 int
880 sys_ppoll(struct proc *p, void *v, register_t *retval)
881 {
882 	struct sys_ppoll_args /* {
883 		syscallarg(struct pollfd *) fds;
884 		syscallarg(u_int) nfds;
885 		syscallarg(const struct timespec *) ts;
886 		syscallarg(const sigset_t *) mask;
887 	} */ *uap = v;
888 
889 	int error;
890 	struct timespec ts, *tsp = NULL;
891 	sigset_t ss, *ssp = NULL;
892 
893 	if (SCARG(uap, ts) != NULL) {
894 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
895 			return (error);
896 #ifdef KTRACE
897 		if (KTRPOINT(p, KTR_STRUCT))
898 			ktrreltimespec(p, &ts);
899 #endif
900 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
901 			return (EINVAL);
902 		tsp = &ts;
903 	}
904 
905 	if (SCARG(uap, mask) != NULL) {
906 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
907 			return (error);
908 		ssp = &ss;
909 	}
910 
911 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
912 	    retval));
913 }
914 
915 int
916 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
917     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
918 {
919 	size_t sz;
920 	struct pollfd pfds[4], *pl = pfds;
921 	struct timespec elapsed, start, stop;
922 	uint64_t nsecs;
923 	int ncoll, i, s, error;
924 
925 	/* Standards say no more than MAX_OPEN; this is possibly better. */
926 	if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
927 		return (EINVAL);
928 
929 	/* optimize for the default case, of a small nfds value */
930 	if (nfds > nitems(pfds)) {
931 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
932 		    M_WAITOK | M_CANFAIL);
933 		if (pl == NULL)
934 			return (EINVAL);
935 	}
936 
937 	sz = nfds * sizeof(*pl);
938 
939 	if ((error = copyin(fds, pl, sz)) != 0)
940 		goto bad;
941 
942 	for (i = 0; i < nfds; i++) {
943 		pl[i].events &= ~POLL_NOHUP;
944 		pl[i].revents = 0;
945 	}
946 
947 	if (sigmask)
948 		dosigsuspend(p, *sigmask &~ sigcantmask);
949 
950 retry:
951 	ncoll = nselcoll;
952 	atomic_setbits_int(&p->p_flag, P_SELECT);
953 	pollscan(p, pl, nfds, retval);
954 	if (*retval)
955 		goto done;
956 	if (timeout == NULL || timespecisset(timeout)) {
957 		if (timeout != NULL) {
958 			getnanouptime(&start);
959 			nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
960 		} else
961 			nsecs = INFSLP;
962 		s = splhigh();
963 		if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
964 			splx(s);
965 			goto retry;
966 		}
967 		atomic_clearbits_int(&p->p_flag, P_SELECT);
968 		error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs);
969 		splx(s);
970 		if (timeout != NULL) {
971 			getnanouptime(&stop);
972 			timespecsub(&stop, &start, &elapsed);
973 			timespecsub(timeout, &elapsed, timeout);
974 			if (timeout->tv_sec < 0)
975 				timespecclear(timeout);
976 		}
977 		if (error == 0 || error == EWOULDBLOCK)
978 			goto retry;
979 	}
980 
981 done:
982 	atomic_clearbits_int(&p->p_flag, P_SELECT);
983 	/*
984 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
985 	 *       ignored (since the whole point is to see what would block).
986 	 */
987 	switch (error) {
988 	case ERESTART:
989 		error = pollout(pl, fds, nfds);
990 		if (error == 0)
991 			error = EINTR;
992 		break;
993 	case EWOULDBLOCK:
994 	case 0:
995 		error = pollout(pl, fds, nfds);
996 		break;
997 	}
998 #ifdef KTRACE
999 	if (KTRPOINT(p, KTR_STRUCT))
1000 		ktrpollfd(p, pl, nfds);
1001 #endif /* KTRACE */
1002 bad:
1003 	if (pl != pfds)
1004 		free(pl, M_TEMP, sz);
1005 	return (error);
1006 }
1007 
1008 /*
1009  * utrace system call
1010  */
1011 int
1012 sys_utrace(struct proc *curp, void *v, register_t *retval)
1013 {
1014 #ifdef KTRACE
1015 	struct sys_utrace_args /* {
1016 		syscallarg(const char *) label;
1017 		syscallarg(const void *) addr;
1018 		syscallarg(size_t) len;
1019 	} */ *uap = v;
1020 
1021 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1022 	    SCARG(uap, len)));
1023 #else
1024 	return (0);
1025 #endif
1026 }
1027