xref: /openbsd-src/sys/kern/sys_generic.c (revision d1df930ffab53da22f3324c32bed7ac5709915e6)
1 /*	$OpenBSD: sys_generic.c,v 1.122 2018/08/20 16:00:22 mpi Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 #include <sys/sched.h>
61 #include <sys/pledge.h>
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 #include <uvm/uvm_extern.h>
67 
68 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
69 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
70 int pollout(struct pollfd *, struct pollfd *, u_int);
71 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
72     const struct timespec *, const sigset_t *, register_t *);
73 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
74     const sigset_t *, register_t *);
75 
76 int
77 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
78     unsigned int iovcnt, size_t *residp)
79 {
80 #ifdef KTRACE
81 	struct proc *p = curproc;
82 #endif
83 	struct iovec *iov;
84 	int error, i;
85 	size_t resid = 0;
86 
87 	if (iovcnt > UIO_SMALLIOV) {
88 		if (iovcnt > IOV_MAX)
89 			return (EINVAL);
90 		iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
91 	} else if (iovcnt > 0) {
92 		iov = aiov;
93 	} else {
94 		return (EINVAL);
95 	}
96 	*iovp = iov;
97 
98 	if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
99 		return (error);
100 
101 #ifdef KTRACE
102 	if (KTRPOINT(p, KTR_STRUCT))
103 		ktriovec(p, iov, iovcnt);
104 #endif
105 
106 	for (i = 0; i < iovcnt; i++) {
107 		resid += iov->iov_len;
108 		/*
109 		 * Writes return ssize_t because -1 is returned on error.
110 		 * Therefore we must restrict the length to SSIZE_MAX to
111 		 * avoid garbage return values.  Note that the addition is
112 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
113 		 */
114 		if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
115 			return (EINVAL);
116 		iov++;
117 	}
118 
119 	if (residp != NULL)
120 		*residp = resid;
121 
122 	return (0);
123 }
124 
125 void
126 iovec_free(struct iovec *iov, unsigned int iovcnt)
127 {
128 	if (iovcnt > UIO_SMALLIOV)
129 		free(iov, M_IOV, iovcnt * sizeof(*iov));
130 }
131 
132 /*
133  * Read system call.
134  */
135 int
136 sys_read(struct proc *p, void *v, register_t *retval)
137 {
138 	struct sys_read_args /* {
139 		syscallarg(int) fd;
140 		syscallarg(void *) buf;
141 		syscallarg(size_t) nbyte;
142 	} */ *uap = v;
143 	struct iovec iov;
144 	struct uio auio;
145 
146 	iov.iov_base = SCARG(uap, buf);
147 	iov.iov_len = SCARG(uap, nbyte);
148 	if (iov.iov_len > SSIZE_MAX)
149 		return (EINVAL);
150 
151 	auio.uio_iov = &iov;
152 	auio.uio_iovcnt = 1;
153 	auio.uio_resid = iov.iov_len;
154 
155 	return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
156 }
157 
158 /*
159  * Scatter read system call.
160  */
161 int
162 sys_readv(struct proc *p, void *v, register_t *retval)
163 {
164 	struct sys_readv_args /* {
165 		syscallarg(int) fd;
166 		syscallarg(const struct iovec *) iovp;
167 		syscallarg(int) iovcnt;
168 	} */ *uap = v;
169 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
170 	int error, iovcnt = SCARG(uap, iovcnt);
171 	struct uio auio;
172 	size_t resid;
173 
174 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
175 	if (error)
176 		goto done;
177 
178 	auio.uio_iov = iov;
179 	auio.uio_iovcnt = iovcnt;
180 	auio.uio_resid = resid;
181 
182 	error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
183  done:
184 	iovec_free(iov, iovcnt);
185 	return (error);
186 }
187 
188 int
189 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
190     register_t *retval)
191 {
192 	struct filedesc *fdp = p->p_fd;
193 	struct file *fp;
194 	long cnt, error = 0;
195 	u_int iovlen;
196 #ifdef KTRACE
197 	struct iovec *ktriov = NULL;
198 #endif
199 
200 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
201 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
202 
203 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
204 		return (EBADF);
205 
206 	/* Checks for positioned read. */
207 	if (flags & FO_POSITION) {
208 		struct vnode *vp = fp->f_data;
209 
210 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
211 		    (vp->v_flag & VISTTY)) {
212 			error = ESPIPE;
213 			goto done;
214 		}
215 
216 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
217 			error = EINVAL;
218 			goto done;
219 		}
220 	}
221 
222 	uio->uio_rw = UIO_READ;
223 	uio->uio_segflg = UIO_USERSPACE;
224 	uio->uio_procp = p;
225 #ifdef KTRACE
226 	/*
227 	 * if tracing, save a copy of iovec
228 	 */
229 	if (KTRPOINT(p, KTR_GENIO)) {
230 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
231 		memcpy(ktriov, uio->uio_iov, iovlen);
232 	}
233 #endif
234 	cnt = uio->uio_resid;
235 	error = (*fp->f_ops->fo_read)(fp, uio, flags);
236 	if (error) {
237 		if (uio->uio_resid != cnt && (error == ERESTART ||
238 		    error == EINTR || error == EWOULDBLOCK))
239 			error = 0;
240 	}
241 	cnt -= uio->uio_resid;
242 
243 	mtx_enter(&fp->f_mtx);
244 	fp->f_rxfer++;
245 	fp->f_rbytes += cnt;
246 	mtx_leave(&fp->f_mtx);
247 #ifdef KTRACE
248 	if (ktriov != NULL) {
249 		if (error == 0)
250 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
251 		free(ktriov, M_TEMP, iovlen);
252 	}
253 #endif
254 	*retval = cnt;
255  done:
256 	FRELE(fp, p);
257 	return (error);
258 }
259 
260 /*
261  * Write system call
262  */
263 int
264 sys_write(struct proc *p, void *v, register_t *retval)
265 {
266 	struct sys_write_args /* {
267 		syscallarg(int) fd;
268 		syscallarg(const void *) buf;
269 		syscallarg(size_t) nbyte;
270 	} */ *uap = v;
271 	struct iovec iov;
272 	struct uio auio;
273 
274 	iov.iov_base = (void *)SCARG(uap, buf);
275 	iov.iov_len = SCARG(uap, nbyte);
276 	if (iov.iov_len > SSIZE_MAX)
277 		return (EINVAL);
278 
279 	auio.uio_iov = &iov;
280 	auio.uio_iovcnt = 1;
281 	auio.uio_resid = iov.iov_len;
282 
283 	return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
284 }
285 
286 /*
287  * Gather write system call
288  */
289 int
290 sys_writev(struct proc *p, void *v, register_t *retval)
291 {
292 	struct sys_writev_args /* {
293 		syscallarg(int) fd;
294 		syscallarg(const struct iovec *) iovp;
295 		syscallarg(int) iovcnt;
296 	} */ *uap = v;
297 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
298 	int error, iovcnt = SCARG(uap, iovcnt);
299 	struct uio auio;
300 	size_t resid;
301 
302 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
303 	if (error)
304 		goto done;
305 
306 	auio.uio_iov = iov;
307 	auio.uio_iovcnt = iovcnt;
308 	auio.uio_resid = resid;
309 
310 	error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
311  done:
312 	iovec_free(iov, iovcnt);
313  	return (error);
314 }
315 
316 int
317 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
318     register_t *retval)
319 {
320 	struct filedesc *fdp = p->p_fd;
321 	struct file *fp;
322 	long cnt, error = 0;
323 	u_int iovlen;
324 #ifdef KTRACE
325 	struct iovec *ktriov = NULL;
326 #endif
327 
328 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
329 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
330 
331 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
332 		return (EBADF);
333 
334 	/* Checks for positioned write. */
335 	if (flags & FO_POSITION) {
336 		struct vnode *vp = fp->f_data;
337 
338 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
339 		    (vp->v_flag & VISTTY)) {
340 			error = ESPIPE;
341 			goto done;
342 		}
343 
344 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
345 			error = EINVAL;
346 			goto done;
347 		}
348 	}
349 
350 	uio->uio_rw = UIO_WRITE;
351 	uio->uio_segflg = UIO_USERSPACE;
352 	uio->uio_procp = p;
353 #ifdef KTRACE
354 	/*
355 	 * if tracing, save a copy of iovec
356 	 */
357 	if (KTRPOINT(p, KTR_GENIO)) {
358 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
359 		memcpy(ktriov, uio->uio_iov, iovlen);
360 	}
361 #endif
362 	cnt = uio->uio_resid;
363 	error = (*fp->f_ops->fo_write)(fp, uio, flags);
364 	if (error) {
365 		if (uio->uio_resid != cnt && (error == ERESTART ||
366 		    error == EINTR || error == EWOULDBLOCK))
367 			error = 0;
368 		if (error == EPIPE)
369 			ptsignal(p, SIGPIPE, STHREAD);
370 	}
371 	cnt -= uio->uio_resid;
372 
373 	mtx_enter(&fp->f_mtx);
374 	fp->f_wxfer++;
375 	fp->f_wbytes += cnt;
376 	mtx_leave(&fp->f_mtx);
377 #ifdef KTRACE
378 	if (ktriov != NULL) {
379 		if (error == 0)
380 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
381 		free(ktriov, M_TEMP, iovlen);
382 	}
383 #endif
384 	*retval = cnt;
385  done:
386 	FRELE(fp, p);
387 	return (error);
388 }
389 
390 /*
391  * Ioctl system call
392  */
393 int
394 sys_ioctl(struct proc *p, void *v, register_t *retval)
395 {
396 	struct sys_ioctl_args /* {
397 		syscallarg(int) fd;
398 		syscallarg(u_long) com;
399 		syscallarg(void *) data;
400 	} */ *uap = v;
401 	struct file *fp;
402 	struct filedesc *fdp = p->p_fd;
403 	u_long com = SCARG(uap, com);
404 	int error = 0;
405 	u_int size = 0;
406 	caddr_t data, memp = NULL;
407 	int tmp;
408 #define STK_PARAMS	128
409 	long long stkbuf[STK_PARAMS / sizeof(long long)];
410 
411 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
412 		return (EBADF);
413 
414 	if (fp->f_type == DTYPE_SOCKET) {
415 		struct socket *so = fp->f_data;
416 
417 		if (so->so_state & SS_DNS) {
418 			error = EINVAL;
419 			goto out;
420 		}
421 	}
422 
423 	error = pledge_ioctl(p, com, fp);
424 	if (error)
425 		goto out;
426 
427 	switch (com) {
428 	case FIONCLEX:
429 	case FIOCLEX:
430 		fdplock(fdp);
431 		if (com == FIONCLEX)
432 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
433 		else
434 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
435 		fdpunlock(fdp);
436 		goto out;
437 	}
438 
439 	/*
440 	 * Interpret high order word to find amount of data to be
441 	 * copied to/from the user's address space.
442 	 */
443 	size = IOCPARM_LEN(com);
444 	if (size > IOCPARM_MAX) {
445 		error = ENOTTY;
446 		goto out;
447 	}
448 	if (size > sizeof (stkbuf)) {
449 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
450 		data = memp;
451 	} else
452 		data = (caddr_t)stkbuf;
453 	if (com&IOC_IN) {
454 		if (size) {
455 			error = copyin(SCARG(uap, data), data, size);
456 			if (error) {
457 				goto out;
458 			}
459 		} else
460 			*(caddr_t *)data = SCARG(uap, data);
461 	} else if ((com&IOC_OUT) && size)
462 		/*
463 		 * Zero the buffer so the user always
464 		 * gets back something deterministic.
465 		 */
466 		memset(data, 0, size);
467 	else if (com&IOC_VOID)
468 		*(caddr_t *)data = SCARG(uap, data);
469 
470 	switch (com) {
471 
472 	case FIONBIO:
473 		if ((tmp = *(int *)data) != 0)
474 			fp->f_flag |= FNONBLOCK;
475 		else
476 			fp->f_flag &= ~FNONBLOCK;
477 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
478 		break;
479 
480 	case FIOASYNC:
481 		if ((tmp = *(int *)data) != 0)
482 			fp->f_flag |= FASYNC;
483 		else
484 			fp->f_flag &= ~FASYNC;
485 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
486 		break;
487 
488 	case FIOSETOWN:
489 		tmp = *(int *)data;
490 
491 		if (fp->f_type == DTYPE_SOCKET || fp->f_type == DTYPE_PIPE) {
492 			/* nothing */
493 		} else if (tmp <= 0) {
494 			tmp = -tmp;
495 		} else {
496 			struct process *pr = prfind(tmp);
497 			if (pr == NULL) {
498 				error = ESRCH;
499 				break;
500 			}
501 			tmp = pr->ps_pgrp->pg_id;
502 		}
503 		error = (*fp->f_ops->fo_ioctl)
504 		    (fp, TIOCSPGRP, (caddr_t)&tmp, p);
505 		break;
506 
507 	case FIOGETOWN:
508 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
509 		*(int *)data = -*(int *)data;
510 		break;
511 
512 	default:
513 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
514 		break;
515 	}
516 	/*
517 	 * Copy any data to user, size was
518 	 * already set and checked above.
519 	 */
520 	if (error == 0 && (com&IOC_OUT) && size)
521 		error = copyout(data, SCARG(uap, data), size);
522 out:
523 	FRELE(fp, p);
524 	free(memp, M_IOCTLOPS, size);
525 	return (error);
526 }
527 
528 int	selwait, nselcoll;
529 
530 /*
531  * Select system call.
532  */
533 int
534 sys_select(struct proc *p, void *v, register_t *retval)
535 {
536 	struct sys_select_args /* {
537 		syscallarg(int) nd;
538 		syscallarg(fd_set *) in;
539 		syscallarg(fd_set *) ou;
540 		syscallarg(fd_set *) ex;
541 		syscallarg(struct timeval *) tv;
542 	} */ *uap = v;
543 
544 	struct timespec ts, *tsp = NULL;
545 	int error;
546 
547 	if (SCARG(uap, tv) != NULL) {
548 		struct timeval tv;
549 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
550 			return (error);
551 		if ((error = itimerfix(&tv)) != 0)
552 			return (error);
553 #ifdef KTRACE
554 		if (KTRPOINT(p, KTR_STRUCT))
555 			ktrreltimeval(p, &tv);
556 #endif
557 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
558 		tsp = &ts;
559 	}
560 
561 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
562 	    SCARG(uap, ex), tsp, NULL, retval));
563 }
564 
565 int
566 sys_pselect(struct proc *p, void *v, register_t *retval)
567 {
568 	struct sys_pselect_args /* {
569 		syscallarg(int) nd;
570 		syscallarg(fd_set *) in;
571 		syscallarg(fd_set *) ou;
572 		syscallarg(fd_set *) ex;
573 		syscallarg(const struct timespec *) ts;
574 		syscallarg(const sigset_t *) mask;
575 	} */ *uap = v;
576 
577 	struct timespec ts, *tsp = NULL;
578 	sigset_t ss, *ssp = NULL;
579 	int error;
580 
581 	if (SCARG(uap, ts) != NULL) {
582 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
583 			return (error);
584 		if ((error = timespecfix(&ts)) != 0)
585 			return (error);
586 #ifdef KTRACE
587 		if (KTRPOINT(p, KTR_STRUCT))
588 			ktrreltimespec(p, &ts);
589 #endif
590 		tsp = &ts;
591 	}
592 	if (SCARG(uap, mask) != NULL) {
593 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
594 			return (error);
595 		ssp = &ss;
596 	}
597 
598 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
599 	    SCARG(uap, ex), tsp, ssp, retval));
600 }
601 
602 int
603 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
604     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
605 {
606 	fd_mask bits[6];
607 	fd_set *pibits[3], *pobits[3];
608 	struct timespec ats, rts, tts;
609 	int s, ncoll, error = 0, timo;
610 	u_int ni;
611 
612 	if (nd < 0)
613 		return (EINVAL);
614 	if (nd > p->p_fd->fd_nfiles) {
615 		/* forgiving; slightly wrong */
616 		nd = p->p_fd->fd_nfiles;
617 	}
618 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
619 	if (ni > sizeof(bits[0])) {
620 		caddr_t mbits;
621 
622 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
623 		pibits[0] = (fd_set *)&mbits[ni * 0];
624 		pibits[1] = (fd_set *)&mbits[ni * 1];
625 		pibits[2] = (fd_set *)&mbits[ni * 2];
626 		pobits[0] = (fd_set *)&mbits[ni * 3];
627 		pobits[1] = (fd_set *)&mbits[ni * 4];
628 		pobits[2] = (fd_set *)&mbits[ni * 5];
629 	} else {
630 		memset(bits, 0, sizeof(bits));
631 		pibits[0] = (fd_set *)&bits[0];
632 		pibits[1] = (fd_set *)&bits[1];
633 		pibits[2] = (fd_set *)&bits[2];
634 		pobits[0] = (fd_set *)&bits[3];
635 		pobits[1] = (fd_set *)&bits[4];
636 		pobits[2] = (fd_set *)&bits[5];
637 	}
638 
639 #define	getbits(name, x) \
640 	if (name && (error = copyin(name, pibits[x], ni))) \
641 		goto done;
642 	getbits(in, 0);
643 	getbits(ou, 1);
644 	getbits(ex, 2);
645 #undef	getbits
646 #ifdef KTRACE
647 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
648 		if (in) ktrfdset(p, pibits[0], ni);
649 		if (ou) ktrfdset(p, pibits[1], ni);
650 		if (ex) ktrfdset(p, pibits[2], ni);
651 	}
652 #endif
653 
654 	if (tsp) {
655 		getnanouptime(&rts);
656 		timespecadd(tsp, &rts, &ats);
657 	} else {
658 		ats.tv_sec = 0;
659 		ats.tv_nsec = 0;
660 	}
661 	timo = 0;
662 
663 	if (sigmask)
664 		dosigsuspend(p, *sigmask &~ sigcantmask);
665 
666 retry:
667 	ncoll = nselcoll;
668 	atomic_setbits_int(&p->p_flag, P_SELECT);
669 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
670 	if (error || *retval)
671 		goto done;
672 	if (tsp) {
673 		getnanouptime(&rts);
674 		if (timespeccmp(&rts, &ats, >=))
675 			goto done;
676 		timespecsub(&ats, &rts, &tts);
677 		timo = tts.tv_sec > 24 * 60 * 60 ?
678 			24 * 60 * 60 * hz : tstohz(&tts);
679 	}
680 	s = splhigh();
681 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
682 		splx(s);
683 		goto retry;
684 	}
685 	atomic_clearbits_int(&p->p_flag, P_SELECT);
686 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
687 	splx(s);
688 	if (error == 0)
689 		goto retry;
690 done:
691 	atomic_clearbits_int(&p->p_flag, P_SELECT);
692 	/* select is not restarted after signals... */
693 	if (error == ERESTART)
694 		error = EINTR;
695 	if (error == EWOULDBLOCK)
696 		error = 0;
697 #define	putbits(name, x) \
698 	if (name && (error2 = copyout(pobits[x], name, ni))) \
699 		error = error2;
700 	if (error == 0) {
701 		int error2;
702 
703 		putbits(in, 0);
704 		putbits(ou, 1);
705 		putbits(ex, 2);
706 #undef putbits
707 #ifdef KTRACE
708 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
709 			if (in) ktrfdset(p, pobits[0], ni);
710 			if (ou) ktrfdset(p, pobits[1], ni);
711 			if (ex) ktrfdset(p, pobits[2], ni);
712 		}
713 #endif
714 	}
715 
716 	if (pibits[0] != (fd_set *)&bits[0])
717 		free(pibits[0], M_TEMP, 6 * ni);
718 	return (error);
719 }
720 
721 int
722 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
723     register_t *retval)
724 {
725 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
726 	struct filedesc *fdp = p->p_fd;
727 	int msk, i, j, fd;
728 	fd_mask bits;
729 	struct file *fp;
730 	int n = 0;
731 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
732 
733 	for (msk = 0; msk < 3; msk++) {
734 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
735 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
736 
737 		for (i = 0; i < nfd; i += NFDBITS) {
738 			bits = pibits->fds_bits[i/NFDBITS];
739 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
740 				bits &= ~(1 << j);
741 				if ((fp = fd_getfile(fdp, fd)) == NULL)
742 					return (EBADF);
743 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
744 					FD_SET(fd, pobits);
745 					n++;
746 				}
747 				FRELE(fp, p);
748 			}
749 		}
750 	}
751 	*retval = n;
752 	return (0);
753 }
754 
755 int
756 seltrue(dev_t dev, int events, struct proc *p)
757 {
758 
759 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
760 }
761 
762 int
763 selfalse(dev_t dev, int events, struct proc *p)
764 {
765 
766 	return (0);
767 }
768 
769 /*
770  * Record a select request.
771  */
772 void
773 selrecord(struct proc *selector, struct selinfo *sip)
774 {
775 	struct proc *p;
776 	pid_t mytid;
777 
778 	mytid = selector->p_tid;
779 	if (sip->si_seltid == mytid)
780 		return;
781 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
782 	    p->p_wchan == (caddr_t)&selwait)
783 		sip->si_flags |= SI_COLL;
784 	else
785 		sip->si_seltid = mytid;
786 }
787 
788 /*
789  * Do a wakeup when a selectable event occurs.
790  */
791 void
792 selwakeup(struct selinfo *sip)
793 {
794 	struct proc *p;
795 	int s;
796 
797 	KNOTE(&sip->si_note, NOTE_SUBMIT);
798 	if (sip->si_seltid == 0)
799 		return;
800 	if (sip->si_flags & SI_COLL) {
801 		nselcoll++;
802 		sip->si_flags &= ~SI_COLL;
803 		wakeup(&selwait);
804 	}
805 	p = tfind(sip->si_seltid);
806 	sip->si_seltid = 0;
807 	if (p != NULL) {
808 		SCHED_LOCK(s);
809 		if (p->p_wchan == (caddr_t)&selwait) {
810 			if (p->p_stat == SSLEEP)
811 				setrunnable(p);
812 			else
813 				unsleep(p);
814 		} else if (p->p_flag & P_SELECT)
815 			atomic_clearbits_int(&p->p_flag, P_SELECT);
816 		SCHED_UNLOCK(s);
817 	}
818 }
819 
820 void
821 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
822 {
823 	struct filedesc *fdp = p->p_fd;
824 	struct file *fp;
825 	u_int i;
826 	int n = 0;
827 
828 	for (i = 0; i < nfd; i++, pl++) {
829 		/* Check the file descriptor. */
830 		if (pl->fd < 0) {
831 			pl->revents = 0;
832 			continue;
833 		}
834 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
835 			pl->revents = POLLNVAL;
836 			n++;
837 			continue;
838 		}
839 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
840 		FRELE(fp, p);
841 		if (pl->revents != 0)
842 			n++;
843 	}
844 	*retval = n;
845 }
846 
847 /*
848  * Only copyout the revents field.
849  */
850 int
851 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
852 {
853 	int error = 0;
854 	u_int i = 0;
855 
856 	while (!error && i++ < nfds) {
857 		error = copyout(&pl->revents, &upl->revents,
858 		    sizeof(upl->revents));
859 		pl++;
860 		upl++;
861 	}
862 
863 	return (error);
864 }
865 
866 /*
867  * We are using the same mechanism as select only we encode/decode args
868  * differently.
869  */
870 int
871 sys_poll(struct proc *p, void *v, register_t *retval)
872 {
873 	struct sys_poll_args /* {
874 		syscallarg(struct pollfd *) fds;
875 		syscallarg(u_int) nfds;
876 		syscallarg(int) timeout;
877 	} */ *uap = v;
878 
879 	struct timespec ts, *tsp = NULL;
880 	int msec = SCARG(uap, timeout);
881 
882 	if (msec != INFTIM) {
883 		if (msec < 0)
884 			return (EINVAL);
885 		ts.tv_sec = msec / 1000;
886 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
887 		tsp = &ts;
888 	}
889 
890 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
891 	    retval));
892 }
893 
894 int
895 sys_ppoll(struct proc *p, void *v, register_t *retval)
896 {
897 	struct sys_ppoll_args /* {
898 		syscallarg(struct pollfd *) fds;
899 		syscallarg(u_int) nfds;
900 		syscallarg(const struct timespec *) ts;
901 		syscallarg(const sigset_t *) mask;
902 	} */ *uap = v;
903 
904 	int error;
905 	struct timespec ts, *tsp = NULL;
906 	sigset_t ss, *ssp = NULL;
907 
908 	if (SCARG(uap, ts) != NULL) {
909 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
910 			return (error);
911 		if ((error = timespecfix(&ts)) != 0)
912 			return (error);
913 #ifdef KTRACE
914 		if (KTRPOINT(p, KTR_STRUCT))
915 			ktrreltimespec(p, &ts);
916 #endif
917 		tsp = &ts;
918 	}
919 
920 	if (SCARG(uap, mask) != NULL) {
921 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
922 			return (error);
923 		ssp = &ss;
924 	}
925 
926 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
927 	    retval));
928 }
929 
930 int
931 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
932     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
933 {
934 	size_t sz;
935 	struct pollfd pfds[4], *pl = pfds;
936 	struct timespec ats, rts, tts;
937 	int timo, ncoll, i, s, error;
938 
939 	/* Standards say no more than MAX_OPEN; this is possibly better. */
940 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
941 		return (EINVAL);
942 
943 	/* optimize for the default case, of a small nfds value */
944 	if (nfds > nitems(pfds)) {
945 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
946 		    M_WAITOK | M_CANFAIL);
947 		if (pl == NULL)
948 			return (EINVAL);
949 	}
950 
951 	sz = nfds * sizeof(*pl);
952 
953 	if ((error = copyin(fds, pl, sz)) != 0)
954 		goto bad;
955 
956 	for (i = 0; i < nfds; i++) {
957 		pl[i].events &= ~POLL_NOHUP;
958 		pl[i].revents = 0;
959 	}
960 
961 	if (tsp != NULL) {
962 		getnanouptime(&rts);
963 		timespecadd(tsp, &rts, &ats);
964 	} else {
965 		ats.tv_sec = 0;
966 		ats.tv_nsec = 0;
967 	}
968 	timo = 0;
969 
970 	if (sigmask)
971 		dosigsuspend(p, *sigmask &~ sigcantmask);
972 
973 retry:
974 	ncoll = nselcoll;
975 	atomic_setbits_int(&p->p_flag, P_SELECT);
976 	pollscan(p, pl, nfds, retval);
977 	if (*retval)
978 		goto done;
979 	if (tsp != NULL) {
980 		getnanouptime(&rts);
981 		if (timespeccmp(&rts, &ats, >=))
982 			goto done;
983 		timespecsub(&ats, &rts, &tts);
984 		timo = tts.tv_sec > 24 * 60 * 60 ?
985 			24 * 60 * 60 * hz : tstohz(&tts);
986 	}
987 	s = splhigh();
988 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
989 		splx(s);
990 		goto retry;
991 	}
992 	atomic_clearbits_int(&p->p_flag, P_SELECT);
993 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
994 	splx(s);
995 	if (error == 0)
996 		goto retry;
997 
998 done:
999 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1000 	/*
1001 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1002 	 *       ignored (since the whole point is to see what would block).
1003 	 */
1004 	switch (error) {
1005 	case ERESTART:
1006 		error = pollout(pl, fds, nfds);
1007 		if (error == 0)
1008 			error = EINTR;
1009 		break;
1010 	case EWOULDBLOCK:
1011 	case 0:
1012 		error = pollout(pl, fds, nfds);
1013 		break;
1014 	}
1015 #ifdef KTRACE
1016 	if (KTRPOINT(p, KTR_STRUCT))
1017 		ktrpollfd(p, pl, nfds);
1018 #endif /* KTRACE */
1019 bad:
1020 	if (pl != pfds)
1021 		free(pl, M_TEMP, sz);
1022 	return (error);
1023 }
1024 
1025 /*
1026  * utrace system call
1027  */
1028 int
1029 sys_utrace(struct proc *curp, void *v, register_t *retval)
1030 {
1031 #ifdef KTRACE
1032 	struct sys_utrace_args /* {
1033 		syscallarg(const char *) label;
1034 		syscallarg(const void *) addr;
1035 		syscallarg(size_t) len;
1036 	} */ *uap = v;
1037 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1038 	    SCARG(uap, len)));
1039 #else
1040 	return (0);
1041 #endif
1042 }
1043