xref: /openbsd-src/sys/kern/sys_generic.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*	$OpenBSD: sys_generic.c,v 1.116 2018/01/02 06:38:45 guenther Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/socketvar.h>
50 #include <sys/signalvar.h>
51 #include <sys/uio.h>
52 #include <sys/kernel.h>
53 #include <sys/stat.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #ifdef KTRACE
57 #include <sys/ktrace.h>
58 #endif
59 #include <sys/sched.h>
60 #include <sys/pledge.h>
61 
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
68 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
69 int pollout(struct pollfd *, struct pollfd *, u_int);
70 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
71     const struct timespec *, const sigset_t *, register_t *);
72 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
73     const sigset_t *, register_t *);
74 
75 /*
76  * Read system call.
77  */
78 int
79 sys_read(struct proc *p, void *v, register_t *retval)
80 {
81 	struct sys_read_args /* {
82 		syscallarg(int) fd;
83 		syscallarg(void *) buf;
84 		syscallarg(size_t) nbyte;
85 	} */ *uap = v;
86 	struct iovec iov;
87 	int fd = SCARG(uap, fd);
88 	struct file *fp;
89 	struct filedesc *fdp = p->p_fd;
90 
91 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
92 		return (EBADF);
93 
94 	iov.iov_base = SCARG(uap, buf);
95 	iov.iov_len = SCARG(uap, nbyte);
96 
97 	FREF(fp);
98 
99 	/* dofilereadv() will FRELE the descriptor for us */
100 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
101 }
102 
103 /*
104  * Scatter read system call.
105  */
106 int
107 sys_readv(struct proc *p, void *v, register_t *retval)
108 {
109 	struct sys_readv_args /* {
110 		syscallarg(int) fd;
111 		syscallarg(const struct iovec *) iovp;
112 		syscallarg(int) iovcnt;
113 	} */ *uap = v;
114 	int fd = SCARG(uap, fd);
115 	struct file *fp;
116 	struct filedesc *fdp = p->p_fd;
117 
118 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
119 		return (EBADF);
120 	FREF(fp);
121 
122 	/* dofilereadv() will FRELE the descriptor for us */
123 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
124 	    &fp->f_offset, retval));
125 }
126 
127 int
128 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
129     int iovcnt, int userspace, off_t *offset, register_t *retval)
130 {
131 	struct iovec aiov[UIO_SMALLIOV];
132 	struct uio auio;
133 	struct iovec *iov;
134 	struct iovec *needfree = NULL;
135 	long i, cnt, error = 0;
136 	u_int iovlen;
137 #ifdef KTRACE
138 	struct iovec *ktriov = NULL;
139 #endif
140 
141 	/* note: can't use iovlen until iovcnt is validated */
142 	iovlen = iovcnt * sizeof(struct iovec);
143 
144 	/*
145 	 * If the iovec array exists in userspace, it needs to be copied in;
146 	 * otherwise, it can be used directly.
147 	 */
148 	if (userspace) {
149 		if ((u_int)iovcnt > UIO_SMALLIOV) {
150 			if ((u_int)iovcnt > IOV_MAX) {
151 				error = EINVAL;
152 				goto out;
153 			}
154 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
155 		} else if ((u_int)iovcnt > 0) {
156 			iov = aiov;
157 			needfree = NULL;
158 		} else {
159 			error = EINVAL;
160 			goto out;
161 		}
162 		if ((error = copyin(iovp, iov, iovlen)))
163 			goto done;
164 #ifdef KTRACE
165 		if (KTRPOINT(p, KTR_STRUCT))
166 			ktriovec(p, iov, iovcnt);
167 #endif
168 	} else {
169 		iov = (struct iovec *)iovp;		/* de-constify */
170 	}
171 
172 	auio.uio_iov = iov;
173 	auio.uio_iovcnt = iovcnt;
174 	auio.uio_rw = UIO_READ;
175 	auio.uio_segflg = UIO_USERSPACE;
176 	auio.uio_procp = p;
177 	auio.uio_resid = 0;
178 	for (i = 0; i < iovcnt; i++) {
179 		auio.uio_resid += iov->iov_len;
180 		/*
181 		 * Reads return ssize_t because -1 is returned on error.
182 		 * Therefore we must restrict the length to SSIZE_MAX to
183 		 * avoid garbage return values.  Note that the addition is
184 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
185 		 */
186 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
187 			error = EINVAL;
188 			goto done;
189 		}
190 		iov++;
191 	}
192 #ifdef KTRACE
193 	/*
194 	 * if tracing, save a copy of iovec
195 	 */
196 	if (KTRPOINT(p, KTR_GENIO)) {
197 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
198 		memcpy(ktriov, auio.uio_iov, iovlen);
199 	}
200 #endif
201 	cnt = auio.uio_resid;
202 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
203 	if (error)
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	cnt -= auio.uio_resid;
208 
209 	fp->f_rxfer++;
210 	fp->f_rbytes += cnt;
211 #ifdef KTRACE
212 	if (ktriov != NULL) {
213 		if (error == 0)
214 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
215 		free(ktriov, M_TEMP, iovlen);
216 	}
217 #endif
218 	*retval = cnt;
219  done:
220 	if (needfree)
221 		free(needfree, M_IOV, iovlen);
222  out:
223 	FRELE(fp, p);
224 	return (error);
225 }
226 
227 /*
228  * Write system call
229  */
230 int
231 sys_write(struct proc *p, void *v, register_t *retval)
232 {
233 	struct sys_write_args /* {
234 		syscallarg(int) fd;
235 		syscallarg(const void *) buf;
236 		syscallarg(size_t) nbyte;
237 	} */ *uap = v;
238 	struct iovec iov;
239 	int fd = SCARG(uap, fd);
240 	struct file *fp;
241 	struct filedesc *fdp = p->p_fd;
242 
243 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
244 		return (EBADF);
245 
246 	iov.iov_base = (void *)SCARG(uap, buf);
247 	iov.iov_len = SCARG(uap, nbyte);
248 
249 	FREF(fp);
250 
251 	/* dofilewritev() will FRELE the descriptor for us */
252 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
253 }
254 
255 /*
256  * Gather write system call
257  */
258 int
259 sys_writev(struct proc *p, void *v, register_t *retval)
260 {
261 	struct sys_writev_args /* {
262 		syscallarg(int) fd;
263 		syscallarg(const struct iovec *) iovp;
264 		syscallarg(int) iovcnt;
265 	} */ *uap = v;
266 	int fd = SCARG(uap, fd);
267 	struct file *fp;
268 	struct filedesc *fdp = p->p_fd;
269 
270 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
271 		return (EBADF);
272 	FREF(fp);
273 
274 	/* dofilewritev() will FRELE the descriptor for us */
275 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
276 	    &fp->f_offset, retval));
277 }
278 
279 int
280 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
281     int iovcnt, int userspace, off_t *offset, register_t *retval)
282 {
283 	struct iovec aiov[UIO_SMALLIOV];
284 	struct uio auio;
285 	struct iovec *iov;
286 	struct iovec *needfree = NULL;
287 	long i, cnt, error = 0;
288 	u_int iovlen;
289 #ifdef KTRACE
290 	struct iovec *ktriov = NULL;
291 #endif
292 
293 	/* note: can't use iovlen until iovcnt is validated */
294 	iovlen = iovcnt * sizeof(struct iovec);
295 
296 	/*
297 	 * If the iovec array exists in userspace, it needs to be copied in;
298 	 * otherwise, it can be used directly.
299 	 */
300 	if (userspace) {
301 		if ((u_int)iovcnt > UIO_SMALLIOV) {
302 			if ((u_int)iovcnt > IOV_MAX) {
303 				error = EINVAL;
304 				goto out;
305 			}
306 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
307 		} else if ((u_int)iovcnt > 0) {
308 			iov = aiov;
309 			needfree = NULL;
310 		} else {
311 			error = EINVAL;
312 			goto out;
313 		}
314 		if ((error = copyin(iovp, iov, iovlen)))
315 			goto done;
316 #ifdef KTRACE
317 		if (KTRPOINT(p, KTR_STRUCT))
318 			ktriovec(p, iov, iovcnt);
319 #endif
320 	} else {
321 		iov = (struct iovec *)iovp;		/* de-constify */
322 	}
323 
324 	auio.uio_iov = iov;
325 	auio.uio_iovcnt = iovcnt;
326 	auio.uio_rw = UIO_WRITE;
327 	auio.uio_segflg = UIO_USERSPACE;
328 	auio.uio_procp = p;
329 	auio.uio_resid = 0;
330 	for (i = 0; i < iovcnt; i++) {
331 		auio.uio_resid += iov->iov_len;
332 		/*
333 		 * Writes return ssize_t because -1 is returned on error.
334 		 * Therefore we must restrict the length to SSIZE_MAX to
335 		 * avoid garbage return values.  Note that the addition is
336 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
337 		 */
338 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
339 			error = EINVAL;
340 			goto done;
341 		}
342 		iov++;
343 	}
344 #ifdef KTRACE
345 	/*
346 	 * if tracing, save a copy of iovec
347 	 */
348 	if (KTRPOINT(p, KTR_GENIO)) {
349 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
350 		memcpy(ktriov, auio.uio_iov, iovlen);
351 	}
352 #endif
353 	cnt = auio.uio_resid;
354 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
355 	if (error) {
356 		if (auio.uio_resid != cnt && (error == ERESTART ||
357 		    error == EINTR || error == EWOULDBLOCK))
358 			error = 0;
359 		if (error == EPIPE)
360 			ptsignal(p, SIGPIPE, STHREAD);
361 	}
362 	cnt -= auio.uio_resid;
363 
364 	fp->f_wxfer++;
365 	fp->f_wbytes += cnt;
366 #ifdef KTRACE
367 	if (ktriov != NULL) {
368 		if (error == 0)
369 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
370 		free(ktriov, M_TEMP, iovlen);
371 	}
372 #endif
373 	*retval = cnt;
374  done:
375 	if (needfree)
376 		free(needfree, M_IOV, iovlen);
377  out:
378 	FRELE(fp, p);
379 	return (error);
380 }
381 
382 /*
383  * Ioctl system call
384  */
385 int
386 sys_ioctl(struct proc *p, void *v, register_t *retval)
387 {
388 	struct sys_ioctl_args /* {
389 		syscallarg(int) fd;
390 		syscallarg(u_long) com;
391 		syscallarg(void *) data;
392 	} */ *uap = v;
393 	struct file *fp;
394 	struct filedesc *fdp;
395 	u_long com = SCARG(uap, com);
396 	int error;
397 	u_int size;
398 	caddr_t data, memp;
399 	int tmp;
400 #define STK_PARAMS	128
401 	long long stkbuf[STK_PARAMS / sizeof(long long)];
402 
403 	fdp = p->p_fd;
404 	fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE);
405 
406 	if (fp == NULL)
407 		return (EBADF);
408 
409 	if (fp->f_type == DTYPE_SOCKET) {
410 		struct socket *so = fp->f_data;
411 
412 		if (so->so_state & SS_DNS)
413 			return (EINVAL);
414 	}
415 
416 	error = pledge_ioctl(p, com, fp);
417 	if (error)
418 		return (error);
419 
420 	switch (com) {
421 	case FIONCLEX:
422 	case FIOCLEX:
423 		fdplock(fdp);
424 		if (com == FIONCLEX)
425 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
426 		else
427 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
428 		fdpunlock(fdp);
429 		return (0);
430 	}
431 
432 	/*
433 	 * Interpret high order word to find amount of data to be
434 	 * copied to/from the user's address space.
435 	 */
436 	size = IOCPARM_LEN(com);
437 	if (size > IOCPARM_MAX)
438 		return (ENOTTY);
439 	FREF(fp);
440 	memp = NULL;
441 	if (size > sizeof (stkbuf)) {
442 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
443 		data = memp;
444 	} else
445 		data = (caddr_t)stkbuf;
446 	if (com&IOC_IN) {
447 		if (size) {
448 			error = copyin(SCARG(uap, data), data, size);
449 			if (error) {
450 				goto out;
451 			}
452 		} else
453 			*(caddr_t *)data = SCARG(uap, data);
454 	} else if ((com&IOC_OUT) && size)
455 		/*
456 		 * Zero the buffer so the user always
457 		 * gets back something deterministic.
458 		 */
459 		memset(data, 0, size);
460 	else if (com&IOC_VOID)
461 		*(caddr_t *)data = SCARG(uap, data);
462 
463 	switch (com) {
464 
465 	case FIONBIO:
466 		if ((tmp = *(int *)data) != 0)
467 			fp->f_flag |= FNONBLOCK;
468 		else
469 			fp->f_flag &= ~FNONBLOCK;
470 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
471 		break;
472 
473 	case FIOASYNC:
474 		if ((tmp = *(int *)data) != 0)
475 			fp->f_flag |= FASYNC;
476 		else
477 			fp->f_flag &= ~FASYNC;
478 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
479 		break;
480 
481 	case FIOSETOWN:
482 		tmp = *(int *)data;
483 		if (fp->f_type == DTYPE_SOCKET) {
484 			struct socket *so = fp->f_data;
485 
486 			so->so_pgid = tmp;
487 			so->so_siguid = p->p_ucred->cr_ruid;
488 			so->so_sigeuid = p->p_ucred->cr_uid;
489 			error = 0;
490 			break;
491 		}
492 		if (tmp <= 0) {
493 			tmp = -tmp;
494 		} else {
495 			struct process *pr = prfind(tmp);
496 			if (pr == NULL) {
497 				error = ESRCH;
498 				break;
499 			}
500 			tmp = pr->ps_pgrp->pg_id;
501 		}
502 		error = (*fp->f_ops->fo_ioctl)
503 		    (fp, TIOCSPGRP, (caddr_t)&tmp, p);
504 		break;
505 
506 	case FIOGETOWN:
507 		if (fp->f_type == DTYPE_SOCKET) {
508 			error = 0;
509 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
510 			break;
511 		}
512 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
513 		*(int *)data = -*(int *)data;
514 		break;
515 
516 	default:
517 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
518 		break;
519 	}
520 	/*
521 	 * Copy any data to user, size was
522 	 * already set and checked above.
523 	 */
524 	if (error == 0 && (com&IOC_OUT) && size)
525 		error = copyout(data, SCARG(uap, data), size);
526 out:
527 	FRELE(fp, p);
528 	if (memp)
529 		free(memp, M_IOCTLOPS, size);
530 	return (error);
531 }
532 
533 int	selwait, nselcoll;
534 
535 /*
536  * Select system call.
537  */
538 int
539 sys_select(struct proc *p, void *v, register_t *retval)
540 {
541 	struct sys_select_args /* {
542 		syscallarg(int) nd;
543 		syscallarg(fd_set *) in;
544 		syscallarg(fd_set *) ou;
545 		syscallarg(fd_set *) ex;
546 		syscallarg(struct timeval *) tv;
547 	} */ *uap = v;
548 
549 	struct timespec ts, *tsp = NULL;
550 	int error;
551 
552 	if (SCARG(uap, tv) != NULL) {
553 		struct timeval tv;
554 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
555 			return (error);
556 		if ((error = itimerfix(&tv)) != 0)
557 			return (error);
558 #ifdef KTRACE
559 		if (KTRPOINT(p, KTR_STRUCT))
560 			ktrreltimeval(p, &tv);
561 #endif
562 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
563 		tsp = &ts;
564 	}
565 
566 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
567 	    SCARG(uap, ex), tsp, NULL, retval));
568 }
569 
570 int
571 sys_pselect(struct proc *p, void *v, register_t *retval)
572 {
573 	struct sys_pselect_args /* {
574 		syscallarg(int) nd;
575 		syscallarg(fd_set *) in;
576 		syscallarg(fd_set *) ou;
577 		syscallarg(fd_set *) ex;
578 		syscallarg(const struct timespec *) ts;
579 		syscallarg(const sigset_t *) mask;
580 	} */ *uap = v;
581 
582 	struct timespec ts, *tsp = NULL;
583 	sigset_t ss, *ssp = NULL;
584 	int error;
585 
586 	if (SCARG(uap, ts) != NULL) {
587 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
588 			return (error);
589 		if ((error = timespecfix(&ts)) != 0)
590 			return (error);
591 #ifdef KTRACE
592 		if (KTRPOINT(p, KTR_STRUCT))
593 			ktrreltimespec(p, &ts);
594 #endif
595 		tsp = &ts;
596 	}
597 	if (SCARG(uap, mask) != NULL) {
598 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
599 			return (error);
600 		ssp = &ss;
601 	}
602 
603 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
604 	    SCARG(uap, ex), tsp, ssp, retval));
605 }
606 
607 int
608 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
609     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
610 {
611 	fd_mask bits[6];
612 	fd_set *pibits[3], *pobits[3];
613 	struct timespec ats, rts, tts;
614 	int s, ncoll, error = 0, timo;
615 	u_int ni;
616 
617 	if (nd < 0)
618 		return (EINVAL);
619 	if (nd > p->p_fd->fd_nfiles) {
620 		/* forgiving; slightly wrong */
621 		nd = p->p_fd->fd_nfiles;
622 	}
623 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
624 	if (ni > sizeof(bits[0])) {
625 		caddr_t mbits;
626 
627 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
628 		pibits[0] = (fd_set *)&mbits[ni * 0];
629 		pibits[1] = (fd_set *)&mbits[ni * 1];
630 		pibits[2] = (fd_set *)&mbits[ni * 2];
631 		pobits[0] = (fd_set *)&mbits[ni * 3];
632 		pobits[1] = (fd_set *)&mbits[ni * 4];
633 		pobits[2] = (fd_set *)&mbits[ni * 5];
634 	} else {
635 		memset(bits, 0, sizeof(bits));
636 		pibits[0] = (fd_set *)&bits[0];
637 		pibits[1] = (fd_set *)&bits[1];
638 		pibits[2] = (fd_set *)&bits[2];
639 		pobits[0] = (fd_set *)&bits[3];
640 		pobits[1] = (fd_set *)&bits[4];
641 		pobits[2] = (fd_set *)&bits[5];
642 	}
643 
644 #define	getbits(name, x) \
645 	if (name && (error = copyin(name, pibits[x], ni))) \
646 		goto done;
647 	getbits(in, 0);
648 	getbits(ou, 1);
649 	getbits(ex, 2);
650 #undef	getbits
651 #ifdef KTRACE
652 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
653 		if (in) ktrfdset(p, pibits[0], ni);
654 		if (ou) ktrfdset(p, pibits[1], ni);
655 		if (ex) ktrfdset(p, pibits[2], ni);
656 	}
657 #endif
658 
659 	if (tsp) {
660 		getnanouptime(&rts);
661 		timespecadd(tsp, &rts, &ats);
662 	} else {
663 		ats.tv_sec = 0;
664 		ats.tv_nsec = 0;
665 	}
666 	timo = 0;
667 
668 	if (sigmask)
669 		dosigsuspend(p, *sigmask &~ sigcantmask);
670 
671 retry:
672 	ncoll = nselcoll;
673 	atomic_setbits_int(&p->p_flag, P_SELECT);
674 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
675 	if (error || *retval)
676 		goto done;
677 	if (tsp) {
678 		getnanouptime(&rts);
679 		if (timespeccmp(&rts, &ats, >=))
680 			goto done;
681 		timespecsub(&ats, &rts, &tts);
682 		timo = tts.tv_sec > 24 * 60 * 60 ?
683 			24 * 60 * 60 * hz : tstohz(&tts);
684 	}
685 	s = splhigh();
686 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
687 		splx(s);
688 		goto retry;
689 	}
690 	atomic_clearbits_int(&p->p_flag, P_SELECT);
691 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
692 	splx(s);
693 	if (error == 0)
694 		goto retry;
695 done:
696 	atomic_clearbits_int(&p->p_flag, P_SELECT);
697 	/* select is not restarted after signals... */
698 	if (error == ERESTART)
699 		error = EINTR;
700 	if (error == EWOULDBLOCK)
701 		error = 0;
702 #define	putbits(name, x) \
703 	if (name && (error2 = copyout(pobits[x], name, ni))) \
704 		error = error2;
705 	if (error == 0) {
706 		int error2;
707 
708 		putbits(in, 0);
709 		putbits(ou, 1);
710 		putbits(ex, 2);
711 #undef putbits
712 #ifdef KTRACE
713 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
714 			if (in) ktrfdset(p, pobits[0], ni);
715 			if (ou) ktrfdset(p, pobits[1], ni);
716 			if (ex) ktrfdset(p, pobits[2], ni);
717 		}
718 #endif
719 	}
720 
721 	if (pibits[0] != (fd_set *)&bits[0])
722 		free(pibits[0], M_TEMP, 6 * ni);
723 	return (error);
724 }
725 
726 int
727 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
728     register_t *retval)
729 {
730 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
731 	struct filedesc *fdp = p->p_fd;
732 	int msk, i, j, fd;
733 	fd_mask bits;
734 	struct file *fp;
735 	int n = 0;
736 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
737 
738 	for (msk = 0; msk < 3; msk++) {
739 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
740 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
741 
742 		for (i = 0; i < nfd; i += NFDBITS) {
743 			bits = pibits->fds_bits[i/NFDBITS];
744 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
745 				bits &= ~(1 << j);
746 				if ((fp = fd_getfile(fdp, fd)) == NULL)
747 					return (EBADF);
748 				FREF(fp);
749 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
750 					FD_SET(fd, pobits);
751 					n++;
752 				}
753 				FRELE(fp, p);
754 			}
755 		}
756 	}
757 	*retval = n;
758 	return (0);
759 }
760 
761 int
762 seltrue(dev_t dev, int events, struct proc *p)
763 {
764 
765 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
766 }
767 
768 int
769 selfalse(dev_t dev, int events, struct proc *p)
770 {
771 
772 	return (0);
773 }
774 
775 /*
776  * Record a select request.
777  */
778 void
779 selrecord(struct proc *selector, struct selinfo *sip)
780 {
781 	struct proc *p;
782 	pid_t mytid;
783 
784 	mytid = selector->p_tid;
785 	if (sip->si_seltid == mytid)
786 		return;
787 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
788 	    p->p_wchan == (caddr_t)&selwait)
789 		sip->si_flags |= SI_COLL;
790 	else
791 		sip->si_seltid = mytid;
792 }
793 
794 /*
795  * Do a wakeup when a selectable event occurs.
796  */
797 void
798 selwakeup(struct selinfo *sip)
799 {
800 	struct proc *p;
801 	int s;
802 
803 	KNOTE(&sip->si_note, NOTE_SUBMIT);
804 	if (sip->si_seltid == 0)
805 		return;
806 	if (sip->si_flags & SI_COLL) {
807 		nselcoll++;
808 		sip->si_flags &= ~SI_COLL;
809 		wakeup(&selwait);
810 	}
811 	p = tfind(sip->si_seltid);
812 	sip->si_seltid = 0;
813 	if (p != NULL) {
814 		SCHED_LOCK(s);
815 		if (p->p_wchan == (caddr_t)&selwait) {
816 			if (p->p_stat == SSLEEP)
817 				setrunnable(p);
818 			else
819 				unsleep(p);
820 		} else if (p->p_flag & P_SELECT)
821 			atomic_clearbits_int(&p->p_flag, P_SELECT);
822 		SCHED_UNLOCK(s);
823 	}
824 }
825 
826 void
827 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
828 {
829 	struct filedesc *fdp = p->p_fd;
830 	struct file *fp;
831 	u_int i;
832 	int n = 0;
833 
834 	for (i = 0; i < nfd; i++, pl++) {
835 		/* Check the file descriptor. */
836 		if (pl->fd < 0) {
837 			pl->revents = 0;
838 			continue;
839 		}
840 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
841 			pl->revents = POLLNVAL;
842 			n++;
843 			continue;
844 		}
845 		FREF(fp);
846 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
847 		FRELE(fp, p);
848 		if (pl->revents != 0)
849 			n++;
850 	}
851 	*retval = n;
852 }
853 
854 /*
855  * Only copyout the revents field.
856  */
857 int
858 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
859 {
860 	int error = 0;
861 	u_int i = 0;
862 
863 	while (!error && i++ < nfds) {
864 		error = copyout(&pl->revents, &upl->revents,
865 		    sizeof(upl->revents));
866 		pl++;
867 		upl++;
868 	}
869 
870 	return (error);
871 }
872 
873 /*
874  * We are using the same mechanism as select only we encode/decode args
875  * differently.
876  */
877 int
878 sys_poll(struct proc *p, void *v, register_t *retval)
879 {
880 	struct sys_poll_args /* {
881 		syscallarg(struct pollfd *) fds;
882 		syscallarg(u_int) nfds;
883 		syscallarg(int) timeout;
884 	} */ *uap = v;
885 
886 	struct timespec ts, *tsp = NULL;
887 	int msec = SCARG(uap, timeout);
888 
889 	if (msec != INFTIM) {
890 		if (msec < 0)
891 			return (EINVAL);
892 		ts.tv_sec = msec / 1000;
893 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
894 		tsp = &ts;
895 	}
896 
897 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
898 	    retval));
899 }
900 
901 int
902 sys_ppoll(struct proc *p, void *v, register_t *retval)
903 {
904 	struct sys_ppoll_args /* {
905 		syscallarg(struct pollfd *) fds;
906 		syscallarg(u_int) nfds;
907 		syscallarg(const struct timespec *) ts;
908 		syscallarg(const sigset_t *) mask;
909 	} */ *uap = v;
910 
911 	int error;
912 	struct timespec ts, *tsp = NULL;
913 	sigset_t ss, *ssp = NULL;
914 
915 	if (SCARG(uap, ts) != NULL) {
916 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
917 			return (error);
918 		if ((error = timespecfix(&ts)) != 0)
919 			return (error);
920 #ifdef KTRACE
921 		if (KTRPOINT(p, KTR_STRUCT))
922 			ktrreltimespec(p, &ts);
923 #endif
924 		tsp = &ts;
925 	}
926 
927 	if (SCARG(uap, mask) != NULL) {
928 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
929 			return (error);
930 		ssp = &ss;
931 	}
932 
933 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
934 	    retval));
935 }
936 
937 int
938 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
939     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
940 {
941 	size_t sz;
942 	struct pollfd pfds[4], *pl = pfds;
943 	struct timespec ats, rts, tts;
944 	int timo, ncoll, i, s, error;
945 
946 	/* Standards say no more than MAX_OPEN; this is possibly better. */
947 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
948 		return (EINVAL);
949 
950 	/* optimize for the default case, of a small nfds value */
951 	if (nfds > nitems(pfds)) {
952 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
953 		    M_WAITOK | M_CANFAIL);
954 		if (pl == NULL)
955 			return (EINVAL);
956 	}
957 
958 	sz = nfds * sizeof(*pl);
959 
960 	if ((error = copyin(fds, pl, sz)) != 0)
961 		goto bad;
962 
963 	for (i = 0; i < nfds; i++) {
964 		pl[i].events &= ~POLL_NOHUP;
965 		pl[i].revents = 0;
966 	}
967 
968 	if (tsp != NULL) {
969 		getnanouptime(&rts);
970 		timespecadd(tsp, &rts, &ats);
971 	} else {
972 		ats.tv_sec = 0;
973 		ats.tv_nsec = 0;
974 	}
975 	timo = 0;
976 
977 	if (sigmask)
978 		dosigsuspend(p, *sigmask &~ sigcantmask);
979 
980 retry:
981 	ncoll = nselcoll;
982 	atomic_setbits_int(&p->p_flag, P_SELECT);
983 	pollscan(p, pl, nfds, retval);
984 	if (*retval)
985 		goto done;
986 	if (tsp != NULL) {
987 		getnanouptime(&rts);
988 		if (timespeccmp(&rts, &ats, >=))
989 			goto done;
990 		timespecsub(&ats, &rts, &tts);
991 		timo = tts.tv_sec > 24 * 60 * 60 ?
992 			24 * 60 * 60 * hz : tstohz(&tts);
993 	}
994 	s = splhigh();
995 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
996 		splx(s);
997 		goto retry;
998 	}
999 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1000 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
1001 	splx(s);
1002 	if (error == 0)
1003 		goto retry;
1004 
1005 done:
1006 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1007 	/*
1008 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1009 	 *       ignored (since the whole point is to see what would block).
1010 	 */
1011 	switch (error) {
1012 	case ERESTART:
1013 		error = pollout(pl, fds, nfds);
1014 		if (error == 0)
1015 			error = EINTR;
1016 		break;
1017 	case EWOULDBLOCK:
1018 	case 0:
1019 		error = pollout(pl, fds, nfds);
1020 		break;
1021 	}
1022 #ifdef KTRACE
1023 	if (KTRPOINT(p, KTR_STRUCT))
1024 		ktrpollfd(p, pl, nfds);
1025 #endif /* KTRACE */
1026 bad:
1027 	if (pl != pfds)
1028 		free(pl, M_TEMP, sz);
1029 	return (error);
1030 }
1031 
1032 /*
1033  * utrace system call
1034  */
1035 int
1036 sys_utrace(struct proc *curp, void *v, register_t *retval)
1037 {
1038 #ifdef KTRACE
1039 	struct sys_utrace_args /* {
1040 		syscallarg(const char *) label;
1041 		syscallarg(const void *) addr;
1042 		syscallarg(size_t) len;
1043 	} */ *uap = v;
1044 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1045 	    SCARG(uap, len)));
1046 #else
1047 	return (0);
1048 #endif
1049 }
1050