xref: /openbsd-src/sys/kern/sys_generic.c (revision 5b859c19fe53bbea08f5c342e0a4470e99f883e1)
1 /*	$OpenBSD: sys_generic.c,v 1.94 2014/11/03 03:08:00 deraadt Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
67 int pollout(struct pollfd *, struct pollfd *, u_int);
68 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
69     const struct timespec *, const sigset_t *, register_t *);
70 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
71     const sigset_t *, register_t *);
72 
73 /*
74  * Read system call.
75  */
76 /* ARGSUSED */
77 int
78 sys_read(struct proc *p, void *v, register_t *retval)
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	struct iovec iov;
86 	int fd = SCARG(uap, fd);
87 	struct file *fp;
88 	struct filedesc *fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile(fdp, fd)) == NULL)
91 		return (EBADF);
92 	if ((fp->f_flag & FREAD) == 0)
93 		return (EBADF);
94 
95 	iov.iov_base = SCARG(uap, buf);
96 	iov.iov_len = SCARG(uap, nbyte);
97 
98 	FREF(fp);
99 
100 	/* dofilereadv() will FRELE the descriptor for us */
101 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
102 }
103 
104 /*
105  * Scatter read system call.
106  */
107 int
108 sys_readv(struct proc *p, void *v, register_t *retval)
109 {
110 	struct sys_readv_args /* {
111 		syscallarg(int) fd;
112 		syscallarg(const struct iovec *) iovp;
113 		syscallarg(int) iovcnt;
114 	} */ *uap = v;
115 	int fd = SCARG(uap, fd);
116 	struct file *fp;
117 	struct filedesc *fdp = p->p_fd;
118 
119 	if ((fp = fd_getfile(fdp, fd)) == NULL)
120 		return (EBADF);
121 	if ((fp->f_flag & FREAD) == 0)
122 		return (EBADF);
123 
124 	FREF(fp);
125 
126 	/* dofilereadv() will FRELE the descriptor for us */
127 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
128 	    &fp->f_offset, retval));
129 }
130 
131 int
132 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
133     int iovcnt, int userspace, off_t *offset, register_t *retval)
134 {
135 	struct iovec aiov[UIO_SMALLIOV];
136 	struct uio auio;
137 	struct iovec *iov;
138 	struct iovec *needfree = NULL;
139 	long i, cnt, error = 0;
140 	u_int iovlen;
141 #ifdef KTRACE
142 	struct iovec *ktriov = NULL;
143 #endif
144 
145 	/* note: can't use iovlen until iovcnt is validated */
146 	iovlen = iovcnt * sizeof(struct iovec);
147 
148 	/*
149 	 * If the iovec array exists in userspace, it needs to be copied in;
150 	 * otherwise, it can be used directly.
151 	 */
152 	if (userspace) {
153 		if ((u_int)iovcnt > UIO_SMALLIOV) {
154 			if ((u_int)iovcnt > IOV_MAX) {
155 				error = EINVAL;
156 				goto out;
157 			}
158 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
159 		} else if ((u_int)iovcnt > 0) {
160 			iov = aiov;
161 			needfree = NULL;
162 		} else {
163 			error = EINVAL;
164 			goto out;
165 		}
166 		if ((error = copyin(iovp, iov, iovlen)))
167 			goto done;
168 	} else {
169 		iov = (struct iovec *)iovp;		/* de-constify */
170 	}
171 
172 	auio.uio_iov = iov;
173 	auio.uio_iovcnt = iovcnt;
174 	auio.uio_rw = UIO_READ;
175 	auio.uio_segflg = UIO_USERSPACE;
176 	auio.uio_procp = p;
177 	auio.uio_resid = 0;
178 	for (i = 0; i < iovcnt; i++) {
179 		auio.uio_resid += iov->iov_len;
180 		/*
181 		 * Reads return ssize_t because -1 is returned on error.
182 		 * Therefore we must restrict the length to SSIZE_MAX to
183 		 * avoid garbage return values.  Note that the addition is
184 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
185 		 */
186 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
187 			error = EINVAL;
188 			goto done;
189 		}
190 		iov++;
191 	}
192 #ifdef KTRACE
193 	/*
194 	 * if tracing, save a copy of iovec
195 	 */
196 	if (KTRPOINT(p, KTR_GENIO)) {
197 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
198 		bcopy(auio.uio_iov, ktriov, iovlen);
199 	}
200 #endif
201 	cnt = auio.uio_resid;
202 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
203 	if (error)
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	cnt -= auio.uio_resid;
208 
209 	fp->f_rxfer++;
210 	fp->f_rbytes += cnt;
211 #ifdef KTRACE
212 	if (ktriov != NULL) {
213 		if (error == 0)
214 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
215 		free(ktriov, M_TEMP, iovlen);
216 	}
217 #endif
218 	*retval = cnt;
219  done:
220 	if (needfree)
221 		free(needfree, M_IOV, iovlen);
222  out:
223 	FRELE(fp, p);
224 	return (error);
225 }
226 
227 /*
228  * Write system call
229  */
230 int
231 sys_write(struct proc *p, void *v, register_t *retval)
232 {
233 	struct sys_write_args /* {
234 		syscallarg(int) fd;
235 		syscallarg(const void *) buf;
236 		syscallarg(size_t) nbyte;
237 	} */ *uap = v;
238 	struct iovec iov;
239 	int fd = SCARG(uap, fd);
240 	struct file *fp;
241 	struct filedesc *fdp = p->p_fd;
242 
243 	if ((fp = fd_getfile(fdp, fd)) == NULL)
244 		return (EBADF);
245 	if ((fp->f_flag & FWRITE) == 0)
246 		return (EBADF);
247 
248 	iov.iov_base = (void *)SCARG(uap, buf);
249 	iov.iov_len = SCARG(uap, nbyte);
250 
251 	FREF(fp);
252 
253 	/* dofilewritev() will FRELE the descriptor for us */
254 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
255 }
256 
257 /*
258  * Gather write system call
259  */
260 int
261 sys_writev(struct proc *p, void *v, register_t *retval)
262 {
263 	struct sys_writev_args /* {
264 		syscallarg(int) fd;
265 		syscallarg(const struct iovec *) iovp;
266 		syscallarg(int) iovcnt;
267 	} */ *uap = v;
268 	int fd = SCARG(uap, fd);
269 	struct file *fp;
270 	struct filedesc *fdp = p->p_fd;
271 
272 	if ((fp = fd_getfile(fdp, fd)) == NULL)
273 		return (EBADF);
274 	if ((fp->f_flag & FWRITE) == 0)
275 		return (EBADF);
276 
277 	FREF(fp);
278 
279 	/* dofilewritev() will FRELE the descriptor for us */
280 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
281 	    &fp->f_offset, retval));
282 }
283 
284 int
285 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
286     int iovcnt, int userspace, off_t *offset, register_t *retval)
287 {
288 	struct iovec aiov[UIO_SMALLIOV];
289 	struct uio auio;
290 	struct iovec *iov;
291 	struct iovec *needfree = NULL;
292 	long i, cnt, error = 0;
293 	u_int iovlen;
294 #ifdef KTRACE
295 	struct iovec *ktriov = NULL;
296 #endif
297 
298 	/* note: can't use iovlen until iovcnt is validated */
299 	iovlen = iovcnt * sizeof(struct iovec);
300 
301 	/*
302 	 * If the iovec array exists in userspace, it needs to be copied in;
303 	 * otherwise, it can be used directly.
304 	 */
305 	if (userspace) {
306 		if ((u_int)iovcnt > UIO_SMALLIOV) {
307 			if ((u_int)iovcnt > IOV_MAX) {
308 				error = EINVAL;
309 				goto out;
310 			}
311 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
312 		} else if ((u_int)iovcnt > 0) {
313 			iov = aiov;
314 			needfree = NULL;
315 		} else {
316 			error = EINVAL;
317 			goto out;
318 		}
319 		if ((error = copyin(iovp, iov, iovlen)))
320 			goto done;
321 	} else {
322 		iov = (struct iovec *)iovp;		/* de-constify */
323 	}
324 
325 	auio.uio_iov = iov;
326 	auio.uio_iovcnt = iovcnt;
327 	auio.uio_rw = UIO_WRITE;
328 	auio.uio_segflg = UIO_USERSPACE;
329 	auio.uio_procp = p;
330 	auio.uio_resid = 0;
331 	for (i = 0; i < iovcnt; i++) {
332 		auio.uio_resid += iov->iov_len;
333 		/*
334 		 * Writes return ssize_t because -1 is returned on error.
335 		 * Therefore we must restrict the length to SSIZE_MAX to
336 		 * avoid garbage return values.  Note that the addition is
337 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
338 		 */
339 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
340 			error = EINVAL;
341 			goto done;
342 		}
343 		iov++;
344 	}
345 #ifdef KTRACE
346 	/*
347 	 * if tracing, save a copy of iovec
348 	 */
349 	if (KTRPOINT(p, KTR_GENIO)) {
350 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
351 		bcopy(auio.uio_iov, ktriov, iovlen);
352 	}
353 #endif
354 	cnt = auio.uio_resid;
355 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
356 	if (error) {
357 		if (auio.uio_resid != cnt && (error == ERESTART ||
358 		    error == EINTR || error == EWOULDBLOCK))
359 			error = 0;
360 		if (error == EPIPE)
361 			ptsignal(p, SIGPIPE, STHREAD);
362 	}
363 	cnt -= auio.uio_resid;
364 
365 	fp->f_wxfer++;
366 	fp->f_wbytes += cnt;
367 #ifdef KTRACE
368 	if (ktriov != NULL) {
369 		if (error == 0)
370 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
371 		free(ktriov, M_TEMP, iovlen);
372 	}
373 #endif
374 	*retval = cnt;
375  done:
376 	if (needfree)
377 		free(needfree, M_IOV, iovlen);
378  out:
379 	FRELE(fp, p);
380 	return (error);
381 }
382 
383 /*
384  * Ioctl system call
385  */
386 /* ARGSUSED */
387 int
388 sys_ioctl(struct proc *p, void *v, register_t *retval)
389 {
390 	struct sys_ioctl_args /* {
391 		syscallarg(int) fd;
392 		syscallarg(u_long) com;
393 		syscallarg(void *) data;
394 	} */ *uap = v;
395 	struct file *fp;
396 	struct filedesc *fdp;
397 	u_long com;
398 	int error;
399 	u_int size;
400 	caddr_t data, memp;
401 	int tmp;
402 #define STK_PARAMS	128
403 	long long stkbuf[STK_PARAMS / sizeof(long long)];
404 
405 	fdp = p->p_fd;
406 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
407 		return (EBADF);
408 
409 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
410 		return (EBADF);
411 
412 	switch (com = SCARG(uap, com)) {
413 	case FIONCLEX:
414 	case FIOCLEX:
415 		fdplock(fdp);
416 		if (com == FIONCLEX)
417 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
418 		else
419 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
420 		fdpunlock(fdp);
421 		return (0);
422 	}
423 
424 	/*
425 	 * Interpret high order word to find amount of data to be
426 	 * copied to/from the user's address space.
427 	 */
428 	size = IOCPARM_LEN(com);
429 	if (size > IOCPARM_MAX)
430 		return (ENOTTY);
431 	FREF(fp);
432 	memp = NULL;
433 	if (size > sizeof (stkbuf)) {
434 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
435 		data = memp;
436 	} else
437 		data = (caddr_t)stkbuf;
438 	if (com&IOC_IN) {
439 		if (size) {
440 			error = copyin(SCARG(uap, data), data, (u_int)size);
441 			if (error) {
442 				goto out;
443 			}
444 		} else
445 			*(caddr_t *)data = SCARG(uap, data);
446 	} else if ((com&IOC_OUT) && size)
447 		/*
448 		 * Zero the buffer so the user always
449 		 * gets back something deterministic.
450 		 */
451 		memset(data, 0, size);
452 	else if (com&IOC_VOID)
453 		*(caddr_t *)data = SCARG(uap, data);
454 
455 	switch (com) {
456 
457 	case FIONBIO:
458 		if ((tmp = *(int *)data) != 0)
459 			fp->f_flag |= FNONBLOCK;
460 		else
461 			fp->f_flag &= ~FNONBLOCK;
462 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
463 		break;
464 
465 	case FIOASYNC:
466 		if ((tmp = *(int *)data) != 0)
467 			fp->f_flag |= FASYNC;
468 		else
469 			fp->f_flag &= ~FASYNC;
470 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
471 		break;
472 
473 	case FIOSETOWN:
474 		tmp = *(int *)data;
475 		if (fp->f_type == DTYPE_SOCKET) {
476 			struct socket *so = (struct socket *)fp->f_data;
477 
478 			so->so_pgid = tmp;
479 			so->so_siguid = p->p_ucred->cr_ruid;
480 			so->so_sigeuid = p->p_ucred->cr_uid;
481 			error = 0;
482 			break;
483 		}
484 		if (tmp <= 0) {
485 			tmp = -tmp;
486 		} else {
487 			struct process *pr = prfind(tmp);
488 			if (pr == NULL) {
489 				error = ESRCH;
490 				break;
491 			}
492 			tmp = pr->ps_pgrp->pg_id;
493 		}
494 		error = (*fp->f_ops->fo_ioctl)
495 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
496 		break;
497 
498 	case FIOGETOWN:
499 		if (fp->f_type == DTYPE_SOCKET) {
500 			error = 0;
501 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
502 			break;
503 		}
504 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
505 		*(int *)data = -*(int *)data;
506 		break;
507 
508 	default:
509 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
510 		break;
511 	}
512 	/*
513 	 * Copy any data to user, size was
514 	 * already set and checked above.
515 	 */
516 	if (error == 0 && (com&IOC_OUT) && size)
517 		error = copyout(data, SCARG(uap, data), (u_int)size);
518 out:
519 	FRELE(fp, p);
520 	if (memp)
521 		free(memp, M_IOCTLOPS, size);
522 	return (error);
523 }
524 
525 int	selwait, nselcoll;
526 
527 /*
528  * Select system call.
529  */
530 int
531 sys_select(struct proc *p, void *v, register_t *retval)
532 {
533 	struct sys_select_args /* {
534 		syscallarg(int) nd;
535 		syscallarg(fd_set *) in;
536 		syscallarg(fd_set *) ou;
537 		syscallarg(fd_set *) ex;
538 		syscallarg(struct timeval *) tv;
539 	} */ *uap = v;
540 
541 	struct timespec ts, *tsp = NULL;
542 	int error;
543 
544 	if (SCARG(uap, tv) != NULL) {
545 		struct timeval tv;
546 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
547 			return (error);
548 		if ((error = itimerfix(&tv)) != 0)
549 			return (error);
550 #ifdef KTRACE
551 		if (KTRPOINT(p, KTR_STRUCT))
552 			ktrreltimeval(p, &tv);
553 #endif
554 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
555 		tsp = &ts;
556 	}
557 
558 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
559 	    SCARG(uap, ex), tsp, NULL, retval));
560 }
561 
562 int
563 sys_pselect(struct proc *p, void *v, register_t *retval)
564 {
565 	struct sys_pselect_args /* {
566 		syscallarg(int) nd;
567 		syscallarg(fd_set *) in;
568 		syscallarg(fd_set *) ou;
569 		syscallarg(fd_set *) ex;
570 		syscallarg(const struct timespec *) ts;
571 		syscallarg(const sigset_t *) mask;
572 	} */ *uap = v;
573 
574 	struct timespec ts, *tsp = NULL;
575 	sigset_t ss, *ssp = NULL;
576 	int error;
577 
578 	if (SCARG(uap, ts) != NULL) {
579 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
580 			return (error);
581 		if ((error = timespecfix(&ts)) != 0)
582 			return (error);
583 #ifdef KTRACE
584 		if (KTRPOINT(p, KTR_STRUCT))
585 			ktrreltimespec(p, &ts);
586 #endif
587 		tsp = &ts;
588 	}
589 	if (SCARG(uap, mask) != NULL) {
590 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
591 			return (error);
592 		ssp = &ss;
593 	}
594 
595 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
596 	    SCARG(uap, ex), tsp, ssp, retval));
597 }
598 
599 int
600 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
601     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
602 {
603 	fd_mask bits[6];
604 	fd_set *pibits[3], *pobits[3];
605 	struct timespec ats, rts, tts;
606 	int s, ncoll, error = 0, timo;
607 	u_int ni;
608 
609 	if (nd < 0)
610 		return (EINVAL);
611 	if (nd > p->p_fd->fd_nfiles) {
612 		/* forgiving; slightly wrong */
613 		nd = p->p_fd->fd_nfiles;
614 	}
615 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
616 	if (ni > sizeof(bits[0])) {
617 		caddr_t mbits;
618 
619 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
620 		pibits[0] = (fd_set *)&mbits[ni * 0];
621 		pibits[1] = (fd_set *)&mbits[ni * 1];
622 		pibits[2] = (fd_set *)&mbits[ni * 2];
623 		pobits[0] = (fd_set *)&mbits[ni * 3];
624 		pobits[1] = (fd_set *)&mbits[ni * 4];
625 		pobits[2] = (fd_set *)&mbits[ni * 5];
626 	} else {
627 		memset(bits, 0, sizeof(bits));
628 		pibits[0] = (fd_set *)&bits[0];
629 		pibits[1] = (fd_set *)&bits[1];
630 		pibits[2] = (fd_set *)&bits[2];
631 		pobits[0] = (fd_set *)&bits[3];
632 		pobits[1] = (fd_set *)&bits[4];
633 		pobits[2] = (fd_set *)&bits[5];
634 	}
635 
636 #define	getbits(name, x) \
637 	if (name && (error = copyin(name, pibits[x], ni))) \
638 		goto done;
639 	getbits(in, 0);
640 	getbits(ou, 1);
641 	getbits(ex, 2);
642 #undef	getbits
643 #ifdef KTRACE
644 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
645 		if (in) ktrfdset(p, pibits[0], ni);
646 		if (ou) ktrfdset(p, pibits[1], ni);
647 		if (ex) ktrfdset(p, pibits[2], ni);
648 	}
649 #endif
650 
651 	if (tsp) {
652 		getnanouptime(&rts);
653 		timespecadd(tsp, &rts, &ats);
654 	} else {
655 		ats.tv_sec = 0;
656 		ats.tv_nsec = 0;
657 	}
658 	timo = 0;
659 
660 	if (sigmask)
661 		dosigsuspend(p, *sigmask &~ sigcantmask);
662 
663 retry:
664 	ncoll = nselcoll;
665 	atomic_setbits_int(&p->p_flag, P_SELECT);
666 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
667 	if (error || *retval)
668 		goto done;
669 	if (tsp) {
670 		getnanouptime(&rts);
671 		if (timespeccmp(&rts, &ats, >=))
672 			goto done;
673 		timespecsub(&ats, &rts, &tts);
674 		timo = tts.tv_sec > 24 * 60 * 60 ?
675 			24 * 60 * 60 * hz : tstohz(&tts);
676 	}
677 	s = splhigh();
678 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
679 		splx(s);
680 		goto retry;
681 	}
682 	atomic_clearbits_int(&p->p_flag, P_SELECT);
683 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
684 	splx(s);
685 	if (error == 0)
686 		goto retry;
687 done:
688 	atomic_clearbits_int(&p->p_flag, P_SELECT);
689 	/* select is not restarted after signals... */
690 	if (error == ERESTART)
691 		error = EINTR;
692 	if (error == EWOULDBLOCK)
693 		error = 0;
694 #define	putbits(name, x) \
695 	if (name && (error2 = copyout(pobits[x], name, ni))) \
696 		error = error2;
697 	if (error == 0) {
698 		int error2;
699 
700 		putbits(in, 0);
701 		putbits(ou, 1);
702 		putbits(ex, 2);
703 #undef putbits
704 #ifdef KTRACE
705 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
706 			if (in) ktrfdset(p, pobits[0], ni);
707 			if (ou) ktrfdset(p, pobits[1], ni);
708 			if (ex) ktrfdset(p, pobits[2], ni);
709 		}
710 #endif
711 	}
712 
713 	if (pibits[0] != (fd_set *)&bits[0])
714 		free(pibits[0], M_TEMP, 6 * ni);
715 	return (error);
716 }
717 
718 int
719 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
720     register_t *retval)
721 {
722 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
723 	struct filedesc *fdp = p->p_fd;
724 	int msk, i, j, fd;
725 	fd_mask bits;
726 	struct file *fp;
727 	int n = 0;
728 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
729 
730 	for (msk = 0; msk < 3; msk++) {
731 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
732 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
733 
734 		for (i = 0; i < nfd; i += NFDBITS) {
735 			bits = pibits->fds_bits[i/NFDBITS];
736 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
737 				bits &= ~(1 << j);
738 				if ((fp = fd_getfile(fdp, fd)) == NULL)
739 					return (EBADF);
740 				FREF(fp);
741 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
742 					FD_SET(fd, pobits);
743 					n++;
744 				}
745 				FRELE(fp, p);
746 			}
747 		}
748 	}
749 	*retval = n;
750 	return (0);
751 }
752 
753 /*ARGSUSED*/
754 int
755 seltrue(dev_t dev, int events, struct proc *p)
756 {
757 
758 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
759 }
760 
761 int
762 selfalse(dev_t dev, int events, struct proc *p)
763 {
764 
765 	return (0);
766 }
767 
768 /*
769  * Record a select request.
770  */
771 void
772 selrecord(struct proc *selector, struct selinfo *sip)
773 {
774 	struct proc *p;
775 	pid_t mypid;
776 
777 	mypid = selector->p_pid;
778 	if (sip->si_selpid == mypid)
779 		return;
780 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
781 	    p->p_wchan == (caddr_t)&selwait)
782 		sip->si_flags |= SI_COLL;
783 	else
784 		sip->si_selpid = mypid;
785 }
786 
787 /*
788  * Do a wakeup when a selectable event occurs.
789  */
790 void
791 selwakeup(struct selinfo *sip)
792 {
793 	struct proc *p;
794 	int s;
795 
796 	KNOTE(&sip->si_note, 0);
797 	if (sip->si_selpid == 0)
798 		return;
799 	if (sip->si_flags & SI_COLL) {
800 		nselcoll++;
801 		sip->si_flags &= ~SI_COLL;
802 		wakeup(&selwait);
803 	}
804 	p = pfind(sip->si_selpid);
805 	sip->si_selpid = 0;
806 	if (p != NULL) {
807 		SCHED_LOCK(s);
808 		if (p->p_wchan == (caddr_t)&selwait) {
809 			if (p->p_stat == SSLEEP)
810 				setrunnable(p);
811 			else
812 				unsleep(p);
813 		} else if (p->p_flag & P_SELECT)
814 			atomic_clearbits_int(&p->p_flag, P_SELECT);
815 		SCHED_UNLOCK(s);
816 	}
817 }
818 
819 void
820 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
821 {
822 	struct filedesc *fdp = p->p_fd;
823 	struct file *fp;
824 	u_int i;
825 	int n = 0;
826 
827 	for (i = 0; i < nfd; i++, pl++) {
828 		/* Check the file descriptor. */
829 		if (pl->fd < 0) {
830 			pl->revents = 0;
831 			continue;
832 		}
833 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
834 			pl->revents = POLLNVAL;
835 			n++;
836 			continue;
837 		}
838 		FREF(fp);
839 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
840 		FRELE(fp, p);
841 		if (pl->revents != 0)
842 			n++;
843 	}
844 	*retval = n;
845 }
846 
847 /*
848  * Only copyout the revents field.
849  */
850 int
851 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
852 {
853 	int error = 0;
854 	u_int i = 0;
855 
856 	while (!error && i++ < nfds) {
857 		error = copyout(&pl->revents, &upl->revents,
858 		    sizeof(upl->revents));
859 		pl++;
860 		upl++;
861 	}
862 
863 	return (error);
864 }
865 
866 /*
867  * We are using the same mechanism as select only we encode/decode args
868  * differently.
869  */
870 int
871 sys_poll(struct proc *p, void *v, register_t *retval)
872 {
873 	struct sys_poll_args /* {
874 		syscallarg(struct pollfd *) fds;
875 		syscallarg(u_int) nfds;
876 		syscallarg(int) timeout;
877 	} */ *uap = v;
878 
879 	struct timespec ts, *tsp = NULL;
880 	int msec = SCARG(uap, timeout);
881 
882 	if (msec != INFTIM) {
883 		if (msec < 0)
884 			return (EINVAL);
885 		ts.tv_sec = msec / 1000;
886 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
887 		tsp = &ts;
888 	}
889 
890 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
891 	    retval));
892 }
893 
894 int
895 sys_ppoll(struct proc *p, void *v, register_t *retval)
896 {
897 	struct sys_ppoll_args /* {
898 		syscallarg(struct pollfd *) fds;
899 		syscallarg(u_int) nfds;
900 		syscallarg(const struct timespec *) ts;
901 		syscallarg(const sigset_t *) mask;
902 	} */ *uap = v;
903 
904 	int error;
905 	struct timespec ts, *tsp = NULL;
906 	sigset_t ss, *ssp = NULL;
907 
908 	if (SCARG(uap, ts) != NULL) {
909 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
910 			return (error);
911 		if ((error = timespecfix(&ts)) != 0)
912 			return (error);
913 #ifdef KTRACE
914 		if (KTRPOINT(p, KTR_STRUCT))
915 			ktrreltimespec(p, &ts);
916 #endif
917 		tsp = &ts;
918 	}
919 
920 	if (SCARG(uap, mask) != NULL) {
921 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
922 			return (error);
923 		ssp = &ss;
924 	}
925 
926 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
927 	    retval));
928 }
929 
930 int
931 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
932     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
933 {
934 	size_t sz;
935 	struct pollfd pfds[4], *pl = pfds;
936 	struct timespec ats, rts, tts;
937 	int timo, ncoll, i, s, error;
938 	extern int nselcoll, selwait;
939 
940 	/* Standards say no more than MAX_OPEN; this is possibly better. */
941 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
942 		return (EINVAL);
943 
944 	/* optimize for the default case, of a small nfds value */
945 	if (nfds > nitems(pfds)) {
946 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
947 		    M_WAITOK | M_CANFAIL);
948 		if (pl == NULL)
949 			return (EINVAL);
950 	}
951 
952 	sz = nfds * sizeof(*pl);
953 
954 	if ((error = copyin(fds, pl, sz)) != 0)
955 		goto bad;
956 
957 	for (i = 0; i < nfds; i++)
958 		pl[i].revents = 0;
959 
960 	if (tsp != NULL) {
961 		getnanouptime(&rts);
962 		timespecadd(tsp, &rts, &ats);
963 	} else {
964 		ats.tv_sec = 0;
965 		ats.tv_nsec = 0;
966 	}
967 	timo = 0;
968 
969 	if (sigmask)
970 		dosigsuspend(p, *sigmask &~ sigcantmask);
971 
972 retry:
973 	ncoll = nselcoll;
974 	atomic_setbits_int(&p->p_flag, P_SELECT);
975 	pollscan(p, pl, nfds, retval);
976 	if (*retval)
977 		goto done;
978 	if (tsp != NULL) {
979 		getnanouptime(&rts);
980 		if (timespeccmp(&rts, &ats, >=))
981 			goto done;
982 		timespecsub(&ats, &rts, &tts);
983 		timo = tts.tv_sec > 24 * 60 * 60 ?
984 			24 * 60 * 60 * hz : tstohz(&tts);
985 	}
986 	s = splhigh();
987 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
988 		splx(s);
989 		goto retry;
990 	}
991 	atomic_clearbits_int(&p->p_flag, P_SELECT);
992 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
993 	splx(s);
994 	if (error == 0)
995 		goto retry;
996 
997 done:
998 	atomic_clearbits_int(&p->p_flag, P_SELECT);
999 	/*
1000 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1001 	 *       ignored (since the whole point is to see what would block).
1002 	 */
1003 	switch (error) {
1004 	case ERESTART:
1005 		error = pollout(pl, fds, nfds);
1006 		if (error == 0)
1007 			error = EINTR;
1008 		break;
1009 	case EWOULDBLOCK:
1010 	case 0:
1011 		error = pollout(pl, fds, nfds);
1012 		break;
1013 	}
1014 bad:
1015 	if (pl != pfds)
1016 		free(pl, M_TEMP, sz);
1017 	return (error);
1018 }
1019 
1020 /*
1021  * utrace system call
1022  */
1023 /* ARGSUSED */
1024 int
1025 sys_utrace(struct proc *curp, void *v, register_t *retval)
1026 {
1027 #ifdef KTRACE
1028 	struct sys_utrace_args /* {
1029 		syscallarg(const char *) label;
1030 		syscallarg(const void *) addr;
1031 		syscallarg(size_t) len;
1032 	} */ *uap = v;
1033 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1034 	    SCARG(uap, len)));
1035 #else
1036 	return (0);
1037 #endif
1038 }
1039