xref: /openbsd-src/sys/kern/sys_generic.c (revision bae06bfd75bcb8885bc04d9fe10305e524033034)
1 /*	$OpenBSD: sys_generic.c,v 1.118 2018/04/27 10:13:37 mpi Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/socketvar.h>
50 #include <sys/signalvar.h>
51 #include <sys/uio.h>
52 #include <sys/kernel.h>
53 #include <sys/stat.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #ifdef KTRACE
57 #include <sys/ktrace.h>
58 #endif
59 #include <sys/sched.h>
60 #include <sys/pledge.h>
61 
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
68 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
69 int pollout(struct pollfd *, struct pollfd *, u_int);
70 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
71     const struct timespec *, const sigset_t *, register_t *);
72 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
73     const sigset_t *, register_t *);
74 
75 /*
76  * Read system call.
77  */
78 int
79 sys_read(struct proc *p, void *v, register_t *retval)
80 {
81 	struct sys_read_args /* {
82 		syscallarg(int) fd;
83 		syscallarg(void *) buf;
84 		syscallarg(size_t) nbyte;
85 	} */ *uap = v;
86 	struct iovec iov;
87 	int fd = SCARG(uap, fd);
88 	struct file *fp;
89 	struct filedesc *fdp = p->p_fd;
90 
91 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
92 		return (EBADF);
93 
94 	iov.iov_base = SCARG(uap, buf);
95 	iov.iov_len = SCARG(uap, nbyte);
96 
97 	/* dofilereadv() will FRELE the descriptor for us */
98 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
99 }
100 
101 /*
102  * Scatter read system call.
103  */
104 int
105 sys_readv(struct proc *p, void *v, register_t *retval)
106 {
107 	struct sys_readv_args /* {
108 		syscallarg(int) fd;
109 		syscallarg(const struct iovec *) iovp;
110 		syscallarg(int) iovcnt;
111 	} */ *uap = v;
112 	int fd = SCARG(uap, fd);
113 	struct file *fp;
114 	struct filedesc *fdp = p->p_fd;
115 
116 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
117 		return (EBADF);
118 
119 	/* dofilereadv() will FRELE the descriptor for us */
120 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
121 	    &fp->f_offset, retval));
122 }
123 
124 int
125 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
126     int iovcnt, int userspace, off_t *offset, register_t *retval)
127 {
128 	struct iovec aiov[UIO_SMALLIOV];
129 	struct uio auio;
130 	struct iovec *iov;
131 	struct iovec *needfree = NULL;
132 	long i, cnt, error = 0;
133 	u_int iovlen;
134 #ifdef KTRACE
135 	struct iovec *ktriov = NULL;
136 #endif
137 
138 	/* note: can't use iovlen until iovcnt is validated */
139 	iovlen = iovcnt * sizeof(struct iovec);
140 
141 	/*
142 	 * If the iovec array exists in userspace, it needs to be copied in;
143 	 * otherwise, it can be used directly.
144 	 */
145 	if (userspace) {
146 		if ((u_int)iovcnt > UIO_SMALLIOV) {
147 			if ((u_int)iovcnt > IOV_MAX) {
148 				error = EINVAL;
149 				goto out;
150 			}
151 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
152 		} else if ((u_int)iovcnt > 0) {
153 			iov = aiov;
154 			needfree = NULL;
155 		} else {
156 			error = EINVAL;
157 			goto out;
158 		}
159 		if ((error = copyin(iovp, iov, iovlen)))
160 			goto done;
161 #ifdef KTRACE
162 		if (KTRPOINT(p, KTR_STRUCT))
163 			ktriovec(p, iov, iovcnt);
164 #endif
165 	} else {
166 		iov = (struct iovec *)iovp;		/* de-constify */
167 	}
168 
169 	auio.uio_iov = iov;
170 	auio.uio_iovcnt = iovcnt;
171 	auio.uio_rw = UIO_READ;
172 	auio.uio_segflg = UIO_USERSPACE;
173 	auio.uio_procp = p;
174 	auio.uio_resid = 0;
175 	for (i = 0; i < iovcnt; i++) {
176 		auio.uio_resid += iov->iov_len;
177 		/*
178 		 * Reads return ssize_t because -1 is returned on error.
179 		 * Therefore we must restrict the length to SSIZE_MAX to
180 		 * avoid garbage return values.  Note that the addition is
181 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
182 		 */
183 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
184 			error = EINVAL;
185 			goto done;
186 		}
187 		iov++;
188 	}
189 #ifdef KTRACE
190 	/*
191 	 * if tracing, save a copy of iovec
192 	 */
193 	if (KTRPOINT(p, KTR_GENIO)) {
194 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
195 		memcpy(ktriov, auio.uio_iov, iovlen);
196 	}
197 #endif
198 	cnt = auio.uio_resid;
199 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
200 	if (error)
201 		if (auio.uio_resid != cnt && (error == ERESTART ||
202 		    error == EINTR || error == EWOULDBLOCK))
203 			error = 0;
204 	cnt -= auio.uio_resid;
205 
206 	fp->f_rxfer++;
207 	fp->f_rbytes += cnt;
208 #ifdef KTRACE
209 	if (ktriov != NULL) {
210 		if (error == 0)
211 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
212 		free(ktriov, M_TEMP, iovlen);
213 	}
214 #endif
215 	*retval = cnt;
216  done:
217 	if (needfree)
218 		free(needfree, M_IOV, iovlen);
219  out:
220 	FRELE(fp, p);
221 	return (error);
222 }
223 
224 /*
225  * Write system call
226  */
227 int
228 sys_write(struct proc *p, void *v, register_t *retval)
229 {
230 	struct sys_write_args /* {
231 		syscallarg(int) fd;
232 		syscallarg(const void *) buf;
233 		syscallarg(size_t) nbyte;
234 	} */ *uap = v;
235 	struct iovec iov;
236 	int fd = SCARG(uap, fd);
237 	struct file *fp;
238 	struct filedesc *fdp = p->p_fd;
239 
240 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
241 		return (EBADF);
242 
243 	iov.iov_base = (void *)SCARG(uap, buf);
244 	iov.iov_len = SCARG(uap, nbyte);
245 
246 	/* dofilewritev() will FRELE the descriptor for us */
247 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
248 }
249 
250 /*
251  * Gather write system call
252  */
253 int
254 sys_writev(struct proc *p, void *v, register_t *retval)
255 {
256 	struct sys_writev_args /* {
257 		syscallarg(int) fd;
258 		syscallarg(const struct iovec *) iovp;
259 		syscallarg(int) iovcnt;
260 	} */ *uap = v;
261 	int fd = SCARG(uap, fd);
262 	struct file *fp;
263 	struct filedesc *fdp = p->p_fd;
264 
265 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
266 		return (EBADF);
267 
268 	/* dofilewritev() will FRELE the descriptor for us */
269 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
270 	    &fp->f_offset, retval));
271 }
272 
273 int
274 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
275     int iovcnt, int userspace, off_t *offset, register_t *retval)
276 {
277 	struct iovec aiov[UIO_SMALLIOV];
278 	struct uio auio;
279 	struct iovec *iov;
280 	struct iovec *needfree = NULL;
281 	long i, cnt, error = 0;
282 	u_int iovlen;
283 #ifdef KTRACE
284 	struct iovec *ktriov = NULL;
285 #endif
286 
287 	/* note: can't use iovlen until iovcnt is validated */
288 	iovlen = iovcnt * sizeof(struct iovec);
289 
290 	/*
291 	 * If the iovec array exists in userspace, it needs to be copied in;
292 	 * otherwise, it can be used directly.
293 	 */
294 	if (userspace) {
295 		if ((u_int)iovcnt > UIO_SMALLIOV) {
296 			if ((u_int)iovcnt > IOV_MAX) {
297 				error = EINVAL;
298 				goto out;
299 			}
300 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
301 		} else if ((u_int)iovcnt > 0) {
302 			iov = aiov;
303 			needfree = NULL;
304 		} else {
305 			error = EINVAL;
306 			goto out;
307 		}
308 		if ((error = copyin(iovp, iov, iovlen)))
309 			goto done;
310 #ifdef KTRACE
311 		if (KTRPOINT(p, KTR_STRUCT))
312 			ktriovec(p, iov, iovcnt);
313 #endif
314 	} else {
315 		iov = (struct iovec *)iovp;		/* de-constify */
316 	}
317 
318 	auio.uio_iov = iov;
319 	auio.uio_iovcnt = iovcnt;
320 	auio.uio_rw = UIO_WRITE;
321 	auio.uio_segflg = UIO_USERSPACE;
322 	auio.uio_procp = p;
323 	auio.uio_resid = 0;
324 	for (i = 0; i < iovcnt; i++) {
325 		auio.uio_resid += iov->iov_len;
326 		/*
327 		 * Writes return ssize_t because -1 is returned on error.
328 		 * Therefore we must restrict the length to SSIZE_MAX to
329 		 * avoid garbage return values.  Note that the addition is
330 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
331 		 */
332 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
333 			error = EINVAL;
334 			goto done;
335 		}
336 		iov++;
337 	}
338 #ifdef KTRACE
339 	/*
340 	 * if tracing, save a copy of iovec
341 	 */
342 	if (KTRPOINT(p, KTR_GENIO)) {
343 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
344 		memcpy(ktriov, auio.uio_iov, iovlen);
345 	}
346 #endif
347 	cnt = auio.uio_resid;
348 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
349 	if (error) {
350 		if (auio.uio_resid != cnt && (error == ERESTART ||
351 		    error == EINTR || error == EWOULDBLOCK))
352 			error = 0;
353 		if (error == EPIPE)
354 			ptsignal(p, SIGPIPE, STHREAD);
355 	}
356 	cnt -= auio.uio_resid;
357 
358 	fp->f_wxfer++;
359 	fp->f_wbytes += cnt;
360 #ifdef KTRACE
361 	if (ktriov != NULL) {
362 		if (error == 0)
363 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
364 		free(ktriov, M_TEMP, iovlen);
365 	}
366 #endif
367 	*retval = cnt;
368  done:
369 	if (needfree)
370 		free(needfree, M_IOV, iovlen);
371  out:
372 	FRELE(fp, p);
373 	return (error);
374 }
375 
376 /*
377  * Ioctl system call
378  */
379 int
380 sys_ioctl(struct proc *p, void *v, register_t *retval)
381 {
382 	struct sys_ioctl_args /* {
383 		syscallarg(int) fd;
384 		syscallarg(u_long) com;
385 		syscallarg(void *) data;
386 	} */ *uap = v;
387 	struct file *fp;
388 	struct filedesc *fdp;
389 	u_long com = SCARG(uap, com);
390 	int error = 0;
391 	u_int size;
392 	caddr_t data, memp = NULL;
393 	int tmp;
394 #define STK_PARAMS	128
395 	long long stkbuf[STK_PARAMS / sizeof(long long)];
396 
397 	fdp = p->p_fd;
398 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
399 		return (EBADF);
400 
401 	if (fp->f_type == DTYPE_SOCKET) {
402 		struct socket *so = fp->f_data;
403 
404 		if (so->so_state & SS_DNS) {
405 			error = EINVAL;
406 			goto out;
407 		}
408 	}
409 
410 	error = pledge_ioctl(p, com, fp);
411 	if (error)
412 		goto out;
413 
414 	switch (com) {
415 	case FIONCLEX:
416 	case FIOCLEX:
417 		fdplock(fdp);
418 		if (com == FIONCLEX)
419 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
420 		else
421 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
422 		fdpunlock(fdp);
423 		goto out;
424 	}
425 
426 	/*
427 	 * Interpret high order word to find amount of data to be
428 	 * copied to/from the user's address space.
429 	 */
430 	size = IOCPARM_LEN(com);
431 	if (size > IOCPARM_MAX) {
432 		error = ENOTTY;
433 		goto out;
434 	}
435 	if (size > sizeof (stkbuf)) {
436 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
437 		data = memp;
438 	} else
439 		data = (caddr_t)stkbuf;
440 	if (com&IOC_IN) {
441 		if (size) {
442 			error = copyin(SCARG(uap, data), data, size);
443 			if (error) {
444 				goto out;
445 			}
446 		} else
447 			*(caddr_t *)data = SCARG(uap, data);
448 	} else if ((com&IOC_OUT) && size)
449 		/*
450 		 * Zero the buffer so the user always
451 		 * gets back something deterministic.
452 		 */
453 		memset(data, 0, size);
454 	else if (com&IOC_VOID)
455 		*(caddr_t *)data = SCARG(uap, data);
456 
457 	switch (com) {
458 
459 	case FIONBIO:
460 		if ((tmp = *(int *)data) != 0)
461 			fp->f_flag |= FNONBLOCK;
462 		else
463 			fp->f_flag &= ~FNONBLOCK;
464 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
465 		break;
466 
467 	case FIOASYNC:
468 		if ((tmp = *(int *)data) != 0)
469 			fp->f_flag |= FASYNC;
470 		else
471 			fp->f_flag &= ~FASYNC;
472 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
473 		break;
474 
475 	case FIOSETOWN:
476 		tmp = *(int *)data;
477 		if (fp->f_type == DTYPE_SOCKET) {
478 			struct socket *so = fp->f_data;
479 
480 			so->so_pgid = tmp;
481 			so->so_siguid = p->p_ucred->cr_ruid;
482 			so->so_sigeuid = p->p_ucred->cr_uid;
483 			error = 0;
484 			break;
485 		}
486 		if (tmp <= 0) {
487 			tmp = -tmp;
488 		} else {
489 			struct process *pr = prfind(tmp);
490 			if (pr == NULL) {
491 				error = ESRCH;
492 				break;
493 			}
494 			tmp = pr->ps_pgrp->pg_id;
495 		}
496 		error = (*fp->f_ops->fo_ioctl)
497 		    (fp, TIOCSPGRP, (caddr_t)&tmp, p);
498 		break;
499 
500 	case FIOGETOWN:
501 		if (fp->f_type == DTYPE_SOCKET) {
502 			error = 0;
503 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
504 			break;
505 		}
506 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
507 		*(int *)data = -*(int *)data;
508 		break;
509 
510 	default:
511 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
512 		break;
513 	}
514 	/*
515 	 * Copy any data to user, size was
516 	 * already set and checked above.
517 	 */
518 	if (error == 0 && (com&IOC_OUT) && size)
519 		error = copyout(data, SCARG(uap, data), size);
520 out:
521 	FRELE(fp, p);
522 	free(memp, M_IOCTLOPS, size);
523 	return (error);
524 }
525 
526 int	selwait, nselcoll;
527 
528 /*
529  * Select system call.
530  */
531 int
532 sys_select(struct proc *p, void *v, register_t *retval)
533 {
534 	struct sys_select_args /* {
535 		syscallarg(int) nd;
536 		syscallarg(fd_set *) in;
537 		syscallarg(fd_set *) ou;
538 		syscallarg(fd_set *) ex;
539 		syscallarg(struct timeval *) tv;
540 	} */ *uap = v;
541 
542 	struct timespec ts, *tsp = NULL;
543 	int error;
544 
545 	if (SCARG(uap, tv) != NULL) {
546 		struct timeval tv;
547 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
548 			return (error);
549 		if ((error = itimerfix(&tv)) != 0)
550 			return (error);
551 #ifdef KTRACE
552 		if (KTRPOINT(p, KTR_STRUCT))
553 			ktrreltimeval(p, &tv);
554 #endif
555 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
556 		tsp = &ts;
557 	}
558 
559 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
560 	    SCARG(uap, ex), tsp, NULL, retval));
561 }
562 
563 int
564 sys_pselect(struct proc *p, void *v, register_t *retval)
565 {
566 	struct sys_pselect_args /* {
567 		syscallarg(int) nd;
568 		syscallarg(fd_set *) in;
569 		syscallarg(fd_set *) ou;
570 		syscallarg(fd_set *) ex;
571 		syscallarg(const struct timespec *) ts;
572 		syscallarg(const sigset_t *) mask;
573 	} */ *uap = v;
574 
575 	struct timespec ts, *tsp = NULL;
576 	sigset_t ss, *ssp = NULL;
577 	int error;
578 
579 	if (SCARG(uap, ts) != NULL) {
580 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
581 			return (error);
582 		if ((error = timespecfix(&ts)) != 0)
583 			return (error);
584 #ifdef KTRACE
585 		if (KTRPOINT(p, KTR_STRUCT))
586 			ktrreltimespec(p, &ts);
587 #endif
588 		tsp = &ts;
589 	}
590 	if (SCARG(uap, mask) != NULL) {
591 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
592 			return (error);
593 		ssp = &ss;
594 	}
595 
596 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
597 	    SCARG(uap, ex), tsp, ssp, retval));
598 }
599 
600 int
601 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
602     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
603 {
604 	fd_mask bits[6];
605 	fd_set *pibits[3], *pobits[3];
606 	struct timespec ats, rts, tts;
607 	int s, ncoll, error = 0, timo;
608 	u_int ni;
609 
610 	if (nd < 0)
611 		return (EINVAL);
612 	if (nd > p->p_fd->fd_nfiles) {
613 		/* forgiving; slightly wrong */
614 		nd = p->p_fd->fd_nfiles;
615 	}
616 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
617 	if (ni > sizeof(bits[0])) {
618 		caddr_t mbits;
619 
620 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
621 		pibits[0] = (fd_set *)&mbits[ni * 0];
622 		pibits[1] = (fd_set *)&mbits[ni * 1];
623 		pibits[2] = (fd_set *)&mbits[ni * 2];
624 		pobits[0] = (fd_set *)&mbits[ni * 3];
625 		pobits[1] = (fd_set *)&mbits[ni * 4];
626 		pobits[2] = (fd_set *)&mbits[ni * 5];
627 	} else {
628 		memset(bits, 0, sizeof(bits));
629 		pibits[0] = (fd_set *)&bits[0];
630 		pibits[1] = (fd_set *)&bits[1];
631 		pibits[2] = (fd_set *)&bits[2];
632 		pobits[0] = (fd_set *)&bits[3];
633 		pobits[1] = (fd_set *)&bits[4];
634 		pobits[2] = (fd_set *)&bits[5];
635 	}
636 
637 #define	getbits(name, x) \
638 	if (name && (error = copyin(name, pibits[x], ni))) \
639 		goto done;
640 	getbits(in, 0);
641 	getbits(ou, 1);
642 	getbits(ex, 2);
643 #undef	getbits
644 #ifdef KTRACE
645 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
646 		if (in) ktrfdset(p, pibits[0], ni);
647 		if (ou) ktrfdset(p, pibits[1], ni);
648 		if (ex) ktrfdset(p, pibits[2], ni);
649 	}
650 #endif
651 
652 	if (tsp) {
653 		getnanouptime(&rts);
654 		timespecadd(tsp, &rts, &ats);
655 	} else {
656 		ats.tv_sec = 0;
657 		ats.tv_nsec = 0;
658 	}
659 	timo = 0;
660 
661 	if (sigmask)
662 		dosigsuspend(p, *sigmask &~ sigcantmask);
663 
664 retry:
665 	ncoll = nselcoll;
666 	atomic_setbits_int(&p->p_flag, P_SELECT);
667 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
668 	if (error || *retval)
669 		goto done;
670 	if (tsp) {
671 		getnanouptime(&rts);
672 		if (timespeccmp(&rts, &ats, >=))
673 			goto done;
674 		timespecsub(&ats, &rts, &tts);
675 		timo = tts.tv_sec > 24 * 60 * 60 ?
676 			24 * 60 * 60 * hz : tstohz(&tts);
677 	}
678 	s = splhigh();
679 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
680 		splx(s);
681 		goto retry;
682 	}
683 	atomic_clearbits_int(&p->p_flag, P_SELECT);
684 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
685 	splx(s);
686 	if (error == 0)
687 		goto retry;
688 done:
689 	atomic_clearbits_int(&p->p_flag, P_SELECT);
690 	/* select is not restarted after signals... */
691 	if (error == ERESTART)
692 		error = EINTR;
693 	if (error == EWOULDBLOCK)
694 		error = 0;
695 #define	putbits(name, x) \
696 	if (name && (error2 = copyout(pobits[x], name, ni))) \
697 		error = error2;
698 	if (error == 0) {
699 		int error2;
700 
701 		putbits(in, 0);
702 		putbits(ou, 1);
703 		putbits(ex, 2);
704 #undef putbits
705 #ifdef KTRACE
706 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
707 			if (in) ktrfdset(p, pobits[0], ni);
708 			if (ou) ktrfdset(p, pobits[1], ni);
709 			if (ex) ktrfdset(p, pobits[2], ni);
710 		}
711 #endif
712 	}
713 
714 	if (pibits[0] != (fd_set *)&bits[0])
715 		free(pibits[0], M_TEMP, 6 * ni);
716 	return (error);
717 }
718 
719 int
720 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
721     register_t *retval)
722 {
723 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
724 	struct filedesc *fdp = p->p_fd;
725 	int msk, i, j, fd;
726 	fd_mask bits;
727 	struct file *fp;
728 	int n = 0;
729 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
730 
731 	for (msk = 0; msk < 3; msk++) {
732 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
733 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
734 
735 		for (i = 0; i < nfd; i += NFDBITS) {
736 			bits = pibits->fds_bits[i/NFDBITS];
737 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
738 				bits &= ~(1 << j);
739 				if ((fp = fd_getfile(fdp, fd)) == NULL)
740 					return (EBADF);
741 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
742 					FD_SET(fd, pobits);
743 					n++;
744 				}
745 				FRELE(fp, p);
746 			}
747 		}
748 	}
749 	*retval = n;
750 	return (0);
751 }
752 
753 int
754 seltrue(dev_t dev, int events, struct proc *p)
755 {
756 
757 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
758 }
759 
760 int
761 selfalse(dev_t dev, int events, struct proc *p)
762 {
763 
764 	return (0);
765 }
766 
767 /*
768  * Record a select request.
769  */
770 void
771 selrecord(struct proc *selector, struct selinfo *sip)
772 {
773 	struct proc *p;
774 	pid_t mytid;
775 
776 	mytid = selector->p_tid;
777 	if (sip->si_seltid == mytid)
778 		return;
779 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
780 	    p->p_wchan == (caddr_t)&selwait)
781 		sip->si_flags |= SI_COLL;
782 	else
783 		sip->si_seltid = mytid;
784 }
785 
786 /*
787  * Do a wakeup when a selectable event occurs.
788  */
789 void
790 selwakeup(struct selinfo *sip)
791 {
792 	struct proc *p;
793 	int s;
794 
795 	KNOTE(&sip->si_note, NOTE_SUBMIT);
796 	if (sip->si_seltid == 0)
797 		return;
798 	if (sip->si_flags & SI_COLL) {
799 		nselcoll++;
800 		sip->si_flags &= ~SI_COLL;
801 		wakeup(&selwait);
802 	}
803 	p = tfind(sip->si_seltid);
804 	sip->si_seltid = 0;
805 	if (p != NULL) {
806 		SCHED_LOCK(s);
807 		if (p->p_wchan == (caddr_t)&selwait) {
808 			if (p->p_stat == SSLEEP)
809 				setrunnable(p);
810 			else
811 				unsleep(p);
812 		} else if (p->p_flag & P_SELECT)
813 			atomic_clearbits_int(&p->p_flag, P_SELECT);
814 		SCHED_UNLOCK(s);
815 	}
816 }
817 
818 void
819 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
820 {
821 	struct filedesc *fdp = p->p_fd;
822 	struct file *fp;
823 	u_int i;
824 	int n = 0;
825 
826 	for (i = 0; i < nfd; i++, pl++) {
827 		/* Check the file descriptor. */
828 		if (pl->fd < 0) {
829 			pl->revents = 0;
830 			continue;
831 		}
832 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
833 			pl->revents = POLLNVAL;
834 			n++;
835 			continue;
836 		}
837 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
838 		FRELE(fp, p);
839 		if (pl->revents != 0)
840 			n++;
841 	}
842 	*retval = n;
843 }
844 
845 /*
846  * Only copyout the revents field.
847  */
848 int
849 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
850 {
851 	int error = 0;
852 	u_int i = 0;
853 
854 	while (!error && i++ < nfds) {
855 		error = copyout(&pl->revents, &upl->revents,
856 		    sizeof(upl->revents));
857 		pl++;
858 		upl++;
859 	}
860 
861 	return (error);
862 }
863 
864 /*
865  * We are using the same mechanism as select only we encode/decode args
866  * differently.
867  */
868 int
869 sys_poll(struct proc *p, void *v, register_t *retval)
870 {
871 	struct sys_poll_args /* {
872 		syscallarg(struct pollfd *) fds;
873 		syscallarg(u_int) nfds;
874 		syscallarg(int) timeout;
875 	} */ *uap = v;
876 
877 	struct timespec ts, *tsp = NULL;
878 	int msec = SCARG(uap, timeout);
879 
880 	if (msec != INFTIM) {
881 		if (msec < 0)
882 			return (EINVAL);
883 		ts.tv_sec = msec / 1000;
884 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
885 		tsp = &ts;
886 	}
887 
888 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
889 	    retval));
890 }
891 
892 int
893 sys_ppoll(struct proc *p, void *v, register_t *retval)
894 {
895 	struct sys_ppoll_args /* {
896 		syscallarg(struct pollfd *) fds;
897 		syscallarg(u_int) nfds;
898 		syscallarg(const struct timespec *) ts;
899 		syscallarg(const sigset_t *) mask;
900 	} */ *uap = v;
901 
902 	int error;
903 	struct timespec ts, *tsp = NULL;
904 	sigset_t ss, *ssp = NULL;
905 
906 	if (SCARG(uap, ts) != NULL) {
907 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
908 			return (error);
909 		if ((error = timespecfix(&ts)) != 0)
910 			return (error);
911 #ifdef KTRACE
912 		if (KTRPOINT(p, KTR_STRUCT))
913 			ktrreltimespec(p, &ts);
914 #endif
915 		tsp = &ts;
916 	}
917 
918 	if (SCARG(uap, mask) != NULL) {
919 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
920 			return (error);
921 		ssp = &ss;
922 	}
923 
924 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
925 	    retval));
926 }
927 
928 int
929 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
930     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
931 {
932 	size_t sz;
933 	struct pollfd pfds[4], *pl = pfds;
934 	struct timespec ats, rts, tts;
935 	int timo, ncoll, i, s, error;
936 
937 	/* Standards say no more than MAX_OPEN; this is possibly better. */
938 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
939 		return (EINVAL);
940 
941 	/* optimize for the default case, of a small nfds value */
942 	if (nfds > nitems(pfds)) {
943 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
944 		    M_WAITOK | M_CANFAIL);
945 		if (pl == NULL)
946 			return (EINVAL);
947 	}
948 
949 	sz = nfds * sizeof(*pl);
950 
951 	if ((error = copyin(fds, pl, sz)) != 0)
952 		goto bad;
953 
954 	for (i = 0; i < nfds; i++) {
955 		pl[i].events &= ~POLL_NOHUP;
956 		pl[i].revents = 0;
957 	}
958 
959 	if (tsp != NULL) {
960 		getnanouptime(&rts);
961 		timespecadd(tsp, &rts, &ats);
962 	} else {
963 		ats.tv_sec = 0;
964 		ats.tv_nsec = 0;
965 	}
966 	timo = 0;
967 
968 	if (sigmask)
969 		dosigsuspend(p, *sigmask &~ sigcantmask);
970 
971 retry:
972 	ncoll = nselcoll;
973 	atomic_setbits_int(&p->p_flag, P_SELECT);
974 	pollscan(p, pl, nfds, retval);
975 	if (*retval)
976 		goto done;
977 	if (tsp != NULL) {
978 		getnanouptime(&rts);
979 		if (timespeccmp(&rts, &ats, >=))
980 			goto done;
981 		timespecsub(&ats, &rts, &tts);
982 		timo = tts.tv_sec > 24 * 60 * 60 ?
983 			24 * 60 * 60 * hz : tstohz(&tts);
984 	}
985 	s = splhigh();
986 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
987 		splx(s);
988 		goto retry;
989 	}
990 	atomic_clearbits_int(&p->p_flag, P_SELECT);
991 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
992 	splx(s);
993 	if (error == 0)
994 		goto retry;
995 
996 done:
997 	atomic_clearbits_int(&p->p_flag, P_SELECT);
998 	/*
999 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1000 	 *       ignored (since the whole point is to see what would block).
1001 	 */
1002 	switch (error) {
1003 	case ERESTART:
1004 		error = pollout(pl, fds, nfds);
1005 		if (error == 0)
1006 			error = EINTR;
1007 		break;
1008 	case EWOULDBLOCK:
1009 	case 0:
1010 		error = pollout(pl, fds, nfds);
1011 		break;
1012 	}
1013 #ifdef KTRACE
1014 	if (KTRPOINT(p, KTR_STRUCT))
1015 		ktrpollfd(p, pl, nfds);
1016 #endif /* KTRACE */
1017 bad:
1018 	if (pl != pfds)
1019 		free(pl, M_TEMP, sz);
1020 	return (error);
1021 }
1022 
1023 /*
1024  * utrace system call
1025  */
1026 int
1027 sys_utrace(struct proc *curp, void *v, register_t *retval)
1028 {
1029 #ifdef KTRACE
1030 	struct sys_utrace_args /* {
1031 		syscallarg(const char *) label;
1032 		syscallarg(const void *) addr;
1033 		syscallarg(size_t) len;
1034 	} */ *uap = v;
1035 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1036 	    SCARG(uap, len)));
1037 #else
1038 	return (0);
1039 #endif
1040 }
1041