xref: /openbsd-src/sys/kern/sys_generic.c (revision d4c5fc9dc00f5a9cadd8c2de4e52d85d3c1c6003)
1 /*	$OpenBSD: sys_generic.c,v 1.119 2018/05/08 08:53:41 mpi Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/socketvar.h>
50 #include <sys/signalvar.h>
51 #include <sys/uio.h>
52 #include <sys/kernel.h>
53 #include <sys/stat.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #ifdef KTRACE
57 #include <sys/ktrace.h>
58 #endif
59 #include <sys/sched.h>
60 #include <sys/pledge.h>
61 
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
68 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
69 int pollout(struct pollfd *, struct pollfd *, u_int);
70 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
71     const struct timespec *, const sigset_t *, register_t *);
72 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
73     const sigset_t *, register_t *);
74 
75 /*
76  * Read system call.
77  */
78 int
79 sys_read(struct proc *p, void *v, register_t *retval)
80 {
81 	struct sys_read_args /* {
82 		syscallarg(int) fd;
83 		syscallarg(void *) buf;
84 		syscallarg(size_t) nbyte;
85 	} */ *uap = v;
86 	struct iovec iov;
87 	int fd = SCARG(uap, fd);
88 	struct file *fp;
89 	struct filedesc *fdp = p->p_fd;
90 
91 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
92 		return (EBADF);
93 
94 	iov.iov_base = SCARG(uap, buf);
95 	iov.iov_len = SCARG(uap, nbyte);
96 
97 	/* dofilereadv() will FRELE the descriptor for us */
98 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
99 }
100 
101 /*
102  * Scatter read system call.
103  */
104 int
105 sys_readv(struct proc *p, void *v, register_t *retval)
106 {
107 	struct sys_readv_args /* {
108 		syscallarg(int) fd;
109 		syscallarg(const struct iovec *) iovp;
110 		syscallarg(int) iovcnt;
111 	} */ *uap = v;
112 	int fd = SCARG(uap, fd);
113 	struct file *fp;
114 	struct filedesc *fdp = p->p_fd;
115 
116 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
117 		return (EBADF);
118 
119 	/* dofilereadv() will FRELE the descriptor for us */
120 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
121 	    &fp->f_offset, retval));
122 }
123 
124 int
125 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
126     int iovcnt, int userspace, off_t *offset, register_t *retval)
127 {
128 	struct iovec aiov[UIO_SMALLIOV];
129 	struct uio auio;
130 	struct iovec *iov;
131 	struct iovec *needfree = NULL;
132 	long i, cnt, error = 0;
133 	u_int iovlen;
134 #ifdef KTRACE
135 	struct iovec *ktriov = NULL;
136 #endif
137 
138 	/* note: can't use iovlen until iovcnt is validated */
139 	iovlen = iovcnt * sizeof(struct iovec);
140 
141 	/*
142 	 * If the iovec array exists in userspace, it needs to be copied in;
143 	 * otherwise, it can be used directly.
144 	 */
145 	if (userspace) {
146 		if ((u_int)iovcnt > UIO_SMALLIOV) {
147 			if ((u_int)iovcnt > IOV_MAX) {
148 				error = EINVAL;
149 				goto out;
150 			}
151 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
152 		} else if ((u_int)iovcnt > 0) {
153 			iov = aiov;
154 			needfree = NULL;
155 		} else {
156 			error = EINVAL;
157 			goto out;
158 		}
159 		if ((error = copyin(iovp, iov, iovlen)))
160 			goto done;
161 #ifdef KTRACE
162 		if (KTRPOINT(p, KTR_STRUCT))
163 			ktriovec(p, iov, iovcnt);
164 #endif
165 	} else {
166 		iov = (struct iovec *)iovp;		/* de-constify */
167 	}
168 
169 	auio.uio_iov = iov;
170 	auio.uio_iovcnt = iovcnt;
171 	auio.uio_rw = UIO_READ;
172 	auio.uio_segflg = UIO_USERSPACE;
173 	auio.uio_procp = p;
174 	auio.uio_resid = 0;
175 	for (i = 0; i < iovcnt; i++) {
176 		auio.uio_resid += iov->iov_len;
177 		/*
178 		 * Reads return ssize_t because -1 is returned on error.
179 		 * Therefore we must restrict the length to SSIZE_MAX to
180 		 * avoid garbage return values.  Note that the addition is
181 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
182 		 */
183 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
184 			error = EINVAL;
185 			goto done;
186 		}
187 		iov++;
188 	}
189 #ifdef KTRACE
190 	/*
191 	 * if tracing, save a copy of iovec
192 	 */
193 	if (KTRPOINT(p, KTR_GENIO)) {
194 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
195 		memcpy(ktriov, auio.uio_iov, iovlen);
196 	}
197 #endif
198 	cnt = auio.uio_resid;
199 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
200 	if (error)
201 		if (auio.uio_resid != cnt && (error == ERESTART ||
202 		    error == EINTR || error == EWOULDBLOCK))
203 			error = 0;
204 	cnt -= auio.uio_resid;
205 
206 	mtx_enter(&fp->f_mtx);
207 	fp->f_rxfer++;
208 	fp->f_rbytes += cnt;
209 	mtx_leave(&fp->f_mtx);
210 #ifdef KTRACE
211 	if (ktriov != NULL) {
212 		if (error == 0)
213 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
214 		free(ktriov, M_TEMP, iovlen);
215 	}
216 #endif
217 	*retval = cnt;
218  done:
219 	if (needfree)
220 		free(needfree, M_IOV, iovlen);
221  out:
222 	FRELE(fp, p);
223 	return (error);
224 }
225 
226 /*
227  * Write system call
228  */
229 int
230 sys_write(struct proc *p, void *v, register_t *retval)
231 {
232 	struct sys_write_args /* {
233 		syscallarg(int) fd;
234 		syscallarg(const void *) buf;
235 		syscallarg(size_t) nbyte;
236 	} */ *uap = v;
237 	struct iovec iov;
238 	int fd = SCARG(uap, fd);
239 	struct file *fp;
240 	struct filedesc *fdp = p->p_fd;
241 
242 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
243 		return (EBADF);
244 
245 	iov.iov_base = (void *)SCARG(uap, buf);
246 	iov.iov_len = SCARG(uap, nbyte);
247 
248 	/* dofilewritev() will FRELE the descriptor for us */
249 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
250 }
251 
252 /*
253  * Gather write system call
254  */
255 int
256 sys_writev(struct proc *p, void *v, register_t *retval)
257 {
258 	struct sys_writev_args /* {
259 		syscallarg(int) fd;
260 		syscallarg(const struct iovec *) iovp;
261 		syscallarg(int) iovcnt;
262 	} */ *uap = v;
263 	int fd = SCARG(uap, fd);
264 	struct file *fp;
265 	struct filedesc *fdp = p->p_fd;
266 
267 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
268 		return (EBADF);
269 
270 	/* dofilewritev() will FRELE the descriptor for us */
271 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
272 	    &fp->f_offset, retval));
273 }
274 
275 int
276 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
277     int iovcnt, int userspace, off_t *offset, register_t *retval)
278 {
279 	struct iovec aiov[UIO_SMALLIOV];
280 	struct uio auio;
281 	struct iovec *iov;
282 	struct iovec *needfree = NULL;
283 	long i, cnt, error = 0;
284 	u_int iovlen;
285 #ifdef KTRACE
286 	struct iovec *ktriov = NULL;
287 #endif
288 
289 	/* note: can't use iovlen until iovcnt is validated */
290 	iovlen = iovcnt * sizeof(struct iovec);
291 
292 	/*
293 	 * If the iovec array exists in userspace, it needs to be copied in;
294 	 * otherwise, it can be used directly.
295 	 */
296 	if (userspace) {
297 		if ((u_int)iovcnt > UIO_SMALLIOV) {
298 			if ((u_int)iovcnt > IOV_MAX) {
299 				error = EINVAL;
300 				goto out;
301 			}
302 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
303 		} else if ((u_int)iovcnt > 0) {
304 			iov = aiov;
305 			needfree = NULL;
306 		} else {
307 			error = EINVAL;
308 			goto out;
309 		}
310 		if ((error = copyin(iovp, iov, iovlen)))
311 			goto done;
312 #ifdef KTRACE
313 		if (KTRPOINT(p, KTR_STRUCT))
314 			ktriovec(p, iov, iovcnt);
315 #endif
316 	} else {
317 		iov = (struct iovec *)iovp;		/* de-constify */
318 	}
319 
320 	auio.uio_iov = iov;
321 	auio.uio_iovcnt = iovcnt;
322 	auio.uio_rw = UIO_WRITE;
323 	auio.uio_segflg = UIO_USERSPACE;
324 	auio.uio_procp = p;
325 	auio.uio_resid = 0;
326 	for (i = 0; i < iovcnt; i++) {
327 		auio.uio_resid += iov->iov_len;
328 		/*
329 		 * Writes return ssize_t because -1 is returned on error.
330 		 * Therefore we must restrict the length to SSIZE_MAX to
331 		 * avoid garbage return values.  Note that the addition is
332 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
333 		 */
334 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
335 			error = EINVAL;
336 			goto done;
337 		}
338 		iov++;
339 	}
340 #ifdef KTRACE
341 	/*
342 	 * if tracing, save a copy of iovec
343 	 */
344 	if (KTRPOINT(p, KTR_GENIO)) {
345 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
346 		memcpy(ktriov, auio.uio_iov, iovlen);
347 	}
348 #endif
349 	cnt = auio.uio_resid;
350 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
351 	if (error) {
352 		if (auio.uio_resid != cnt && (error == ERESTART ||
353 		    error == EINTR || error == EWOULDBLOCK))
354 			error = 0;
355 		if (error == EPIPE)
356 			ptsignal(p, SIGPIPE, STHREAD);
357 	}
358 	cnt -= auio.uio_resid;
359 
360 	mtx_enter(&fp->f_mtx);
361 	fp->f_wxfer++;
362 	fp->f_wbytes += cnt;
363 	mtx_leave(&fp->f_mtx);
364 #ifdef KTRACE
365 	if (ktriov != NULL) {
366 		if (error == 0)
367 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
368 		free(ktriov, M_TEMP, iovlen);
369 	}
370 #endif
371 	*retval = cnt;
372  done:
373 	if (needfree)
374 		free(needfree, M_IOV, iovlen);
375  out:
376 	FRELE(fp, p);
377 	return (error);
378 }
379 
380 /*
381  * Ioctl system call
382  */
383 int
384 sys_ioctl(struct proc *p, void *v, register_t *retval)
385 {
386 	struct sys_ioctl_args /* {
387 		syscallarg(int) fd;
388 		syscallarg(u_long) com;
389 		syscallarg(void *) data;
390 	} */ *uap = v;
391 	struct file *fp;
392 	struct filedesc *fdp;
393 	u_long com = SCARG(uap, com);
394 	int error = 0;
395 	u_int size;
396 	caddr_t data, memp = NULL;
397 	int tmp;
398 #define STK_PARAMS	128
399 	long long stkbuf[STK_PARAMS / sizeof(long long)];
400 
401 	fdp = p->p_fd;
402 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
403 		return (EBADF);
404 
405 	if (fp->f_type == DTYPE_SOCKET) {
406 		struct socket *so = fp->f_data;
407 
408 		if (so->so_state & SS_DNS) {
409 			error = EINVAL;
410 			goto out;
411 		}
412 	}
413 
414 	error = pledge_ioctl(p, com, fp);
415 	if (error)
416 		goto out;
417 
418 	switch (com) {
419 	case FIONCLEX:
420 	case FIOCLEX:
421 		fdplock(fdp);
422 		if (com == FIONCLEX)
423 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
424 		else
425 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
426 		fdpunlock(fdp);
427 		goto out;
428 	}
429 
430 	/*
431 	 * Interpret high order word to find amount of data to be
432 	 * copied to/from the user's address space.
433 	 */
434 	size = IOCPARM_LEN(com);
435 	if (size > IOCPARM_MAX) {
436 		error = ENOTTY;
437 		goto out;
438 	}
439 	if (size > sizeof (stkbuf)) {
440 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
441 		data = memp;
442 	} else
443 		data = (caddr_t)stkbuf;
444 	if (com&IOC_IN) {
445 		if (size) {
446 			error = copyin(SCARG(uap, data), data, size);
447 			if (error) {
448 				goto out;
449 			}
450 		} else
451 			*(caddr_t *)data = SCARG(uap, data);
452 	} else if ((com&IOC_OUT) && size)
453 		/*
454 		 * Zero the buffer so the user always
455 		 * gets back something deterministic.
456 		 */
457 		memset(data, 0, size);
458 	else if (com&IOC_VOID)
459 		*(caddr_t *)data = SCARG(uap, data);
460 
461 	switch (com) {
462 
463 	case FIONBIO:
464 		if ((tmp = *(int *)data) != 0)
465 			fp->f_flag |= FNONBLOCK;
466 		else
467 			fp->f_flag &= ~FNONBLOCK;
468 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
469 		break;
470 
471 	case FIOASYNC:
472 		if ((tmp = *(int *)data) != 0)
473 			fp->f_flag |= FASYNC;
474 		else
475 			fp->f_flag &= ~FASYNC;
476 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
477 		break;
478 
479 	case FIOSETOWN:
480 		tmp = *(int *)data;
481 		if (fp->f_type == DTYPE_SOCKET) {
482 			struct socket *so = fp->f_data;
483 
484 			so->so_pgid = tmp;
485 			so->so_siguid = p->p_ucred->cr_ruid;
486 			so->so_sigeuid = p->p_ucred->cr_uid;
487 			error = 0;
488 			break;
489 		}
490 		if (tmp <= 0) {
491 			tmp = -tmp;
492 		} else {
493 			struct process *pr = prfind(tmp);
494 			if (pr == NULL) {
495 				error = ESRCH;
496 				break;
497 			}
498 			tmp = pr->ps_pgrp->pg_id;
499 		}
500 		error = (*fp->f_ops->fo_ioctl)
501 		    (fp, TIOCSPGRP, (caddr_t)&tmp, p);
502 		break;
503 
504 	case FIOGETOWN:
505 		if (fp->f_type == DTYPE_SOCKET) {
506 			error = 0;
507 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
508 			break;
509 		}
510 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
511 		*(int *)data = -*(int *)data;
512 		break;
513 
514 	default:
515 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
516 		break;
517 	}
518 	/*
519 	 * Copy any data to user, size was
520 	 * already set and checked above.
521 	 */
522 	if (error == 0 && (com&IOC_OUT) && size)
523 		error = copyout(data, SCARG(uap, data), size);
524 out:
525 	FRELE(fp, p);
526 	free(memp, M_IOCTLOPS, size);
527 	return (error);
528 }
529 
530 int	selwait, nselcoll;
531 
532 /*
533  * Select system call.
534  */
535 int
536 sys_select(struct proc *p, void *v, register_t *retval)
537 {
538 	struct sys_select_args /* {
539 		syscallarg(int) nd;
540 		syscallarg(fd_set *) in;
541 		syscallarg(fd_set *) ou;
542 		syscallarg(fd_set *) ex;
543 		syscallarg(struct timeval *) tv;
544 	} */ *uap = v;
545 
546 	struct timespec ts, *tsp = NULL;
547 	int error;
548 
549 	if (SCARG(uap, tv) != NULL) {
550 		struct timeval tv;
551 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
552 			return (error);
553 		if ((error = itimerfix(&tv)) != 0)
554 			return (error);
555 #ifdef KTRACE
556 		if (KTRPOINT(p, KTR_STRUCT))
557 			ktrreltimeval(p, &tv);
558 #endif
559 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
560 		tsp = &ts;
561 	}
562 
563 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
564 	    SCARG(uap, ex), tsp, NULL, retval));
565 }
566 
567 int
568 sys_pselect(struct proc *p, void *v, register_t *retval)
569 {
570 	struct sys_pselect_args /* {
571 		syscallarg(int) nd;
572 		syscallarg(fd_set *) in;
573 		syscallarg(fd_set *) ou;
574 		syscallarg(fd_set *) ex;
575 		syscallarg(const struct timespec *) ts;
576 		syscallarg(const sigset_t *) mask;
577 	} */ *uap = v;
578 
579 	struct timespec ts, *tsp = NULL;
580 	sigset_t ss, *ssp = NULL;
581 	int error;
582 
583 	if (SCARG(uap, ts) != NULL) {
584 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
585 			return (error);
586 		if ((error = timespecfix(&ts)) != 0)
587 			return (error);
588 #ifdef KTRACE
589 		if (KTRPOINT(p, KTR_STRUCT))
590 			ktrreltimespec(p, &ts);
591 #endif
592 		tsp = &ts;
593 	}
594 	if (SCARG(uap, mask) != NULL) {
595 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
596 			return (error);
597 		ssp = &ss;
598 	}
599 
600 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
601 	    SCARG(uap, ex), tsp, ssp, retval));
602 }
603 
604 int
605 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
606     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
607 {
608 	fd_mask bits[6];
609 	fd_set *pibits[3], *pobits[3];
610 	struct timespec ats, rts, tts;
611 	int s, ncoll, error = 0, timo;
612 	u_int ni;
613 
614 	if (nd < 0)
615 		return (EINVAL);
616 	if (nd > p->p_fd->fd_nfiles) {
617 		/* forgiving; slightly wrong */
618 		nd = p->p_fd->fd_nfiles;
619 	}
620 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
621 	if (ni > sizeof(bits[0])) {
622 		caddr_t mbits;
623 
624 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
625 		pibits[0] = (fd_set *)&mbits[ni * 0];
626 		pibits[1] = (fd_set *)&mbits[ni * 1];
627 		pibits[2] = (fd_set *)&mbits[ni * 2];
628 		pobits[0] = (fd_set *)&mbits[ni * 3];
629 		pobits[1] = (fd_set *)&mbits[ni * 4];
630 		pobits[2] = (fd_set *)&mbits[ni * 5];
631 	} else {
632 		memset(bits, 0, sizeof(bits));
633 		pibits[0] = (fd_set *)&bits[0];
634 		pibits[1] = (fd_set *)&bits[1];
635 		pibits[2] = (fd_set *)&bits[2];
636 		pobits[0] = (fd_set *)&bits[3];
637 		pobits[1] = (fd_set *)&bits[4];
638 		pobits[2] = (fd_set *)&bits[5];
639 	}
640 
641 #define	getbits(name, x) \
642 	if (name && (error = copyin(name, pibits[x], ni))) \
643 		goto done;
644 	getbits(in, 0);
645 	getbits(ou, 1);
646 	getbits(ex, 2);
647 #undef	getbits
648 #ifdef KTRACE
649 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
650 		if (in) ktrfdset(p, pibits[0], ni);
651 		if (ou) ktrfdset(p, pibits[1], ni);
652 		if (ex) ktrfdset(p, pibits[2], ni);
653 	}
654 #endif
655 
656 	if (tsp) {
657 		getnanouptime(&rts);
658 		timespecadd(tsp, &rts, &ats);
659 	} else {
660 		ats.tv_sec = 0;
661 		ats.tv_nsec = 0;
662 	}
663 	timo = 0;
664 
665 	if (sigmask)
666 		dosigsuspend(p, *sigmask &~ sigcantmask);
667 
668 retry:
669 	ncoll = nselcoll;
670 	atomic_setbits_int(&p->p_flag, P_SELECT);
671 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
672 	if (error || *retval)
673 		goto done;
674 	if (tsp) {
675 		getnanouptime(&rts);
676 		if (timespeccmp(&rts, &ats, >=))
677 			goto done;
678 		timespecsub(&ats, &rts, &tts);
679 		timo = tts.tv_sec > 24 * 60 * 60 ?
680 			24 * 60 * 60 * hz : tstohz(&tts);
681 	}
682 	s = splhigh();
683 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
684 		splx(s);
685 		goto retry;
686 	}
687 	atomic_clearbits_int(&p->p_flag, P_SELECT);
688 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
689 	splx(s);
690 	if (error == 0)
691 		goto retry;
692 done:
693 	atomic_clearbits_int(&p->p_flag, P_SELECT);
694 	/* select is not restarted after signals... */
695 	if (error == ERESTART)
696 		error = EINTR;
697 	if (error == EWOULDBLOCK)
698 		error = 0;
699 #define	putbits(name, x) \
700 	if (name && (error2 = copyout(pobits[x], name, ni))) \
701 		error = error2;
702 	if (error == 0) {
703 		int error2;
704 
705 		putbits(in, 0);
706 		putbits(ou, 1);
707 		putbits(ex, 2);
708 #undef putbits
709 #ifdef KTRACE
710 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
711 			if (in) ktrfdset(p, pobits[0], ni);
712 			if (ou) ktrfdset(p, pobits[1], ni);
713 			if (ex) ktrfdset(p, pobits[2], ni);
714 		}
715 #endif
716 	}
717 
718 	if (pibits[0] != (fd_set *)&bits[0])
719 		free(pibits[0], M_TEMP, 6 * ni);
720 	return (error);
721 }
722 
723 int
724 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
725     register_t *retval)
726 {
727 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
728 	struct filedesc *fdp = p->p_fd;
729 	int msk, i, j, fd;
730 	fd_mask bits;
731 	struct file *fp;
732 	int n = 0;
733 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
734 
735 	for (msk = 0; msk < 3; msk++) {
736 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
737 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
738 
739 		for (i = 0; i < nfd; i += NFDBITS) {
740 			bits = pibits->fds_bits[i/NFDBITS];
741 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
742 				bits &= ~(1 << j);
743 				if ((fp = fd_getfile(fdp, fd)) == NULL)
744 					return (EBADF);
745 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
746 					FD_SET(fd, pobits);
747 					n++;
748 				}
749 				FRELE(fp, p);
750 			}
751 		}
752 	}
753 	*retval = n;
754 	return (0);
755 }
756 
757 int
758 seltrue(dev_t dev, int events, struct proc *p)
759 {
760 
761 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
762 }
763 
764 int
765 selfalse(dev_t dev, int events, struct proc *p)
766 {
767 
768 	return (0);
769 }
770 
771 /*
772  * Record a select request.
773  */
774 void
775 selrecord(struct proc *selector, struct selinfo *sip)
776 {
777 	struct proc *p;
778 	pid_t mytid;
779 
780 	mytid = selector->p_tid;
781 	if (sip->si_seltid == mytid)
782 		return;
783 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
784 	    p->p_wchan == (caddr_t)&selwait)
785 		sip->si_flags |= SI_COLL;
786 	else
787 		sip->si_seltid = mytid;
788 }
789 
790 /*
791  * Do a wakeup when a selectable event occurs.
792  */
793 void
794 selwakeup(struct selinfo *sip)
795 {
796 	struct proc *p;
797 	int s;
798 
799 	KNOTE(&sip->si_note, NOTE_SUBMIT);
800 	if (sip->si_seltid == 0)
801 		return;
802 	if (sip->si_flags & SI_COLL) {
803 		nselcoll++;
804 		sip->si_flags &= ~SI_COLL;
805 		wakeup(&selwait);
806 	}
807 	p = tfind(sip->si_seltid);
808 	sip->si_seltid = 0;
809 	if (p != NULL) {
810 		SCHED_LOCK(s);
811 		if (p->p_wchan == (caddr_t)&selwait) {
812 			if (p->p_stat == SSLEEP)
813 				setrunnable(p);
814 			else
815 				unsleep(p);
816 		} else if (p->p_flag & P_SELECT)
817 			atomic_clearbits_int(&p->p_flag, P_SELECT);
818 		SCHED_UNLOCK(s);
819 	}
820 }
821 
822 void
823 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
824 {
825 	struct filedesc *fdp = p->p_fd;
826 	struct file *fp;
827 	u_int i;
828 	int n = 0;
829 
830 	for (i = 0; i < nfd; i++, pl++) {
831 		/* Check the file descriptor. */
832 		if (pl->fd < 0) {
833 			pl->revents = 0;
834 			continue;
835 		}
836 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
837 			pl->revents = POLLNVAL;
838 			n++;
839 			continue;
840 		}
841 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
842 		FRELE(fp, p);
843 		if (pl->revents != 0)
844 			n++;
845 	}
846 	*retval = n;
847 }
848 
849 /*
850  * Only copyout the revents field.
851  */
852 int
853 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
854 {
855 	int error = 0;
856 	u_int i = 0;
857 
858 	while (!error && i++ < nfds) {
859 		error = copyout(&pl->revents, &upl->revents,
860 		    sizeof(upl->revents));
861 		pl++;
862 		upl++;
863 	}
864 
865 	return (error);
866 }
867 
868 /*
869  * We are using the same mechanism as select only we encode/decode args
870  * differently.
871  */
872 int
873 sys_poll(struct proc *p, void *v, register_t *retval)
874 {
875 	struct sys_poll_args /* {
876 		syscallarg(struct pollfd *) fds;
877 		syscallarg(u_int) nfds;
878 		syscallarg(int) timeout;
879 	} */ *uap = v;
880 
881 	struct timespec ts, *tsp = NULL;
882 	int msec = SCARG(uap, timeout);
883 
884 	if (msec != INFTIM) {
885 		if (msec < 0)
886 			return (EINVAL);
887 		ts.tv_sec = msec / 1000;
888 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
889 		tsp = &ts;
890 	}
891 
892 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
893 	    retval));
894 }
895 
896 int
897 sys_ppoll(struct proc *p, void *v, register_t *retval)
898 {
899 	struct sys_ppoll_args /* {
900 		syscallarg(struct pollfd *) fds;
901 		syscallarg(u_int) nfds;
902 		syscallarg(const struct timespec *) ts;
903 		syscallarg(const sigset_t *) mask;
904 	} */ *uap = v;
905 
906 	int error;
907 	struct timespec ts, *tsp = NULL;
908 	sigset_t ss, *ssp = NULL;
909 
910 	if (SCARG(uap, ts) != NULL) {
911 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
912 			return (error);
913 		if ((error = timespecfix(&ts)) != 0)
914 			return (error);
915 #ifdef KTRACE
916 		if (KTRPOINT(p, KTR_STRUCT))
917 			ktrreltimespec(p, &ts);
918 #endif
919 		tsp = &ts;
920 	}
921 
922 	if (SCARG(uap, mask) != NULL) {
923 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
924 			return (error);
925 		ssp = &ss;
926 	}
927 
928 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
929 	    retval));
930 }
931 
932 int
933 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
934     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
935 {
936 	size_t sz;
937 	struct pollfd pfds[4], *pl = pfds;
938 	struct timespec ats, rts, tts;
939 	int timo, ncoll, i, s, error;
940 
941 	/* Standards say no more than MAX_OPEN; this is possibly better. */
942 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
943 		return (EINVAL);
944 
945 	/* optimize for the default case, of a small nfds value */
946 	if (nfds > nitems(pfds)) {
947 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
948 		    M_WAITOK | M_CANFAIL);
949 		if (pl == NULL)
950 			return (EINVAL);
951 	}
952 
953 	sz = nfds * sizeof(*pl);
954 
955 	if ((error = copyin(fds, pl, sz)) != 0)
956 		goto bad;
957 
958 	for (i = 0; i < nfds; i++) {
959 		pl[i].events &= ~POLL_NOHUP;
960 		pl[i].revents = 0;
961 	}
962 
963 	if (tsp != NULL) {
964 		getnanouptime(&rts);
965 		timespecadd(tsp, &rts, &ats);
966 	} else {
967 		ats.tv_sec = 0;
968 		ats.tv_nsec = 0;
969 	}
970 	timo = 0;
971 
972 	if (sigmask)
973 		dosigsuspend(p, *sigmask &~ sigcantmask);
974 
975 retry:
976 	ncoll = nselcoll;
977 	atomic_setbits_int(&p->p_flag, P_SELECT);
978 	pollscan(p, pl, nfds, retval);
979 	if (*retval)
980 		goto done;
981 	if (tsp != NULL) {
982 		getnanouptime(&rts);
983 		if (timespeccmp(&rts, &ats, >=))
984 			goto done;
985 		timespecsub(&ats, &rts, &tts);
986 		timo = tts.tv_sec > 24 * 60 * 60 ?
987 			24 * 60 * 60 * hz : tstohz(&tts);
988 	}
989 	s = splhigh();
990 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
991 		splx(s);
992 		goto retry;
993 	}
994 	atomic_clearbits_int(&p->p_flag, P_SELECT);
995 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
996 	splx(s);
997 	if (error == 0)
998 		goto retry;
999 
1000 done:
1001 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1002 	/*
1003 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1004 	 *       ignored (since the whole point is to see what would block).
1005 	 */
1006 	switch (error) {
1007 	case ERESTART:
1008 		error = pollout(pl, fds, nfds);
1009 		if (error == 0)
1010 			error = EINTR;
1011 		break;
1012 	case EWOULDBLOCK:
1013 	case 0:
1014 		error = pollout(pl, fds, nfds);
1015 		break;
1016 	}
1017 #ifdef KTRACE
1018 	if (KTRPOINT(p, KTR_STRUCT))
1019 		ktrpollfd(p, pl, nfds);
1020 #endif /* KTRACE */
1021 bad:
1022 	if (pl != pfds)
1023 		free(pl, M_TEMP, sz);
1024 	return (error);
1025 }
1026 
1027 /*
1028  * utrace system call
1029  */
1030 int
1031 sys_utrace(struct proc *curp, void *v, register_t *retval)
1032 {
1033 #ifdef KTRACE
1034 	struct sys_utrace_args /* {
1035 		syscallarg(const char *) label;
1036 		syscallarg(const void *) addr;
1037 		syscallarg(size_t) len;
1038 	} */ *uap = v;
1039 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1040 	    SCARG(uap, len)));
1041 #else
1042 	return (0);
1043 #endif
1044 }
1045