xref: /openbsd-src/sys/kern/sys_generic.c (revision fc68b3f3761b143a31dcbbb0931b9bcaca15d069)
1 /*	$OpenBSD: sys_generic.c,v 1.112 2016/07/05 00:35:09 tedu Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 #include <sys/pledge.h>
60 
61 #include <sys/mount.h>
62 #include <sys/syscallargs.h>
63 
64 #include <uvm/uvm_extern.h>
65 
66 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
67 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
68 int pollout(struct pollfd *, struct pollfd *, u_int);
69 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
70     const struct timespec *, const sigset_t *, register_t *);
71 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
72     const sigset_t *, register_t *);
73 
74 /*
75  * Read system call.
76  */
77 int
78 sys_read(struct proc *p, void *v, register_t *retval)
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	struct iovec iov;
86 	int fd = SCARG(uap, fd);
87 	struct file *fp;
88 	struct filedesc *fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
91 		return (EBADF);
92 
93 	iov.iov_base = SCARG(uap, buf);
94 	iov.iov_len = SCARG(uap, nbyte);
95 
96 	FREF(fp);
97 
98 	/* dofilereadv() will FRELE the descriptor for us */
99 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
100 }
101 
102 /*
103  * Scatter read system call.
104  */
105 int
106 sys_readv(struct proc *p, void *v, register_t *retval)
107 {
108 	struct sys_readv_args /* {
109 		syscallarg(int) fd;
110 		syscallarg(const struct iovec *) iovp;
111 		syscallarg(int) iovcnt;
112 	} */ *uap = v;
113 	int fd = SCARG(uap, fd);
114 	struct file *fp;
115 	struct filedesc *fdp = p->p_fd;
116 
117 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
118 		return (EBADF);
119 	FREF(fp);
120 
121 	/* dofilereadv() will FRELE the descriptor for us */
122 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
123 	    &fp->f_offset, retval));
124 }
125 
126 int
127 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
128     int iovcnt, int userspace, off_t *offset, register_t *retval)
129 {
130 	struct iovec aiov[UIO_SMALLIOV];
131 	struct uio auio;
132 	struct iovec *iov;
133 	struct iovec *needfree = NULL;
134 	long i, cnt, error = 0;
135 	u_int iovlen;
136 #ifdef KTRACE
137 	struct iovec *ktriov = NULL;
138 #endif
139 
140 	/* note: can't use iovlen until iovcnt is validated */
141 	iovlen = iovcnt * sizeof(struct iovec);
142 
143 	/*
144 	 * If the iovec array exists in userspace, it needs to be copied in;
145 	 * otherwise, it can be used directly.
146 	 */
147 	if (userspace) {
148 		if ((u_int)iovcnt > UIO_SMALLIOV) {
149 			if ((u_int)iovcnt > IOV_MAX) {
150 				error = EINVAL;
151 				goto out;
152 			}
153 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
154 		} else if ((u_int)iovcnt > 0) {
155 			iov = aiov;
156 			needfree = NULL;
157 		} else {
158 			error = EINVAL;
159 			goto out;
160 		}
161 		if ((error = copyin(iovp, iov, iovlen)))
162 			goto done;
163 #ifdef KTRACE
164 		if (KTRPOINT(p, KTR_STRUCT))
165 			ktriovec(p, iov, iovcnt);
166 #endif
167 	} else {
168 		iov = (struct iovec *)iovp;		/* de-constify */
169 	}
170 
171 	auio.uio_iov = iov;
172 	auio.uio_iovcnt = iovcnt;
173 	auio.uio_rw = UIO_READ;
174 	auio.uio_segflg = UIO_USERSPACE;
175 	auio.uio_procp = p;
176 	auio.uio_resid = 0;
177 	for (i = 0; i < iovcnt; i++) {
178 		auio.uio_resid += iov->iov_len;
179 		/*
180 		 * Reads return ssize_t because -1 is returned on error.
181 		 * Therefore we must restrict the length to SSIZE_MAX to
182 		 * avoid garbage return values.  Note that the addition is
183 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
184 		 */
185 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
186 			error = EINVAL;
187 			goto done;
188 		}
189 		iov++;
190 	}
191 #ifdef KTRACE
192 	/*
193 	 * if tracing, save a copy of iovec
194 	 */
195 	if (KTRPOINT(p, KTR_GENIO)) {
196 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
197 		memcpy(ktriov, auio.uio_iov, iovlen);
198 	}
199 #endif
200 	cnt = auio.uio_resid;
201 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
202 	if (error)
203 		if (auio.uio_resid != cnt && (error == ERESTART ||
204 		    error == EINTR || error == EWOULDBLOCK))
205 			error = 0;
206 	cnt -= auio.uio_resid;
207 
208 	fp->f_rxfer++;
209 	fp->f_rbytes += cnt;
210 #ifdef KTRACE
211 	if (ktriov != NULL) {
212 		if (error == 0)
213 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
214 		free(ktriov, M_TEMP, iovlen);
215 	}
216 #endif
217 	*retval = cnt;
218  done:
219 	if (needfree)
220 		free(needfree, M_IOV, iovlen);
221  out:
222 	FRELE(fp, p);
223 	return (error);
224 }
225 
226 /*
227  * Write system call
228  */
229 int
230 sys_write(struct proc *p, void *v, register_t *retval)
231 {
232 	struct sys_write_args /* {
233 		syscallarg(int) fd;
234 		syscallarg(const void *) buf;
235 		syscallarg(size_t) nbyte;
236 	} */ *uap = v;
237 	struct iovec iov;
238 	int fd = SCARG(uap, fd);
239 	struct file *fp;
240 	struct filedesc *fdp = p->p_fd;
241 
242 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
243 		return (EBADF);
244 
245 	iov.iov_base = (void *)SCARG(uap, buf);
246 	iov.iov_len = SCARG(uap, nbyte);
247 
248 	FREF(fp);
249 
250 	/* dofilewritev() will FRELE the descriptor for us */
251 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
252 }
253 
254 /*
255  * Gather write system call
256  */
257 int
258 sys_writev(struct proc *p, void *v, register_t *retval)
259 {
260 	struct sys_writev_args /* {
261 		syscallarg(int) fd;
262 		syscallarg(const struct iovec *) iovp;
263 		syscallarg(int) iovcnt;
264 	} */ *uap = v;
265 	int fd = SCARG(uap, fd);
266 	struct file *fp;
267 	struct filedesc *fdp = p->p_fd;
268 
269 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
270 		return (EBADF);
271 	FREF(fp);
272 
273 	/* dofilewritev() will FRELE the descriptor for us */
274 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
275 	    &fp->f_offset, retval));
276 }
277 
278 int
279 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
280     int iovcnt, int userspace, off_t *offset, register_t *retval)
281 {
282 	struct iovec aiov[UIO_SMALLIOV];
283 	struct uio auio;
284 	struct iovec *iov;
285 	struct iovec *needfree = NULL;
286 	long i, cnt, error = 0;
287 	u_int iovlen;
288 #ifdef KTRACE
289 	struct iovec *ktriov = NULL;
290 #endif
291 
292 	/* note: can't use iovlen until iovcnt is validated */
293 	iovlen = iovcnt * sizeof(struct iovec);
294 
295 	/*
296 	 * If the iovec array exists in userspace, it needs to be copied in;
297 	 * otherwise, it can be used directly.
298 	 */
299 	if (userspace) {
300 		if ((u_int)iovcnt > UIO_SMALLIOV) {
301 			if ((u_int)iovcnt > IOV_MAX) {
302 				error = EINVAL;
303 				goto out;
304 			}
305 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
306 		} else if ((u_int)iovcnt > 0) {
307 			iov = aiov;
308 			needfree = NULL;
309 		} else {
310 			error = EINVAL;
311 			goto out;
312 		}
313 		if ((error = copyin(iovp, iov, iovlen)))
314 			goto done;
315 #ifdef KTRACE
316 		if (KTRPOINT(p, KTR_STRUCT))
317 			ktriovec(p, iov, iovcnt);
318 #endif
319 	} else {
320 		iov = (struct iovec *)iovp;		/* de-constify */
321 	}
322 
323 	auio.uio_iov = iov;
324 	auio.uio_iovcnt = iovcnt;
325 	auio.uio_rw = UIO_WRITE;
326 	auio.uio_segflg = UIO_USERSPACE;
327 	auio.uio_procp = p;
328 	auio.uio_resid = 0;
329 	for (i = 0; i < iovcnt; i++) {
330 		auio.uio_resid += iov->iov_len;
331 		/*
332 		 * Writes return ssize_t because -1 is returned on error.
333 		 * Therefore we must restrict the length to SSIZE_MAX to
334 		 * avoid garbage return values.  Note that the addition is
335 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
336 		 */
337 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
338 			error = EINVAL;
339 			goto done;
340 		}
341 		iov++;
342 	}
343 #ifdef KTRACE
344 	/*
345 	 * if tracing, save a copy of iovec
346 	 */
347 	if (KTRPOINT(p, KTR_GENIO)) {
348 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
349 		memcpy(ktriov, auio.uio_iov, iovlen);
350 	}
351 #endif
352 	cnt = auio.uio_resid;
353 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
354 	if (error) {
355 		if (auio.uio_resid != cnt && (error == ERESTART ||
356 		    error == EINTR || error == EWOULDBLOCK))
357 			error = 0;
358 		if (error == EPIPE)
359 			ptsignal(p, SIGPIPE, STHREAD);
360 	}
361 	cnt -= auio.uio_resid;
362 
363 	fp->f_wxfer++;
364 	fp->f_wbytes += cnt;
365 #ifdef KTRACE
366 	if (ktriov != NULL) {
367 		if (error == 0)
368 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
369 		free(ktriov, M_TEMP, iovlen);
370 	}
371 #endif
372 	*retval = cnt;
373  done:
374 	if (needfree)
375 		free(needfree, M_IOV, iovlen);
376  out:
377 	FRELE(fp, p);
378 	return (error);
379 }
380 
381 /*
382  * Ioctl system call
383  */
384 int
385 sys_ioctl(struct proc *p, void *v, register_t *retval)
386 {
387 	struct sys_ioctl_args /* {
388 		syscallarg(int) fd;
389 		syscallarg(u_long) com;
390 		syscallarg(void *) data;
391 	} */ *uap = v;
392 	struct file *fp;
393 	struct filedesc *fdp;
394 	u_long com = SCARG(uap, com);
395 	int error;
396 	u_int size;
397 	caddr_t data, memp;
398 	int tmp;
399 #define STK_PARAMS	128
400 	long long stkbuf[STK_PARAMS / sizeof(long long)];
401 
402 	fdp = p->p_fd;
403 	fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE);
404 
405 	if (fp == NULL)
406 		return (EBADF);
407 
408 	if (fp->f_type == DTYPE_SOCKET) {
409 		struct socket *so = fp->f_data;
410 
411 		if (so->so_state & SS_DNS)
412 			return (EINVAL);
413 	}
414 
415 	error = pledge_ioctl(p, com, fp);
416 	if (error)
417 		return (error);
418 
419 	switch (com) {
420 	case FIONCLEX:
421 	case FIOCLEX:
422 		fdplock(fdp);
423 		if (com == FIONCLEX)
424 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
425 		else
426 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
427 		fdpunlock(fdp);
428 		return (0);
429 	}
430 
431 	/*
432 	 * Interpret high order word to find amount of data to be
433 	 * copied to/from the user's address space.
434 	 */
435 	size = IOCPARM_LEN(com);
436 	if (size > IOCPARM_MAX)
437 		return (ENOTTY);
438 	FREF(fp);
439 	memp = NULL;
440 	if (size > sizeof (stkbuf)) {
441 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
442 		data = memp;
443 	} else
444 		data = (caddr_t)stkbuf;
445 	if (com&IOC_IN) {
446 		if (size) {
447 			error = copyin(SCARG(uap, data), data, size);
448 			if (error) {
449 				goto out;
450 			}
451 		} else
452 			*(caddr_t *)data = SCARG(uap, data);
453 	} else if ((com&IOC_OUT) && size)
454 		/*
455 		 * Zero the buffer so the user always
456 		 * gets back something deterministic.
457 		 */
458 		memset(data, 0, size);
459 	else if (com&IOC_VOID)
460 		*(caddr_t *)data = SCARG(uap, data);
461 
462 	switch (com) {
463 
464 	case FIONBIO:
465 		if ((tmp = *(int *)data) != 0)
466 			fp->f_flag |= FNONBLOCK;
467 		else
468 			fp->f_flag &= ~FNONBLOCK;
469 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
470 		break;
471 
472 	case FIOASYNC:
473 		if ((tmp = *(int *)data) != 0)
474 			fp->f_flag |= FASYNC;
475 		else
476 			fp->f_flag &= ~FASYNC;
477 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
478 		break;
479 
480 	case FIOSETOWN:
481 		tmp = *(int *)data;
482 		if (fp->f_type == DTYPE_SOCKET) {
483 			struct socket *so = fp->f_data;
484 
485 			so->so_pgid = tmp;
486 			so->so_siguid = p->p_ucred->cr_ruid;
487 			so->so_sigeuid = p->p_ucred->cr_uid;
488 			error = 0;
489 			break;
490 		}
491 		if (tmp <= 0) {
492 			tmp = -tmp;
493 		} else {
494 			struct process *pr = prfind(tmp);
495 			if (pr == NULL) {
496 				error = ESRCH;
497 				break;
498 			}
499 			tmp = pr->ps_pgrp->pg_id;
500 		}
501 		error = (*fp->f_ops->fo_ioctl)
502 		    (fp, TIOCSPGRP, (caddr_t)&tmp, p);
503 		break;
504 
505 	case FIOGETOWN:
506 		if (fp->f_type == DTYPE_SOCKET) {
507 			error = 0;
508 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
509 			break;
510 		}
511 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
512 		*(int *)data = -*(int *)data;
513 		break;
514 
515 	default:
516 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
517 		break;
518 	}
519 	/*
520 	 * Copy any data to user, size was
521 	 * already set and checked above.
522 	 */
523 	if (error == 0 && (com&IOC_OUT) && size)
524 		error = copyout(data, SCARG(uap, data), size);
525 out:
526 	FRELE(fp, p);
527 	if (memp)
528 		free(memp, M_IOCTLOPS, size);
529 	return (error);
530 }
531 
532 int	selwait, nselcoll;
533 
534 /*
535  * Select system call.
536  */
537 int
538 sys_select(struct proc *p, void *v, register_t *retval)
539 {
540 	struct sys_select_args /* {
541 		syscallarg(int) nd;
542 		syscallarg(fd_set *) in;
543 		syscallarg(fd_set *) ou;
544 		syscallarg(fd_set *) ex;
545 		syscallarg(struct timeval *) tv;
546 	} */ *uap = v;
547 
548 	struct timespec ts, *tsp = NULL;
549 	int error;
550 
551 	if (SCARG(uap, tv) != NULL) {
552 		struct timeval tv;
553 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
554 			return (error);
555 		if ((error = itimerfix(&tv)) != 0)
556 			return (error);
557 #ifdef KTRACE
558 		if (KTRPOINT(p, KTR_STRUCT))
559 			ktrreltimeval(p, &tv);
560 #endif
561 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
562 		tsp = &ts;
563 	}
564 
565 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
566 	    SCARG(uap, ex), tsp, NULL, retval));
567 }
568 
569 int
570 sys_pselect(struct proc *p, void *v, register_t *retval)
571 {
572 	struct sys_pselect_args /* {
573 		syscallarg(int) nd;
574 		syscallarg(fd_set *) in;
575 		syscallarg(fd_set *) ou;
576 		syscallarg(fd_set *) ex;
577 		syscallarg(const struct timespec *) ts;
578 		syscallarg(const sigset_t *) mask;
579 	} */ *uap = v;
580 
581 	struct timespec ts, *tsp = NULL;
582 	sigset_t ss, *ssp = NULL;
583 	int error;
584 
585 	if (SCARG(uap, ts) != NULL) {
586 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
587 			return (error);
588 		if ((error = timespecfix(&ts)) != 0)
589 			return (error);
590 #ifdef KTRACE
591 		if (KTRPOINT(p, KTR_STRUCT))
592 			ktrreltimespec(p, &ts);
593 #endif
594 		tsp = &ts;
595 	}
596 	if (SCARG(uap, mask) != NULL) {
597 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
598 			return (error);
599 		ssp = &ss;
600 	}
601 
602 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
603 	    SCARG(uap, ex), tsp, ssp, retval));
604 }
605 
606 int
607 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
608     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
609 {
610 	fd_mask bits[6];
611 	fd_set *pibits[3], *pobits[3];
612 	struct timespec ats, rts, tts;
613 	int s, ncoll, error = 0, timo;
614 	u_int ni;
615 
616 	if (nd < 0)
617 		return (EINVAL);
618 	if (nd > p->p_fd->fd_nfiles) {
619 		/* forgiving; slightly wrong */
620 		nd = p->p_fd->fd_nfiles;
621 	}
622 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
623 	if (ni > sizeof(bits[0])) {
624 		caddr_t mbits;
625 
626 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
627 		pibits[0] = (fd_set *)&mbits[ni * 0];
628 		pibits[1] = (fd_set *)&mbits[ni * 1];
629 		pibits[2] = (fd_set *)&mbits[ni * 2];
630 		pobits[0] = (fd_set *)&mbits[ni * 3];
631 		pobits[1] = (fd_set *)&mbits[ni * 4];
632 		pobits[2] = (fd_set *)&mbits[ni * 5];
633 	} else {
634 		memset(bits, 0, sizeof(bits));
635 		pibits[0] = (fd_set *)&bits[0];
636 		pibits[1] = (fd_set *)&bits[1];
637 		pibits[2] = (fd_set *)&bits[2];
638 		pobits[0] = (fd_set *)&bits[3];
639 		pobits[1] = (fd_set *)&bits[4];
640 		pobits[2] = (fd_set *)&bits[5];
641 	}
642 
643 #define	getbits(name, x) \
644 	if (name && (error = copyin(name, pibits[x], ni))) \
645 		goto done;
646 	getbits(in, 0);
647 	getbits(ou, 1);
648 	getbits(ex, 2);
649 #undef	getbits
650 #ifdef KTRACE
651 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
652 		if (in) ktrfdset(p, pibits[0], ni);
653 		if (ou) ktrfdset(p, pibits[1], ni);
654 		if (ex) ktrfdset(p, pibits[2], ni);
655 	}
656 #endif
657 
658 	if (tsp) {
659 		getnanouptime(&rts);
660 		timespecadd(tsp, &rts, &ats);
661 	} else {
662 		ats.tv_sec = 0;
663 		ats.tv_nsec = 0;
664 	}
665 	timo = 0;
666 
667 	if (sigmask)
668 		dosigsuspend(p, *sigmask &~ sigcantmask);
669 
670 retry:
671 	ncoll = nselcoll;
672 	atomic_setbits_int(&p->p_flag, P_SELECT);
673 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
674 	if (error || *retval)
675 		goto done;
676 	if (tsp) {
677 		getnanouptime(&rts);
678 		if (timespeccmp(&rts, &ats, >=))
679 			goto done;
680 		timespecsub(&ats, &rts, &tts);
681 		timo = tts.tv_sec > 24 * 60 * 60 ?
682 			24 * 60 * 60 * hz : tstohz(&tts);
683 	}
684 	s = splhigh();
685 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
686 		splx(s);
687 		goto retry;
688 	}
689 	atomic_clearbits_int(&p->p_flag, P_SELECT);
690 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
691 	splx(s);
692 	if (error == 0)
693 		goto retry;
694 done:
695 	atomic_clearbits_int(&p->p_flag, P_SELECT);
696 	/* select is not restarted after signals... */
697 	if (error == ERESTART)
698 		error = EINTR;
699 	if (error == EWOULDBLOCK)
700 		error = 0;
701 #define	putbits(name, x) \
702 	if (name && (error2 = copyout(pobits[x], name, ni))) \
703 		error = error2;
704 	if (error == 0) {
705 		int error2;
706 
707 		putbits(in, 0);
708 		putbits(ou, 1);
709 		putbits(ex, 2);
710 #undef putbits
711 #ifdef KTRACE
712 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
713 			if (in) ktrfdset(p, pobits[0], ni);
714 			if (ou) ktrfdset(p, pobits[1], ni);
715 			if (ex) ktrfdset(p, pobits[2], ni);
716 		}
717 #endif
718 	}
719 
720 	if (pibits[0] != (fd_set *)&bits[0])
721 		free(pibits[0], M_TEMP, 6 * ni);
722 	return (error);
723 }
724 
725 int
726 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
727     register_t *retval)
728 {
729 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
730 	struct filedesc *fdp = p->p_fd;
731 	int msk, i, j, fd;
732 	fd_mask bits;
733 	struct file *fp;
734 	int n = 0;
735 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
736 
737 	for (msk = 0; msk < 3; msk++) {
738 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
739 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
740 
741 		for (i = 0; i < nfd; i += NFDBITS) {
742 			bits = pibits->fds_bits[i/NFDBITS];
743 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
744 				bits &= ~(1 << j);
745 				if ((fp = fd_getfile(fdp, fd)) == NULL)
746 					return (EBADF);
747 				FREF(fp);
748 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
749 					FD_SET(fd, pobits);
750 					n++;
751 				}
752 				FRELE(fp, p);
753 			}
754 		}
755 	}
756 	*retval = n;
757 	return (0);
758 }
759 
760 int
761 seltrue(dev_t dev, int events, struct proc *p)
762 {
763 
764 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
765 }
766 
767 int
768 selfalse(dev_t dev, int events, struct proc *p)
769 {
770 
771 	return (0);
772 }
773 
774 /*
775  * Record a select request.
776  */
777 void
778 selrecord(struct proc *selector, struct selinfo *sip)
779 {
780 	struct proc *p;
781 	pid_t mypid;
782 
783 	mypid = selector->p_pid;
784 	if (sip->si_selpid == mypid)
785 		return;
786 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
787 	    p->p_wchan == (caddr_t)&selwait)
788 		sip->si_flags |= SI_COLL;
789 	else
790 		sip->si_selpid = mypid;
791 }
792 
793 /*
794  * Do a wakeup when a selectable event occurs.
795  */
796 void
797 selwakeup(struct selinfo *sip)
798 {
799 	struct proc *p;
800 	int s;
801 
802 	KNOTE(&sip->si_note, 0);
803 	if (sip->si_selpid == 0)
804 		return;
805 	if (sip->si_flags & SI_COLL) {
806 		nselcoll++;
807 		sip->si_flags &= ~SI_COLL;
808 		wakeup(&selwait);
809 	}
810 	p = pfind(sip->si_selpid);
811 	sip->si_selpid = 0;
812 	if (p != NULL) {
813 		SCHED_LOCK(s);
814 		if (p->p_wchan == (caddr_t)&selwait) {
815 			if (p->p_stat == SSLEEP)
816 				setrunnable(p);
817 			else
818 				unsleep(p);
819 		} else if (p->p_flag & P_SELECT)
820 			atomic_clearbits_int(&p->p_flag, P_SELECT);
821 		SCHED_UNLOCK(s);
822 	}
823 }
824 
825 void
826 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
827 {
828 	struct filedesc *fdp = p->p_fd;
829 	struct file *fp;
830 	u_int i;
831 	int n = 0;
832 
833 	for (i = 0; i < nfd; i++, pl++) {
834 		/* Check the file descriptor. */
835 		if (pl->fd < 0) {
836 			pl->revents = 0;
837 			continue;
838 		}
839 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
840 			pl->revents = POLLNVAL;
841 			n++;
842 			continue;
843 		}
844 		FREF(fp);
845 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
846 		FRELE(fp, p);
847 		if (pl->revents != 0)
848 			n++;
849 	}
850 	*retval = n;
851 }
852 
853 /*
854  * Only copyout the revents field.
855  */
856 int
857 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
858 {
859 	int error = 0;
860 	u_int i = 0;
861 
862 	while (!error && i++ < nfds) {
863 		error = copyout(&pl->revents, &upl->revents,
864 		    sizeof(upl->revents));
865 		pl++;
866 		upl++;
867 	}
868 
869 	return (error);
870 }
871 
872 /*
873  * We are using the same mechanism as select only we encode/decode args
874  * differently.
875  */
876 int
877 sys_poll(struct proc *p, void *v, register_t *retval)
878 {
879 	struct sys_poll_args /* {
880 		syscallarg(struct pollfd *) fds;
881 		syscallarg(u_int) nfds;
882 		syscallarg(int) timeout;
883 	} */ *uap = v;
884 
885 	struct timespec ts, *tsp = NULL;
886 	int msec = SCARG(uap, timeout);
887 
888 	if (msec != INFTIM) {
889 		if (msec < 0)
890 			return (EINVAL);
891 		ts.tv_sec = msec / 1000;
892 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
893 		tsp = &ts;
894 	}
895 
896 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
897 	    retval));
898 }
899 
900 int
901 sys_ppoll(struct proc *p, void *v, register_t *retval)
902 {
903 	struct sys_ppoll_args /* {
904 		syscallarg(struct pollfd *) fds;
905 		syscallarg(u_int) nfds;
906 		syscallarg(const struct timespec *) ts;
907 		syscallarg(const sigset_t *) mask;
908 	} */ *uap = v;
909 
910 	int error;
911 	struct timespec ts, *tsp = NULL;
912 	sigset_t ss, *ssp = NULL;
913 
914 	if (SCARG(uap, ts) != NULL) {
915 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
916 			return (error);
917 		if ((error = timespecfix(&ts)) != 0)
918 			return (error);
919 #ifdef KTRACE
920 		if (KTRPOINT(p, KTR_STRUCT))
921 			ktrreltimespec(p, &ts);
922 #endif
923 		tsp = &ts;
924 	}
925 
926 	if (SCARG(uap, mask) != NULL) {
927 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
928 			return (error);
929 		ssp = &ss;
930 	}
931 
932 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
933 	    retval));
934 }
935 
936 int
937 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
938     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
939 {
940 	size_t sz;
941 	struct pollfd pfds[4], *pl = pfds;
942 	struct timespec ats, rts, tts;
943 	int timo, ncoll, i, s, error;
944 
945 	/* Standards say no more than MAX_OPEN; this is possibly better. */
946 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
947 		return (EINVAL);
948 
949 	/* optimize for the default case, of a small nfds value */
950 	if (nfds > nitems(pfds)) {
951 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
952 		    M_WAITOK | M_CANFAIL);
953 		if (pl == NULL)
954 			return (EINVAL);
955 	}
956 
957 	sz = nfds * sizeof(*pl);
958 
959 	if ((error = copyin(fds, pl, sz)) != 0)
960 		goto bad;
961 
962 	for (i = 0; i < nfds; i++) {
963 		pl[i].events &= ~POLL_NOHUP;
964 		pl[i].revents = 0;
965 	}
966 
967 	if (tsp != NULL) {
968 		getnanouptime(&rts);
969 		timespecadd(tsp, &rts, &ats);
970 	} else {
971 		ats.tv_sec = 0;
972 		ats.tv_nsec = 0;
973 	}
974 	timo = 0;
975 
976 	if (sigmask)
977 		dosigsuspend(p, *sigmask &~ sigcantmask);
978 
979 retry:
980 	ncoll = nselcoll;
981 	atomic_setbits_int(&p->p_flag, P_SELECT);
982 	pollscan(p, pl, nfds, retval);
983 	if (*retval)
984 		goto done;
985 	if (tsp != NULL) {
986 		getnanouptime(&rts);
987 		if (timespeccmp(&rts, &ats, >=))
988 			goto done;
989 		timespecsub(&ats, &rts, &tts);
990 		timo = tts.tv_sec > 24 * 60 * 60 ?
991 			24 * 60 * 60 * hz : tstohz(&tts);
992 	}
993 	s = splhigh();
994 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
995 		splx(s);
996 		goto retry;
997 	}
998 	atomic_clearbits_int(&p->p_flag, P_SELECT);
999 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
1000 	splx(s);
1001 	if (error == 0)
1002 		goto retry;
1003 
1004 done:
1005 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1006 	/*
1007 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1008 	 *       ignored (since the whole point is to see what would block).
1009 	 */
1010 	switch (error) {
1011 	case ERESTART:
1012 		error = pollout(pl, fds, nfds);
1013 		if (error == 0)
1014 			error = EINTR;
1015 		break;
1016 	case EWOULDBLOCK:
1017 	case 0:
1018 		error = pollout(pl, fds, nfds);
1019 		break;
1020 	}
1021 #ifdef KTRACE
1022 	if (KTRPOINT(p, KTR_STRUCT))
1023 		ktrpollfd(p, pl, nfds);
1024 #endif /* KTRACE */
1025 bad:
1026 	if (pl != pfds)
1027 		free(pl, M_TEMP, sz);
1028 	return (error);
1029 }
1030 
1031 /*
1032  * utrace system call
1033  */
1034 int
1035 sys_utrace(struct proc *curp, void *v, register_t *retval)
1036 {
1037 #ifdef KTRACE
1038 	struct sys_utrace_args /* {
1039 		syscallarg(const char *) label;
1040 		syscallarg(const void *) addr;
1041 		syscallarg(size_t) len;
1042 	} */ *uap = v;
1043 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1044 	    SCARG(uap, len)));
1045 #else
1046 	return (0);
1047 #endif
1048 }
1049