xref: /openbsd-src/sys/kern/sys_generic.c (revision 7350f337b9e3eb4461d99580e625c7ef148d107c)
1 /*	$OpenBSD: sys_generic.c,v 1.125 2019/06/22 06:48:25 semarie Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/time.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #ifdef KTRACE
59 #include <sys/ktrace.h>
60 #endif
61 #include <sys/sched.h>
62 #include <sys/pledge.h>
63 
64 #include <sys/mount.h>
65 #include <sys/syscallargs.h>
66 
67 #include <uvm/uvm_extern.h>
68 
69 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
70 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
71 int pollout(struct pollfd *, struct pollfd *, u_int);
72 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
73     struct timespec *, const sigset_t *, register_t *);
74 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
75     const sigset_t *, register_t *);
76 
77 int
78 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
79     unsigned int iovcnt, size_t *residp)
80 {
81 #ifdef KTRACE
82 	struct proc *p = curproc;
83 #endif
84 	struct iovec *iov;
85 	int error, i;
86 	size_t resid = 0;
87 
88 	if (iovcnt > UIO_SMALLIOV) {
89 		if (iovcnt > IOV_MAX)
90 			return (EINVAL);
91 		iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
92 	} else if (iovcnt > 0) {
93 		iov = aiov;
94 	} else {
95 		return (EINVAL);
96 	}
97 	*iovp = iov;
98 
99 	if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
100 		return (error);
101 
102 #ifdef KTRACE
103 	if (KTRPOINT(p, KTR_STRUCT))
104 		ktriovec(p, iov, iovcnt);
105 #endif
106 
107 	for (i = 0; i < iovcnt; i++) {
108 		resid += iov->iov_len;
109 		/*
110 		 * Writes return ssize_t because -1 is returned on error.
111 		 * Therefore we must restrict the length to SSIZE_MAX to
112 		 * avoid garbage return values.  Note that the addition is
113 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
114 		 */
115 		if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
116 			return (EINVAL);
117 		iov++;
118 	}
119 
120 	if (residp != NULL)
121 		*residp = resid;
122 
123 	return (0);
124 }
125 
126 void
127 iovec_free(struct iovec *iov, unsigned int iovcnt)
128 {
129 	if (iovcnt > UIO_SMALLIOV)
130 		free(iov, M_IOV, iovcnt * sizeof(*iov));
131 }
132 
133 /*
134  * Read system call.
135  */
136 int
137 sys_read(struct proc *p, void *v, register_t *retval)
138 {
139 	struct sys_read_args /* {
140 		syscallarg(int) fd;
141 		syscallarg(void *) buf;
142 		syscallarg(size_t) nbyte;
143 	} */ *uap = v;
144 	struct iovec iov;
145 	struct uio auio;
146 
147 	iov.iov_base = SCARG(uap, buf);
148 	iov.iov_len = SCARG(uap, nbyte);
149 	if (iov.iov_len > SSIZE_MAX)
150 		return (EINVAL);
151 
152 	auio.uio_iov = &iov;
153 	auio.uio_iovcnt = 1;
154 	auio.uio_resid = iov.iov_len;
155 
156 	return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
157 }
158 
159 /*
160  * Scatter read system call.
161  */
162 int
163 sys_readv(struct proc *p, void *v, register_t *retval)
164 {
165 	struct sys_readv_args /* {
166 		syscallarg(int) fd;
167 		syscallarg(const struct iovec *) iovp;
168 		syscallarg(int) iovcnt;
169 	} */ *uap = v;
170 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
171 	int error, iovcnt = SCARG(uap, iovcnt);
172 	struct uio auio;
173 	size_t resid;
174 
175 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
176 	if (error)
177 		goto done;
178 
179 	auio.uio_iov = iov;
180 	auio.uio_iovcnt = iovcnt;
181 	auio.uio_resid = resid;
182 
183 	error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
184  done:
185 	iovec_free(iov, iovcnt);
186 	return (error);
187 }
188 
189 int
190 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
191     register_t *retval)
192 {
193 	struct filedesc *fdp = p->p_fd;
194 	struct file *fp;
195 	long cnt, error = 0;
196 	u_int iovlen;
197 #ifdef KTRACE
198 	struct iovec *ktriov = NULL;
199 #endif
200 
201 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
202 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
203 
204 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
205 		return (EBADF);
206 
207 	/* Checks for positioned read. */
208 	if (flags & FO_POSITION) {
209 		struct vnode *vp = fp->f_data;
210 
211 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
212 		    (vp->v_flag & VISTTY)) {
213 			error = ESPIPE;
214 			goto done;
215 		}
216 
217 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
218 			error = EINVAL;
219 			goto done;
220 		}
221 	}
222 
223 	uio->uio_rw = UIO_READ;
224 	uio->uio_segflg = UIO_USERSPACE;
225 	uio->uio_procp = p;
226 #ifdef KTRACE
227 	/*
228 	 * if tracing, save a copy of iovec
229 	 */
230 	if (KTRPOINT(p, KTR_GENIO)) {
231 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
232 		memcpy(ktriov, uio->uio_iov, iovlen);
233 	}
234 #endif
235 	cnt = uio->uio_resid;
236 	error = (*fp->f_ops->fo_read)(fp, uio, flags);
237 	if (error) {
238 		if (uio->uio_resid != cnt && (error == ERESTART ||
239 		    error == EINTR || error == EWOULDBLOCK))
240 			error = 0;
241 	}
242 	cnt -= uio->uio_resid;
243 
244 	mtx_enter(&fp->f_mtx);
245 	fp->f_rxfer++;
246 	fp->f_rbytes += cnt;
247 	mtx_leave(&fp->f_mtx);
248 #ifdef KTRACE
249 	if (ktriov != NULL) {
250 		if (error == 0)
251 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
252 		free(ktriov, M_TEMP, iovlen);
253 	}
254 #endif
255 	*retval = cnt;
256  done:
257 	FRELE(fp, p);
258 	return (error);
259 }
260 
261 /*
262  * Write system call
263  */
264 int
265 sys_write(struct proc *p, void *v, register_t *retval)
266 {
267 	struct sys_write_args /* {
268 		syscallarg(int) fd;
269 		syscallarg(const void *) buf;
270 		syscallarg(size_t) nbyte;
271 	} */ *uap = v;
272 	struct iovec iov;
273 	struct uio auio;
274 
275 	iov.iov_base = (void *)SCARG(uap, buf);
276 	iov.iov_len = SCARG(uap, nbyte);
277 	if (iov.iov_len > SSIZE_MAX)
278 		return (EINVAL);
279 
280 	auio.uio_iov = &iov;
281 	auio.uio_iovcnt = 1;
282 	auio.uio_resid = iov.iov_len;
283 
284 	return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
285 }
286 
287 /*
288  * Gather write system call
289  */
290 int
291 sys_writev(struct proc *p, void *v, register_t *retval)
292 {
293 	struct sys_writev_args /* {
294 		syscallarg(int) fd;
295 		syscallarg(const struct iovec *) iovp;
296 		syscallarg(int) iovcnt;
297 	} */ *uap = v;
298 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
299 	int error, iovcnt = SCARG(uap, iovcnt);
300 	struct uio auio;
301 	size_t resid;
302 
303 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
304 	if (error)
305 		goto done;
306 
307 	auio.uio_iov = iov;
308 	auio.uio_iovcnt = iovcnt;
309 	auio.uio_resid = resid;
310 
311 	error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
312  done:
313 	iovec_free(iov, iovcnt);
314  	return (error);
315 }
316 
317 int
318 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
319     register_t *retval)
320 {
321 	struct filedesc *fdp = p->p_fd;
322 	struct file *fp;
323 	long cnt, error = 0;
324 	u_int iovlen;
325 #ifdef KTRACE
326 	struct iovec *ktriov = NULL;
327 #endif
328 
329 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
330 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
331 
332 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
333 		return (EBADF);
334 
335 	/* Checks for positioned write. */
336 	if (flags & FO_POSITION) {
337 		struct vnode *vp = fp->f_data;
338 
339 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
340 		    (vp->v_flag & VISTTY)) {
341 			error = ESPIPE;
342 			goto done;
343 		}
344 
345 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
346 			error = EINVAL;
347 			goto done;
348 		}
349 	}
350 
351 	uio->uio_rw = UIO_WRITE;
352 	uio->uio_segflg = UIO_USERSPACE;
353 	uio->uio_procp = p;
354 #ifdef KTRACE
355 	/*
356 	 * if tracing, save a copy of iovec
357 	 */
358 	if (KTRPOINT(p, KTR_GENIO)) {
359 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
360 		memcpy(ktriov, uio->uio_iov, iovlen);
361 	}
362 #endif
363 	cnt = uio->uio_resid;
364 	error = (*fp->f_ops->fo_write)(fp, uio, flags);
365 	if (error) {
366 		if (uio->uio_resid != cnt && (error == ERESTART ||
367 		    error == EINTR || error == EWOULDBLOCK))
368 			error = 0;
369 		if (error == EPIPE) {
370 			KERNEL_LOCK();
371 			ptsignal(p, SIGPIPE, STHREAD);
372 			KERNEL_UNLOCK();
373 		}
374 	}
375 	cnt -= uio->uio_resid;
376 
377 	mtx_enter(&fp->f_mtx);
378 	fp->f_wxfer++;
379 	fp->f_wbytes += cnt;
380 	mtx_leave(&fp->f_mtx);
381 #ifdef KTRACE
382 	if (ktriov != NULL) {
383 		if (error == 0)
384 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
385 		free(ktriov, M_TEMP, iovlen);
386 	}
387 #endif
388 	*retval = cnt;
389  done:
390 	FRELE(fp, p);
391 	return (error);
392 }
393 
394 /*
395  * Ioctl system call
396  */
397 int
398 sys_ioctl(struct proc *p, void *v, register_t *retval)
399 {
400 	struct sys_ioctl_args /* {
401 		syscallarg(int) fd;
402 		syscallarg(u_long) com;
403 		syscallarg(void *) data;
404 	} */ *uap = v;
405 	struct file *fp;
406 	struct filedesc *fdp = p->p_fd;
407 	u_long com = SCARG(uap, com);
408 	int error = 0;
409 	u_int size = 0;
410 	caddr_t data, memp = NULL;
411 	int tmp;
412 #define STK_PARAMS	128
413 	long long stkbuf[STK_PARAMS / sizeof(long long)];
414 
415 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
416 		return (EBADF);
417 
418 	if (fp->f_type == DTYPE_SOCKET) {
419 		struct socket *so = fp->f_data;
420 
421 		if (so->so_state & SS_DNS) {
422 			error = EINVAL;
423 			goto out;
424 		}
425 	}
426 
427 	error = pledge_ioctl(p, com, fp);
428 	if (error)
429 		goto out;
430 
431 	switch (com) {
432 	case FIONCLEX:
433 	case FIOCLEX:
434 		fdplock(fdp);
435 		if (com == FIONCLEX)
436 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
437 		else
438 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
439 		fdpunlock(fdp);
440 		goto out;
441 	}
442 
443 	/*
444 	 * Interpret high order word to find amount of data to be
445 	 * copied to/from the user's address space.
446 	 */
447 	size = IOCPARM_LEN(com);
448 	if (size > IOCPARM_MAX) {
449 		error = ENOTTY;
450 		goto out;
451 	}
452 	if (size > sizeof (stkbuf)) {
453 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
454 		data = memp;
455 	} else
456 		data = (caddr_t)stkbuf;
457 	if (com&IOC_IN) {
458 		if (size) {
459 			error = copyin(SCARG(uap, data), data, size);
460 			if (error) {
461 				goto out;
462 			}
463 		} else
464 			*(caddr_t *)data = SCARG(uap, data);
465 	} else if ((com&IOC_OUT) && size)
466 		/*
467 		 * Zero the buffer so the user always
468 		 * gets back something deterministic.
469 		 */
470 		memset(data, 0, size);
471 	else if (com&IOC_VOID)
472 		*(caddr_t *)data = SCARG(uap, data);
473 
474 	switch (com) {
475 
476 	case FIONBIO:
477 		if ((tmp = *(int *)data) != 0)
478 			fp->f_flag |= FNONBLOCK;
479 		else
480 			fp->f_flag &= ~FNONBLOCK;
481 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
482 		break;
483 
484 	case FIOASYNC:
485 		if ((tmp = *(int *)data) != 0)
486 			fp->f_flag |= FASYNC;
487 		else
488 			fp->f_flag &= ~FASYNC;
489 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
490 		break;
491 
492 	case FIOSETOWN:
493 		tmp = *(int *)data;
494 
495 		if (fp->f_type == DTYPE_SOCKET || fp->f_type == DTYPE_PIPE) {
496 			/* nothing */
497 		} else if (tmp <= 0) {
498 			tmp = -tmp;
499 		} else {
500 			struct process *pr = prfind(tmp);
501 			if (pr == NULL) {
502 				error = ESRCH;
503 				break;
504 			}
505 			tmp = pr->ps_pgrp->pg_id;
506 		}
507 		error = (*fp->f_ops->fo_ioctl)
508 		    (fp, TIOCSPGRP, (caddr_t)&tmp, p);
509 		break;
510 
511 	case FIOGETOWN:
512 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
513 		*(int *)data = -*(int *)data;
514 		break;
515 
516 	default:
517 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
518 		break;
519 	}
520 	/*
521 	 * Copy any data to user, size was
522 	 * already set and checked above.
523 	 */
524 	if (error == 0 && (com&IOC_OUT) && size)
525 		error = copyout(data, SCARG(uap, data), size);
526 out:
527 	FRELE(fp, p);
528 	free(memp, M_IOCTLOPS, size);
529 	return (error);
530 }
531 
532 int	selwait, nselcoll;
533 
534 /*
535  * Select system call.
536  */
537 int
538 sys_select(struct proc *p, void *v, register_t *retval)
539 {
540 	struct sys_select_args /* {
541 		syscallarg(int) nd;
542 		syscallarg(fd_set *) in;
543 		syscallarg(fd_set *) ou;
544 		syscallarg(fd_set *) ex;
545 		syscallarg(struct timeval *) tv;
546 	} */ *uap = v;
547 
548 	struct timespec ts, *tsp = NULL;
549 	int error;
550 
551 	if (SCARG(uap, tv) != NULL) {
552 		struct timeval tv;
553 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
554 			return (error);
555 		if (tv.tv_sec < 0 || !timerisvalid(&tv))
556 			return (EINVAL);
557 #ifdef KTRACE
558 		if (KTRPOINT(p, KTR_STRUCT))
559 			ktrreltimeval(p, &tv);
560 #endif
561 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
562 		tsp = &ts;
563 	}
564 
565 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
566 	    SCARG(uap, ex), tsp, NULL, retval));
567 }
568 
569 int
570 sys_pselect(struct proc *p, void *v, register_t *retval)
571 {
572 	struct sys_pselect_args /* {
573 		syscallarg(int) nd;
574 		syscallarg(fd_set *) in;
575 		syscallarg(fd_set *) ou;
576 		syscallarg(fd_set *) ex;
577 		syscallarg(const struct timespec *) ts;
578 		syscallarg(const sigset_t *) mask;
579 	} */ *uap = v;
580 
581 	struct timespec ts, *tsp = NULL;
582 	sigset_t ss, *ssp = NULL;
583 	int error;
584 
585 	if (SCARG(uap, ts) != NULL) {
586 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
587 			return (error);
588 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
589 			return (EINVAL);
590 #ifdef KTRACE
591 		if (KTRPOINT(p, KTR_STRUCT))
592 			ktrreltimespec(p, &ts);
593 #endif
594 		tsp = &ts;
595 	}
596 	if (SCARG(uap, mask) != NULL) {
597 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
598 			return (error);
599 		ssp = &ss;
600 	}
601 
602 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
603 	    SCARG(uap, ex), tsp, ssp, retval));
604 }
605 
606 int
607 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
608     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
609 {
610 	fd_mask bits[6];
611 	fd_set *pibits[3], *pobits[3];
612 	struct timespec elapsed, start, stop;
613 	int s, ncoll, error = 0, timo;
614 	u_int ni;
615 
616 	if (nd < 0)
617 		return (EINVAL);
618 	if (nd > p->p_fd->fd_nfiles) {
619 		/* forgiving; slightly wrong */
620 		nd = p->p_fd->fd_nfiles;
621 	}
622 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
623 	if (ni > sizeof(bits[0])) {
624 		caddr_t mbits;
625 
626 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
627 		pibits[0] = (fd_set *)&mbits[ni * 0];
628 		pibits[1] = (fd_set *)&mbits[ni * 1];
629 		pibits[2] = (fd_set *)&mbits[ni * 2];
630 		pobits[0] = (fd_set *)&mbits[ni * 3];
631 		pobits[1] = (fd_set *)&mbits[ni * 4];
632 		pobits[2] = (fd_set *)&mbits[ni * 5];
633 	} else {
634 		memset(bits, 0, sizeof(bits));
635 		pibits[0] = (fd_set *)&bits[0];
636 		pibits[1] = (fd_set *)&bits[1];
637 		pibits[2] = (fd_set *)&bits[2];
638 		pobits[0] = (fd_set *)&bits[3];
639 		pobits[1] = (fd_set *)&bits[4];
640 		pobits[2] = (fd_set *)&bits[5];
641 	}
642 
643 #define	getbits(name, x) \
644 	if (name && (error = copyin(name, pibits[x], ni))) \
645 		goto done;
646 	getbits(in, 0);
647 	getbits(ou, 1);
648 	getbits(ex, 2);
649 #undef	getbits
650 #ifdef KTRACE
651 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
652 		if (in) ktrfdset(p, pibits[0], ni);
653 		if (ou) ktrfdset(p, pibits[1], ni);
654 		if (ex) ktrfdset(p, pibits[2], ni);
655 	}
656 #endif
657 
658 	if (sigmask)
659 		dosigsuspend(p, *sigmask &~ sigcantmask);
660 
661 retry:
662 	ncoll = nselcoll;
663 	atomic_setbits_int(&p->p_flag, P_SELECT);
664 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
665 	if (error || *retval)
666 		goto done;
667 	while (timeout == NULL || timespecisset(timeout)) {
668 		timo = (timeout == NULL) ? 0 : tstohz(timeout);
669 		if (timeout != NULL)
670 			getnanouptime(&start);
671 		s = splhigh();
672 		if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
673 			splx(s);
674 			goto retry;
675 		}
676 		atomic_clearbits_int(&p->p_flag, P_SELECT);
677 		error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
678 		splx(s);
679 		if (timeout != NULL) {
680 			getnanouptime(&stop);
681 			timespecsub(&stop, &start, &elapsed);
682 			timespecsub(timeout, &elapsed, timeout);
683 			if (timeout->tv_sec < 0)
684 				timespecclear(timeout);
685 		}
686 		if (error == 0)
687 			goto retry;
688 		if (error != EWOULDBLOCK)
689 			break;
690 	}
691 done:
692 	atomic_clearbits_int(&p->p_flag, P_SELECT);
693 	/* select is not restarted after signals... */
694 	if (error == ERESTART)
695 		error = EINTR;
696 	if (error == EWOULDBLOCK)
697 		error = 0;
698 #define	putbits(name, x) \
699 	if (name && (error2 = copyout(pobits[x], name, ni))) \
700 		error = error2;
701 	if (error == 0) {
702 		int error2;
703 
704 		putbits(in, 0);
705 		putbits(ou, 1);
706 		putbits(ex, 2);
707 #undef putbits
708 #ifdef KTRACE
709 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
710 			if (in) ktrfdset(p, pobits[0], ni);
711 			if (ou) ktrfdset(p, pobits[1], ni);
712 			if (ex) ktrfdset(p, pobits[2], ni);
713 		}
714 #endif
715 	}
716 
717 	if (pibits[0] != (fd_set *)&bits[0])
718 		free(pibits[0], M_TEMP, 6 * ni);
719 	return (error);
720 }
721 
722 int
723 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
724     register_t *retval)
725 {
726 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
727 	struct filedesc *fdp = p->p_fd;
728 	int msk, i, j, fd;
729 	fd_mask bits;
730 	struct file *fp;
731 	int n = 0;
732 	static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
733 
734 	for (msk = 0; msk < 3; msk++) {
735 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
736 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
737 
738 		for (i = 0; i < nfd; i += NFDBITS) {
739 			bits = pibits->fds_bits[i/NFDBITS];
740 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
741 				bits &= ~(1 << j);
742 				if ((fp = fd_getfile(fdp, fd)) == NULL)
743 					return (EBADF);
744 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
745 					FD_SET(fd, pobits);
746 					n++;
747 				}
748 				FRELE(fp, p);
749 			}
750 		}
751 	}
752 	*retval = n;
753 	return (0);
754 }
755 
756 int
757 seltrue(dev_t dev, int events, struct proc *p)
758 {
759 
760 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
761 }
762 
763 int
764 selfalse(dev_t dev, int events, struct proc *p)
765 {
766 
767 	return (0);
768 }
769 
770 /*
771  * Record a select request.
772  */
773 void
774 selrecord(struct proc *selector, struct selinfo *sip)
775 {
776 	struct proc *p;
777 	pid_t mytid;
778 
779 	mytid = selector->p_tid;
780 	if (sip->si_seltid == mytid)
781 		return;
782 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
783 	    p->p_wchan == (caddr_t)&selwait)
784 		sip->si_flags |= SI_COLL;
785 	else
786 		sip->si_seltid = mytid;
787 }
788 
789 /*
790  * Do a wakeup when a selectable event occurs.
791  */
792 void
793 selwakeup(struct selinfo *sip)
794 {
795 	struct proc *p;
796 	int s;
797 
798 	KNOTE(&sip->si_note, NOTE_SUBMIT);
799 	if (sip->si_seltid == 0)
800 		return;
801 	if (sip->si_flags & SI_COLL) {
802 		nselcoll++;
803 		sip->si_flags &= ~SI_COLL;
804 		wakeup(&selwait);
805 	}
806 	p = tfind(sip->si_seltid);
807 	sip->si_seltid = 0;
808 	if (p != NULL) {
809 		SCHED_LOCK(s);
810 		if (p->p_wchan == (caddr_t)&selwait) {
811 			if (p->p_stat == SSLEEP)
812 				setrunnable(p);
813 			else
814 				unsleep(p);
815 		} else if (p->p_flag & P_SELECT)
816 			atomic_clearbits_int(&p->p_flag, P_SELECT);
817 		SCHED_UNLOCK(s);
818 	}
819 }
820 
821 void
822 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
823 {
824 	struct filedesc *fdp = p->p_fd;
825 	struct file *fp;
826 	u_int i;
827 	int n = 0;
828 
829 	for (i = 0; i < nfd; i++, pl++) {
830 		/* Check the file descriptor. */
831 		if (pl->fd < 0) {
832 			pl->revents = 0;
833 			continue;
834 		}
835 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
836 			pl->revents = POLLNVAL;
837 			n++;
838 			continue;
839 		}
840 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
841 		FRELE(fp, p);
842 		if (pl->revents != 0)
843 			n++;
844 	}
845 	*retval = n;
846 }
847 
848 /*
849  * Only copyout the revents field.
850  */
851 int
852 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
853 {
854 	int error = 0;
855 	u_int i = 0;
856 
857 	while (!error && i++ < nfds) {
858 		error = copyout(&pl->revents, &upl->revents,
859 		    sizeof(upl->revents));
860 		pl++;
861 		upl++;
862 	}
863 
864 	return (error);
865 }
866 
867 /*
868  * We are using the same mechanism as select only we encode/decode args
869  * differently.
870  */
871 int
872 sys_poll(struct proc *p, void *v, register_t *retval)
873 {
874 	struct sys_poll_args /* {
875 		syscallarg(struct pollfd *) fds;
876 		syscallarg(u_int) nfds;
877 		syscallarg(int) timeout;
878 	} */ *uap = v;
879 
880 	struct timespec ts, *tsp = NULL;
881 	int msec = SCARG(uap, timeout);
882 
883 	if (msec != INFTIM) {
884 		if (msec < 0)
885 			return (EINVAL);
886 		ts.tv_sec = msec / 1000;
887 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
888 		tsp = &ts;
889 	}
890 
891 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
892 	    retval));
893 }
894 
895 int
896 sys_ppoll(struct proc *p, void *v, register_t *retval)
897 {
898 	struct sys_ppoll_args /* {
899 		syscallarg(struct pollfd *) fds;
900 		syscallarg(u_int) nfds;
901 		syscallarg(const struct timespec *) ts;
902 		syscallarg(const sigset_t *) mask;
903 	} */ *uap = v;
904 
905 	int error;
906 	struct timespec ts, *tsp = NULL;
907 	sigset_t ss, *ssp = NULL;
908 
909 	if (SCARG(uap, ts) != NULL) {
910 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
911 			return (error);
912 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
913 			return (EINVAL);
914 #ifdef KTRACE
915 		if (KTRPOINT(p, KTR_STRUCT))
916 			ktrreltimespec(p, &ts);
917 #endif
918 		tsp = &ts;
919 	}
920 
921 	if (SCARG(uap, mask) != NULL) {
922 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
923 			return (error);
924 		ssp = &ss;
925 	}
926 
927 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
928 	    retval));
929 }
930 
931 int
932 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
933     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
934 {
935 	size_t sz;
936 	struct pollfd pfds[4], *pl = pfds;
937 	struct timespec elapsed, start, stop;
938 	int timo, ncoll, i, s, error;
939 
940 	/* Standards say no more than MAX_OPEN; this is possibly better. */
941 	if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
942 		return (EINVAL);
943 
944 	/* optimize for the default case, of a small nfds value */
945 	if (nfds > nitems(pfds)) {
946 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
947 		    M_WAITOK | M_CANFAIL);
948 		if (pl == NULL)
949 			return (EINVAL);
950 	}
951 
952 	sz = nfds * sizeof(*pl);
953 
954 	if ((error = copyin(fds, pl, sz)) != 0)
955 		goto bad;
956 
957 	for (i = 0; i < nfds; i++) {
958 		pl[i].events &= ~POLL_NOHUP;
959 		pl[i].revents = 0;
960 	}
961 
962 	if (sigmask)
963 		dosigsuspend(p, *sigmask &~ sigcantmask);
964 
965 retry:
966 	ncoll = nselcoll;
967 	atomic_setbits_int(&p->p_flag, P_SELECT);
968 	pollscan(p, pl, nfds, retval);
969 	if (*retval)
970 		goto done;
971 	while (timeout == NULL || timespecisset(timeout)) {
972 		timo = (timeout == NULL) ? 0 : tstohz(timeout);
973 		if (timeout != NULL)
974 			getnanouptime(&start);
975 		s = splhigh();
976 		if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
977 			splx(s);
978 			goto retry;
979 		}
980 		atomic_clearbits_int(&p->p_flag, P_SELECT);
981 		error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
982 		splx(s);
983 		if (timeout != NULL) {
984 			getnanouptime(&stop);
985 			timespecsub(&stop, &start, &elapsed);
986 			timespecsub(timeout, &elapsed, timeout);
987 			if (timeout->tv_sec < 0)
988 				timespecclear(timeout);
989 		}
990 		if (error == 0)
991 			goto retry;
992 		if (error != EWOULDBLOCK)
993 			break;
994 	}
995 
996 done:
997 	atomic_clearbits_int(&p->p_flag, P_SELECT);
998 	/*
999 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1000 	 *       ignored (since the whole point is to see what would block).
1001 	 */
1002 	switch (error) {
1003 	case ERESTART:
1004 		error = pollout(pl, fds, nfds);
1005 		if (error == 0)
1006 			error = EINTR;
1007 		break;
1008 	case EWOULDBLOCK:
1009 	case 0:
1010 		error = pollout(pl, fds, nfds);
1011 		break;
1012 	}
1013 #ifdef KTRACE
1014 	if (KTRPOINT(p, KTR_STRUCT))
1015 		ktrpollfd(p, pl, nfds);
1016 #endif /* KTRACE */
1017 bad:
1018 	if (pl != pfds)
1019 		free(pl, M_TEMP, sz);
1020 	return (error);
1021 }
1022 
1023 /*
1024  * utrace system call
1025  */
1026 int
1027 sys_utrace(struct proc *curp, void *v, register_t *retval)
1028 {
1029 #ifdef KTRACE
1030 	struct sys_utrace_args /* {
1031 		syscallarg(const char *) label;
1032 		syscallarg(const void *) addr;
1033 		syscallarg(size_t) len;
1034 	} */ *uap = v;
1035 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1036 	    SCARG(uap, len)));
1037 #else
1038 	return (0);
1039 #endif
1040 }
1041