xref: /openbsd-src/sys/kern/sys_generic.c (revision 5a38ef86d0b61900239c7913d24a05e7b88a58f0)
1 /*	$OpenBSD: sys_generic.c,v 1.146 2021/12/11 09:28:26 visa Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/time.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/eventvar.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 #include <sys/sched.h>
63 #include <sys/pledge.h>
64 
65 #include <sys/mount.h>
66 #include <sys/syscallargs.h>
67 
68 #include <uvm/uvm_extern.h>
69 
70 /*
71  * Debug values:
72  *  1 - print implementation errors, things that should not happen.
73  *  2 - print ppoll(2) information, somewhat verbose
74  *  3 - print pselect(2) and ppoll(2) information, very verbose
75  */
76 int kqpoll_debug = 0;
77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) {			\
78 	printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);	\
79 	printf(x);							\
80 }
81 
82 int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *);
83 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
84 
85 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
86 int pollout(struct pollfd *, struct pollfd *, u_int);
87 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
88     struct timespec *, const sigset_t *, register_t *);
89 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
90     const sigset_t *, register_t *);
91 void doselwakeup(struct selinfo *);
92 
93 int
94 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
95     unsigned int iovcnt, size_t *residp)
96 {
97 #ifdef KTRACE
98 	struct proc *p = curproc;
99 #endif
100 	struct iovec *iov;
101 	int error, i;
102 	size_t resid = 0;
103 
104 	if (iovcnt > UIO_SMALLIOV) {
105 		if (iovcnt > IOV_MAX)
106 			return (EINVAL);
107 		iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
108 	} else if (iovcnt > 0) {
109 		iov = aiov;
110 	} else {
111 		return (EINVAL);
112 	}
113 	*iovp = iov;
114 
115 	if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
116 		return (error);
117 
118 #ifdef KTRACE
119 	if (KTRPOINT(p, KTR_STRUCT))
120 		ktriovec(p, iov, iovcnt);
121 #endif
122 
123 	for (i = 0; i < iovcnt; i++) {
124 		resid += iov->iov_len;
125 		/*
126 		 * Writes return ssize_t because -1 is returned on error.
127 		 * Therefore we must restrict the length to SSIZE_MAX to
128 		 * avoid garbage return values.  Note that the addition is
129 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
130 		 */
131 		if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
132 			return (EINVAL);
133 		iov++;
134 	}
135 
136 	if (residp != NULL)
137 		*residp = resid;
138 
139 	return (0);
140 }
141 
142 void
143 iovec_free(struct iovec *iov, unsigned int iovcnt)
144 {
145 	if (iovcnt > UIO_SMALLIOV)
146 		free(iov, M_IOV, iovcnt * sizeof(*iov));
147 }
148 
149 /*
150  * Read system call.
151  */
152 int
153 sys_read(struct proc *p, void *v, register_t *retval)
154 {
155 	struct sys_read_args /* {
156 		syscallarg(int) fd;
157 		syscallarg(void *) buf;
158 		syscallarg(size_t) nbyte;
159 	} */ *uap = v;
160 	struct iovec iov;
161 	struct uio auio;
162 
163 	iov.iov_base = SCARG(uap, buf);
164 	iov.iov_len = SCARG(uap, nbyte);
165 	if (iov.iov_len > SSIZE_MAX)
166 		return (EINVAL);
167 
168 	auio.uio_iov = &iov;
169 	auio.uio_iovcnt = 1;
170 	auio.uio_resid = iov.iov_len;
171 
172 	return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
173 }
174 
175 /*
176  * Scatter read system call.
177  */
178 int
179 sys_readv(struct proc *p, void *v, register_t *retval)
180 {
181 	struct sys_readv_args /* {
182 		syscallarg(int) fd;
183 		syscallarg(const struct iovec *) iovp;
184 		syscallarg(int) iovcnt;
185 	} */ *uap = v;
186 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
187 	int error, iovcnt = SCARG(uap, iovcnt);
188 	struct uio auio;
189 	size_t resid;
190 
191 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
192 	if (error)
193 		goto done;
194 
195 	auio.uio_iov = iov;
196 	auio.uio_iovcnt = iovcnt;
197 	auio.uio_resid = resid;
198 
199 	error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
200  done:
201 	iovec_free(iov, iovcnt);
202 	return (error);
203 }
204 
205 int
206 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
207     register_t *retval)
208 {
209 	struct filedesc *fdp = p->p_fd;
210 	struct file *fp;
211 	long cnt, error = 0;
212 	u_int iovlen;
213 #ifdef KTRACE
214 	struct iovec *ktriov = NULL;
215 #endif
216 
217 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
218 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
219 
220 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
221 		return (EBADF);
222 
223 	/* Checks for positioned read. */
224 	if (flags & FO_POSITION) {
225 		struct vnode *vp = fp->f_data;
226 
227 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
228 		    (vp->v_flag & VISTTY)) {
229 			error = ESPIPE;
230 			goto done;
231 		}
232 
233 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
234 			error = EINVAL;
235 			goto done;
236 		}
237 	}
238 
239 	uio->uio_rw = UIO_READ;
240 	uio->uio_segflg = UIO_USERSPACE;
241 	uio->uio_procp = p;
242 #ifdef KTRACE
243 	/*
244 	 * if tracing, save a copy of iovec
245 	 */
246 	if (KTRPOINT(p, KTR_GENIO)) {
247 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
248 		memcpy(ktriov, uio->uio_iov, iovlen);
249 	}
250 #endif
251 	cnt = uio->uio_resid;
252 	error = (*fp->f_ops->fo_read)(fp, uio, flags);
253 	if (error) {
254 		if (uio->uio_resid != cnt && (error == ERESTART ||
255 		    error == EINTR || error == EWOULDBLOCK))
256 			error = 0;
257 	}
258 	cnt -= uio->uio_resid;
259 
260 	mtx_enter(&fp->f_mtx);
261 	fp->f_rxfer++;
262 	fp->f_rbytes += cnt;
263 	mtx_leave(&fp->f_mtx);
264 #ifdef KTRACE
265 	if (ktriov != NULL) {
266 		if (error == 0)
267 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
268 		free(ktriov, M_TEMP, iovlen);
269 	}
270 #endif
271 	*retval = cnt;
272  done:
273 	FRELE(fp, p);
274 	return (error);
275 }
276 
277 /*
278  * Write system call
279  */
280 int
281 sys_write(struct proc *p, void *v, register_t *retval)
282 {
283 	struct sys_write_args /* {
284 		syscallarg(int) fd;
285 		syscallarg(const void *) buf;
286 		syscallarg(size_t) nbyte;
287 	} */ *uap = v;
288 	struct iovec iov;
289 	struct uio auio;
290 
291 	iov.iov_base = (void *)SCARG(uap, buf);
292 	iov.iov_len = SCARG(uap, nbyte);
293 	if (iov.iov_len > SSIZE_MAX)
294 		return (EINVAL);
295 
296 	auio.uio_iov = &iov;
297 	auio.uio_iovcnt = 1;
298 	auio.uio_resid = iov.iov_len;
299 
300 	return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
301 }
302 
303 /*
304  * Gather write system call
305  */
306 int
307 sys_writev(struct proc *p, void *v, register_t *retval)
308 {
309 	struct sys_writev_args /* {
310 		syscallarg(int) fd;
311 		syscallarg(const struct iovec *) iovp;
312 		syscallarg(int) iovcnt;
313 	} */ *uap = v;
314 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
315 	int error, iovcnt = SCARG(uap, iovcnt);
316 	struct uio auio;
317 	size_t resid;
318 
319 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
320 	if (error)
321 		goto done;
322 
323 	auio.uio_iov = iov;
324 	auio.uio_iovcnt = iovcnt;
325 	auio.uio_resid = resid;
326 
327 	error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
328  done:
329 	iovec_free(iov, iovcnt);
330  	return (error);
331 }
332 
333 int
334 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
335     register_t *retval)
336 {
337 	struct filedesc *fdp = p->p_fd;
338 	struct file *fp;
339 	long cnt, error = 0;
340 	u_int iovlen;
341 #ifdef KTRACE
342 	struct iovec *ktriov = NULL;
343 #endif
344 
345 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
346 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
347 
348 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
349 		return (EBADF);
350 
351 	/* Checks for positioned write. */
352 	if (flags & FO_POSITION) {
353 		struct vnode *vp = fp->f_data;
354 
355 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
356 		    (vp->v_flag & VISTTY)) {
357 			error = ESPIPE;
358 			goto done;
359 		}
360 
361 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
362 			error = EINVAL;
363 			goto done;
364 		}
365 	}
366 
367 	uio->uio_rw = UIO_WRITE;
368 	uio->uio_segflg = UIO_USERSPACE;
369 	uio->uio_procp = p;
370 #ifdef KTRACE
371 	/*
372 	 * if tracing, save a copy of iovec
373 	 */
374 	if (KTRPOINT(p, KTR_GENIO)) {
375 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
376 		memcpy(ktriov, uio->uio_iov, iovlen);
377 	}
378 #endif
379 	cnt = uio->uio_resid;
380 	error = (*fp->f_ops->fo_write)(fp, uio, flags);
381 	if (error) {
382 		if (uio->uio_resid != cnt && (error == ERESTART ||
383 		    error == EINTR || error == EWOULDBLOCK))
384 			error = 0;
385 		if (error == EPIPE) {
386 			KERNEL_LOCK();
387 			ptsignal(p, SIGPIPE, STHREAD);
388 			KERNEL_UNLOCK();
389 		}
390 	}
391 	cnt -= uio->uio_resid;
392 
393 	mtx_enter(&fp->f_mtx);
394 	fp->f_wxfer++;
395 	fp->f_wbytes += cnt;
396 	mtx_leave(&fp->f_mtx);
397 #ifdef KTRACE
398 	if (ktriov != NULL) {
399 		if (error == 0)
400 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
401 		free(ktriov, M_TEMP, iovlen);
402 	}
403 #endif
404 	*retval = cnt;
405  done:
406 	FRELE(fp, p);
407 	return (error);
408 }
409 
410 /*
411  * Ioctl system call
412  */
413 int
414 sys_ioctl(struct proc *p, void *v, register_t *retval)
415 {
416 	struct sys_ioctl_args /* {
417 		syscallarg(int) fd;
418 		syscallarg(u_long) com;
419 		syscallarg(void *) data;
420 	} */ *uap = v;
421 	struct file *fp;
422 	struct filedesc *fdp = p->p_fd;
423 	u_long com = SCARG(uap, com);
424 	int error = 0;
425 	u_int size = 0;
426 	caddr_t data, memp = NULL;
427 	int tmp;
428 #define STK_PARAMS	128
429 	long long stkbuf[STK_PARAMS / sizeof(long long)];
430 
431 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
432 		return (EBADF);
433 
434 	if (fp->f_type == DTYPE_SOCKET) {
435 		struct socket *so = fp->f_data;
436 
437 		if (so->so_state & SS_DNS) {
438 			error = EINVAL;
439 			goto out;
440 		}
441 	}
442 
443 	error = pledge_ioctl(p, com, fp);
444 	if (error)
445 		goto out;
446 
447 	switch (com) {
448 	case FIONCLEX:
449 	case FIOCLEX:
450 		fdplock(fdp);
451 		if (com == FIONCLEX)
452 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
453 		else
454 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
455 		fdpunlock(fdp);
456 		goto out;
457 	}
458 
459 	/*
460 	 * Interpret high order word to find amount of data to be
461 	 * copied to/from the user's address space.
462 	 */
463 	size = IOCPARM_LEN(com);
464 	if (size > IOCPARM_MAX) {
465 		error = ENOTTY;
466 		goto out;
467 	}
468 	if (size > sizeof (stkbuf)) {
469 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
470 		data = memp;
471 	} else
472 		data = (caddr_t)stkbuf;
473 	if (com&IOC_IN) {
474 		if (size) {
475 			error = copyin(SCARG(uap, data), data, size);
476 			if (error) {
477 				goto out;
478 			}
479 		} else
480 			*(caddr_t *)data = SCARG(uap, data);
481 	} else if ((com&IOC_OUT) && size)
482 		/*
483 		 * Zero the buffer so the user always
484 		 * gets back something deterministic.
485 		 */
486 		memset(data, 0, size);
487 	else if (com&IOC_VOID)
488 		*(caddr_t *)data = SCARG(uap, data);
489 
490 	switch (com) {
491 
492 	case FIONBIO:
493 		if ((tmp = *(int *)data) != 0)
494 			atomic_setbits_int(&fp->f_flag, FNONBLOCK);
495 		else
496 			atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
497 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
498 		break;
499 
500 	case FIOASYNC:
501 		if ((tmp = *(int *)data) != 0)
502 			atomic_setbits_int(&fp->f_flag, FASYNC);
503 		else
504 			atomic_clearbits_int(&fp->f_flag, FASYNC);
505 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
506 		break;
507 
508 	default:
509 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
510 		break;
511 	}
512 	/*
513 	 * Copy any data to user, size was
514 	 * already set and checked above.
515 	 */
516 	if (error == 0 && (com&IOC_OUT) && size)
517 		error = copyout(data, SCARG(uap, data), size);
518 out:
519 	FRELE(fp, p);
520 	free(memp, M_IOCTLOPS, size);
521 	return (error);
522 }
523 
524 int	selwait, nselcoll;
525 
526 /*
527  * Select system call.
528  */
529 int
530 sys_select(struct proc *p, void *v, register_t *retval)
531 {
532 	struct sys_select_args /* {
533 		syscallarg(int) nd;
534 		syscallarg(fd_set *) in;
535 		syscallarg(fd_set *) ou;
536 		syscallarg(fd_set *) ex;
537 		syscallarg(struct timeval *) tv;
538 	} */ *uap = v;
539 
540 	struct timespec ts, *tsp = NULL;
541 	int error;
542 
543 	if (SCARG(uap, tv) != NULL) {
544 		struct timeval tv;
545 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
546 			return (error);
547 #ifdef KTRACE
548 		if (KTRPOINT(p, KTR_STRUCT))
549 			ktrreltimeval(p, &tv);
550 #endif
551 		if (tv.tv_sec < 0 || !timerisvalid(&tv))
552 			return (EINVAL);
553 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
554 		tsp = &ts;
555 	}
556 
557 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
558 	    SCARG(uap, ex), tsp, NULL, retval));
559 }
560 
561 int
562 sys_pselect(struct proc *p, void *v, register_t *retval)
563 {
564 	struct sys_pselect_args /* {
565 		syscallarg(int) nd;
566 		syscallarg(fd_set *) in;
567 		syscallarg(fd_set *) ou;
568 		syscallarg(fd_set *) ex;
569 		syscallarg(const struct timespec *) ts;
570 		syscallarg(const sigset_t *) mask;
571 	} */ *uap = v;
572 
573 	struct timespec ts, *tsp = NULL;
574 	sigset_t ss, *ssp = NULL;
575 	int error;
576 
577 	if (SCARG(uap, ts) != NULL) {
578 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
579 			return (error);
580 #ifdef KTRACE
581 		if (KTRPOINT(p, KTR_STRUCT))
582 			ktrreltimespec(p, &ts);
583 #endif
584 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
585 			return (EINVAL);
586 		tsp = &ts;
587 	}
588 	if (SCARG(uap, mask) != NULL) {
589 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
590 			return (error);
591 		ssp = &ss;
592 	}
593 
594 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
595 	    SCARG(uap, ex), tsp, ssp, retval));
596 }
597 
598 int
599 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
600     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
601 {
602 	struct kqueue_scan_state scan;
603 	struct timespec zerots = {};
604 	fd_mask bits[6];
605 	fd_set *pibits[3], *pobits[3];
606 	int error, ncollected = 0, nevents = 0;
607 	u_int ni;
608 
609 	if (nd < 0)
610 		return (EINVAL);
611 	if (nd > p->p_fd->fd_nfiles) {
612 		/* forgiving; slightly wrong */
613 		nd = p->p_fd->fd_nfiles;
614 	}
615 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
616 	if (ni > sizeof(bits[0])) {
617 		caddr_t mbits;
618 
619 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
620 		pibits[0] = (fd_set *)&mbits[ni * 0];
621 		pibits[1] = (fd_set *)&mbits[ni * 1];
622 		pibits[2] = (fd_set *)&mbits[ni * 2];
623 		pobits[0] = (fd_set *)&mbits[ni * 3];
624 		pobits[1] = (fd_set *)&mbits[ni * 4];
625 		pobits[2] = (fd_set *)&mbits[ni * 5];
626 	} else {
627 		memset(bits, 0, sizeof(bits));
628 		pibits[0] = (fd_set *)&bits[0];
629 		pibits[1] = (fd_set *)&bits[1];
630 		pibits[2] = (fd_set *)&bits[2];
631 		pobits[0] = (fd_set *)&bits[3];
632 		pobits[1] = (fd_set *)&bits[4];
633 		pobits[2] = (fd_set *)&bits[5];
634 	}
635 
636 	kqpoll_init(nd);
637 
638 #define	getbits(name, x) \
639 	if (name && (error = copyin(name, pibits[x], ni))) \
640 		goto done;
641 	getbits(in, 0);
642 	getbits(ou, 1);
643 	getbits(ex, 2);
644 #undef	getbits
645 #ifdef KTRACE
646 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
647 		if (in) ktrfdset(p, pibits[0], ni);
648 		if (ou) ktrfdset(p, pibits[1], ni);
649 		if (ex) ktrfdset(p, pibits[2], ni);
650 	}
651 #endif
652 
653 	if (sigmask)
654 		dosigsuspend(p, *sigmask &~ sigcantmask);
655 
656 	/* Register kqueue events */
657 	error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected);
658 	if (error != 0)
659 		goto done;
660 
661 	/*
662 	 * The poll/select family of syscalls has been designed to
663 	 * block when file descriptors are not available, even if
664 	 * there's nothing to wait for.
665 	 */
666 	if (nevents == 0 && ncollected == 0) {
667 		uint64_t nsecs = INFSLP;
668 
669 		if (timeout != NULL) {
670 			if (!timespecisset(timeout))
671 				goto done;
672 			nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
673 		}
674 		error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs);
675 		/* select is not restarted after signals... */
676 		if (error == ERESTART)
677 			error = EINTR;
678 		if (error == EWOULDBLOCK)
679 			error = 0;
680 		goto done;
681 	}
682 
683 	/* Do not block if registering found pending events. */
684 	if (ncollected > 0)
685 		timeout = &zerots;
686 
687 	/* Collect at most `nevents' possibly waiting in kqueue_scan() */
688 	kqueue_scan_setup(&scan, p->p_kq);
689 	while (nevents > 0) {
690 		struct kevent kev[KQ_NEVENTS];
691 		int i, ready, count;
692 
693 		/* Maximum number of events per iteration */
694 		count = MIN(nitems(kev), nevents);
695 		ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
696 #ifdef KTRACE
697 		if (KTRPOINT(p, KTR_STRUCT))
698 			ktrevent(p, kev, ready);
699 #endif
700 		/* Convert back events that are ready. */
701 		for (i = 0; i < ready && error == 0; i++)
702 			error = pselcollect(p, &kev[i], pobits, &ncollected);
703 		/*
704 		 * Stop if there was an error or if we had enough
705 		 * space to collect all events that were ready.
706 		 */
707 		if (error || ready < count)
708 			break;
709 
710 		nevents -= ready;
711 	}
712 	kqueue_scan_finish(&scan);
713 	*retval = ncollected;
714 done:
715 #define	putbits(name, x) \
716 	if (name && (error2 = copyout(pobits[x], name, ni))) \
717 		error = error2;
718 	if (error == 0) {
719 		int error2;
720 
721 		putbits(in, 0);
722 		putbits(ou, 1);
723 		putbits(ex, 2);
724 #undef putbits
725 #ifdef KTRACE
726 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
727 			if (in) ktrfdset(p, pobits[0], ni);
728 			if (ou) ktrfdset(p, pobits[1], ni);
729 			if (ex) ktrfdset(p, pobits[2], ni);
730 		}
731 #endif
732 	}
733 
734 	if (pibits[0] != (fd_set *)&bits[0])
735 		free(pibits[0], M_TEMP, 6 * ni);
736 
737 	kqpoll_done(nd);
738 
739 	return (error);
740 }
741 
742 /*
743  * Convert fd_set into kqueue events and register them on the
744  * per-thread queue.
745  */
746 int
747 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd,
748     int *nregistered, int *ncollected)
749 {
750 	static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
751 	static const int evff[] = { 0, 0, NOTE_OOB };
752 	int msk, i, j, fd, nevents = 0, error = 0;
753 	struct kevent kev;
754 	fd_mask bits;
755 
756 	for (msk = 0; msk < 3; msk++) {
757 		for (i = 0; i < nfd; i += NFDBITS) {
758 			bits = pibits[msk]->fds_bits[i / NFDBITS];
759 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
760 				bits &= ~(1 << j);
761 
762 				DPRINTFN(2, "select fd %d mask %d serial %lu\n",
763 				    fd, msk, p->p_kq_serial);
764 				EV_SET(&kev, fd, evf[msk],
765 				    EV_ADD|EV_ENABLE|__EV_SELECT,
766 				    evff[msk], 0, (void *)(p->p_kq_serial));
767 #ifdef KTRACE
768 				if (KTRPOINT(p, KTR_STRUCT))
769 					ktrevent(p, &kev, 1);
770 #endif
771 				error = kqueue_register(p->p_kq, &kev, p);
772 				switch (error) {
773 				case 0:
774 					nevents++;
775 				/* FALLTHROUGH */
776 				case EOPNOTSUPP:/* No underlying kqfilter */
777 				case EINVAL:	/* Unimplemented filter */
778 				case EPERM:	/* Specific to FIFO and
779 						 * __EV_SELECT */
780 					error = 0;
781 					break;
782 				case EPIPE:	/* Specific to pipes */
783 					KASSERT(kev.filter == EVFILT_WRITE);
784 					FD_SET(kev.ident, pobits[1]);
785 					(*ncollected)++;
786 					error = 0;
787 					break;
788 				case ENXIO:	/* Device has been detached */
789 				default:
790 					goto bad;
791 				}
792 			}
793 		}
794 	}
795 
796 	*nregistered = nevents;
797 	return (0);
798 bad:
799 	DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
800 	    kev.filter, error);
801 	return (error);
802 }
803 
804 /*
805  * Convert given kqueue event into corresponding select(2) bit.
806  */
807 int
808 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
809     int *ncollected)
810 {
811 	if ((unsigned long)kevp->udata != p->p_kq_serial) {
812 		panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx",
813 		    __func__, kevp, (int)kevp->ident,
814 		    (unsigned long)kevp->udata, p->p_kq_serial);
815 	}
816 
817 	if (kevp->flags & EV_ERROR) {
818 		DPRINTFN(2, "select fd %d filt %d error %d\n",
819 		    (int)kevp->ident, kevp->filter, (int)kevp->data);
820 		return (kevp->data);
821 	}
822 
823 	switch (kevp->filter) {
824 	case EVFILT_READ:
825 		FD_SET(kevp->ident, pobits[0]);
826 		break;
827 	case EVFILT_WRITE:
828 		FD_SET(kevp->ident, pobits[1]);
829 		break;
830 	case EVFILT_EXCEPT:
831 		FD_SET(kevp->ident, pobits[2]);
832 		break;
833 	default:
834 		KASSERT(0);
835 	}
836 	(*ncollected)++;
837 
838 	DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
839 	return (0);
840 }
841 
842 int
843 seltrue(dev_t dev, int events, struct proc *p)
844 {
845 
846 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
847 }
848 
849 int
850 selfalse(dev_t dev, int events, struct proc *p)
851 {
852 
853 	return (0);
854 }
855 
856 /*
857  * Record a select request.
858  */
859 void
860 selrecord(struct proc *selector, struct selinfo *sip)
861 {
862 	struct proc *p;
863 	pid_t mytid;
864 
865 	KERNEL_ASSERT_LOCKED();
866 
867 	mytid = selector->p_tid;
868 	if (sip->si_seltid == mytid)
869 		return;
870 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
871 	    p->p_wchan == (caddr_t)&selwait)
872 		sip->si_flags |= SI_COLL;
873 	else
874 		sip->si_seltid = mytid;
875 }
876 
877 /*
878  * Do a wakeup when a selectable event occurs.
879  */
880 void
881 selwakeup(struct selinfo *sip)
882 {
883 	KERNEL_LOCK();
884 	KNOTE(&sip->si_note, NOTE_SUBMIT);
885 	doselwakeup(sip);
886 	KERNEL_UNLOCK();
887 }
888 
889 void
890 doselwakeup(struct selinfo *sip)
891 {
892 	struct proc *p;
893 
894 	KERNEL_ASSERT_LOCKED();
895 
896 	if (sip->si_seltid == 0)
897 		return;
898 	if (sip->si_flags & SI_COLL) {
899 		nselcoll++;
900 		sip->si_flags &= ~SI_COLL;
901 		wakeup(&selwait);
902 	}
903 	p = tfind(sip->si_seltid);
904 	sip->si_seltid = 0;
905 	if (p != NULL) {
906 		if (wakeup_proc(p, &selwait)) {
907 			/* nothing else to do */
908 		} else if (p->p_flag & P_SELECT)
909 			atomic_clearbits_int(&p->p_flag, P_SELECT);
910 	}
911 }
912 
913 void
914 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
915 {
916 	struct filedesc *fdp = p->p_fd;
917 	struct file *fp;
918 	u_int i;
919 	int n = 0;
920 
921 	for (i = 0; i < nfd; i++, pl++) {
922 		/* Check the file descriptor. */
923 		if (pl->fd < 0) {
924 			pl->revents = 0;
925 			continue;
926 		}
927 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
928 			pl->revents = POLLNVAL;
929 			n++;
930 			continue;
931 		}
932 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
933 		FRELE(fp, p);
934 		if (pl->revents != 0)
935 			n++;
936 	}
937 	*retval = n;
938 }
939 
940 /*
941  * Only copyout the revents field.
942  */
943 int
944 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
945 {
946 	int error = 0;
947 	u_int i = 0;
948 
949 	while (!error && i++ < nfds) {
950 		error = copyout(&pl->revents, &upl->revents,
951 		    sizeof(upl->revents));
952 		pl++;
953 		upl++;
954 	}
955 
956 	return (error);
957 }
958 
959 /*
960  * We are using the same mechanism as select only we encode/decode args
961  * differently.
962  */
963 int
964 sys_poll(struct proc *p, void *v, register_t *retval)
965 {
966 	struct sys_poll_args /* {
967 		syscallarg(struct pollfd *) fds;
968 		syscallarg(u_int) nfds;
969 		syscallarg(int) timeout;
970 	} */ *uap = v;
971 
972 	struct timespec ts, *tsp = NULL;
973 	int msec = SCARG(uap, timeout);
974 
975 	if (msec != INFTIM) {
976 		if (msec < 0)
977 			return (EINVAL);
978 		ts.tv_sec = msec / 1000;
979 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
980 		tsp = &ts;
981 	}
982 
983 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
984 	    retval));
985 }
986 
987 int
988 sys_ppoll(struct proc *p, void *v, register_t *retval)
989 {
990 	struct sys_ppoll_args /* {
991 		syscallarg(struct pollfd *) fds;
992 		syscallarg(u_int) nfds;
993 		syscallarg(const struct timespec *) ts;
994 		syscallarg(const sigset_t *) mask;
995 	} */ *uap = v;
996 
997 	int error;
998 	struct timespec ts, *tsp = NULL;
999 	sigset_t ss, *ssp = NULL;
1000 
1001 	if (SCARG(uap, ts) != NULL) {
1002 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
1003 			return (error);
1004 #ifdef KTRACE
1005 		if (KTRPOINT(p, KTR_STRUCT))
1006 			ktrreltimespec(p, &ts);
1007 #endif
1008 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
1009 			return (EINVAL);
1010 		tsp = &ts;
1011 	}
1012 
1013 	if (SCARG(uap, mask) != NULL) {
1014 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
1015 			return (error);
1016 		ssp = &ss;
1017 	}
1018 
1019 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
1020 	    retval));
1021 }
1022 
1023 int
1024 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
1025     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
1026 {
1027 	size_t sz;
1028 	struct pollfd pfds[4], *pl = pfds;
1029 	struct timespec elapsed, start, stop;
1030 	uint64_t nsecs;
1031 	int ncoll, i, s, error;
1032 
1033 	/* Standards say no more than MAX_OPEN; this is possibly better. */
1034 	if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
1035 		return (EINVAL);
1036 
1037 	/* optimize for the default case, of a small nfds value */
1038 	if (nfds > nitems(pfds)) {
1039 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
1040 		    M_WAITOK | M_CANFAIL);
1041 		if (pl == NULL)
1042 			return (EINVAL);
1043 	}
1044 
1045 	sz = nfds * sizeof(*pl);
1046 
1047 	if ((error = copyin(fds, pl, sz)) != 0)
1048 		goto bad;
1049 
1050 	for (i = 0; i < nfds; i++) {
1051 		pl[i].events &= ~POLL_NOHUP;
1052 		pl[i].revents = 0;
1053 	}
1054 
1055 	if (sigmask)
1056 		dosigsuspend(p, *sigmask &~ sigcantmask);
1057 
1058 retry:
1059 	ncoll = nselcoll;
1060 	atomic_setbits_int(&p->p_flag, P_SELECT);
1061 	pollscan(p, pl, nfds, retval);
1062 	if (*retval)
1063 		goto done;
1064 	if (timeout == NULL || timespecisset(timeout)) {
1065 		if (timeout != NULL) {
1066 			getnanouptime(&start);
1067 			nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
1068 		} else
1069 			nsecs = INFSLP;
1070 		s = splhigh();
1071 		if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
1072 			splx(s);
1073 			goto retry;
1074 		}
1075 		atomic_clearbits_int(&p->p_flag, P_SELECT);
1076 		error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs);
1077 		splx(s);
1078 		if (timeout != NULL) {
1079 			getnanouptime(&stop);
1080 			timespecsub(&stop, &start, &elapsed);
1081 			timespecsub(timeout, &elapsed, timeout);
1082 			if (timeout->tv_sec < 0)
1083 				timespecclear(timeout);
1084 		}
1085 		if (error == 0 || error == EWOULDBLOCK)
1086 			goto retry;
1087 	}
1088 
1089 done:
1090 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1091 	/*
1092 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1093 	 *       ignored (since the whole point is to see what would block).
1094 	 */
1095 	switch (error) {
1096 	case ERESTART:
1097 		error = pollout(pl, fds, nfds);
1098 		if (error == 0)
1099 			error = EINTR;
1100 		break;
1101 	case EWOULDBLOCK:
1102 	case 0:
1103 		error = pollout(pl, fds, nfds);
1104 		break;
1105 	}
1106 #ifdef KTRACE
1107 	if (KTRPOINT(p, KTR_STRUCT))
1108 		ktrpollfd(p, pl, nfds);
1109 #endif /* KTRACE */
1110 bad:
1111 	if (pl != pfds)
1112 		free(pl, M_TEMP, sz);
1113 	return (error);
1114 }
1115 
1116 /*
1117  * utrace system call
1118  */
1119 int
1120 sys_utrace(struct proc *curp, void *v, register_t *retval)
1121 {
1122 #ifdef KTRACE
1123 	struct sys_utrace_args /* {
1124 		syscallarg(const char *) label;
1125 		syscallarg(const void *) addr;
1126 		syscallarg(size_t) len;
1127 	} */ *uap = v;
1128 
1129 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1130 	    SCARG(uap, len)));
1131 #else
1132 	return (0);
1133 #endif
1134 }
1135