xref: /openbsd-src/sys/kern/sys_generic.c (revision 0d280c5f69f6ef21f5dca2b37e738de752505c51)
1 /*	$OpenBSD: sys_generic.c,v 1.149 2022/08/14 01:58:28 jsg Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/time.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #include <sys/eventvar.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 #include <sys/pledge.h>
61 
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64 
65 /*
66  * Debug values:
67  *  1 - print implementation errors, things that should not happen.
68  *  2 - print ppoll(2) information, somewhat verbose
69  *  3 - print pselect(2) and ppoll(2) information, very verbose
70  */
71 int kqpoll_debug = 0;
72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) {			\
73 	printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);	\
74 	printf(x);							\
75 }
76 
77 int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *);
78 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *);
80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
81 
82 int pollout(struct pollfd *, struct pollfd *, u_int);
83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
84     struct timespec *, const sigset_t *, register_t *);
85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
86     const sigset_t *, register_t *);
87 
88 int
89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
90     unsigned int iovcnt, size_t *residp)
91 {
92 #ifdef KTRACE
93 	struct proc *p = curproc;
94 #endif
95 	struct iovec *iov;
96 	int error, i;
97 	size_t resid = 0;
98 
99 	if (iovcnt > UIO_SMALLIOV) {
100 		if (iovcnt > IOV_MAX)
101 			return (EINVAL);
102 		iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
103 	} else if (iovcnt > 0) {
104 		iov = aiov;
105 	} else {
106 		return (EINVAL);
107 	}
108 	*iovp = iov;
109 
110 	if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
111 		return (error);
112 
113 #ifdef KTRACE
114 	if (KTRPOINT(p, KTR_STRUCT))
115 		ktriovec(p, iov, iovcnt);
116 #endif
117 
118 	for (i = 0; i < iovcnt; i++) {
119 		resid += iov->iov_len;
120 		/*
121 		 * Writes return ssize_t because -1 is returned on error.
122 		 * Therefore we must restrict the length to SSIZE_MAX to
123 		 * avoid garbage return values.  Note that the addition is
124 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
125 		 */
126 		if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
127 			return (EINVAL);
128 		iov++;
129 	}
130 
131 	if (residp != NULL)
132 		*residp = resid;
133 
134 	return (0);
135 }
136 
137 void
138 iovec_free(struct iovec *iov, unsigned int iovcnt)
139 {
140 	if (iovcnt > UIO_SMALLIOV)
141 		free(iov, M_IOV, iovcnt * sizeof(*iov));
142 }
143 
144 /*
145  * Read system call.
146  */
147 int
148 sys_read(struct proc *p, void *v, register_t *retval)
149 {
150 	struct sys_read_args /* {
151 		syscallarg(int) fd;
152 		syscallarg(void *) buf;
153 		syscallarg(size_t) nbyte;
154 	} */ *uap = v;
155 	struct iovec iov;
156 	struct uio auio;
157 
158 	iov.iov_base = SCARG(uap, buf);
159 	iov.iov_len = SCARG(uap, nbyte);
160 	if (iov.iov_len > SSIZE_MAX)
161 		return (EINVAL);
162 
163 	auio.uio_iov = &iov;
164 	auio.uio_iovcnt = 1;
165 	auio.uio_resid = iov.iov_len;
166 
167 	return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
168 }
169 
170 /*
171  * Scatter read system call.
172  */
173 int
174 sys_readv(struct proc *p, void *v, register_t *retval)
175 {
176 	struct sys_readv_args /* {
177 		syscallarg(int) fd;
178 		syscallarg(const struct iovec *) iovp;
179 		syscallarg(int) iovcnt;
180 	} */ *uap = v;
181 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
182 	int error, iovcnt = SCARG(uap, iovcnt);
183 	struct uio auio;
184 	size_t resid;
185 
186 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
187 	if (error)
188 		goto done;
189 
190 	auio.uio_iov = iov;
191 	auio.uio_iovcnt = iovcnt;
192 	auio.uio_resid = resid;
193 
194 	error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
195  done:
196 	iovec_free(iov, iovcnt);
197 	return (error);
198 }
199 
200 int
201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
202     register_t *retval)
203 {
204 	struct filedesc *fdp = p->p_fd;
205 	struct file *fp;
206 	long cnt, error = 0;
207 	u_int iovlen;
208 #ifdef KTRACE
209 	struct iovec *ktriov = NULL;
210 #endif
211 
212 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
213 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
214 
215 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
216 		return (EBADF);
217 
218 	/* Checks for positioned read. */
219 	if (flags & FO_POSITION) {
220 		struct vnode *vp = fp->f_data;
221 
222 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
223 		    (vp->v_flag & VISTTY)) {
224 			error = ESPIPE;
225 			goto done;
226 		}
227 
228 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
229 			error = EINVAL;
230 			goto done;
231 		}
232 	}
233 
234 	uio->uio_rw = UIO_READ;
235 	uio->uio_segflg = UIO_USERSPACE;
236 	uio->uio_procp = p;
237 #ifdef KTRACE
238 	/*
239 	 * if tracing, save a copy of iovec
240 	 */
241 	if (KTRPOINT(p, KTR_GENIO)) {
242 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
243 		memcpy(ktriov, uio->uio_iov, iovlen);
244 	}
245 #endif
246 	cnt = uio->uio_resid;
247 	error = (*fp->f_ops->fo_read)(fp, uio, flags);
248 	if (error) {
249 		if (uio->uio_resid != cnt && (error == ERESTART ||
250 		    error == EINTR || error == EWOULDBLOCK))
251 			error = 0;
252 	}
253 	cnt -= uio->uio_resid;
254 
255 	mtx_enter(&fp->f_mtx);
256 	fp->f_rxfer++;
257 	fp->f_rbytes += cnt;
258 	mtx_leave(&fp->f_mtx);
259 #ifdef KTRACE
260 	if (ktriov != NULL) {
261 		if (error == 0)
262 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
263 		free(ktriov, M_TEMP, iovlen);
264 	}
265 #endif
266 	*retval = cnt;
267  done:
268 	FRELE(fp, p);
269 	return (error);
270 }
271 
272 /*
273  * Write system call
274  */
275 int
276 sys_write(struct proc *p, void *v, register_t *retval)
277 {
278 	struct sys_write_args /* {
279 		syscallarg(int) fd;
280 		syscallarg(const void *) buf;
281 		syscallarg(size_t) nbyte;
282 	} */ *uap = v;
283 	struct iovec iov;
284 	struct uio auio;
285 
286 	iov.iov_base = (void *)SCARG(uap, buf);
287 	iov.iov_len = SCARG(uap, nbyte);
288 	if (iov.iov_len > SSIZE_MAX)
289 		return (EINVAL);
290 
291 	auio.uio_iov = &iov;
292 	auio.uio_iovcnt = 1;
293 	auio.uio_resid = iov.iov_len;
294 
295 	return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
296 }
297 
298 /*
299  * Gather write system call
300  */
301 int
302 sys_writev(struct proc *p, void *v, register_t *retval)
303 {
304 	struct sys_writev_args /* {
305 		syscallarg(int) fd;
306 		syscallarg(const struct iovec *) iovp;
307 		syscallarg(int) iovcnt;
308 	} */ *uap = v;
309 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
310 	int error, iovcnt = SCARG(uap, iovcnt);
311 	struct uio auio;
312 	size_t resid;
313 
314 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
315 	if (error)
316 		goto done;
317 
318 	auio.uio_iov = iov;
319 	auio.uio_iovcnt = iovcnt;
320 	auio.uio_resid = resid;
321 
322 	error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
323  done:
324 	iovec_free(iov, iovcnt);
325  	return (error);
326 }
327 
328 int
329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
330     register_t *retval)
331 {
332 	struct filedesc *fdp = p->p_fd;
333 	struct file *fp;
334 	long cnt, error = 0;
335 	u_int iovlen;
336 #ifdef KTRACE
337 	struct iovec *ktriov = NULL;
338 #endif
339 
340 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
341 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
342 
343 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
344 		return (EBADF);
345 
346 	/* Checks for positioned write. */
347 	if (flags & FO_POSITION) {
348 		struct vnode *vp = fp->f_data;
349 
350 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
351 		    (vp->v_flag & VISTTY)) {
352 			error = ESPIPE;
353 			goto done;
354 		}
355 
356 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
357 			error = EINVAL;
358 			goto done;
359 		}
360 	}
361 
362 	uio->uio_rw = UIO_WRITE;
363 	uio->uio_segflg = UIO_USERSPACE;
364 	uio->uio_procp = p;
365 #ifdef KTRACE
366 	/*
367 	 * if tracing, save a copy of iovec
368 	 */
369 	if (KTRPOINT(p, KTR_GENIO)) {
370 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
371 		memcpy(ktriov, uio->uio_iov, iovlen);
372 	}
373 #endif
374 	cnt = uio->uio_resid;
375 	error = (*fp->f_ops->fo_write)(fp, uio, flags);
376 	if (error) {
377 		if (uio->uio_resid != cnt && (error == ERESTART ||
378 		    error == EINTR || error == EWOULDBLOCK))
379 			error = 0;
380 		if (error == EPIPE) {
381 			KERNEL_LOCK();
382 			ptsignal(p, SIGPIPE, STHREAD);
383 			KERNEL_UNLOCK();
384 		}
385 	}
386 	cnt -= uio->uio_resid;
387 
388 	mtx_enter(&fp->f_mtx);
389 	fp->f_wxfer++;
390 	fp->f_wbytes += cnt;
391 	mtx_leave(&fp->f_mtx);
392 #ifdef KTRACE
393 	if (ktriov != NULL) {
394 		if (error == 0)
395 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
396 		free(ktriov, M_TEMP, iovlen);
397 	}
398 #endif
399 	*retval = cnt;
400  done:
401 	FRELE(fp, p);
402 	return (error);
403 }
404 
405 /*
406  * Ioctl system call
407  */
408 int
409 sys_ioctl(struct proc *p, void *v, register_t *retval)
410 {
411 	struct sys_ioctl_args /* {
412 		syscallarg(int) fd;
413 		syscallarg(u_long) com;
414 		syscallarg(void *) data;
415 	} */ *uap = v;
416 	struct file *fp;
417 	struct filedesc *fdp = p->p_fd;
418 	u_long com = SCARG(uap, com);
419 	int error = 0;
420 	u_int size = 0;
421 	caddr_t data, memp = NULL;
422 	int tmp;
423 #define STK_PARAMS	128
424 	long long stkbuf[STK_PARAMS / sizeof(long long)];
425 
426 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
427 		return (EBADF);
428 
429 	if (fp->f_type == DTYPE_SOCKET) {
430 		struct socket *so = fp->f_data;
431 
432 		if (so->so_state & SS_DNS) {
433 			error = EINVAL;
434 			goto out;
435 		}
436 	}
437 
438 	error = pledge_ioctl(p, com, fp);
439 	if (error)
440 		goto out;
441 
442 	switch (com) {
443 	case FIONCLEX:
444 	case FIOCLEX:
445 		fdplock(fdp);
446 		if (com == FIONCLEX)
447 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
448 		else
449 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
450 		fdpunlock(fdp);
451 		goto out;
452 	}
453 
454 	/*
455 	 * Interpret high order word to find amount of data to be
456 	 * copied to/from the user's address space.
457 	 */
458 	size = IOCPARM_LEN(com);
459 	if (size > IOCPARM_MAX) {
460 		error = ENOTTY;
461 		goto out;
462 	}
463 	if (size > sizeof (stkbuf)) {
464 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
465 		data = memp;
466 	} else
467 		data = (caddr_t)stkbuf;
468 	if (com&IOC_IN) {
469 		if (size) {
470 			error = copyin(SCARG(uap, data), data, size);
471 			if (error) {
472 				goto out;
473 			}
474 		} else
475 			*(caddr_t *)data = SCARG(uap, data);
476 	} else if ((com&IOC_OUT) && size)
477 		/*
478 		 * Zero the buffer so the user always
479 		 * gets back something deterministic.
480 		 */
481 		memset(data, 0, size);
482 	else if (com&IOC_VOID)
483 		*(caddr_t *)data = SCARG(uap, data);
484 
485 	switch (com) {
486 
487 	case FIONBIO:
488 		if ((tmp = *(int *)data) != 0)
489 			atomic_setbits_int(&fp->f_flag, FNONBLOCK);
490 		else
491 			atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
492 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
493 		break;
494 
495 	case FIOASYNC:
496 		if ((tmp = *(int *)data) != 0)
497 			atomic_setbits_int(&fp->f_flag, FASYNC);
498 		else
499 			atomic_clearbits_int(&fp->f_flag, FASYNC);
500 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
501 		break;
502 
503 	default:
504 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
505 		break;
506 	}
507 	/*
508 	 * Copy any data to user, size was
509 	 * already set and checked above.
510 	 */
511 	if (error == 0 && (com&IOC_OUT) && size)
512 		error = copyout(data, SCARG(uap, data), size);
513 out:
514 	FRELE(fp, p);
515 	free(memp, M_IOCTLOPS, size);
516 	return (error);
517 }
518 
519 /*
520  * Select system call.
521  */
522 int
523 sys_select(struct proc *p, void *v, register_t *retval)
524 {
525 	struct sys_select_args /* {
526 		syscallarg(int) nd;
527 		syscallarg(fd_set *) in;
528 		syscallarg(fd_set *) ou;
529 		syscallarg(fd_set *) ex;
530 		syscallarg(struct timeval *) tv;
531 	} */ *uap = v;
532 
533 	struct timespec ts, *tsp = NULL;
534 	int error;
535 
536 	if (SCARG(uap, tv) != NULL) {
537 		struct timeval tv;
538 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
539 			return (error);
540 #ifdef KTRACE
541 		if (KTRPOINT(p, KTR_STRUCT))
542 			ktrreltimeval(p, &tv);
543 #endif
544 		if (tv.tv_sec < 0 || !timerisvalid(&tv))
545 			return (EINVAL);
546 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
547 		tsp = &ts;
548 	}
549 
550 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
551 	    SCARG(uap, ex), tsp, NULL, retval));
552 }
553 
554 int
555 sys_pselect(struct proc *p, void *v, register_t *retval)
556 {
557 	struct sys_pselect_args /* {
558 		syscallarg(int) nd;
559 		syscallarg(fd_set *) in;
560 		syscallarg(fd_set *) ou;
561 		syscallarg(fd_set *) ex;
562 		syscallarg(const struct timespec *) ts;
563 		syscallarg(const sigset_t *) mask;
564 	} */ *uap = v;
565 
566 	struct timespec ts, *tsp = NULL;
567 	sigset_t ss, *ssp = NULL;
568 	int error;
569 
570 	if (SCARG(uap, ts) != NULL) {
571 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
572 			return (error);
573 #ifdef KTRACE
574 		if (KTRPOINT(p, KTR_STRUCT))
575 			ktrreltimespec(p, &ts);
576 #endif
577 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
578 			return (EINVAL);
579 		tsp = &ts;
580 	}
581 	if (SCARG(uap, mask) != NULL) {
582 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
583 			return (error);
584 		ssp = &ss;
585 	}
586 
587 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
588 	    SCARG(uap, ex), tsp, ssp, retval));
589 }
590 
591 int
592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
593     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
594 {
595 	struct kqueue_scan_state scan;
596 	struct timespec zerots = {};
597 	fd_mask bits[6];
598 	fd_set *pibits[3], *pobits[3];
599 	int error, ncollected = 0, nevents = 0;
600 	u_int ni;
601 
602 	if (nd < 0)
603 		return (EINVAL);
604 	if (nd > p->p_fd->fd_nfiles) {
605 		/* forgiving; slightly wrong */
606 		nd = p->p_fd->fd_nfiles;
607 	}
608 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
609 	if (ni > sizeof(bits[0])) {
610 		caddr_t mbits;
611 
612 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
613 		pibits[0] = (fd_set *)&mbits[ni * 0];
614 		pibits[1] = (fd_set *)&mbits[ni * 1];
615 		pibits[2] = (fd_set *)&mbits[ni * 2];
616 		pobits[0] = (fd_set *)&mbits[ni * 3];
617 		pobits[1] = (fd_set *)&mbits[ni * 4];
618 		pobits[2] = (fd_set *)&mbits[ni * 5];
619 	} else {
620 		memset(bits, 0, sizeof(bits));
621 		pibits[0] = (fd_set *)&bits[0];
622 		pibits[1] = (fd_set *)&bits[1];
623 		pibits[2] = (fd_set *)&bits[2];
624 		pobits[0] = (fd_set *)&bits[3];
625 		pobits[1] = (fd_set *)&bits[4];
626 		pobits[2] = (fd_set *)&bits[5];
627 	}
628 
629 	kqpoll_init(nd);
630 
631 #define	getbits(name, x) \
632 	if (name && (error = copyin(name, pibits[x], ni))) \
633 		goto done;
634 	getbits(in, 0);
635 	getbits(ou, 1);
636 	getbits(ex, 2);
637 #undef	getbits
638 #ifdef KTRACE
639 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
640 		if (in) ktrfdset(p, pibits[0], ni);
641 		if (ou) ktrfdset(p, pibits[1], ni);
642 		if (ex) ktrfdset(p, pibits[2], ni);
643 	}
644 #endif
645 
646 	if (sigmask)
647 		dosigsuspend(p, *sigmask &~ sigcantmask);
648 
649 	/* Register kqueue events */
650 	error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected);
651 	if (error != 0)
652 		goto done;
653 
654 	/*
655 	 * The poll/select family of syscalls has been designed to
656 	 * block when file descriptors are not available, even if
657 	 * there's nothing to wait for.
658 	 */
659 	if (nevents == 0 && ncollected == 0) {
660 		uint64_t nsecs = INFSLP;
661 
662 		if (timeout != NULL) {
663 			if (!timespecisset(timeout))
664 				goto done;
665 			nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
666 		}
667 		error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs);
668 		/* select is not restarted after signals... */
669 		if (error == ERESTART)
670 			error = EINTR;
671 		if (error == EWOULDBLOCK)
672 			error = 0;
673 		goto done;
674 	}
675 
676 	/* Do not block if registering found pending events. */
677 	if (ncollected > 0)
678 		timeout = &zerots;
679 
680 	/* Collect at most `nevents' possibly waiting in kqueue_scan() */
681 	kqueue_scan_setup(&scan, p->p_kq);
682 	while (nevents > 0) {
683 		struct kevent kev[KQ_NEVENTS];
684 		int i, ready, count;
685 
686 		/* Maximum number of events per iteration */
687 		count = MIN(nitems(kev), nevents);
688 		ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
689 #ifdef KTRACE
690 		if (KTRPOINT(p, KTR_STRUCT))
691 			ktrevent(p, kev, ready);
692 #endif
693 		/* Convert back events that are ready. */
694 		for (i = 0; i < ready && error == 0; i++)
695 			error = pselcollect(p, &kev[i], pobits, &ncollected);
696 		/*
697 		 * Stop if there was an error or if we had enough
698 		 * space to collect all events that were ready.
699 		 */
700 		if (error || ready < count)
701 			break;
702 
703 		nevents -= ready;
704 	}
705 	kqueue_scan_finish(&scan);
706 	*retval = ncollected;
707 done:
708 #define	putbits(name, x) \
709 	if (name && (error2 = copyout(pobits[x], name, ni))) \
710 		error = error2;
711 	if (error == 0) {
712 		int error2;
713 
714 		putbits(in, 0);
715 		putbits(ou, 1);
716 		putbits(ex, 2);
717 #undef putbits
718 #ifdef KTRACE
719 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
720 			if (in) ktrfdset(p, pobits[0], ni);
721 			if (ou) ktrfdset(p, pobits[1], ni);
722 			if (ex) ktrfdset(p, pobits[2], ni);
723 		}
724 #endif
725 	}
726 
727 	if (pibits[0] != (fd_set *)&bits[0])
728 		free(pibits[0], M_TEMP, 6 * ni);
729 
730 	kqpoll_done(nd);
731 
732 	return (error);
733 }
734 
735 /*
736  * Convert fd_set into kqueue events and register them on the
737  * per-thread queue.
738  */
739 int
740 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd,
741     int *nregistered, int *ncollected)
742 {
743 	static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
744 	static const int evff[] = { 0, 0, NOTE_OOB };
745 	int msk, i, j, fd, nevents = 0, error = 0;
746 	struct kevent kev;
747 	fd_mask bits;
748 
749 	for (msk = 0; msk < 3; msk++) {
750 		for (i = 0; i < nfd; i += NFDBITS) {
751 			bits = pibits[msk]->fds_bits[i / NFDBITS];
752 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
753 				bits &= ~(1 << j);
754 
755 				DPRINTFN(2, "select fd %d mask %d serial %lu\n",
756 				    fd, msk, p->p_kq_serial);
757 				EV_SET(&kev, fd, evf[msk],
758 				    EV_ADD|EV_ENABLE|__EV_SELECT,
759 				    evff[msk], 0, (void *)(p->p_kq_serial));
760 #ifdef KTRACE
761 				if (KTRPOINT(p, KTR_STRUCT))
762 					ktrevent(p, &kev, 1);
763 #endif
764 				error = kqueue_register(p->p_kq, &kev, 0, p);
765 				switch (error) {
766 				case 0:
767 					nevents++;
768 				/* FALLTHROUGH */
769 				case EOPNOTSUPP:/* No underlying kqfilter */
770 				case EINVAL:	/* Unimplemented filter */
771 				case EPERM:	/* Specific to FIFO and
772 						 * __EV_SELECT */
773 					error = 0;
774 					break;
775 				case EPIPE:	/* Specific to pipes */
776 					KASSERT(kev.filter == EVFILT_WRITE);
777 					FD_SET(kev.ident, pobits[1]);
778 					(*ncollected)++;
779 					error = 0;
780 					break;
781 				case ENXIO:	/* Device has been detached */
782 				default:
783 					goto bad;
784 				}
785 			}
786 		}
787 	}
788 
789 	*nregistered = nevents;
790 	return (0);
791 bad:
792 	DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
793 	    kev.filter, error);
794 	return (error);
795 }
796 
797 /*
798  * Convert given kqueue event into corresponding select(2) bit.
799  */
800 int
801 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
802     int *ncollected)
803 {
804 	if ((unsigned long)kevp->udata != p->p_kq_serial) {
805 		panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx",
806 		    __func__, kevp, (int)kevp->ident,
807 		    (unsigned long)kevp->udata, p->p_kq_serial);
808 	}
809 
810 	if (kevp->flags & EV_ERROR) {
811 		DPRINTFN(2, "select fd %d filt %d error %d\n",
812 		    (int)kevp->ident, kevp->filter, (int)kevp->data);
813 		return (kevp->data);
814 	}
815 
816 	switch (kevp->filter) {
817 	case EVFILT_READ:
818 		FD_SET(kevp->ident, pobits[0]);
819 		break;
820 	case EVFILT_WRITE:
821 		FD_SET(kevp->ident, pobits[1]);
822 		break;
823 	case EVFILT_EXCEPT:
824 		FD_SET(kevp->ident, pobits[2]);
825 		break;
826 	default:
827 		KASSERT(0);
828 	}
829 	(*ncollected)++;
830 
831 	DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
832 	return (0);
833 }
834 
835 /*
836  * Do a wakeup when a selectable event occurs.
837  */
838 void
839 selwakeup(struct selinfo *sip)
840 {
841 	KERNEL_LOCK();
842 	KNOTE(&sip->si_note, NOTE_SUBMIT);
843 	KERNEL_UNLOCK();
844 }
845 
846 /*
847  * Only copyout the revents field.
848  */
849 int
850 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
851 {
852 	int error = 0;
853 	u_int i = 0;
854 
855 	while (!error && i++ < nfds) {
856 		error = copyout(&pl->revents, &upl->revents,
857 		    sizeof(upl->revents));
858 		pl++;
859 		upl++;
860 	}
861 
862 	return (error);
863 }
864 
865 /*
866  * We are using the same mechanism as select only we encode/decode args
867  * differently.
868  */
869 int
870 sys_poll(struct proc *p, void *v, register_t *retval)
871 {
872 	struct sys_poll_args /* {
873 		syscallarg(struct pollfd *) fds;
874 		syscallarg(u_int) nfds;
875 		syscallarg(int) timeout;
876 	} */ *uap = v;
877 
878 	struct timespec ts, *tsp = NULL;
879 	int msec = SCARG(uap, timeout);
880 
881 	if (msec != INFTIM) {
882 		if (msec < 0)
883 			return (EINVAL);
884 		ts.tv_sec = msec / 1000;
885 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
886 		tsp = &ts;
887 	}
888 
889 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
890 	    retval));
891 }
892 
893 int
894 sys_ppoll(struct proc *p, void *v, register_t *retval)
895 {
896 	struct sys_ppoll_args /* {
897 		syscallarg(struct pollfd *) fds;
898 		syscallarg(u_int) nfds;
899 		syscallarg(const struct timespec *) ts;
900 		syscallarg(const sigset_t *) mask;
901 	} */ *uap = v;
902 
903 	int error;
904 	struct timespec ts, *tsp = NULL;
905 	sigset_t ss, *ssp = NULL;
906 
907 	if (SCARG(uap, ts) != NULL) {
908 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
909 			return (error);
910 #ifdef KTRACE
911 		if (KTRPOINT(p, KTR_STRUCT))
912 			ktrreltimespec(p, &ts);
913 #endif
914 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
915 			return (EINVAL);
916 		tsp = &ts;
917 	}
918 
919 	if (SCARG(uap, mask) != NULL) {
920 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
921 			return (error);
922 		ssp = &ss;
923 	}
924 
925 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
926 	    retval));
927 }
928 
929 int
930 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
931     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
932 {
933 	struct kqueue_scan_state scan;
934 	struct timespec zerots = {};
935 	struct pollfd pfds[4], *pl = pfds;
936 	int error, ncollected = 0, nevents = 0;
937 	size_t sz;
938 
939 	/* Standards say no more than MAX_OPEN; this is possibly better. */
940 	if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
941 		return (EINVAL);
942 
943 	/* optimize for the default case, of a small nfds value */
944 	if (nfds > nitems(pfds)) {
945 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
946 		    M_WAITOK | M_CANFAIL);
947 		if (pl == NULL)
948 			return (EINVAL);
949 	}
950 
951 	kqpoll_init(nfds);
952 
953 	sz = nfds * sizeof(*pl);
954 
955 	if ((error = copyin(fds, pl, sz)) != 0)
956 		goto bad;
957 
958 	if (sigmask)
959 		dosigsuspend(p, *sigmask &~ sigcantmask);
960 
961 	/* Register kqueue events */
962 	ppollregister(p, pl, nfds, &nevents, &ncollected);
963 
964 	/*
965 	 * The poll/select family of syscalls has been designed to
966 	 * block when file descriptors are not available, even if
967 	 * there's nothing to wait for.
968 	 */
969 	if (nevents == 0 && ncollected == 0) {
970 		uint64_t nsecs = INFSLP;
971 
972 		if (timeout != NULL) {
973 			if (!timespecisset(timeout))
974 				goto done;
975 			nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
976 		}
977 
978 		error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs);
979 		if (error == ERESTART)
980 			error = EINTR;
981 		if (error == EWOULDBLOCK)
982 			error = 0;
983 		goto done;
984 	}
985 
986 	/* Do not block if registering found pending events. */
987 	if (ncollected > 0)
988 		timeout = &zerots;
989 
990 	/* Collect at most `nevents' possibly waiting in kqueue_scan() */
991 	kqueue_scan_setup(&scan, p->p_kq);
992 	while (nevents > 0) {
993 		struct kevent kev[KQ_NEVENTS];
994 		int i, ready, count;
995 
996 		/* Maximum number of events per iteration */
997 		count = MIN(nitems(kev), nevents);
998 		ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
999 #ifdef KTRACE
1000 		if (KTRPOINT(p, KTR_STRUCT))
1001 			ktrevent(p, kev, ready);
1002 #endif
1003 		/* Convert back events that are ready. */
1004 		for (i = 0; i < ready; i++)
1005 			ncollected += ppollcollect(p, &kev[i], pl, nfds);
1006 
1007 		/*
1008 		 * Stop if there was an error or if we had enough
1009 		 * place to collect all events that were ready.
1010 		 */
1011 		if (error || ready < count)
1012 			break;
1013 
1014 		nevents -= ready;
1015 	}
1016 	kqueue_scan_finish(&scan);
1017 	*retval = ncollected;
1018 done:
1019 	/*
1020 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1021 	 *       ignored (since the whole point is to see what would block).
1022 	 */
1023 	switch (error) {
1024 	case EINTR:
1025 		error = pollout(pl, fds, nfds);
1026 		if (error == 0)
1027 			error = EINTR;
1028 		break;
1029 	case EWOULDBLOCK:
1030 	case 0:
1031 		error = pollout(pl, fds, nfds);
1032 		break;
1033 	}
1034 #ifdef KTRACE
1035 	if (KTRPOINT(p, KTR_STRUCT))
1036 		ktrpollfd(p, pl, nfds);
1037 #endif /* KTRACE */
1038 bad:
1039 	if (pl != pfds)
1040 		free(pl, M_TEMP, sz);
1041 
1042 	kqpoll_done(nfds);
1043 
1044 	return (error);
1045 }
1046 
1047 int
1048 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev,
1049     struct pollfd *pl, unsigned int pollid)
1050 {
1051 	int i, error, nevents = 0;
1052 
1053 	KASSERT(pl->revents == 0);
1054 
1055 #ifdef KTRACE
1056 	if (KTRPOINT(p, KTR_STRUCT))
1057 		ktrevent(p, kevp, nkev);
1058 #endif
1059 	for (i = 0; i < nkev; i++, kevp++) {
1060 again:
1061 		error = kqueue_register(p->p_kq, kevp, pollid, p);
1062 		switch (error) {
1063 		case 0:
1064 			nevents++;
1065 			break;
1066 		case EOPNOTSUPP:/* No underlying kqfilter */
1067 		case EINVAL:	/* Unimplemented filter */
1068 			break;
1069 		case EBADF:	/* Bad file descriptor */
1070 			pl->revents |= POLLNVAL;
1071 			break;
1072 		case EPERM:	/* Specific to FIFO */
1073 			KASSERT(kevp->filter == EVFILT_WRITE);
1074 			if (nkev == 1) {
1075 				/*
1076 				 * If this is the only filter make sure
1077 				 * POLLHUP is passed to userland.
1078 				 */
1079 				kevp->filter = EVFILT_EXCEPT;
1080 				goto again;
1081 			}
1082 			break;
1083 		case EPIPE:	/* Specific to pipes */
1084 			KASSERT(kevp->filter == EVFILT_WRITE);
1085 			pl->revents |= POLLHUP;
1086 			break;
1087 		default:
1088 			DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
1089 			    " %lu filt %d ERROR=%d\n",
1090 			    ((unsigned long)kevp->udata - p->p_kq_serial),
1091 			    pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
1092 			    error);
1093 			/* FALLTHROUGH */
1094 		case ENXIO:	/* Device has been detached */
1095 			pl->revents |= POLLERR;
1096 			break;
1097 		}
1098 	}
1099 
1100 	return (nevents);
1101 }
1102 
1103 /*
1104  * Convert pollfd into kqueue events and register them on the
1105  * per-thread queue.
1106  *
1107  * At most 3 events can correspond to a single pollfd.
1108  */
1109 void
1110 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered,
1111     int *ncollected)
1112 {
1113 	int i, nkev, nevt, forcehup;
1114 	struct kevent kev[3], *kevp;
1115 
1116 	for (i = 0; i < nfds; i++) {
1117 		pl[i].events &= ~POLL_NOHUP;
1118 		pl[i].revents = 0;
1119 
1120 		if (pl[i].fd < 0)
1121 			continue;
1122 
1123 		/*
1124 		 * POLLHUP checking is implicit in the event filters.
1125 		 * However, the checking must be even if no events are
1126 		 * requested.
1127 		 */
1128 		forcehup = ((pl[i].events & ~POLLHUP) == 0);
1129 
1130 		DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n",
1131 		    i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial);
1132 
1133 		nevt = 0;
1134 		nkev = 0;
1135 		kevp = kev;
1136 		if (pl[i].events & (POLLIN | POLLRDNORM)) {
1137 			EV_SET(kevp, pl[i].fd, EVFILT_READ,
1138 			    EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1139 			    (void *)(p->p_kq_serial + i));
1140 			nkev++;
1141 			kevp++;
1142 		}
1143 		if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1144 			EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
1145 			    EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1146 			    (void *)(p->p_kq_serial + i));
1147 			nkev++;
1148 			kevp++;
1149 		}
1150 		if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) {
1151 			int evff = forcehup ? 0 : NOTE_OOB;
1152 
1153 			EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT,
1154 			    EV_ADD|EV_ENABLE|__EV_POLL, evff, 0,
1155 			    (void *)(p->p_kq_serial + i));
1156 			nkev++;
1157 			kevp++;
1158 		}
1159 
1160 		if (nkev == 0)
1161 			continue;
1162 
1163 		*nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i);
1164 
1165 		if (pl[i].revents != 0)
1166 			(*ncollected)++;
1167 	}
1168 
1169 	DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered,
1170 	    *ncollected);
1171 }
1172 
1173 /*
1174  * Convert given kqueue event into corresponding poll(2) revents bit.
1175  */
1176 int
1177 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds)
1178 {
1179 	static struct timeval poll_errintvl = { 5, 0 };
1180 	static struct timeval poll_lasterr;
1181 	int already_seen;
1182 	unsigned long i;
1183 
1184 	/*  Extract poll array index */
1185 	i = (unsigned long)kevp->udata - p->p_kq_serial;
1186 
1187 	if (i >= nfds) {
1188 		panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx",
1189 		    __func__, kevp, nfds,
1190 		    (unsigned long)kevp->udata, p->p_kq_serial);
1191 	}
1192 	if ((int)kevp->ident != pl[i].fd) {
1193 		panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx",
1194 		    __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd,
1195 		    p->p_kq_serial);
1196 	}
1197 
1198 	/*
1199 	 * A given descriptor may already have generated an error
1200 	 * against another filter during kqueue_register().
1201 	 *
1202 	 * Make sure to set the appropriate flags but do not
1203 	 * increment `*retval' more than once.
1204 	 */
1205 	already_seen = (pl[i].revents != 0);
1206 
1207 	/* POLLNVAL preempts other events. */
1208 	if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) {
1209 		pl[i].revents = POLLNVAL;
1210 		goto done;
1211 	} else if (pl[i].revents & POLLNVAL) {
1212 		goto done;
1213 	}
1214 
1215 	switch (kevp->filter) {
1216 	case EVFILT_READ:
1217 		if (kevp->flags & __EV_HUP)
1218 			pl[i].revents |= POLLHUP;
1219 		if (pl[i].events & (POLLIN | POLLRDNORM))
1220 			pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
1221 		break;
1222 	case EVFILT_WRITE:
1223 		/* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
1224 		if (kevp->flags & __EV_HUP) {
1225 			pl[i].revents |= POLLHUP;
1226 		} else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1227 			pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
1228 		}
1229 		break;
1230 	case EVFILT_EXCEPT:
1231 		if (kevp->flags & __EV_HUP) {
1232 			if (pl[i].events != 0 && pl[i].events != POLLOUT)
1233 				DPRINTFN(0, "weird events %x\n", pl[i].events);
1234 			pl[i].revents |= POLLHUP;
1235 			break;
1236 		}
1237 		if (pl[i].events & (POLLPRI | POLLRDBAND))
1238 			pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
1239 		break;
1240 	default:
1241 		KASSERT(0);
1242 	}
1243 
1244 done:
1245 	DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
1246 	    i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
1247 	    kevp->filter);
1248 
1249 	/*
1250 	 * Make noise about unclaimed events as they might indicate a bug
1251 	 * and can result in spurious-looking wakeups of poll(2).
1252 	 *
1253 	 * Live-locking within the system call should not happen because
1254 	 * the scan loop in doppoll() has an upper limit for the number
1255 	 * of events to process.
1256 	 */
1257 	if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) {
1258 		printf("%s[%d]: poll index %lu fd %d events 0x%x "
1259 		    "filter %d/0x%x unclaimed\n",
1260 		    p->p_p->ps_comm, p->p_tid, i, pl[i].fd,
1261 		    pl[i].events, kevp->filter, kevp->flags);
1262 	}
1263 
1264 	if (!already_seen && (pl[i].revents != 0))
1265 		return (1);
1266 
1267 	return (0);
1268 }
1269 
1270 /*
1271  * utrace system call
1272  */
1273 int
1274 sys_utrace(struct proc *curp, void *v, register_t *retval)
1275 {
1276 #ifdef KTRACE
1277 	struct sys_utrace_args /* {
1278 		syscallarg(const char *) label;
1279 		syscallarg(const void *) addr;
1280 		syscallarg(size_t) len;
1281 	} */ *uap = v;
1282 
1283 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1284 	    SCARG(uap, len)));
1285 #else
1286 	return (0);
1287 #endif
1288 }
1289