xref: /openbsd-src/sys/kern/sys_generic.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: sys_generic.c,v 1.134 2020/12/26 14:26:48 visa Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/time.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/eventvar.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 #include <sys/sched.h>
63 #include <sys/pledge.h>
64 
65 #include <sys/mount.h>
66 #include <sys/syscallargs.h>
67 
68 #include <uvm/uvm_extern.h>
69 
70 /*
71  * Debug values:
72  *  1 - print implementation errors, things that should not happen.
73  *  2 - print ppoll(2) information, somewhat verbose
74  *  3 - print pselect(2) and ppoll(2) information, very verbose
75  */
76 int kqpoll_debug = 0;
77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) {			\
78 	printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);	\
79 	printf(x);							\
80 }
81 
82 int pselregister(struct proc *, fd_set *[], int, int *);
83 int pselcollect(struct proc *, struct kevent *, fd_set *[]);
84 
85 int pollout(struct pollfd *, struct pollfd *, u_int);
86 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
87     struct timespec *, const sigset_t *, register_t *);
88 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
89     const sigset_t *, register_t *);
90 void doselwakeup(struct selinfo *);
91 
92 int
93 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
94     unsigned int iovcnt, size_t *residp)
95 {
96 #ifdef KTRACE
97 	struct proc *p = curproc;
98 #endif
99 	struct iovec *iov;
100 	int error, i;
101 	size_t resid = 0;
102 
103 	if (iovcnt > UIO_SMALLIOV) {
104 		if (iovcnt > IOV_MAX)
105 			return (EINVAL);
106 		iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
107 	} else if (iovcnt > 0) {
108 		iov = aiov;
109 	} else {
110 		return (EINVAL);
111 	}
112 	*iovp = iov;
113 
114 	if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
115 		return (error);
116 
117 #ifdef KTRACE
118 	if (KTRPOINT(p, KTR_STRUCT))
119 		ktriovec(p, iov, iovcnt);
120 #endif
121 
122 	for (i = 0; i < iovcnt; i++) {
123 		resid += iov->iov_len;
124 		/*
125 		 * Writes return ssize_t because -1 is returned on error.
126 		 * Therefore we must restrict the length to SSIZE_MAX to
127 		 * avoid garbage return values.  Note that the addition is
128 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
129 		 */
130 		if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
131 			return (EINVAL);
132 		iov++;
133 	}
134 
135 	if (residp != NULL)
136 		*residp = resid;
137 
138 	return (0);
139 }
140 
141 void
142 iovec_free(struct iovec *iov, unsigned int iovcnt)
143 {
144 	if (iovcnt > UIO_SMALLIOV)
145 		free(iov, M_IOV, iovcnt * sizeof(*iov));
146 }
147 
148 /*
149  * Read system call.
150  */
151 int
152 sys_read(struct proc *p, void *v, register_t *retval)
153 {
154 	struct sys_read_args /* {
155 		syscallarg(int) fd;
156 		syscallarg(void *) buf;
157 		syscallarg(size_t) nbyte;
158 	} */ *uap = v;
159 	struct iovec iov;
160 	struct uio auio;
161 
162 	iov.iov_base = SCARG(uap, buf);
163 	iov.iov_len = SCARG(uap, nbyte);
164 	if (iov.iov_len > SSIZE_MAX)
165 		return (EINVAL);
166 
167 	auio.uio_iov = &iov;
168 	auio.uio_iovcnt = 1;
169 	auio.uio_resid = iov.iov_len;
170 
171 	return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
172 }
173 
174 /*
175  * Scatter read system call.
176  */
177 int
178 sys_readv(struct proc *p, void *v, register_t *retval)
179 {
180 	struct sys_readv_args /* {
181 		syscallarg(int) fd;
182 		syscallarg(const struct iovec *) iovp;
183 		syscallarg(int) iovcnt;
184 	} */ *uap = v;
185 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
186 	int error, iovcnt = SCARG(uap, iovcnt);
187 	struct uio auio;
188 	size_t resid;
189 
190 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
191 	if (error)
192 		goto done;
193 
194 	auio.uio_iov = iov;
195 	auio.uio_iovcnt = iovcnt;
196 	auio.uio_resid = resid;
197 
198 	error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
199  done:
200 	iovec_free(iov, iovcnt);
201 	return (error);
202 }
203 
204 int
205 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
206     register_t *retval)
207 {
208 	struct filedesc *fdp = p->p_fd;
209 	struct file *fp;
210 	long cnt, error = 0;
211 	u_int iovlen;
212 #ifdef KTRACE
213 	struct iovec *ktriov = NULL;
214 #endif
215 
216 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
217 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
218 
219 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
220 		return (EBADF);
221 
222 	/* Checks for positioned read. */
223 	if (flags & FO_POSITION) {
224 		struct vnode *vp = fp->f_data;
225 
226 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
227 		    (vp->v_flag & VISTTY)) {
228 			error = ESPIPE;
229 			goto done;
230 		}
231 
232 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
233 			error = EINVAL;
234 			goto done;
235 		}
236 	}
237 
238 	uio->uio_rw = UIO_READ;
239 	uio->uio_segflg = UIO_USERSPACE;
240 	uio->uio_procp = p;
241 #ifdef KTRACE
242 	/*
243 	 * if tracing, save a copy of iovec
244 	 */
245 	if (KTRPOINT(p, KTR_GENIO)) {
246 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
247 		memcpy(ktriov, uio->uio_iov, iovlen);
248 	}
249 #endif
250 	cnt = uio->uio_resid;
251 	error = (*fp->f_ops->fo_read)(fp, uio, flags);
252 	if (error) {
253 		if (uio->uio_resid != cnt && (error == ERESTART ||
254 		    error == EINTR || error == EWOULDBLOCK))
255 			error = 0;
256 	}
257 	cnt -= uio->uio_resid;
258 
259 	mtx_enter(&fp->f_mtx);
260 	fp->f_rxfer++;
261 	fp->f_rbytes += cnt;
262 	mtx_leave(&fp->f_mtx);
263 #ifdef KTRACE
264 	if (ktriov != NULL) {
265 		if (error == 0)
266 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
267 		free(ktriov, M_TEMP, iovlen);
268 	}
269 #endif
270 	*retval = cnt;
271  done:
272 	FRELE(fp, p);
273 	return (error);
274 }
275 
276 /*
277  * Write system call
278  */
279 int
280 sys_write(struct proc *p, void *v, register_t *retval)
281 {
282 	struct sys_write_args /* {
283 		syscallarg(int) fd;
284 		syscallarg(const void *) buf;
285 		syscallarg(size_t) nbyte;
286 	} */ *uap = v;
287 	struct iovec iov;
288 	struct uio auio;
289 
290 	iov.iov_base = (void *)SCARG(uap, buf);
291 	iov.iov_len = SCARG(uap, nbyte);
292 	if (iov.iov_len > SSIZE_MAX)
293 		return (EINVAL);
294 
295 	auio.uio_iov = &iov;
296 	auio.uio_iovcnt = 1;
297 	auio.uio_resid = iov.iov_len;
298 
299 	return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
300 }
301 
302 /*
303  * Gather write system call
304  */
305 int
306 sys_writev(struct proc *p, void *v, register_t *retval)
307 {
308 	struct sys_writev_args /* {
309 		syscallarg(int) fd;
310 		syscallarg(const struct iovec *) iovp;
311 		syscallarg(int) iovcnt;
312 	} */ *uap = v;
313 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
314 	int error, iovcnt = SCARG(uap, iovcnt);
315 	struct uio auio;
316 	size_t resid;
317 
318 	error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
319 	if (error)
320 		goto done;
321 
322 	auio.uio_iov = iov;
323 	auio.uio_iovcnt = iovcnt;
324 	auio.uio_resid = resid;
325 
326 	error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
327  done:
328 	iovec_free(iov, iovcnt);
329  	return (error);
330 }
331 
332 int
333 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
334     register_t *retval)
335 {
336 	struct filedesc *fdp = p->p_fd;
337 	struct file *fp;
338 	long cnt, error = 0;
339 	u_int iovlen;
340 #ifdef KTRACE
341 	struct iovec *ktriov = NULL;
342 #endif
343 
344 	KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
345 	iovlen = uio->uio_iovcnt * sizeof(struct iovec);
346 
347 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
348 		return (EBADF);
349 
350 	/* Checks for positioned write. */
351 	if (flags & FO_POSITION) {
352 		struct vnode *vp = fp->f_data;
353 
354 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
355 		    (vp->v_flag & VISTTY)) {
356 			error = ESPIPE;
357 			goto done;
358 		}
359 
360 		if (uio->uio_offset < 0 && vp->v_type != VCHR) {
361 			error = EINVAL;
362 			goto done;
363 		}
364 	}
365 
366 	uio->uio_rw = UIO_WRITE;
367 	uio->uio_segflg = UIO_USERSPACE;
368 	uio->uio_procp = p;
369 #ifdef KTRACE
370 	/*
371 	 * if tracing, save a copy of iovec
372 	 */
373 	if (KTRPOINT(p, KTR_GENIO)) {
374 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
375 		memcpy(ktriov, uio->uio_iov, iovlen);
376 	}
377 #endif
378 	cnt = uio->uio_resid;
379 	error = (*fp->f_ops->fo_write)(fp, uio, flags);
380 	if (error) {
381 		if (uio->uio_resid != cnt && (error == ERESTART ||
382 		    error == EINTR || error == EWOULDBLOCK))
383 			error = 0;
384 		if (error == EPIPE) {
385 			KERNEL_LOCK();
386 			ptsignal(p, SIGPIPE, STHREAD);
387 			KERNEL_UNLOCK();
388 		}
389 	}
390 	cnt -= uio->uio_resid;
391 
392 	mtx_enter(&fp->f_mtx);
393 	fp->f_wxfer++;
394 	fp->f_wbytes += cnt;
395 	mtx_leave(&fp->f_mtx);
396 #ifdef KTRACE
397 	if (ktriov != NULL) {
398 		if (error == 0)
399 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
400 		free(ktriov, M_TEMP, iovlen);
401 	}
402 #endif
403 	*retval = cnt;
404  done:
405 	FRELE(fp, p);
406 	return (error);
407 }
408 
409 /*
410  * Ioctl system call
411  */
412 int
413 sys_ioctl(struct proc *p, void *v, register_t *retval)
414 {
415 	struct sys_ioctl_args /* {
416 		syscallarg(int) fd;
417 		syscallarg(u_long) com;
418 		syscallarg(void *) data;
419 	} */ *uap = v;
420 	struct file *fp;
421 	struct filedesc *fdp = p->p_fd;
422 	u_long com = SCARG(uap, com);
423 	int error = 0;
424 	u_int size = 0;
425 	caddr_t data, memp = NULL;
426 	int tmp;
427 #define STK_PARAMS	128
428 	long long stkbuf[STK_PARAMS / sizeof(long long)];
429 
430 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
431 		return (EBADF);
432 
433 	if (fp->f_type == DTYPE_SOCKET) {
434 		struct socket *so = fp->f_data;
435 
436 		if (so->so_state & SS_DNS) {
437 			error = EINVAL;
438 			goto out;
439 		}
440 	}
441 
442 	error = pledge_ioctl(p, com, fp);
443 	if (error)
444 		goto out;
445 
446 	switch (com) {
447 	case FIONCLEX:
448 	case FIOCLEX:
449 		fdplock(fdp);
450 		if (com == FIONCLEX)
451 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
452 		else
453 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
454 		fdpunlock(fdp);
455 		goto out;
456 	}
457 
458 	/*
459 	 * Interpret high order word to find amount of data to be
460 	 * copied to/from the user's address space.
461 	 */
462 	size = IOCPARM_LEN(com);
463 	if (size > IOCPARM_MAX) {
464 		error = ENOTTY;
465 		goto out;
466 	}
467 	if (size > sizeof (stkbuf)) {
468 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
469 		data = memp;
470 	} else
471 		data = (caddr_t)stkbuf;
472 	if (com&IOC_IN) {
473 		if (size) {
474 			error = copyin(SCARG(uap, data), data, size);
475 			if (error) {
476 				goto out;
477 			}
478 		} else
479 			*(caddr_t *)data = SCARG(uap, data);
480 	} else if ((com&IOC_OUT) && size)
481 		/*
482 		 * Zero the buffer so the user always
483 		 * gets back something deterministic.
484 		 */
485 		memset(data, 0, size);
486 	else if (com&IOC_VOID)
487 		*(caddr_t *)data = SCARG(uap, data);
488 
489 	switch (com) {
490 
491 	case FIONBIO:
492 		if ((tmp = *(int *)data) != 0)
493 			atomic_setbits_int(&fp->f_flag, FNONBLOCK);
494 		else
495 			atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
496 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
497 		break;
498 
499 	case FIOASYNC:
500 		if ((tmp = *(int *)data) != 0)
501 			atomic_setbits_int(&fp->f_flag, FASYNC);
502 		else
503 			atomic_clearbits_int(&fp->f_flag, FASYNC);
504 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
505 		break;
506 
507 	default:
508 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
509 		break;
510 	}
511 	/*
512 	 * Copy any data to user, size was
513 	 * already set and checked above.
514 	 */
515 	if (error == 0 && (com&IOC_OUT) && size)
516 		error = copyout(data, SCARG(uap, data), size);
517 out:
518 	FRELE(fp, p);
519 	free(memp, M_IOCTLOPS, size);
520 	return (error);
521 }
522 
523 int	selwait, nselcoll;
524 
525 /*
526  * Select system call.
527  */
528 int
529 sys_select(struct proc *p, void *v, register_t *retval)
530 {
531 	struct sys_select_args /* {
532 		syscallarg(int) nd;
533 		syscallarg(fd_set *) in;
534 		syscallarg(fd_set *) ou;
535 		syscallarg(fd_set *) ex;
536 		syscallarg(struct timeval *) tv;
537 	} */ *uap = v;
538 
539 	struct timespec ts, *tsp = NULL;
540 	int error;
541 
542 	if (SCARG(uap, tv) != NULL) {
543 		struct timeval tv;
544 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
545 			return (error);
546 #ifdef KTRACE
547 		if (KTRPOINT(p, KTR_STRUCT))
548 			ktrreltimeval(p, &tv);
549 #endif
550 		if (tv.tv_sec < 0 || !timerisvalid(&tv))
551 			return (EINVAL);
552 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
553 		tsp = &ts;
554 	}
555 
556 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
557 	    SCARG(uap, ex), tsp, NULL, retval));
558 }
559 
560 int
561 sys_pselect(struct proc *p, void *v, register_t *retval)
562 {
563 	struct sys_pselect_args /* {
564 		syscallarg(int) nd;
565 		syscallarg(fd_set *) in;
566 		syscallarg(fd_set *) ou;
567 		syscallarg(fd_set *) ex;
568 		syscallarg(const struct timespec *) ts;
569 		syscallarg(const sigset_t *) mask;
570 	} */ *uap = v;
571 
572 	struct timespec ts, *tsp = NULL;
573 	sigset_t ss, *ssp = NULL;
574 	int error;
575 
576 	if (SCARG(uap, ts) != NULL) {
577 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
578 			return (error);
579 #ifdef KTRACE
580 		if (KTRPOINT(p, KTR_STRUCT))
581 			ktrreltimespec(p, &ts);
582 #endif
583 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
584 			return (EINVAL);
585 		tsp = &ts;
586 	}
587 	if (SCARG(uap, mask) != NULL) {
588 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
589 			return (error);
590 		ssp = &ss;
591 	}
592 
593 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
594 	    SCARG(uap, ex), tsp, ssp, retval));
595 }
596 
597 int
598 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
599     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
600 {
601 	struct kqueue_scan_state scan;
602 	fd_mask bits[6];
603 	fd_set *pibits[3], *pobits[3];
604 	int error, nevents = 0;
605 	u_int ni;
606 
607 	if (nd < 0)
608 		return (EINVAL);
609 	if (nd > p->p_fd->fd_nfiles) {
610 		/* forgiving; slightly wrong */
611 		nd = p->p_fd->fd_nfiles;
612 	}
613 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
614 	if (ni > sizeof(bits[0])) {
615 		caddr_t mbits;
616 
617 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
618 		pibits[0] = (fd_set *)&mbits[ni * 0];
619 		pibits[1] = (fd_set *)&mbits[ni * 1];
620 		pibits[2] = (fd_set *)&mbits[ni * 2];
621 		pobits[0] = (fd_set *)&mbits[ni * 3];
622 		pobits[1] = (fd_set *)&mbits[ni * 4];
623 		pobits[2] = (fd_set *)&mbits[ni * 5];
624 	} else {
625 		memset(bits, 0, sizeof(bits));
626 		pibits[0] = (fd_set *)&bits[0];
627 		pibits[1] = (fd_set *)&bits[1];
628 		pibits[2] = (fd_set *)&bits[2];
629 		pobits[0] = (fd_set *)&bits[3];
630 		pobits[1] = (fd_set *)&bits[4];
631 		pobits[2] = (fd_set *)&bits[5];
632 	}
633 
634 	kqpoll_init();
635 
636 #define	getbits(name, x) \
637 	if (name && (error = copyin(name, pibits[x], ni))) \
638 		goto done;
639 	getbits(in, 0);
640 	getbits(ou, 1);
641 	getbits(ex, 2);
642 #undef	getbits
643 #ifdef KTRACE
644 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
645 		if (in) ktrfdset(p, pibits[0], ni);
646 		if (ou) ktrfdset(p, pibits[1], ni);
647 		if (ex) ktrfdset(p, pibits[2], ni);
648 	}
649 #endif
650 
651 	if (sigmask)
652 		dosigsuspend(p, *sigmask &~ sigcantmask);
653 
654 	/* Register kqueue events */
655 	error = pselregister(p, pibits, nd, &nevents);
656 	if (error != 0)
657 		goto done;
658 
659 	/*
660 	 * The poll/select family of syscalls has been designed to
661 	 * block when file descriptors are not available, even if
662 	 * there's nothing to wait for.
663 	 */
664 	if (nevents == 0) {
665 		uint64_t nsecs = INFSLP;
666 
667 		if (timeout != NULL) {
668 			if (!timespecisset(timeout))
669 				goto done;
670 			nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
671 		}
672 		error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
673 		/* select is not restarted after signals... */
674 		if (error == ERESTART)
675 			error = EINTR;
676 		if (error == EWOULDBLOCK)
677 			error = 0;
678 		goto done;
679 	}
680 
681 	/* Collect at most `nevents' possibly waiting in kqueue_scan() */
682 	kqueue_scan_setup(&scan, p->p_kq);
683 	while (nevents > 0) {
684 		struct kevent kev[KQ_NEVENTS];
685 		int i, ready, count;
686 
687 		/* Maxium number of events per iteration */
688 		count = MIN(nitems(kev), nevents);
689 		ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
690 #ifdef KTRACE
691 		if (KTRPOINT(p, KTR_STRUCT))
692 			ktrevent(p, kev, ready);
693 #endif
694 		/* Convert back events that are ready. */
695 		for (i = 0; i < ready; i++)
696 			*retval += pselcollect(p, &kev[i], pobits);
697 		/*
698 		 * Stop if there was an error or if we had enough
699 		 * space to collect all events that were ready.
700 		 */
701 		if (error || ready < count)
702 			break;
703 
704 		nevents -= ready;
705 	}
706 	kqueue_scan_finish(&scan);
707  done:
708 #define	putbits(name, x) \
709 	if (name && (error2 = copyout(pobits[x], name, ni))) \
710 		error = error2;
711 	if (error == 0) {
712 		int error2;
713 
714 		putbits(in, 0);
715 		putbits(ou, 1);
716 		putbits(ex, 2);
717 #undef putbits
718 #ifdef KTRACE
719 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
720 			if (in) ktrfdset(p, pobits[0], ni);
721 			if (ou) ktrfdset(p, pobits[1], ni);
722 			if (ex) ktrfdset(p, pobits[2], ni);
723 		}
724 #endif
725 	}
726 
727 	if (pibits[0] != (fd_set *)&bits[0])
728 		free(pibits[0], M_TEMP, 6 * ni);
729 
730 	kqueue_purge(p, p->p_kq);
731 	p->p_kq_serial += nd;
732 
733 	return (error);
734 }
735 
736 /*
737  * Convert fd_set into kqueue events and register them on the
738  * per-thread queue.
739  */
740 int
741 pselregister(struct proc *p, fd_set *pibits[3], int nfd, int *nregistered)
742 {
743 	static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
744 	static const int evff[] = { 0, 0, NOTE_OOB };
745 	int msk, i, j, fd, nevents = 0, error = 0;
746 	struct kevent kev;
747 	fd_mask bits;
748 
749 	for (msk = 0; msk < 3; msk++) {
750 		for (i = 0; i < nfd; i += NFDBITS) {
751 			bits = pibits[msk]->fds_bits[i / NFDBITS];
752 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
753 				bits &= ~(1 << j);
754 
755 				DPRINTFN(2, "select fd %d mask %d serial %lu\n",
756 				    fd, msk, p->p_kq_serial);
757 				EV_SET(&kev, fd, evf[msk],
758 				    EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL,
759 				    evff[msk], 0, (void *)(p->p_kq_serial));
760 #ifdef KTRACE
761 				if (KTRPOINT(p, KTR_STRUCT))
762 					ktrevent(p, &kev, 1);
763 #endif
764 				error = kqueue_register(p->p_kq, &kev, p);
765 				switch (error) {
766 				case 0:
767 					nevents++;
768 				/* FALLTHROUGH */
769 				case EOPNOTSUPP:/* No underlying kqfilter */
770 				case EINVAL:	/* Unimplemented filter */
771 					error = 0;
772 					break;
773 				case ENXIO:	/* Device has been detached */
774 				default:
775 					goto bad;
776 				}
777 			}
778 		}
779 	}
780 
781 	*nregistered = nevents;
782 	return (0);
783 bad:
784 	DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
785 	    kev.filter, error);
786 	return (error);
787 }
788 
789 /*
790  * Convert given kqueue event into corresponding select(2) bit.
791  */
792 int
793 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
794 {
795 #ifdef DIAGNOSTIC
796 	/* Filter out and lazily delete spurious events */
797 	if ((unsigned long)kevp->udata != p->p_kq_serial) {
798 		DPRINTFN(0, "select fd %u mismatched serial %lu\n",
799 		    (int)kevp->ident, p->p_kq_serial);
800 		kevp->flags = EV_DISABLE|EV_DELETE;
801 		kqueue_register(p->p_kq, kevp, p);
802 		return (0);
803 	}
804 #endif
805 
806 	switch (kevp->filter) {
807 	case EVFILT_READ:
808 		FD_SET(kevp->ident, pobits[0]);
809 		break;
810 	case EVFILT_WRITE:
811 		FD_SET(kevp->ident, pobits[1]);
812 		break;
813 	case EVFILT_EXCEPT:
814 		FD_SET(kevp->ident, pobits[2]);
815 		break;
816 	default:
817 		KASSERT(0);
818 	}
819 
820 	DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
821 	return (1);
822 }
823 
824 int
825 seltrue(dev_t dev, int events, struct proc *p)
826 {
827 
828 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
829 }
830 
831 int
832 selfalse(dev_t dev, int events, struct proc *p)
833 {
834 
835 	return (0);
836 }
837 
838 /*
839  * Record a select request.
840  */
841 void
842 selrecord(struct proc *selector, struct selinfo *sip)
843 {
844 	struct proc *p;
845 	pid_t mytid;
846 
847 	KERNEL_ASSERT_LOCKED();
848 
849 	mytid = selector->p_tid;
850 	if (sip->si_seltid == mytid)
851 		return;
852 	if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
853 	    p->p_wchan == (caddr_t)&selwait)
854 		sip->si_flags |= SI_COLL;
855 	else
856 		sip->si_seltid = mytid;
857 }
858 
859 /*
860  * Do a wakeup when a selectable event occurs.
861  */
862 void
863 selwakeup(struct selinfo *sip)
864 {
865 	KERNEL_LOCK();
866 	KNOTE(&sip->si_note, NOTE_SUBMIT);
867 	doselwakeup(sip);
868 	KERNEL_UNLOCK();
869 }
870 
871 void
872 doselwakeup(struct selinfo *sip)
873 {
874 	struct proc *p;
875 
876 	KERNEL_ASSERT_LOCKED();
877 
878 	if (sip->si_seltid == 0)
879 		return;
880 	if (sip->si_flags & SI_COLL) {
881 		nselcoll++;
882 		sip->si_flags &= ~SI_COLL;
883 		wakeup(&selwait);
884 	}
885 	p = tfind(sip->si_seltid);
886 	sip->si_seltid = 0;
887 	if (p != NULL) {
888 		if (wakeup_proc(p, &selwait)) {
889 			/* nothing else to do */
890 		} else if (p->p_flag & P_SELECT)
891 			atomic_clearbits_int(&p->p_flag, P_SELECT);
892 	}
893 }
894 
895 void
896 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
897 {
898 	struct filedesc *fdp = p->p_fd;
899 	struct file *fp;
900 	u_int i;
901 	int n = 0;
902 
903 	for (i = 0; i < nfd; i++, pl++) {
904 		/* Check the file descriptor. */
905 		if (pl->fd < 0) {
906 			pl->revents = 0;
907 			continue;
908 		}
909 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
910 			pl->revents = POLLNVAL;
911 			n++;
912 			continue;
913 		}
914 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
915 		FRELE(fp, p);
916 		if (pl->revents != 0)
917 			n++;
918 	}
919 	*retval = n;
920 }
921 
922 /*
923  * Only copyout the revents field.
924  */
925 int
926 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
927 {
928 	int error = 0;
929 	u_int i = 0;
930 
931 	while (!error && i++ < nfds) {
932 		error = copyout(&pl->revents, &upl->revents,
933 		    sizeof(upl->revents));
934 		pl++;
935 		upl++;
936 	}
937 
938 	return (error);
939 }
940 
941 /*
942  * We are using the same mechanism as select only we encode/decode args
943  * differently.
944  */
945 int
946 sys_poll(struct proc *p, void *v, register_t *retval)
947 {
948 	struct sys_poll_args /* {
949 		syscallarg(struct pollfd *) fds;
950 		syscallarg(u_int) nfds;
951 		syscallarg(int) timeout;
952 	} */ *uap = v;
953 
954 	struct timespec ts, *tsp = NULL;
955 	int msec = SCARG(uap, timeout);
956 
957 	if (msec != INFTIM) {
958 		if (msec < 0)
959 			return (EINVAL);
960 		ts.tv_sec = msec / 1000;
961 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
962 		tsp = &ts;
963 	}
964 
965 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
966 	    retval));
967 }
968 
969 int
970 sys_ppoll(struct proc *p, void *v, register_t *retval)
971 {
972 	struct sys_ppoll_args /* {
973 		syscallarg(struct pollfd *) fds;
974 		syscallarg(u_int) nfds;
975 		syscallarg(const struct timespec *) ts;
976 		syscallarg(const sigset_t *) mask;
977 	} */ *uap = v;
978 
979 	int error;
980 	struct timespec ts, *tsp = NULL;
981 	sigset_t ss, *ssp = NULL;
982 
983 	if (SCARG(uap, ts) != NULL) {
984 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
985 			return (error);
986 #ifdef KTRACE
987 		if (KTRPOINT(p, KTR_STRUCT))
988 			ktrreltimespec(p, &ts);
989 #endif
990 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
991 			return (EINVAL);
992 		tsp = &ts;
993 	}
994 
995 	if (SCARG(uap, mask) != NULL) {
996 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
997 			return (error);
998 		ssp = &ss;
999 	}
1000 
1001 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
1002 	    retval));
1003 }
1004 
1005 int
1006 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
1007     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
1008 {
1009 	size_t sz;
1010 	struct pollfd pfds[4], *pl = pfds;
1011 	struct timespec elapsed, start, stop;
1012 	uint64_t nsecs;
1013 	int ncoll, i, s, error;
1014 
1015 	/* Standards say no more than MAX_OPEN; this is possibly better. */
1016 	if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
1017 		return (EINVAL);
1018 
1019 	/* optimize for the default case, of a small nfds value */
1020 	if (nfds > nitems(pfds)) {
1021 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
1022 		    M_WAITOK | M_CANFAIL);
1023 		if (pl == NULL)
1024 			return (EINVAL);
1025 	}
1026 
1027 	sz = nfds * sizeof(*pl);
1028 
1029 	if ((error = copyin(fds, pl, sz)) != 0)
1030 		goto bad;
1031 
1032 	for (i = 0; i < nfds; i++) {
1033 		pl[i].events &= ~POLL_NOHUP;
1034 		pl[i].revents = 0;
1035 	}
1036 
1037 	if (sigmask)
1038 		dosigsuspend(p, *sigmask &~ sigcantmask);
1039 
1040 retry:
1041 	ncoll = nselcoll;
1042 	atomic_setbits_int(&p->p_flag, P_SELECT);
1043 	pollscan(p, pl, nfds, retval);
1044 	if (*retval)
1045 		goto done;
1046 	if (timeout == NULL || timespecisset(timeout)) {
1047 		if (timeout != NULL) {
1048 			getnanouptime(&start);
1049 			nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
1050 		} else
1051 			nsecs = INFSLP;
1052 		s = splhigh();
1053 		if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
1054 			splx(s);
1055 			goto retry;
1056 		}
1057 		atomic_clearbits_int(&p->p_flag, P_SELECT);
1058 		error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs);
1059 		splx(s);
1060 		if (timeout != NULL) {
1061 			getnanouptime(&stop);
1062 			timespecsub(&stop, &start, &elapsed);
1063 			timespecsub(timeout, &elapsed, timeout);
1064 			if (timeout->tv_sec < 0)
1065 				timespecclear(timeout);
1066 		}
1067 		if (error == 0 || error == EWOULDBLOCK)
1068 			goto retry;
1069 	}
1070 
1071 done:
1072 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1073 	/*
1074 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1075 	 *       ignored (since the whole point is to see what would block).
1076 	 */
1077 	switch (error) {
1078 	case ERESTART:
1079 		error = pollout(pl, fds, nfds);
1080 		if (error == 0)
1081 			error = EINTR;
1082 		break;
1083 	case EWOULDBLOCK:
1084 	case 0:
1085 		error = pollout(pl, fds, nfds);
1086 		break;
1087 	}
1088 #ifdef KTRACE
1089 	if (KTRPOINT(p, KTR_STRUCT))
1090 		ktrpollfd(p, pl, nfds);
1091 #endif /* KTRACE */
1092 bad:
1093 	if (pl != pfds)
1094 		free(pl, M_TEMP, sz);
1095 	return (error);
1096 }
1097 
1098 /*
1099  * utrace system call
1100  */
1101 int
1102 sys_utrace(struct proc *curp, void *v, register_t *retval)
1103 {
1104 #ifdef KTRACE
1105 	struct sys_utrace_args /* {
1106 		syscallarg(const char *) label;
1107 		syscallarg(const void *) addr;
1108 		syscallarg(size_t) len;
1109 	} */ *uap = v;
1110 
1111 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1112 	    SCARG(uap, len)));
1113 #else
1114 	return (0);
1115 #endif
1116 }
1117