xref: /netbsd-src/sys/kern/sys_generic.c (revision e6d6e05cb173f30287ab619b21120b27baa66ad6)
1 /*	$NetBSD: sys_generic.c,v 1.113 2008/03/05 18:09:58 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * Copyright (c) 1982, 1986, 1989, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  * (c) UNIX System Laboratories, Inc.
43  * All or some portions of this file are derived from material licensed
44  * to the University of California by American Telephone and Telegraph
45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46  * the permission of UNIX System Laboratories, Inc.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  * 3. Neither the name of the University nor the names of its contributors
57  *    may be used to endorse or promote products derived from this software
58  *    without specific prior written permission.
59  *
60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70  * SUCH DAMAGE.
71  *
72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
73  */
74 
75 /*
76  * System calls relating to files.
77  */
78 
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.113 2008/03/05 18:09:58 ad Exp $");
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99 
100 #include <uvm/uvm_extern.h>
101 
102 /* Flags for lwp::l_selflag. */
103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
104 #define	SEL_SCANNING	1	/* polling descriptors */
105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
106 
107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
109 
110 /* Global state for select()/poll(). */
111 kmutex_t	select_lock;
112 kcondvar_t	select_cv;
113 int		nselcoll;
114 
115 /*
116  * Read system call.
117  */
118 /* ARGSUSED */
119 int
120 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
121 {
122 	/* {
123 		syscallarg(int)		fd;
124 		syscallarg(void *)	buf;
125 		syscallarg(size_t)	nbyte;
126 	} */
127 	int		fd;
128 	struct file	*fp;
129 	proc_t		*p;
130 	struct filedesc	*fdp;
131 
132 	fd = SCARG(uap, fd);
133 	p = l->l_proc;
134 	fdp = p->p_fd;
135 
136 	if ((fp = fd_getfile(fdp, fd)) == NULL)
137 		return (EBADF);
138 
139 	if ((fp->f_flag & FREAD) == 0) {
140 		FILE_UNLOCK(fp);
141 		return (EBADF);
142 	}
143 
144 	FILE_USE(fp);
145 
146 	/* dofileread() will unuse the descriptor for us */
147 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
148 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
149 }
150 
151 int
152 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
153 	off_t *offset, int flags, register_t *retval)
154 {
155 	struct iovec aiov;
156 	struct uio auio;
157 	size_t cnt;
158 	int error;
159 	lwp_t *l;
160 
161 	l = curlwp;
162 
163 	aiov.iov_base = (void *)buf;
164 	aiov.iov_len = nbyte;
165 	auio.uio_iov = &aiov;
166 	auio.uio_iovcnt = 1;
167 	auio.uio_resid = nbyte;
168 	auio.uio_rw = UIO_READ;
169 	auio.uio_vmspace = l->l_proc->p_vmspace;
170 
171 	/*
172 	 * Reads return ssize_t because -1 is returned on error.  Therefore
173 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
174 	 * values.
175 	 */
176 	if (auio.uio_resid > SSIZE_MAX) {
177 		error = EINVAL;
178 		goto out;
179 	}
180 
181 	cnt = auio.uio_resid;
182 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
183 	if (error)
184 		if (auio.uio_resid != cnt && (error == ERESTART ||
185 		    error == EINTR || error == EWOULDBLOCK))
186 			error = 0;
187 	cnt -= auio.uio_resid;
188 	ktrgenio(fd, UIO_READ, buf, cnt, error);
189 	*retval = cnt;
190  out:
191 	FILE_UNUSE(fp, l);
192 	return (error);
193 }
194 
195 /*
196  * Scatter read system call.
197  */
198 int
199 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
200 {
201 	/* {
202 		syscallarg(int)				fd;
203 		syscallarg(const struct iovec *)	iovp;
204 		syscallarg(int)				iovcnt;
205 	} */
206 
207 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
208 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
209 }
210 
211 int
212 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
213     off_t *offset, int flags, register_t *retval)
214 {
215 	struct uio	auio;
216 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
217 	int		i, error;
218 	size_t		cnt;
219 	u_int		iovlen;
220 	struct file	*fp;
221 	struct iovec	*ktriov = NULL;
222 	lwp_t		*l;
223 
224 	if (iovcnt == 0)
225 		return EINVAL;
226 
227 	l = curlwp;
228 
229 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
230 		return EBADF;
231 
232 	if ((fp->f_flag & FREAD) == 0) {
233 		FILE_UNLOCK(fp);
234 		return EBADF;
235 	}
236 
237 	FILE_USE(fp);
238 
239 	if (offset == NULL)
240 		offset = &fp->f_offset;
241 	else {
242 		struct vnode *vp = fp->f_data;
243 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
244 			error = ESPIPE;
245 			goto out;
246 		}
247 		/*
248 		 * Test that the device is seekable ?
249 		 * XXX This works because no file systems actually
250 		 * XXX take any action on the seek operation.
251 		 */
252 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
253 		if (error != 0)
254 			goto out;
255 	}
256 
257 	iovlen = iovcnt * sizeof(struct iovec);
258 	if (flags & FOF_IOV_SYSSPACE)
259 		iov = __UNCONST(iovp);
260 	else {
261 		iov = aiov;
262 		if ((u_int)iovcnt > UIO_SMALLIOV) {
263 			if ((u_int)iovcnt > IOV_MAX) {
264 				error = EINVAL;
265 				goto out;
266 			}
267 			iov = kmem_alloc(iovlen, KM_SLEEP);
268 			if (iov == NULL) {
269 				error = ENOMEM;
270 				goto out;
271 			}
272 			needfree = iov;
273 		}
274 		error = copyin(iovp, iov, iovlen);
275 		if (error)
276 			goto done;
277 	}
278 
279 	auio.uio_iov = iov;
280 	auio.uio_iovcnt = iovcnt;
281 	auio.uio_rw = UIO_READ;
282 	auio.uio_vmspace = l->l_proc->p_vmspace;
283 
284 	auio.uio_resid = 0;
285 	for (i = 0; i < iovcnt; i++, iov++) {
286 		auio.uio_resid += iov->iov_len;
287 		/*
288 		 * Reads return ssize_t because -1 is returned on error.
289 		 * Therefore we must restrict the length to SSIZE_MAX to
290 		 * avoid garbage return values.
291 		 */
292 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
293 			error = EINVAL;
294 			goto done;
295 		}
296 	}
297 
298 	/*
299 	 * if tracing, save a copy of iovec
300 	 */
301 	if (ktrpoint(KTR_GENIO))  {
302 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
303 		if (ktriov != NULL)
304 			memcpy(ktriov, auio.uio_iov, iovlen);
305 	}
306 
307 	cnt = auio.uio_resid;
308 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
309 	if (error)
310 		if (auio.uio_resid != cnt && (error == ERESTART ||
311 		    error == EINTR || error == EWOULDBLOCK))
312 			error = 0;
313 	cnt -= auio.uio_resid;
314 	*retval = cnt;
315 
316 	if (ktriov != NULL) {
317 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
318 		kmem_free(ktriov, iovlen);
319 	}
320 
321  done:
322 	if (needfree)
323 		kmem_free(needfree, iovlen);
324  out:
325 	FILE_UNUSE(fp, l);
326 	return (error);
327 }
328 
329 /*
330  * Write system call
331  */
332 int
333 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
334 {
335 	/* {
336 		syscallarg(int)			fd;
337 		syscallarg(const void *)	buf;
338 		syscallarg(size_t)		nbyte;
339 	} */
340 	int		fd;
341 	struct file	*fp;
342 
343 	fd = SCARG(uap, fd);
344 
345 	if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL)
346 		return (EBADF);
347 
348 	if ((fp->f_flag & FWRITE) == 0) {
349 		FILE_UNLOCK(fp);
350 		return (EBADF);
351 	}
352 
353 	FILE_USE(fp);
354 
355 	/* dofilewrite() will unuse the descriptor for us */
356 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
357 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
358 }
359 
360 int
361 dofilewrite(int fd, struct file *fp, const void *buf,
362 	size_t nbyte, off_t *offset, int flags, register_t *retval)
363 {
364 	struct iovec aiov;
365 	struct uio auio;
366 	size_t cnt;
367 	int error;
368 	lwp_t *l;
369 
370 	l = curlwp;
371 
372 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
373 	aiov.iov_len = nbyte;
374 	auio.uio_iov = &aiov;
375 	auio.uio_iovcnt = 1;
376 	auio.uio_resid = nbyte;
377 	auio.uio_rw = UIO_WRITE;
378 	auio.uio_vmspace = l->l_proc->p_vmspace;
379 
380 	/*
381 	 * Writes return ssize_t because -1 is returned on error.  Therefore
382 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
383 	 * values.
384 	 */
385 	if (auio.uio_resid > SSIZE_MAX) {
386 		error = EINVAL;
387 		goto out;
388 	}
389 
390 	cnt = auio.uio_resid;
391 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
392 	if (error) {
393 		if (auio.uio_resid != cnt && (error == ERESTART ||
394 		    error == EINTR || error == EWOULDBLOCK))
395 			error = 0;
396 		if (error == EPIPE) {
397 			mutex_enter(&proclist_mutex);
398 			psignal(l->l_proc, SIGPIPE);
399 			mutex_exit(&proclist_mutex);
400 		}
401 	}
402 	cnt -= auio.uio_resid;
403 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
404 	*retval = cnt;
405  out:
406 	FILE_UNUSE(fp, l);
407 	return (error);
408 }
409 
410 /*
411  * Gather write system call
412  */
413 int
414 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
415 {
416 	/* {
417 		syscallarg(int)				fd;
418 		syscallarg(const struct iovec *)	iovp;
419 		syscallarg(int)				iovcnt;
420 	} */
421 
422 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
423 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
424 }
425 
426 int
427 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
428     off_t *offset, int flags, register_t *retval)
429 {
430 	struct uio	auio;
431 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
432 	int		i, error;
433 	size_t		cnt;
434 	u_int		iovlen;
435 	struct file	*fp;
436 	struct iovec	*ktriov = NULL;
437 	lwp_t		*l;
438 
439 	l = curlwp;
440 
441 	if (iovcnt == 0)
442 		return EINVAL;
443 
444 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
445 		return EBADF;
446 
447 	if ((fp->f_flag & FWRITE) == 0) {
448 		FILE_UNLOCK(fp);
449 		return EBADF;
450 	}
451 
452 	FILE_USE(fp);
453 
454 	if (offset == NULL)
455 		offset = &fp->f_offset;
456 	else {
457 		struct vnode *vp = fp->f_data;
458 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
459 			error = ESPIPE;
460 			goto out;
461 		}
462 		/*
463 		 * Test that the device is seekable ?
464 		 * XXX This works because no file systems actually
465 		 * XXX take any action on the seek operation.
466 		 */
467 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
468 		if (error != 0)
469 			goto out;
470 	}
471 
472 	iovlen = iovcnt * sizeof(struct iovec);
473 	if (flags & FOF_IOV_SYSSPACE)
474 		iov = __UNCONST(iovp);
475 	else {
476 		iov = aiov;
477 		if ((u_int)iovcnt > UIO_SMALLIOV) {
478 			if ((u_int)iovcnt > IOV_MAX) {
479 				error = EINVAL;
480 				goto out;
481 			}
482 			iov = kmem_alloc(iovlen, KM_SLEEP);
483 			if (iov == NULL) {
484 				error = ENOMEM;
485 				goto out;
486 			}
487 			needfree = iov;
488 		}
489 		error = copyin(iovp, iov, iovlen);
490 		if (error)
491 			goto done;
492 	}
493 
494 	auio.uio_iov = iov;
495 	auio.uio_iovcnt = iovcnt;
496 	auio.uio_rw = UIO_WRITE;
497 	auio.uio_vmspace = curproc->p_vmspace;
498 
499 	auio.uio_resid = 0;
500 	for (i = 0; i < iovcnt; i++, iov++) {
501 		auio.uio_resid += iov->iov_len;
502 		/*
503 		 * Writes return ssize_t because -1 is returned on error.
504 		 * Therefore we must restrict the length to SSIZE_MAX to
505 		 * avoid garbage return values.
506 		 */
507 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
508 			error = EINVAL;
509 			goto done;
510 		}
511 	}
512 
513 	/*
514 	 * if tracing, save a copy of iovec
515 	 */
516 	if (ktrpoint(KTR_GENIO))  {
517 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
518 		if (ktriov != NULL)
519 			memcpy(ktriov, auio.uio_iov, iovlen);
520 	}
521 
522 	cnt = auio.uio_resid;
523 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
524 	if (error) {
525 		if (auio.uio_resid != cnt && (error == ERESTART ||
526 		    error == EINTR || error == EWOULDBLOCK))
527 			error = 0;
528 		if (error == EPIPE) {
529 			mutex_enter(&proclist_mutex);
530 			psignal(l->l_proc, SIGPIPE);
531 			mutex_exit(&proclist_mutex);
532 		}
533 	}
534 	cnt -= auio.uio_resid;
535 	*retval = cnt;
536 
537 	if (ktriov != NULL) {
538 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
539 		kmem_free(ktriov, iovlen);
540 	}
541 
542  done:
543 	if (needfree)
544 		kmem_free(needfree, iovlen);
545  out:
546 	FILE_UNUSE(fp, l);
547 	return (error);
548 }
549 
550 /*
551  * Ioctl system call
552  */
553 /* ARGSUSED */
554 int
555 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
556 {
557 	/* {
558 		syscallarg(int)		fd;
559 		syscallarg(u_long)	com;
560 		syscallarg(void *)	data;
561 	} */
562 	struct file	*fp;
563 	proc_t		*p;
564 	struct filedesc	*fdp;
565 	u_long		com;
566 	int		error;
567 	u_int		size;
568 	void 		*data, *memp;
569 #define	STK_PARAMS	128
570 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
571 
572 	error = 0;
573 	p = l->l_proc;
574 	fdp = p->p_fd;
575 
576 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
577 		return (EBADF);
578 
579 	FILE_USE(fp);
580 
581 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
582 		error = EBADF;
583 		com = 0;
584 		goto out;
585 	}
586 
587 	switch (com = SCARG(uap, com)) {
588 	case FIONCLEX:
589 		rw_enter(&fdp->fd_lock, RW_WRITER);
590 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
591 		rw_exit(&fdp->fd_lock);
592 		goto out;
593 
594 	case FIOCLEX:
595 		rw_enter(&fdp->fd_lock, RW_WRITER);
596 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
597 		rw_exit(&fdp->fd_lock);
598 		goto out;
599 	}
600 
601 	/*
602 	 * Interpret high order word to find amount of data to be
603 	 * copied to/from the user's address space.
604 	 */
605 	size = IOCPARM_LEN(com);
606 	if (size > IOCPARM_MAX) {
607 		error = ENOTTY;
608 		goto out;
609 	}
610 	memp = NULL;
611 	if (size > sizeof(stkbuf)) {
612 		memp = kmem_alloc(size, KM_SLEEP);
613 		data = memp;
614 	} else
615 		data = (void *)stkbuf;
616 	if (com&IOC_IN) {
617 		if (size) {
618 			error = copyin(SCARG(uap, data), data, size);
619 			if (error) {
620 				if (memp)
621 					kmem_free(memp, size);
622 				goto out;
623 			}
624 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
625 			    size, 0);
626 		} else
627 			*(void **)data = SCARG(uap, data);
628 	} else if ((com&IOC_OUT) && size)
629 		/*
630 		 * Zero the buffer so the user always
631 		 * gets back something deterministic.
632 		 */
633 		memset(data, 0, size);
634 	else if (com&IOC_VOID)
635 		*(void **)data = SCARG(uap, data);
636 
637 	switch (com) {
638 
639 	case FIONBIO:
640 		FILE_LOCK(fp);
641 		if (*(int *)data != 0)
642 			fp->f_flag |= FNONBLOCK;
643 		else
644 			fp->f_flag &= ~FNONBLOCK;
645 		FILE_UNLOCK(fp);
646 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
647 		break;
648 
649 	case FIOASYNC:
650 		FILE_LOCK(fp);
651 		if (*(int *)data != 0)
652 			fp->f_flag |= FASYNC;
653 		else
654 			fp->f_flag &= ~FASYNC;
655 		FILE_UNLOCK(fp);
656 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
657 		break;
658 
659 	default:
660 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
661 		/*
662 		 * Copy any data to user, size was
663 		 * already set and checked above.
664 		 */
665 		if (error == 0 && (com&IOC_OUT) && size) {
666 			error = copyout(data, SCARG(uap, data), size);
667 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
668 			    size, error);
669 		}
670 		break;
671 	}
672 	if (memp)
673 		kmem_free(memp, size);
674  out:
675 	FILE_UNUSE(fp, l);
676 	switch (error) {
677 	case -1:
678 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
679 		    "pid=%d comm=%s\n",
680 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
681 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
682 		    p->p_pid, p->p_comm);
683 		/* FALLTHROUGH */
684 	case EPASSTHROUGH:
685 		error = ENOTTY;
686 		/* FALLTHROUGH */
687 	default:
688 		return (error);
689 	}
690 }
691 
692 /*
693  * Select system call.
694  */
695 int
696 sys_pselect(struct lwp *l, const struct sys_pselect_args *uap, register_t *retval)
697 {
698 	/* {
699 		syscallarg(int)				nd;
700 		syscallarg(fd_set *)			in;
701 		syscallarg(fd_set *)			ou;
702 		syscallarg(fd_set *)			ex;
703 		syscallarg(const struct timespec *)	ts;
704 		syscallarg(sigset_t *)			mask;
705 	} */
706 	struct timespec	ats;
707 	struct timeval	atv, *tv = NULL;
708 	sigset_t	amask, *mask = NULL;
709 	int		error;
710 
711 	if (SCARG(uap, ts)) {
712 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
713 		if (error)
714 			return error;
715 		atv.tv_sec = ats.tv_sec;
716 		atv.tv_usec = ats.tv_nsec / 1000;
717 		tv = &atv;
718 	}
719 	if (SCARG(uap, mask) != NULL) {
720 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
721 		if (error)
722 			return error;
723 		mask = &amask;
724 	}
725 
726 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
727 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
728 }
729 
730 int
731 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
732 {
733 	if (itimerfix(tv))
734 		return -1;
735 	getmicrouptime(sleeptv);
736 	return 0;
737 }
738 
739 int
740 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
741 {
742 	/*
743 	 * We have to recalculate the timeout on every retry.
744 	 */
745 	struct timeval slepttv;
746 	/*
747 	 * reduce tv by elapsed time
748 	 * based on monotonic time scale
749 	 */
750 	getmicrouptime(&slepttv);
751 	timeradd(tv, sleeptv, tv);
752 	timersub(tv, &slepttv, tv);
753 	*sleeptv = slepttv;
754 	return tvtohz(tv);
755 }
756 
757 int
758 sys_select(struct lwp *l, const struct sys_select_args *uap, register_t *retval)
759 {
760 	/* {
761 		syscallarg(int)			nd;
762 		syscallarg(fd_set *)		in;
763 		syscallarg(fd_set *)		ou;
764 		syscallarg(fd_set *)		ex;
765 		syscallarg(struct timeval *)	tv;
766 	} */
767 	struct timeval atv, *tv = NULL;
768 	int error;
769 
770 	if (SCARG(uap, tv)) {
771 		error = copyin(SCARG(uap, tv), (void *)&atv,
772 			sizeof(atv));
773 		if (error)
774 			return error;
775 		tv = &atv;
776 	}
777 
778 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
779 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
780 }
781 
782 int
783 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
784 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
785 {
786 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
787 			    sizeof(fd_mask) * 6];
788 	proc_t		* const p = l->l_proc;
789 	char 		*bits;
790 	int		ncoll, error, timo;
791 	size_t		ni;
792 	sigset_t	oldmask;
793 	struct timeval  sleeptv;
794 
795 	error = 0;
796 	if (nd < 0)
797 		return (EINVAL);
798 	if (nd > p->p_fd->fd_nfiles) {
799 		/* forgiving; slightly wrong */
800 		nd = p->p_fd->fd_nfiles;
801 	}
802 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
803 	if (ni * 6 > sizeof(smallbits))
804 		bits = kmem_alloc(ni * 6, KM_SLEEP);
805 	else
806 		bits = smallbits;
807 
808 #define	getbits(name, x)						\
809 	if (u_ ## name) {						\
810 		error = copyin(u_ ## name, bits + ni * x, ni);		\
811 		if (error)						\
812 			goto done;					\
813 	} else								\
814 		memset(bits + ni * x, 0, ni);
815 	getbits(in, 0);
816 	getbits(ou, 1);
817 	getbits(ex, 2);
818 #undef	getbits
819 
820 	timo = 0;
821 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
822 		error = EINVAL;
823 		goto done;
824 	}
825 
826 	if (mask) {
827 		sigminusset(&sigcantmask, mask);
828 		mutex_enter(&p->p_smutex);
829 		oldmask = l->l_sigmask;
830 		l->l_sigmask = *mask;
831 		mutex_exit(&p->p_smutex);
832 	} else
833 		oldmask = l->l_sigmask;	/* XXXgcc */
834 
835 	mutex_enter(&select_lock);
836 	SLIST_INIT(&l->l_selwait);
837 	for (;;) {
838 	 	l->l_selflag = SEL_SCANNING;
839 		ncoll = nselcoll;
840  		mutex_exit(&select_lock);
841 
842 		error = selscan(l, (fd_mask *)(bits + ni * 0),
843 		    (fd_mask *)(bits + ni * 3), nd, retval);
844 
845 		mutex_enter(&select_lock);
846 		if (error || *retval)
847 			break;
848 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
849 			break;
850 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
851 			continue;
852 		l->l_selflag = SEL_BLOCKING;
853 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
854 		if (error != 0)
855 			break;
856 	}
857 	selclear();
858 	mutex_exit(&select_lock);
859 
860 	if (mask) {
861 		mutex_enter(&p->p_smutex);
862 		l->l_sigmask = oldmask;
863 		mutex_exit(&p->p_smutex);
864 	}
865 
866  done:
867 	/* select is not restarted after signals... */
868 	if (error == ERESTART)
869 		error = EINTR;
870 	if (error == EWOULDBLOCK)
871 		error = 0;
872 	if (error == 0 && u_in != NULL)
873 		error = copyout(bits + ni * 3, u_in, ni);
874 	if (error == 0 && u_ou != NULL)
875 		error = copyout(bits + ni * 4, u_ou, ni);
876 	if (error == 0 && u_ex != NULL)
877 		error = copyout(bits + ni * 5, u_ex, ni);
878 	if (bits != smallbits)
879 		kmem_free(bits, ni * 6);
880 	return (error);
881 }
882 
883 int
884 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
885 	register_t *retval)
886 {
887 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
888 			       POLLWRNORM | POLLHUP | POLLERR,
889 			       POLLRDBAND };
890 	proc_t *p = l->l_proc;
891 	struct filedesc	*fdp;
892 	int msk, i, j, fd, n;
893 	fd_mask ibits, obits;
894 	struct file *fp;
895 
896 	fdp = p->p_fd;
897 	n = 0;
898 	for (msk = 0; msk < 3; msk++) {
899 		for (i = 0; i < nfd; i += NFDBITS) {
900 			ibits = *ibitp++;
901 			obits = 0;
902 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
903 				ibits &= ~(1 << j);
904 				if ((fp = fd_getfile(fdp, fd)) == NULL)
905 					return (EBADF);
906 				FILE_USE(fp);
907 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
908 					obits |= (1 << j);
909 					n++;
910 				}
911 				FILE_UNUSE(fp, l);
912 			}
913 			*obitp++ = obits;
914 		}
915 	}
916 	*retval = n;
917 	return (0);
918 }
919 
920 /*
921  * Poll system call.
922  */
923 int
924 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
925 {
926 	/* {
927 		syscallarg(struct pollfd *)	fds;
928 		syscallarg(u_int)		nfds;
929 		syscallarg(int)			timeout;
930 	} */
931 	struct timeval	atv, *tv = NULL;
932 
933 	if (SCARG(uap, timeout) != INFTIM) {
934 		atv.tv_sec = SCARG(uap, timeout) / 1000;
935 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
936 		tv = &atv;
937 	}
938 
939 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
940 		tv, NULL);
941 }
942 
943 /*
944  * Poll system call.
945  */
946 int
947 sys_pollts(struct lwp *l, const struct sys_pollts_args *uap, register_t *retval)
948 {
949 	/* {
950 		syscallarg(struct pollfd *)		fds;
951 		syscallarg(u_int)			nfds;
952 		syscallarg(const struct timespec *)	ts;
953 		syscallarg(const sigset_t *)		mask;
954 	} */
955 	struct timespec	ats;
956 	struct timeval	atv, *tv = NULL;
957 	sigset_t	amask, *mask = NULL;
958 	int		error;
959 
960 	if (SCARG(uap, ts)) {
961 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
962 		if (error)
963 			return error;
964 		atv.tv_sec = ats.tv_sec;
965 		atv.tv_usec = ats.tv_nsec / 1000;
966 		tv = &atv;
967 	}
968 	if (SCARG(uap, mask)) {
969 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
970 		if (error)
971 			return error;
972 		mask = &amask;
973 	}
974 
975 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
976 		tv, mask);
977 }
978 
979 int
980 pollcommon(lwp_t *l, register_t *retval,
981 	struct pollfd *u_fds, u_int nfds,
982 	struct timeval *tv, sigset_t *mask)
983 {
984 	char		smallbits[32 * sizeof(struct pollfd)];
985 	proc_t		* const p = l->l_proc;
986 	void *		bits;
987 	sigset_t	oldmask;
988 	int		ncoll, error, timo;
989 	size_t		ni;
990 	struct timeval	sleeptv;
991 
992 	if (nfds > p->p_fd->fd_nfiles) {
993 		/* forgiving; slightly wrong */
994 		nfds = p->p_fd->fd_nfiles;
995 	}
996 	ni = nfds * sizeof(struct pollfd);
997 	if (ni > sizeof(smallbits))
998 		bits = kmem_alloc(ni, KM_SLEEP);
999 	else
1000 		bits = smallbits;
1001 
1002 	error = copyin(u_fds, bits, ni);
1003 	if (error)
1004 		goto done;
1005 
1006 	timo = 0;
1007 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
1008 		error = EINVAL;
1009 		goto done;
1010 	}
1011 
1012 	if (mask) {
1013 		sigminusset(&sigcantmask, mask);
1014 		mutex_enter(&p->p_smutex);
1015 		oldmask = l->l_sigmask;
1016 		l->l_sigmask = *mask;
1017 		mutex_exit(&p->p_smutex);
1018 	} else
1019 		oldmask = l->l_sigmask;	/* XXXgcc */
1020 
1021 	mutex_enter(&select_lock);
1022 	SLIST_INIT(&l->l_selwait);
1023 	for (;;) {
1024 		ncoll = nselcoll;
1025 		l->l_selflag = SEL_SCANNING;
1026 		mutex_exit(&select_lock);
1027 
1028 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1029 
1030 		mutex_enter(&select_lock);
1031 		if (error || *retval)
1032 			break;
1033 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1034 			break;
1035 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1036 			continue;
1037 		l->l_selflag = SEL_BLOCKING;
1038 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1039 		if (error != 0)
1040 			break;
1041 	}
1042 	selclear();
1043 	mutex_exit(&select_lock);
1044 
1045 	if (mask) {
1046 		mutex_enter(&p->p_smutex);
1047 		l->l_sigmask = oldmask;
1048 		mutex_exit(&p->p_smutex);
1049 	}
1050  done:
1051 	/* poll is not restarted after signals... */
1052 	if (error == ERESTART)
1053 		error = EINTR;
1054 	if (error == EWOULDBLOCK)
1055 		error = 0;
1056 	if (error == 0)
1057 		error = copyout(bits, u_fds, ni);
1058 	if (bits != smallbits)
1059 		kmem_free(bits, ni);
1060 	return (error);
1061 }
1062 
1063 int
1064 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1065 {
1066 	proc_t		*p = l->l_proc;
1067 	struct filedesc	*fdp;
1068 	int		i, n;
1069 	struct file	*fp;
1070 
1071 	fdp = p->p_fd;
1072 	n = 0;
1073 	for (i = 0; i < nfd; i++, fds++) {
1074 		if (fds->fd >= fdp->fd_nfiles) {
1075 			fds->revents = POLLNVAL;
1076 			n++;
1077 		} else if (fds->fd < 0) {
1078 			fds->revents = 0;
1079 		} else {
1080 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1081 				fds->revents = POLLNVAL;
1082 				n++;
1083 			} else {
1084 				FILE_USE(fp);
1085 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1086 				    fds->events | POLLERR | POLLHUP, l);
1087 				if (fds->revents != 0)
1088 					n++;
1089 				FILE_UNUSE(fp, l);
1090 			}
1091 		}
1092 	}
1093 	*retval = n;
1094 	return (0);
1095 }
1096 
1097 /*ARGSUSED*/
1098 int
1099 seltrue(dev_t dev, int events, lwp_t *l)
1100 {
1101 
1102 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1103 }
1104 
1105 /*
1106  * Record a select request.
1107  */
1108 void
1109 selrecord(lwp_t *selector, struct selinfo *sip)
1110 {
1111 
1112 	mutex_enter(&select_lock);
1113 	if (sip->sel_lwp == NULL) {
1114 		/* First named waiter, although there may be more. */
1115 		sip->sel_lwp = selector;
1116 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1117 	} else if (sip->sel_lwp != selector) {
1118 		/* Multiple waiters. */
1119 		sip->sel_collision = true;
1120 	}
1121 	mutex_exit(&select_lock);
1122 }
1123 
1124 /*
1125  * Do a wakeup when a selectable event occurs.
1126  */
1127 void
1128 selnotify(struct selinfo *sip, int events, long knhint)
1129 {
1130 	lwp_t *l;
1131 
1132 	mutex_enter(&select_lock);
1133 	if (sip->sel_collision) {
1134 		/* Multiple waiters - just notify everybody. */
1135 		nselcoll++;
1136 		sip->sel_collision = false;
1137 		cv_broadcast(&select_cv);
1138 	} else if (sip->sel_lwp != NULL) {
1139 		/* Only one LWP waiting. */
1140 		l = sip->sel_lwp;
1141 		if (l->l_selflag == SEL_BLOCKING) {
1142 			/*
1143 			 * If it's sleeping, wake it up.  If not, it's
1144 			 * already awake but hasn't yet removed itself
1145 			 * from the selector.  We reset the state below
1146 			 * so that we only attempt to do this once.
1147 			 */
1148 			lwp_lock(l);
1149 			if (l->l_wchan == &select_cv) {
1150 				/* lwp_unsleep() releases the LWP lock. */
1151 				lwp_unsleep(l);
1152 			} else
1153 				lwp_unlock(l);
1154 		} else {
1155 			/*
1156 			 * Not yet asleep.  Reset its state below so that
1157 			 * it will go around again.
1158 			 */
1159 		}
1160 		l->l_selflag = SEL_RESET;
1161 	}
1162 	mutex_exit(&select_lock);
1163 
1164 	KNOTE(&sip->sel_klist, knhint);
1165 }
1166 
1167 /*
1168  * Remove an LWP from all objects that it is waiting for.
1169  */
1170 void
1171 selclear(void)
1172 {
1173 	struct selinfo *sip;
1174 	lwp_t *l = curlwp;
1175 
1176 	KASSERT(mutex_owned(&select_lock));
1177 
1178 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1179 		KASSERT(sip->sel_lwp == l);
1180 		sip->sel_lwp = NULL;
1181 	}
1182 }
1183 
1184 /*
1185  * Initialize the select/poll system calls.
1186  */
1187 void
1188 selsysinit(void)
1189 {
1190 
1191 	mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM);
1192 	cv_init(&select_cv, "select");
1193 }
1194 
1195 /*
1196  * Initialize a selector.
1197  */
1198 void
1199 selinit(struct selinfo *sip)
1200 {
1201 
1202 	memset(sip, 0, sizeof(*sip));
1203 }
1204 
1205 /*
1206  * Destroy a selector.  The owning object must not gain new
1207  * references while this is in progress: all activity on the
1208  * selector must be stopped.
1209  */
1210 void
1211 seldestroy(struct selinfo *sip)
1212 {
1213 	lwp_t *l;
1214 
1215 	if (sip->sel_lwp == NULL)
1216 		return;
1217 
1218 	mutex_enter(&select_lock);
1219 	if ((l = sip->sel_lwp) != NULL) {
1220 		/* This should rarely happen, so SLIST_REMOVE() is OK. */
1221 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
1222 		sip->sel_lwp = NULL;
1223 	}
1224 	mutex_exit(&select_lock);
1225 }
1226