xref: /netbsd-src/sys/kern/sys_generic.c (revision 8b0f9554ff8762542c4defc4f70e1eb76fb508fa)
1 /*	$NetBSD: sys_generic.c,v 1.109 2007/12/05 07:06:55 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * Copyright (c) 1982, 1986, 1989, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  * (c) UNIX System Laboratories, Inc.
43  * All or some portions of this file are derived from material licensed
44  * to the University of California by American Telephone and Telegraph
45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46  * the permission of UNIX System Laboratories, Inc.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  * 3. Neither the name of the University nor the names of its contributors
57  *    may be used to endorse or promote products derived from this software
58  *    without specific prior written permission.
59  *
60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70  * SUCH DAMAGE.
71  *
72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
73  */
74 
75 /*
76  * System calls relating to files.
77  */
78 
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.109 2007/12/05 07:06:55 ad Exp $");
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99 
100 #include <uvm/uvm_extern.h>
101 
102 /* Flags for lwp::l_selflag. */
103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
104 #define	SEL_SCANNING	1	/* polling descriptors */
105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
106 
107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
109 static void	selclear(void);
110 
111 /* Global state for select()/poll(). */
112 kmutex_t	select_lock;
113 kcondvar_t	select_cv;
114 int		nselcoll;
115 
116 /*
117  * Read system call.
118  */
119 /* ARGSUSED */
120 int
121 sys_read(lwp_t *l, void *v, register_t *retval)
122 {
123 	struct sys_read_args /* {
124 		syscallarg(int)		fd;
125 		syscallarg(void *)	buf;
126 		syscallarg(size_t)	nbyte;
127 	} */ *uap = v;
128 	int		fd;
129 	struct file	*fp;
130 	proc_t		*p;
131 	struct filedesc	*fdp;
132 
133 	fd = SCARG(uap, fd);
134 	p = l->l_proc;
135 	fdp = p->p_fd;
136 
137 	if ((fp = fd_getfile(fdp, fd)) == NULL)
138 		return (EBADF);
139 
140 	if ((fp->f_flag & FREAD) == 0) {
141 		mutex_exit(&fp->f_lock);
142 		return (EBADF);
143 	}
144 
145 	FILE_USE(fp);
146 
147 	/* dofileread() will unuse the descriptor for us */
148 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
149 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
150 }
151 
152 int
153 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
154 	off_t *offset, int flags, register_t *retval)
155 {
156 	struct iovec aiov;
157 	struct uio auio;
158 	size_t cnt;
159 	int error;
160 	lwp_t *l;
161 
162 	l = curlwp;
163 
164 	aiov.iov_base = (void *)buf;
165 	aiov.iov_len = nbyte;
166 	auio.uio_iov = &aiov;
167 	auio.uio_iovcnt = 1;
168 	auio.uio_resid = nbyte;
169 	auio.uio_rw = UIO_READ;
170 	auio.uio_vmspace = l->l_proc->p_vmspace;
171 
172 	/*
173 	 * Reads return ssize_t because -1 is returned on error.  Therefore
174 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
175 	 * values.
176 	 */
177 	if (auio.uio_resid > SSIZE_MAX) {
178 		error = EINVAL;
179 		goto out;
180 	}
181 
182 	cnt = auio.uio_resid;
183 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
184 	if (error)
185 		if (auio.uio_resid != cnt && (error == ERESTART ||
186 		    error == EINTR || error == EWOULDBLOCK))
187 			error = 0;
188 	cnt -= auio.uio_resid;
189 	ktrgenio(fd, UIO_READ, buf, cnt, error);
190 	*retval = cnt;
191  out:
192 	FILE_UNUSE(fp, l);
193 	return (error);
194 }
195 
196 /*
197  * Scatter read system call.
198  */
199 int
200 sys_readv(lwp_t *l, void *v, register_t *retval)
201 {
202 	struct sys_readv_args /* {
203 		syscallarg(int)				fd;
204 		syscallarg(const struct iovec *)	iovp;
205 		syscallarg(int)				iovcnt;
206 	} */ *uap = v;
207 
208 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
209 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
210 }
211 
212 int
213 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
214     off_t *offset, int flags, register_t *retval)
215 {
216 	struct uio	auio;
217 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
218 	int		i, error;
219 	size_t		cnt;
220 	u_int		iovlen;
221 	struct file	*fp;
222 	struct iovec	*ktriov = NULL;
223 	lwp_t		*l;
224 
225 	if (iovcnt == 0)
226 		return EINVAL;
227 
228 	l = curlwp;
229 
230 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
231 		return EBADF;
232 
233 	if ((fp->f_flag & FREAD) == 0) {
234 		mutex_exit(&fp->f_lock);
235 		return EBADF;
236 	}
237 
238 	FILE_USE(fp);
239 
240 	if (offset == NULL)
241 		offset = &fp->f_offset;
242 	else {
243 		struct vnode *vp = fp->f_data;
244 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
245 			error = ESPIPE;
246 			goto out;
247 		}
248 		/*
249 		 * Test that the device is seekable ?
250 		 * XXX This works because no file systems actually
251 		 * XXX take any action on the seek operation.
252 		 */
253 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
254 		if (error != 0)
255 			goto out;
256 	}
257 
258 	iovlen = iovcnt * sizeof(struct iovec);
259 	if (flags & FOF_IOV_SYSSPACE)
260 		iov = __UNCONST(iovp);
261 	else {
262 		iov = aiov;
263 		if ((u_int)iovcnt > UIO_SMALLIOV) {
264 			if ((u_int)iovcnt > IOV_MAX) {
265 				error = EINVAL;
266 				goto out;
267 			}
268 			iov = kmem_alloc(iovlen, KM_SLEEP);
269 			if (iov == NULL) {
270 				error = ENOMEM;
271 				goto out;
272 			}
273 			needfree = iov;
274 		}
275 		error = copyin(iovp, iov, iovlen);
276 		if (error)
277 			goto done;
278 	}
279 
280 	auio.uio_iov = iov;
281 	auio.uio_iovcnt = iovcnt;
282 	auio.uio_rw = UIO_READ;
283 	auio.uio_vmspace = l->l_proc->p_vmspace;
284 
285 	auio.uio_resid = 0;
286 	for (i = 0; i < iovcnt; i++, iov++) {
287 		auio.uio_resid += iov->iov_len;
288 		/*
289 		 * Reads return ssize_t because -1 is returned on error.
290 		 * Therefore we must restrict the length to SSIZE_MAX to
291 		 * avoid garbage return values.
292 		 */
293 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
294 			error = EINVAL;
295 			goto done;
296 		}
297 	}
298 
299 	/*
300 	 * if tracing, save a copy of iovec
301 	 */
302 	if (ktrpoint(KTR_GENIO))  {
303 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
304 		if (ktriov != NULL)
305 			memcpy(ktriov, auio.uio_iov, iovlen);
306 	}
307 
308 	cnt = auio.uio_resid;
309 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
310 	if (error)
311 		if (auio.uio_resid != cnt && (error == ERESTART ||
312 		    error == EINTR || error == EWOULDBLOCK))
313 			error = 0;
314 	cnt -= auio.uio_resid;
315 	*retval = cnt;
316 
317 	if (ktriov != NULL) {
318 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
319 		kmem_free(ktriov, iovlen);
320 	}
321 
322  done:
323 	if (needfree)
324 		kmem_free(needfree, iovlen);
325  out:
326 	FILE_UNUSE(fp, l);
327 	return (error);
328 }
329 
330 /*
331  * Write system call
332  */
333 int
334 sys_write(lwp_t *l, void *v, register_t *retval)
335 {
336 	struct sys_write_args /* {
337 		syscallarg(int)			fd;
338 		syscallarg(const void *)	buf;
339 		syscallarg(size_t)		nbyte;
340 	} */ *uap = v;
341 	int		fd;
342 	struct file	*fp;
343 
344 	fd = SCARG(uap, fd);
345 
346 	if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL)
347 		return (EBADF);
348 
349 	if ((fp->f_flag & FWRITE) == 0) {
350 		mutex_exit(&fp->f_lock);
351 		return (EBADF);
352 	}
353 
354 	FILE_USE(fp);
355 
356 	/* dofilewrite() will unuse the descriptor for us */
357 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
358 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
359 }
360 
361 int
362 dofilewrite(int fd, struct file *fp, const void *buf,
363 	size_t nbyte, off_t *offset, int flags, register_t *retval)
364 {
365 	struct iovec aiov;
366 	struct uio auio;
367 	size_t cnt;
368 	int error;
369 	lwp_t *l;
370 
371 	l = curlwp;
372 
373 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
374 	aiov.iov_len = nbyte;
375 	auio.uio_iov = &aiov;
376 	auio.uio_iovcnt = 1;
377 	auio.uio_resid = nbyte;
378 	auio.uio_rw = UIO_WRITE;
379 	auio.uio_vmspace = l->l_proc->p_vmspace;
380 
381 	/*
382 	 * Writes return ssize_t because -1 is returned on error.  Therefore
383 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
384 	 * values.
385 	 */
386 	if (auio.uio_resid > SSIZE_MAX) {
387 		error = EINVAL;
388 		goto out;
389 	}
390 
391 	cnt = auio.uio_resid;
392 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
393 	if (error) {
394 		if (auio.uio_resid != cnt && (error == ERESTART ||
395 		    error == EINTR || error == EWOULDBLOCK))
396 			error = 0;
397 		if (error == EPIPE) {
398 			mutex_enter(&proclist_mutex);
399 			psignal(l->l_proc, SIGPIPE);
400 			mutex_exit(&proclist_mutex);
401 		}
402 	}
403 	cnt -= auio.uio_resid;
404 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
405 	*retval = cnt;
406  out:
407 	FILE_UNUSE(fp, l);
408 	return (error);
409 }
410 
411 /*
412  * Gather write system call
413  */
414 int
415 sys_writev(lwp_t *l, void *v, register_t *retval)
416 {
417 	struct sys_writev_args /* {
418 		syscallarg(int)				fd;
419 		syscallarg(const struct iovec *)	iovp;
420 		syscallarg(int)				iovcnt;
421 	} */ *uap = v;
422 
423 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
424 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
425 }
426 
427 int
428 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
429     off_t *offset, int flags, register_t *retval)
430 {
431 	struct uio	auio;
432 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
433 	int		i, error;
434 	size_t		cnt;
435 	u_int		iovlen;
436 	struct file	*fp;
437 	struct iovec	*ktriov = NULL;
438 	lwp_t		*l;
439 
440 	l = curlwp;
441 
442 	if (iovcnt == 0)
443 		return EINVAL;
444 
445 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
446 		return EBADF;
447 
448 	if ((fp->f_flag & FWRITE) == 0) {
449 		mutex_exit(&fp->f_lock);
450 		return EBADF;
451 	}
452 
453 	FILE_USE(fp);
454 
455 	if (offset == NULL)
456 		offset = &fp->f_offset;
457 	else {
458 		struct vnode *vp = fp->f_data;
459 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
460 			error = ESPIPE;
461 			goto out;
462 		}
463 		/*
464 		 * Test that the device is seekable ?
465 		 * XXX This works because no file systems actually
466 		 * XXX take any action on the seek operation.
467 		 */
468 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
469 		if (error != 0)
470 			goto out;
471 	}
472 
473 	iovlen = iovcnt * sizeof(struct iovec);
474 	if (flags & FOF_IOV_SYSSPACE)
475 		iov = __UNCONST(iovp);
476 	else {
477 		iov = aiov;
478 		if ((u_int)iovcnt > UIO_SMALLIOV) {
479 			if ((u_int)iovcnt > IOV_MAX) {
480 				error = EINVAL;
481 				goto out;
482 			}
483 			iov = kmem_alloc(iovlen, KM_SLEEP);
484 			if (iov == NULL) {
485 				error = ENOMEM;
486 				goto out;
487 			}
488 			needfree = iov;
489 		}
490 		error = copyin(iovp, iov, iovlen);
491 		if (error)
492 			goto done;
493 	}
494 
495 	auio.uio_iov = iov;
496 	auio.uio_iovcnt = iovcnt;
497 	auio.uio_rw = UIO_WRITE;
498 	auio.uio_vmspace = curproc->p_vmspace;
499 
500 	auio.uio_resid = 0;
501 	for (i = 0; i < iovcnt; i++, iov++) {
502 		auio.uio_resid += iov->iov_len;
503 		/*
504 		 * Writes return ssize_t because -1 is returned on error.
505 		 * Therefore we must restrict the length to SSIZE_MAX to
506 		 * avoid garbage return values.
507 		 */
508 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
509 			error = EINVAL;
510 			goto done;
511 		}
512 	}
513 
514 	/*
515 	 * if tracing, save a copy of iovec
516 	 */
517 	if (ktrpoint(KTR_GENIO))  {
518 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
519 		if (ktriov != NULL)
520 			memcpy(ktriov, auio.uio_iov, iovlen);
521 	}
522 
523 	cnt = auio.uio_resid;
524 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
525 	if (error) {
526 		if (auio.uio_resid != cnt && (error == ERESTART ||
527 		    error == EINTR || error == EWOULDBLOCK))
528 			error = 0;
529 		if (error == EPIPE) {
530 			mutex_enter(&proclist_mutex);
531 			psignal(l->l_proc, SIGPIPE);
532 			mutex_exit(&proclist_mutex);
533 		}
534 	}
535 	cnt -= auio.uio_resid;
536 	*retval = cnt;
537 
538 	if (ktriov != NULL) {
539 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
540 		kmem_free(ktriov, iovlen);
541 	}
542 
543  done:
544 	if (needfree)
545 		kmem_free(needfree, iovlen);
546  out:
547 	FILE_UNUSE(fp, l);
548 	return (error);
549 }
550 
551 /*
552  * Ioctl system call
553  */
554 /* ARGSUSED */
555 int
556 sys_ioctl(lwp_t *l, void *v, register_t *retval)
557 {
558 	struct sys_ioctl_args /* {
559 		syscallarg(int)		fd;
560 		syscallarg(u_long)	com;
561 		syscallarg(void *)	data;
562 	} */ *uap = v;
563 	struct file	*fp;
564 	proc_t		*p;
565 	struct filedesc	*fdp;
566 	u_long		com;
567 	int		error;
568 	u_int		size;
569 	void 		*data, *memp;
570 #define	STK_PARAMS	128
571 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
572 
573 	error = 0;
574 	p = l->l_proc;
575 	fdp = p->p_fd;
576 
577 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
578 		return (EBADF);
579 
580 	FILE_USE(fp);
581 
582 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
583 		error = EBADF;
584 		com = 0;
585 		goto out;
586 	}
587 
588 	switch (com = SCARG(uap, com)) {
589 	case FIONCLEX:
590 		rw_enter(&fdp->fd_lock, RW_WRITER);
591 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
592 		rw_exit(&fdp->fd_lock);
593 		goto out;
594 
595 	case FIOCLEX:
596 		rw_enter(&fdp->fd_lock, RW_WRITER);
597 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
598 		rw_exit(&fdp->fd_lock);
599 		goto out;
600 	}
601 
602 	/*
603 	 * Interpret high order word to find amount of data to be
604 	 * copied to/from the user's address space.
605 	 */
606 	size = IOCPARM_LEN(com);
607 	if (size > IOCPARM_MAX) {
608 		error = ENOTTY;
609 		goto out;
610 	}
611 	memp = NULL;
612 	if (size > sizeof(stkbuf)) {
613 		memp = kmem_alloc(size, KM_SLEEP);
614 		data = memp;
615 	} else
616 		data = (void *)stkbuf;
617 	if (com&IOC_IN) {
618 		if (size) {
619 			error = copyin(SCARG(uap, data), data, size);
620 			if (error) {
621 				if (memp)
622 					kmem_free(memp, size);
623 				goto out;
624 			}
625 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
626 			    size, 0);
627 		} else
628 			*(void **)data = SCARG(uap, data);
629 	} else if ((com&IOC_OUT) && size)
630 		/*
631 		 * Zero the buffer so the user always
632 		 * gets back something deterministic.
633 		 */
634 		memset(data, 0, size);
635 	else if (com&IOC_VOID)
636 		*(void **)data = SCARG(uap, data);
637 
638 	switch (com) {
639 
640 	case FIONBIO:
641 		mutex_enter(&fp->f_lock);
642 		if (*(int *)data != 0)
643 			fp->f_flag |= FNONBLOCK;
644 		else
645 			fp->f_flag &= ~FNONBLOCK;
646 		mutex_exit(&fp->f_lock);
647 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
648 		break;
649 
650 	case FIOASYNC:
651 		mutex_enter(&fp->f_lock);
652 		if (*(int *)data != 0)
653 			fp->f_flag |= FASYNC;
654 		else
655 			fp->f_flag &= ~FASYNC;
656 		mutex_exit(&fp->f_lock);
657 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
658 		break;
659 
660 	default:
661 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
662 		/*
663 		 * Copy any data to user, size was
664 		 * already set and checked above.
665 		 */
666 		if (error == 0 && (com&IOC_OUT) && size) {
667 			error = copyout(data, SCARG(uap, data), size);
668 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
669 			    size, error);
670 		}
671 		break;
672 	}
673 	if (memp)
674 		kmem_free(memp, size);
675  out:
676 	FILE_UNUSE(fp, l);
677 	switch (error) {
678 	case -1:
679 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
680 		    "pid=%d comm=%s\n",
681 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
682 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
683 		    p->p_pid, p->p_comm);
684 		/* FALLTHROUGH */
685 	case EPASSTHROUGH:
686 		error = ENOTTY;
687 		/* FALLTHROUGH */
688 	default:
689 		return (error);
690 	}
691 }
692 
693 /*
694  * Select system call.
695  */
696 int
697 sys_pselect(lwp_t *l, void *v, register_t *retval)
698 {
699 	struct sys_pselect_args /* {
700 		syscallarg(int)				nd;
701 		syscallarg(fd_set *)			in;
702 		syscallarg(fd_set *)			ou;
703 		syscallarg(fd_set *)			ex;
704 		syscallarg(const struct timespec *)	ts;
705 		syscallarg(sigset_t *)			mask;
706 	} */ * const uap = v;
707 	struct timespec	ats;
708 	struct timeval	atv, *tv = NULL;
709 	sigset_t	amask, *mask = NULL;
710 	int		error;
711 
712 	if (SCARG(uap, ts)) {
713 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
714 		if (error)
715 			return error;
716 		atv.tv_sec = ats.tv_sec;
717 		atv.tv_usec = ats.tv_nsec / 1000;
718 		tv = &atv;
719 	}
720 	if (SCARG(uap, mask) != NULL) {
721 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
722 		if (error)
723 			return error;
724 		mask = &amask;
725 	}
726 
727 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
728 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
729 }
730 
731 int
732 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
733 {
734 	if (itimerfix(tv))
735 		return -1;
736 	getmicrouptime(sleeptv);
737 	return 0;
738 }
739 
740 int
741 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
742 {
743 	/*
744 	 * We have to recalculate the timeout on every retry.
745 	 */
746 	struct timeval slepttv;
747 	/*
748 	 * reduce tv by elapsed time
749 	 * based on monotonic time scale
750 	 */
751 	getmicrouptime(&slepttv);
752 	timeradd(tv, sleeptv, tv);
753 	timersub(tv, &slepttv, tv);
754 	*sleeptv = slepttv;
755 	return tvtohz(tv);
756 }
757 
758 int
759 sys_select(lwp_t *l, void *v, register_t *retval)
760 {
761 	struct sys_select_args /* {
762 		syscallarg(int)			nd;
763 		syscallarg(fd_set *)		in;
764 		syscallarg(fd_set *)		ou;
765 		syscallarg(fd_set *)		ex;
766 		syscallarg(struct timeval *)	tv;
767 	} */ * const uap = v;
768 	struct timeval atv, *tv = NULL;
769 	int error;
770 
771 	if (SCARG(uap, tv)) {
772 		error = copyin(SCARG(uap, tv), (void *)&atv,
773 			sizeof(atv));
774 		if (error)
775 			return error;
776 		tv = &atv;
777 	}
778 
779 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
780 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
781 }
782 
783 int
784 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
785 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
786 {
787 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
788 			    sizeof(fd_mask) * 6];
789 	proc_t		* const p = l->l_proc;
790 	char 		*bits;
791 	int		ncoll, error, timo;
792 	size_t		ni;
793 	sigset_t	oldmask;
794 	struct timeval  sleeptv;
795 
796 	error = 0;
797 	if (nd < 0)
798 		return (EINVAL);
799 	if (nd > p->p_fd->fd_nfiles) {
800 		/* forgiving; slightly wrong */
801 		nd = p->p_fd->fd_nfiles;
802 	}
803 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
804 	if (ni * 6 > sizeof(smallbits))
805 		bits = kmem_alloc(ni * 6, KM_SLEEP);
806 	else
807 		bits = smallbits;
808 
809 #define	getbits(name, x)						\
810 	if (u_ ## name) {						\
811 		error = copyin(u_ ## name, bits + ni * x, ni);		\
812 		if (error)						\
813 			goto done;					\
814 	} else								\
815 		memset(bits + ni * x, 0, ni);
816 	getbits(in, 0);
817 	getbits(ou, 1);
818 	getbits(ex, 2);
819 #undef	getbits
820 
821 	timo = 0;
822 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
823 		error = EINVAL;
824 		goto done;
825 	}
826 
827 	if (mask) {
828 		sigminusset(&sigcantmask, mask);
829 		mutex_enter(&p->p_smutex);
830 		oldmask = l->l_sigmask;
831 		l->l_sigmask = *mask;
832 		mutex_exit(&p->p_smutex);
833 	} else
834 		oldmask = l->l_sigmask;	/* XXXgcc */
835 
836 	mutex_enter(&select_lock);
837 	SLIST_INIT(&l->l_selwait);
838 	for (;;) {
839 	 	l->l_selflag = SEL_SCANNING;
840 		ncoll = nselcoll;
841  		mutex_exit(&select_lock);
842 
843 		error = selscan(l, (fd_mask *)(bits + ni * 0),
844 		    (fd_mask *)(bits + ni * 3), nd, retval);
845 
846 		mutex_enter(&select_lock);
847 		if (error || *retval)
848 			break;
849 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
850 			break;
851 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
852 			continue;
853 		l->l_selflag = SEL_BLOCKING;
854 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
855 		if (error != 0)
856 			break;
857 	}
858 	selclear();
859 	mutex_exit(&select_lock);
860 
861 	if (mask) {
862 		mutex_enter(&p->p_smutex);
863 		l->l_sigmask = oldmask;
864 		mutex_exit(&p->p_smutex);
865 	}
866 
867  done:
868 	/* select is not restarted after signals... */
869 	if (error == ERESTART)
870 		error = EINTR;
871 	if (error == EWOULDBLOCK)
872 		error = 0;
873 	if (error == 0 && u_in != NULL)
874 		error = copyout(bits + ni * 3, u_in, ni);
875 	if (error == 0 && u_ou != NULL)
876 		error = copyout(bits + ni * 4, u_ou, ni);
877 	if (error == 0 && u_ex != NULL)
878 		error = copyout(bits + ni * 5, u_ex, ni);
879 	if (bits != smallbits)
880 		kmem_free(bits, ni * 6);
881 	return (error);
882 }
883 
884 int
885 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
886 	register_t *retval)
887 {
888 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
889 			       POLLWRNORM | POLLHUP | POLLERR,
890 			       POLLRDBAND };
891 	proc_t *p = l->l_proc;
892 	struct filedesc	*fdp;
893 	int msk, i, j, fd, n;
894 	fd_mask ibits, obits;
895 	struct file *fp;
896 
897 	fdp = p->p_fd;
898 	n = 0;
899 	for (msk = 0; msk < 3; msk++) {
900 		for (i = 0; i < nfd; i += NFDBITS) {
901 			ibits = *ibitp++;
902 			obits = 0;
903 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
904 				ibits &= ~(1 << j);
905 				if ((fp = fd_getfile(fdp, fd)) == NULL)
906 					return (EBADF);
907 				FILE_USE(fp);
908 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
909 					obits |= (1 << j);
910 					n++;
911 				}
912 				FILE_UNUSE(fp, l);
913 			}
914 			*obitp++ = obits;
915 		}
916 	}
917 	*retval = n;
918 	return (0);
919 }
920 
921 /*
922  * Poll system call.
923  */
924 int
925 sys_poll(lwp_t *l, void *v, register_t *retval)
926 {
927 	struct sys_poll_args /* {
928 		syscallarg(struct pollfd *)	fds;
929 		syscallarg(u_int)		nfds;
930 		syscallarg(int)			timeout;
931 	} */ * const uap = v;
932 	struct timeval	atv, *tv = NULL;
933 
934 	if (SCARG(uap, timeout) != INFTIM) {
935 		atv.tv_sec = SCARG(uap, timeout) / 1000;
936 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
937 		tv = &atv;
938 	}
939 
940 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
941 		tv, NULL);
942 }
943 
944 /*
945  * Poll system call.
946  */
947 int
948 sys_pollts(lwp_t *l, void *v, register_t *retval)
949 {
950 	struct sys_pollts_args /* {
951 		syscallarg(struct pollfd *)		fds;
952 		syscallarg(u_int)			nfds;
953 		syscallarg(const struct timespec *)	ts;
954 		syscallarg(const sigset_t *)		mask;
955 	} */ * const uap = v;
956 	struct timespec	ats;
957 	struct timeval	atv, *tv = NULL;
958 	sigset_t	amask, *mask = NULL;
959 	int		error;
960 
961 	if (SCARG(uap, ts)) {
962 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
963 		if (error)
964 			return error;
965 		atv.tv_sec = ats.tv_sec;
966 		atv.tv_usec = ats.tv_nsec / 1000;
967 		tv = &atv;
968 	}
969 	if (SCARG(uap, mask)) {
970 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
971 		if (error)
972 			return error;
973 		mask = &amask;
974 	}
975 
976 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
977 		tv, mask);
978 }
979 
980 int
981 pollcommon(lwp_t *l, register_t *retval,
982 	struct pollfd *u_fds, u_int nfds,
983 	struct timeval *tv, sigset_t *mask)
984 {
985 	char		smallbits[32 * sizeof(struct pollfd)];
986 	proc_t		* const p = l->l_proc;
987 	void *		bits;
988 	sigset_t	oldmask;
989 	int		ncoll, error, timo;
990 	size_t		ni;
991 	struct timeval	sleeptv;
992 
993 	if (nfds > p->p_fd->fd_nfiles) {
994 		/* forgiving; slightly wrong */
995 		nfds = p->p_fd->fd_nfiles;
996 	}
997 	ni = nfds * sizeof(struct pollfd);
998 	if (ni > sizeof(smallbits))
999 		bits = kmem_alloc(ni, KM_SLEEP);
1000 	else
1001 		bits = smallbits;
1002 
1003 	error = copyin(u_fds, bits, ni);
1004 	if (error)
1005 		goto done;
1006 
1007 	timo = 0;
1008 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
1009 		error = EINVAL;
1010 		goto done;
1011 	}
1012 
1013 	if (mask) {
1014 		sigminusset(&sigcantmask, mask);
1015 		mutex_enter(&p->p_smutex);
1016 		oldmask = l->l_sigmask;
1017 		l->l_sigmask = *mask;
1018 		mutex_exit(&p->p_smutex);
1019 	} else
1020 		oldmask = l->l_sigmask;	/* XXXgcc */
1021 
1022 	mutex_enter(&select_lock);
1023 	SLIST_INIT(&l->l_selwait);
1024 	for (;;) {
1025 		ncoll = nselcoll;
1026 		l->l_selflag = SEL_SCANNING;
1027 		mutex_exit(&select_lock);
1028 
1029 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1030 
1031 		mutex_enter(&select_lock);
1032 		if (error || *retval)
1033 			break;
1034 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1035 			break;
1036 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1037 			continue;
1038 		l->l_selflag = SEL_BLOCKING;
1039 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1040 		if (error != 0)
1041 			break;
1042 	}
1043 	selclear();
1044 	mutex_exit(&select_lock);
1045 
1046 	if (mask) {
1047 		mutex_enter(&p->p_smutex);
1048 		l->l_sigmask = oldmask;
1049 		mutex_exit(&p->p_smutex);
1050 	}
1051  done:
1052 	/* poll is not restarted after signals... */
1053 	if (error == ERESTART)
1054 		error = EINTR;
1055 	if (error == EWOULDBLOCK)
1056 		error = 0;
1057 	if (error == 0)
1058 		error = copyout(bits, u_fds, ni);
1059 	if (bits != smallbits)
1060 		kmem_free(bits, ni);
1061 	return (error);
1062 }
1063 
1064 int
1065 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1066 {
1067 	proc_t		*p = l->l_proc;
1068 	struct filedesc	*fdp;
1069 	int		i, n;
1070 	struct file	*fp;
1071 
1072 	fdp = p->p_fd;
1073 	n = 0;
1074 	for (i = 0; i < nfd; i++, fds++) {
1075 		if (fds->fd >= fdp->fd_nfiles) {
1076 			fds->revents = POLLNVAL;
1077 			n++;
1078 		} else if (fds->fd < 0) {
1079 			fds->revents = 0;
1080 		} else {
1081 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1082 				fds->revents = POLLNVAL;
1083 				n++;
1084 			} else {
1085 				FILE_USE(fp);
1086 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1087 				    fds->events | POLLERR | POLLHUP, l);
1088 				if (fds->revents != 0)
1089 					n++;
1090 				FILE_UNUSE(fp, l);
1091 			}
1092 		}
1093 	}
1094 	*retval = n;
1095 	return (0);
1096 }
1097 
1098 /*ARGSUSED*/
1099 int
1100 seltrue(dev_t dev, int events, lwp_t *l)
1101 {
1102 
1103 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1104 }
1105 
1106 /*
1107  * Record a select request.
1108  */
1109 void
1110 selrecord(lwp_t *selector, struct selinfo *sip)
1111 {
1112 
1113 	mutex_enter(&select_lock);
1114 	if (sip->sel_lwp == NULL) {
1115 		/* First named waiter, although there may be more. */
1116 		sip->sel_lwp = selector;
1117 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1118 	} else if (sip->sel_lwp != selector) {
1119 		/* Multiple waiters. */
1120 		sip->sel_collision = true;
1121 	}
1122 	mutex_exit(&select_lock);
1123 }
1124 
1125 /*
1126  * Do a wakeup when a selectable event occurs.
1127  */
1128 void
1129 selwakeup(struct selinfo *sip)
1130 {
1131 	lwp_t *l;
1132 
1133 	mutex_enter(&select_lock);
1134 	if (sip->sel_collision) {
1135 		/* Multiple waiters - just notify everybody. */
1136 		nselcoll++;
1137 		sip->sel_collision = false;
1138 		cv_broadcast(&select_cv);
1139 	} else if (sip->sel_lwp != NULL) {
1140 		/* Only one LWP waiting. */
1141 		l = sip->sel_lwp;
1142 		if (l->l_selflag == SEL_BLOCKING) {
1143 			/*
1144 			 * If it's sleeping, wake it up.  If not, it's
1145 			 * already awake but hasn't yet removed itself
1146 			 * from the selector.  We reset the state below
1147 			 * so that we only attempt to do this once.
1148 			 */
1149 			lwp_lock(l);
1150 			if (l->l_wchan == &select_cv) {
1151 				/* lwp_unsleep() releases the LWP lock. */
1152 				lwp_unsleep(l);
1153 			} else
1154 				lwp_unlock(l);
1155 		} else {
1156 			/*
1157 			 * Not yet asleep.  Reset its state below so that
1158 			 * it will go around again.
1159 			 */
1160 		}
1161 		l->l_selflag = SEL_RESET;
1162 	}
1163 	mutex_exit(&select_lock);
1164 }
1165 
1166 void
1167 selnotify(struct selinfo *sip, long knhint)
1168 {
1169 
1170 	selwakeup(sip);
1171 	KNOTE(&sip->sel_klist, knhint);
1172 }
1173 
1174 /*
1175  * Remove an LWP from all objects that it is waiting for.
1176  */
1177 static void
1178 selclear(void)
1179 {
1180 	struct selinfo *sip;
1181 	lwp_t *l = curlwp;
1182 
1183 	KASSERT(mutex_owned(&select_lock));
1184 
1185 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1186 		KASSERT(sip->sel_lwp == l);
1187 		sip->sel_lwp = NULL;
1188 	}
1189 }
1190 
1191 /*
1192  * Initialize the select/poll system calls.
1193  */
1194 void
1195 selsysinit(void)
1196 {
1197 
1198 	mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM);
1199 	cv_init(&select_cv, "select");
1200 }
1201 
1202 /*
1203  * Initialize a selector.
1204  */
1205 void
1206 selinit(struct selinfo *sip)
1207 {
1208 
1209 	memset(sip, 0, sizeof(*sip));
1210 }
1211 
1212 /*
1213  * Destroy a selector.  The owning object must not gain new
1214  * references while this is in progress: all activity on the
1215  * selector must be stopped.
1216  */
1217 void
1218 seldestroy(struct selinfo *sip)
1219 {
1220 	lwp_t *l;
1221 
1222 	if (sip->sel_lwp == NULL)
1223 		return;
1224 
1225 	mutex_enter(&select_lock);
1226 	if ((l = sip->sel_lwp) != NULL) {
1227 		/* This should rarely happen, so SLIST_REMOVE() is OK. */
1228 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
1229 		sip->sel_lwp = NULL;
1230 	}
1231 	mutex_exit(&select_lock);
1232 }
1233