xref: /netbsd-src/sys/kern/sys_generic.c (revision 0920b4f20b78ab1ccd9f2312fbe10deaf000cbf3)
1 /*	$NetBSD: sys_generic.c,v 1.105 2007/08/27 16:23:16 dsl Exp $	*/
2 
3 /*-
4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * Copyright (c) 1982, 1986, 1989, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  * (c) UNIX System Laboratories, Inc.
43  * All or some portions of this file are derived from material licensed
44  * to the University of California by American Telephone and Telegraph
45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46  * the permission of UNIX System Laboratories, Inc.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  * 3. Neither the name of the University nor the names of its contributors
57  *    may be used to endorse or promote products derived from this software
58  *    without specific prior written permission.
59  *
60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70  * SUCH DAMAGE.
71  *
72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
73  */
74 
75 /*
76  * System calls relating to files.
77  */
78 
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.105 2007/08/27 16:23:16 dsl Exp $");
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99 
100 #include <uvm/uvm_extern.h>
101 
102 /* Flags for lwp::l_selflag. */
103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
104 #define	SEL_SCANNING	1	/* polling descriptors */
105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
106 
107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
109 static void	selclear(void);
110 
111 /* Global state for select()/poll(). */
112 kmutex_t	select_lock;
113 kcondvar_t	select_cv;
114 int		nselcoll;
115 
116 /*
117  * Read system call.
118  */
119 /* ARGSUSED */
120 int
121 sys_read(lwp_t *l, void *v, register_t *retval)
122 {
123 	struct sys_read_args /* {
124 		syscallarg(int)		fd;
125 		syscallarg(void *)	buf;
126 		syscallarg(size_t)	nbyte;
127 	} */ *uap = v;
128 	int		fd;
129 	struct file	*fp;
130 	proc_t		*p;
131 	struct filedesc	*fdp;
132 
133 	fd = SCARG(uap, fd);
134 	p = l->l_proc;
135 	fdp = p->p_fd;
136 
137 	if ((fp = fd_getfile(fdp, fd)) == NULL)
138 		return (EBADF);
139 
140 	if ((fp->f_flag & FREAD) == 0) {
141 		simple_unlock(&fp->f_slock);
142 		return (EBADF);
143 	}
144 
145 	FILE_USE(fp);
146 
147 	/* dofileread() will unuse the descriptor for us */
148 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
149 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
150 }
151 
152 int
153 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
154 	off_t *offset, int flags, register_t *retval)
155 {
156 	struct iovec aiov;
157 	struct uio auio;
158 	proc_t *p;
159 	struct vmspace *vm;
160 	size_t cnt;
161 	int error;
162 	p = l->l_proc;
163 
164 	error = proc_vmspace_getref(p, &vm);
165 	if (error) {
166 		goto out;
167 	}
168 
169 	aiov.iov_base = (void *)buf;
170 	aiov.iov_len = nbyte;
171 	auio.uio_iov = &aiov;
172 	auio.uio_iovcnt = 1;
173 	auio.uio_resid = nbyte;
174 	auio.uio_rw = UIO_READ;
175 	auio.uio_vmspace = vm;
176 
177 	/*
178 	 * Reads return ssize_t because -1 is returned on error.  Therefore
179 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
180 	 * values.
181 	 */
182 	if (auio.uio_resid > SSIZE_MAX) {
183 		error = EINVAL;
184 		goto out;
185 	}
186 
187 	cnt = auio.uio_resid;
188 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
189 	if (error)
190 		if (auio.uio_resid != cnt && (error == ERESTART ||
191 		    error == EINTR || error == EWOULDBLOCK))
192 			error = 0;
193 	cnt -= auio.uio_resid;
194 	ktrgenio(fd, UIO_READ, buf, cnt, error);
195 	*retval = cnt;
196  out:
197 	FILE_UNUSE(fp, l);
198 	uvmspace_free(vm);
199 	return (error);
200 }
201 
202 /*
203  * Scatter read system call.
204  */
205 int
206 sys_readv(lwp_t *l, void *v, register_t *retval)
207 {
208 	struct sys_readv_args /* {
209 		syscallarg(int)				fd;
210 		syscallarg(const struct iovec *)	iovp;
211 		syscallarg(int)				iovcnt;
212 	} */ *uap = v;
213 
214 	return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
215 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
216 }
217 
218 int
219 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
220     off_t *offset, int flags, register_t *retval)
221 {
222 	struct proc	*p;
223 	struct uio	auio;
224 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
225 	struct vmspace	*vm;
226 	int		i, error;
227 	size_t		cnt;
228 	u_int		iovlen;
229 	struct file	*fp;
230 	struct filedesc	*fdp;
231 	struct iovec	*ktriov = NULL;
232 
233 	if (iovcnt == 0)
234 		return EINVAL;
235 
236 	p = l->l_proc;
237 	fdp = p->p_fd;
238 
239 	if ((fp = fd_getfile(fdp, fd)) == NULL)
240 		return EBADF;
241 
242 	if ((fp->f_flag & FREAD) == 0) {
243 		simple_unlock(&fp->f_slock);
244 		return EBADF;
245 	}
246 
247 	FILE_USE(fp);
248 
249 	if (offset == NULL)
250 		offset = &fp->f_offset;
251 	else {
252 		struct vnode *vp = fp->f_data;
253 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
254 			error = ESPIPE;
255 			goto out;
256 		}
257 		/*
258 		 * Test that the device is seekable ?
259 		 * XXX This works because no file systems actually
260 		 * XXX take any action on the seek operation.
261 		 */
262 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
263 		if (error != 0)
264 			goto out;
265 	}
266 
267 	error = proc_vmspace_getref(p, &vm);
268 	if (error)
269 		goto out;
270 
271 	iovlen = iovcnt * sizeof(struct iovec);
272 	if (flags & FOF_IOV_SYSSPACE)
273 		iov = __UNCONST(iovp);
274 	else {
275 		iov = aiov;
276 		if ((u_int)iovcnt > UIO_SMALLIOV) {
277 			if ((u_int)iovcnt > IOV_MAX) {
278 				error = EINVAL;
279 				goto out;
280 			}
281 			iov = kmem_alloc(iovlen, KM_SLEEP);
282 			if (iov == NULL) {
283 				error = ENOMEM;
284 				goto out;
285 			}
286 			needfree = iov;
287 		}
288 		error = copyin(iovp, iov, iovlen);
289 		if (error)
290 			goto done;
291 	}
292 
293 	auio.uio_iov = iov;
294 	auio.uio_iovcnt = iovcnt;
295 	auio.uio_rw = UIO_READ;
296 	auio.uio_vmspace = vm;
297 
298 	auio.uio_resid = 0;
299 	for (i = 0; i < iovcnt; i++, iov++) {
300 		auio.uio_resid += iov->iov_len;
301 		/*
302 		 * Reads return ssize_t because -1 is returned on error.
303 		 * Therefore we must restrict the length to SSIZE_MAX to
304 		 * avoid garbage return values.
305 		 */
306 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
307 			error = EINVAL;
308 			goto done;
309 		}
310 	}
311 
312 	/*
313 	 * if tracing, save a copy of iovec
314 	 */
315 	if (ktrpoint(KTR_GENIO))  {
316 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
317 		if (ktriov != NULL)
318 			memcpy(ktriov, auio.uio_iov, iovlen);
319 	}
320 
321 	cnt = auio.uio_resid;
322 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
323 	if (error)
324 		if (auio.uio_resid != cnt && (error == ERESTART ||
325 		    error == EINTR || error == EWOULDBLOCK))
326 			error = 0;
327 	cnt -= auio.uio_resid;
328 	*retval = cnt;
329 
330 	if (ktriov != NULL) {
331 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
332 		kmem_free(ktriov, iovlen);
333 	}
334 
335  done:
336 	if (needfree)
337 		kmem_free(needfree, iovlen);
338  out:
339 	FILE_UNUSE(fp, l);
340 	uvmspace_free(vm);
341 	return (error);
342 }
343 
344 /*
345  * Write system call
346  */
347 int
348 sys_write(lwp_t *l, void *v, register_t *retval)
349 {
350 	struct sys_write_args /* {
351 		syscallarg(int)			fd;
352 		syscallarg(const void *)	buf;
353 		syscallarg(size_t)		nbyte;
354 	} */ *uap = v;
355 	int		fd;
356 	struct file	*fp;
357 	proc_t		*p;
358 	struct filedesc	*fdp;
359 
360 	fd = SCARG(uap, fd);
361 	p = l->l_proc;
362 	fdp = p->p_fd;
363 
364 	if ((fp = fd_getfile(fdp, fd)) == NULL)
365 		return (EBADF);
366 
367 	if ((fp->f_flag & FWRITE) == 0) {
368 		simple_unlock(&fp->f_slock);
369 		return (EBADF);
370 	}
371 
372 	FILE_USE(fp);
373 
374 	/* dofilewrite() will unuse the descriptor for us */
375 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
376 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
377 }
378 
379 int
380 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
381 	size_t nbyte, off_t *offset, int flags, register_t *retval)
382 {
383 	struct iovec aiov;
384 	struct uio auio;
385 	proc_t *p;
386 	struct vmspace *vm;
387 	size_t cnt;
388 	int error;
389 
390 	p = l->l_proc;
391 	error = proc_vmspace_getref(p, &vm);
392 	if (error) {
393 		goto out;
394 	}
395 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
396 	aiov.iov_len = nbyte;
397 	auio.uio_iov = &aiov;
398 	auio.uio_iovcnt = 1;
399 	auio.uio_resid = nbyte;
400 	auio.uio_rw = UIO_WRITE;
401 	auio.uio_vmspace = vm;
402 
403 	/*
404 	 * Writes return ssize_t because -1 is returned on error.  Therefore
405 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
406 	 * values.
407 	 */
408 	if (auio.uio_resid > SSIZE_MAX) {
409 		error = EINVAL;
410 		goto out;
411 	}
412 
413 	cnt = auio.uio_resid;
414 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
415 	if (error) {
416 		if (auio.uio_resid != cnt && (error == ERESTART ||
417 		    error == EINTR || error == EWOULDBLOCK))
418 			error = 0;
419 		if (error == EPIPE) {
420 			mutex_enter(&proclist_mutex);
421 			psignal(p, SIGPIPE);
422 			mutex_exit(&proclist_mutex);
423 		}
424 	}
425 	cnt -= auio.uio_resid;
426 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
427 	*retval = cnt;
428  out:
429 	FILE_UNUSE(fp, l);
430 	uvmspace_free(vm);
431 	return (error);
432 }
433 
434 /*
435  * Gather write system call
436  */
437 int
438 sys_writev(lwp_t *l, void *v, register_t *retval)
439 {
440 	struct sys_writev_args /* {
441 		syscallarg(int)				fd;
442 		syscallarg(const struct iovec *)	iovp;
443 		syscallarg(int)				iovcnt;
444 	} */ *uap = v;
445 
446 	return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
447 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
448 }
449 
450 int
451 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
452     off_t *offset, int flags, register_t *retval)
453 {
454 	struct proc	*p;
455 	struct uio	auio;
456 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
457 	struct vmspace	*vm;
458 	int		i, error;
459 	size_t		cnt;
460 	u_int		iovlen;
461 	struct file	*fp;
462 	struct filedesc	*fdp;
463 	struct iovec	*ktriov = NULL;
464 
465 	if (iovcnt == 0)
466 		return EINVAL;
467 
468 	p = l->l_proc;
469 	fdp = p->p_fd;
470 
471 	if ((fp = fd_getfile(fdp, fd)) == NULL)
472 		return EBADF;
473 
474 	if ((fp->f_flag & FWRITE) == 0) {
475 		simple_unlock(&fp->f_slock);
476 		return EBADF;
477 	}
478 
479 	FILE_USE(fp);
480 
481 	if (offset == NULL)
482 		offset = &fp->f_offset;
483 	else {
484 		struct vnode *vp = fp->f_data;
485 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
486 			error = ESPIPE;
487 			goto out;
488 		}
489 		/*
490 		 * Test that the device is seekable ?
491 		 * XXX This works because no file systems actually
492 		 * XXX take any action on the seek operation.
493 		 */
494 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
495 		if (error != 0)
496 			goto out;
497 	}
498 
499 	error = proc_vmspace_getref(p, &vm);
500 	if (error)
501 		goto out;
502 
503 	iovlen = iovcnt * sizeof(struct iovec);
504 	if (flags & FOF_IOV_SYSSPACE)
505 		iov = __UNCONST(iovp);
506 	else {
507 		iov = aiov;
508 		if ((u_int)iovcnt > UIO_SMALLIOV) {
509 			if ((u_int)iovcnt > IOV_MAX) {
510 				error = EINVAL;
511 				goto out;
512 			}
513 			iov = kmem_alloc(iovlen, KM_SLEEP);
514 			if (iov == NULL) {
515 				error = ENOMEM;
516 				goto out;
517 			}
518 			needfree = iov;
519 		}
520 		error = copyin(iovp, iov, iovlen);
521 		if (error)
522 			goto done;
523 	}
524 
525 	auio.uio_iov = iov;
526 	auio.uio_iovcnt = iovcnt;
527 	auio.uio_rw = UIO_WRITE;
528 	auio.uio_vmspace = vm;
529 
530 	auio.uio_resid = 0;
531 	for (i = 0; i < iovcnt; i++, iov++) {
532 		auio.uio_resid += iov->iov_len;
533 		/*
534 		 * Writes return ssize_t because -1 is returned on error.
535 		 * Therefore we must restrict the length to SSIZE_MAX to
536 		 * avoid garbage return values.
537 		 */
538 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
539 			error = EINVAL;
540 			goto done;
541 		}
542 	}
543 
544 	/*
545 	 * if tracing, save a copy of iovec
546 	 */
547 	if (ktrpoint(KTR_GENIO))  {
548 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
549 		if (ktriov != NULL)
550 			memcpy(ktriov, auio.uio_iov, iovlen);
551 	}
552 
553 	cnt = auio.uio_resid;
554 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
555 	if (error) {
556 		if (auio.uio_resid != cnt && (error == ERESTART ||
557 		    error == EINTR || error == EWOULDBLOCK))
558 			error = 0;
559 		if (error == EPIPE) {
560 			mutex_enter(&proclist_mutex);
561 			psignal(p, SIGPIPE);
562 			mutex_exit(&proclist_mutex);
563 		}
564 	}
565 	cnt -= auio.uio_resid;
566 	*retval = cnt;
567 
568 	if (ktriov != NULL) {
569 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
570 		kmem_free(ktriov, iovlen);
571 	}
572 
573  done:
574 	if (needfree)
575 		kmem_free(needfree, iovlen);
576  out:
577 	FILE_UNUSE(fp, l);
578 	uvmspace_free(vm);
579 	return (error);
580 }
581 
582 /*
583  * Ioctl system call
584  */
585 /* ARGSUSED */
586 int
587 sys_ioctl(struct lwp *l, void *v, register_t *retval)
588 {
589 	struct sys_ioctl_args /* {
590 		syscallarg(int)		fd;
591 		syscallarg(u_long)	com;
592 		syscallarg(void *)	data;
593 	} */ *uap = v;
594 	struct file	*fp;
595 	proc_t		*p;
596 	struct filedesc	*fdp;
597 	u_long		com;
598 	int		error;
599 	u_int		size;
600 	void 		*data, *memp;
601 #define	STK_PARAMS	128
602 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
603 
604 	error = 0;
605 	p = l->l_proc;
606 	fdp = p->p_fd;
607 
608 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
609 		return (EBADF);
610 
611 	FILE_USE(fp);
612 
613 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
614 		error = EBADF;
615 		com = 0;
616 		goto out;
617 	}
618 
619 	switch (com = SCARG(uap, com)) {
620 	case FIONCLEX:
621 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
622 		goto out;
623 
624 	case FIOCLEX:
625 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
626 		goto out;
627 	}
628 
629 	/*
630 	 * Interpret high order word to find amount of data to be
631 	 * copied to/from the user's address space.
632 	 */
633 	size = IOCPARM_LEN(com);
634 	if (size > IOCPARM_MAX) {
635 		error = ENOTTY;
636 		goto out;
637 	}
638 	memp = NULL;
639 	if (size > sizeof(stkbuf)) {
640 		memp = kmem_alloc(size, KM_SLEEP);
641 		data = memp;
642 	} else
643 		data = (void *)stkbuf;
644 	if (com&IOC_IN) {
645 		if (size) {
646 			error = copyin(SCARG(uap, data), data, size);
647 			if (error) {
648 				if (memp)
649 					kmem_free(memp, size);
650 				goto out;
651 			}
652 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
653 			    size, 0);
654 		} else
655 			*(void **)data = SCARG(uap, data);
656 	} else if ((com&IOC_OUT) && size)
657 		/*
658 		 * Zero the buffer so the user always
659 		 * gets back something deterministic.
660 		 */
661 		memset(data, 0, size);
662 	else if (com&IOC_VOID)
663 		*(void **)data = SCARG(uap, data);
664 
665 	switch (com) {
666 
667 	case FIONBIO:
668 		if (*(int *)data != 0)
669 			fp->f_flag |= FNONBLOCK;
670 		else
671 			fp->f_flag &= ~FNONBLOCK;
672 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
673 		break;
674 
675 	case FIOASYNC:
676 		if (*(int *)data != 0)
677 			fp->f_flag |= FASYNC;
678 		else
679 			fp->f_flag &= ~FASYNC;
680 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
681 		break;
682 
683 	default:
684 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
685 		/*
686 		 * Copy any data to user, size was
687 		 * already set and checked above.
688 		 */
689 		if (error == 0 && (com&IOC_OUT) && size) {
690 			error = copyout(data, SCARG(uap, data), size);
691 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
692 			    size, error);
693 		}
694 		break;
695 	}
696 	if (memp)
697 		kmem_free(memp, size);
698  out:
699 	FILE_UNUSE(fp, l);
700 	switch (error) {
701 	case -1:
702 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
703 		    "pid=%d comm=%s\n",
704 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
705 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
706 		    p->p_pid, p->p_comm);
707 		/* FALLTHROUGH */
708 	case EPASSTHROUGH:
709 		error = ENOTTY;
710 		/* FALLTHROUGH */
711 	default:
712 		return (error);
713 	}
714 }
715 
716 /*
717  * Select system call.
718  */
719 int
720 sys_pselect(lwp_t *l, void *v, register_t *retval)
721 {
722 	struct sys_pselect_args /* {
723 		syscallarg(int)				nd;
724 		syscallarg(fd_set *)			in;
725 		syscallarg(fd_set *)			ou;
726 		syscallarg(fd_set *)			ex;
727 		syscallarg(const struct timespec *)	ts;
728 		syscallarg(sigset_t *)			mask;
729 	} */ * const uap = v;
730 	struct timespec	ats;
731 	struct timeval	atv, *tv = NULL;
732 	sigset_t	amask, *mask = NULL;
733 	int		error;
734 
735 	if (SCARG(uap, ts)) {
736 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
737 		if (error)
738 			return error;
739 		atv.tv_sec = ats.tv_sec;
740 		atv.tv_usec = ats.tv_nsec / 1000;
741 		tv = &atv;
742 	}
743 	if (SCARG(uap, mask) != NULL) {
744 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
745 		if (error)
746 			return error;
747 		mask = &amask;
748 	}
749 
750 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
751 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
752 }
753 
754 int
755 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
756 {
757 	if (itimerfix(tv))
758 		return -1;
759 	getmicrouptime(sleeptv);
760 	return 0;
761 }
762 
763 int
764 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
765 {
766 	/*
767 	 * We have to recalculate the timeout on every retry.
768 	 */
769 	struct timeval slepttv;
770 	/*
771 	 * reduce tv by elapsed time
772 	 * based on monotonic time scale
773 	 */
774 	getmicrouptime(&slepttv);
775 	timeradd(tv, sleeptv, tv);
776 	timersub(tv, &slepttv, tv);
777 	*sleeptv = slepttv;
778 	return tvtohz(tv);
779 }
780 
781 int
782 sys_select(lwp_t *l, void *v, register_t *retval)
783 {
784 	struct sys_select_args /* {
785 		syscallarg(int)			nd;
786 		syscallarg(fd_set *)		in;
787 		syscallarg(fd_set *)		ou;
788 		syscallarg(fd_set *)		ex;
789 		syscallarg(struct timeval *)	tv;
790 	} */ * const uap = v;
791 	struct timeval atv, *tv = NULL;
792 	int error;
793 
794 	if (SCARG(uap, tv)) {
795 		error = copyin(SCARG(uap, tv), (void *)&atv,
796 			sizeof(atv));
797 		if (error)
798 			return error;
799 		tv = &atv;
800 	}
801 
802 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
803 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
804 }
805 
806 int
807 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
808 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
809 {
810 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
811 			    sizeof(fd_mask) * 6];
812 	proc_t		* const p = l->l_proc;
813 	char 		*bits;
814 	int		ncoll, error, timo;
815 	size_t		ni;
816 	sigset_t	oldmask;
817 	struct timeval  sleeptv;
818 
819 	error = 0;
820 	if (nd < 0)
821 		return (EINVAL);
822 	if (nd > p->p_fd->fd_nfiles) {
823 		/* forgiving; slightly wrong */
824 		nd = p->p_fd->fd_nfiles;
825 	}
826 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
827 	if (ni * 6 > sizeof(smallbits))
828 		bits = kmem_alloc(ni * 6, KM_SLEEP);
829 	else
830 		bits = smallbits;
831 
832 #define	getbits(name, x)						\
833 	if (u_ ## name) {						\
834 		error = copyin(u_ ## name, bits + ni * x, ni);		\
835 		if (error)						\
836 			goto done;					\
837 	} else								\
838 		memset(bits + ni * x, 0, ni);
839 	getbits(in, 0);
840 	getbits(ou, 1);
841 	getbits(ex, 2);
842 #undef	getbits
843 
844 	timo = 0;
845 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
846 		error = EINVAL;
847 		goto done;
848 	}
849 
850 	if (mask) {
851 		sigminusset(&sigcantmask, mask);
852 		mutex_enter(&p->p_smutex);
853 		oldmask = l->l_sigmask;
854 		l->l_sigmask = *mask;
855 		mutex_exit(&p->p_smutex);
856 	} else
857 		oldmask = l->l_sigmask;	/* XXXgcc */
858 
859 	mutex_enter(&select_lock);
860 	SLIST_INIT(&l->l_selwait);
861 	for (;;) {
862 	 	l->l_selflag = SEL_SCANNING;
863 		ncoll = nselcoll;
864  		mutex_exit(&select_lock);
865 
866 		error = selscan(l, (fd_mask *)(bits + ni * 0),
867 		    (fd_mask *)(bits + ni * 3), nd, retval);
868 
869 		mutex_enter(&select_lock);
870 		if (error || *retval)
871 			break;
872 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
873 			break;
874 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
875 			continue;
876 		l->l_selflag = SEL_BLOCKING;
877 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
878 		if (error != 0)
879 			break;
880 	}
881 	selclear();
882 	mutex_exit(&select_lock);
883 
884 	if (mask) {
885 		mutex_enter(&p->p_smutex);
886 		l->l_sigmask = oldmask;
887 		mutex_exit(&p->p_smutex);
888 	}
889 
890  done:
891 	/* select is not restarted after signals... */
892 	if (error == ERESTART)
893 		error = EINTR;
894 	if (error == EWOULDBLOCK)
895 		error = 0;
896 	if (error == 0 && u_in != NULL)
897 		error = copyout(bits + ni * 3, u_in, ni);
898 	if (error == 0 && u_ou != NULL)
899 		error = copyout(bits + ni * 4, u_ou, ni);
900 	if (error == 0 && u_ex != NULL)
901 		error = copyout(bits + ni * 5, u_ex, ni);
902 	if (bits != smallbits)
903 		kmem_free(bits, ni * 6);
904 	return (error);
905 }
906 
907 int
908 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
909 	register_t *retval)
910 {
911 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
912 			       POLLWRNORM | POLLHUP | POLLERR,
913 			       POLLRDBAND };
914 	proc_t *p = l->l_proc;
915 	struct filedesc	*fdp;
916 	int msk, i, j, fd, n;
917 	fd_mask ibits, obits;
918 	struct file *fp;
919 
920 	fdp = p->p_fd;
921 	n = 0;
922 	for (msk = 0; msk < 3; msk++) {
923 		for (i = 0; i < nfd; i += NFDBITS) {
924 			ibits = *ibitp++;
925 			obits = 0;
926 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
927 				ibits &= ~(1 << j);
928 				if ((fp = fd_getfile(fdp, fd)) == NULL)
929 					return (EBADF);
930 				FILE_USE(fp);
931 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
932 					obits |= (1 << j);
933 					n++;
934 				}
935 				FILE_UNUSE(fp, l);
936 			}
937 			*obitp++ = obits;
938 		}
939 	}
940 	*retval = n;
941 	return (0);
942 }
943 
944 /*
945  * Poll system call.
946  */
947 int
948 sys_poll(lwp_t *l, void *v, register_t *retval)
949 {
950 	struct sys_poll_args /* {
951 		syscallarg(struct pollfd *)	fds;
952 		syscallarg(u_int)		nfds;
953 		syscallarg(int)			timeout;
954 	} */ * const uap = v;
955 	struct timeval	atv, *tv = NULL;
956 
957 	if (SCARG(uap, timeout) != INFTIM) {
958 		atv.tv_sec = SCARG(uap, timeout) / 1000;
959 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
960 		tv = &atv;
961 	}
962 
963 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
964 		tv, NULL);
965 }
966 
967 /*
968  * Poll system call.
969  */
970 int
971 sys_pollts(lwp_t *l, void *v, register_t *retval)
972 {
973 	struct sys_pollts_args /* {
974 		syscallarg(struct pollfd *)		fds;
975 		syscallarg(u_int)			nfds;
976 		syscallarg(const struct timespec *)	ts;
977 		syscallarg(const sigset_t *)		mask;
978 	} */ * const uap = v;
979 	struct timespec	ats;
980 	struct timeval	atv, *tv = NULL;
981 	sigset_t	amask, *mask = NULL;
982 	int		error;
983 
984 	if (SCARG(uap, ts)) {
985 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
986 		if (error)
987 			return error;
988 		atv.tv_sec = ats.tv_sec;
989 		atv.tv_usec = ats.tv_nsec / 1000;
990 		tv = &atv;
991 	}
992 	if (SCARG(uap, mask)) {
993 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
994 		if (error)
995 			return error;
996 		mask = &amask;
997 	}
998 
999 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1000 		tv, mask);
1001 }
1002 
1003 int
1004 pollcommon(lwp_t *l, register_t *retval,
1005 	struct pollfd *u_fds, u_int nfds,
1006 	struct timeval *tv, sigset_t *mask)
1007 {
1008 	char		smallbits[32 * sizeof(struct pollfd)];
1009 	proc_t		* const p = l->l_proc;
1010 	void *		bits;
1011 	sigset_t	oldmask;
1012 	int		ncoll, error, timo;
1013 	size_t		ni;
1014 	struct timeval	sleeptv;
1015 
1016 	if (nfds > p->p_fd->fd_nfiles) {
1017 		/* forgiving; slightly wrong */
1018 		nfds = p->p_fd->fd_nfiles;
1019 	}
1020 	ni = nfds * sizeof(struct pollfd);
1021 	if (ni > sizeof(smallbits))
1022 		bits = kmem_alloc(ni, KM_SLEEP);
1023 	else
1024 		bits = smallbits;
1025 
1026 	error = copyin(u_fds, bits, ni);
1027 	if (error)
1028 		goto done;
1029 
1030 	timo = 0;
1031 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
1032 		error = EINVAL;
1033 		goto done;
1034 	}
1035 
1036 	if (mask) {
1037 		sigminusset(&sigcantmask, mask);
1038 		mutex_enter(&p->p_smutex);
1039 		oldmask = l->l_sigmask;
1040 		l->l_sigmask = *mask;
1041 		mutex_exit(&p->p_smutex);
1042 	} else
1043 		oldmask = l->l_sigmask;	/* XXXgcc */
1044 
1045 	mutex_enter(&select_lock);
1046 	SLIST_INIT(&l->l_selwait);
1047 	for (;;) {
1048 		ncoll = nselcoll;
1049 		l->l_selflag = SEL_SCANNING;
1050 		mutex_exit(&select_lock);
1051 
1052 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1053 
1054 		mutex_enter(&select_lock);
1055 		if (error || *retval)
1056 			break;
1057 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1058 			break;
1059 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1060 			continue;
1061 		l->l_selflag = SEL_BLOCKING;
1062 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1063 		if (error != 0)
1064 			break;
1065 	}
1066 	selclear();
1067 	mutex_exit(&select_lock);
1068 
1069 	if (mask) {
1070 		mutex_enter(&p->p_smutex);
1071 		l->l_sigmask = oldmask;
1072 		mutex_exit(&p->p_smutex);
1073 	}
1074  done:
1075 	/* poll is not restarted after signals... */
1076 	if (error == ERESTART)
1077 		error = EINTR;
1078 	if (error == EWOULDBLOCK)
1079 		error = 0;
1080 	if (error == 0)
1081 		error = copyout(bits, u_fds, ni);
1082 	if (bits != smallbits)
1083 		kmem_free(bits, ni);
1084 	return (error);
1085 }
1086 
1087 int
1088 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1089 {
1090 	proc_t		*p = l->l_proc;
1091 	struct filedesc	*fdp;
1092 	int		i, n;
1093 	struct file	*fp;
1094 
1095 	fdp = p->p_fd;
1096 	n = 0;
1097 	for (i = 0; i < nfd; i++, fds++) {
1098 		if (fds->fd >= fdp->fd_nfiles) {
1099 			fds->revents = POLLNVAL;
1100 			n++;
1101 		} else if (fds->fd < 0) {
1102 			fds->revents = 0;
1103 		} else {
1104 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1105 				fds->revents = POLLNVAL;
1106 				n++;
1107 			} else {
1108 				FILE_USE(fp);
1109 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1110 				    fds->events | POLLERR | POLLHUP, l);
1111 				if (fds->revents != 0)
1112 					n++;
1113 				FILE_UNUSE(fp, l);
1114 			}
1115 		}
1116 	}
1117 	*retval = n;
1118 	return (0);
1119 }
1120 
1121 /*ARGSUSED*/
1122 int
1123 seltrue(dev_t dev, int events, lwp_t *l)
1124 {
1125 
1126 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1127 }
1128 
1129 /*
1130  * Record a select request.
1131  */
1132 void
1133 selrecord(lwp_t *selector, struct selinfo *sip)
1134 {
1135 
1136 	mutex_enter(&select_lock);
1137 	if (sip->sel_lwp == NULL) {
1138 		/* First named waiter, although there may be more. */
1139 		sip->sel_lwp = selector;
1140 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1141 	} else if (sip->sel_lwp != selector) {
1142 		/* Multiple waiters. */
1143 		sip->sel_collision = true;
1144 	}
1145 	mutex_exit(&select_lock);
1146 }
1147 
1148 /*
1149  * Do a wakeup when a selectable event occurs.
1150  */
1151 void
1152 selwakeup(struct selinfo *sip)
1153 {
1154 	lwp_t *l;
1155 
1156 	mutex_enter(&select_lock);
1157 	if (sip->sel_collision) {
1158 		/* Multiple waiters - just notify everybody. */
1159 		nselcoll++;
1160 		sip->sel_collision = false;
1161 		cv_broadcast(&select_cv);
1162 	} else if (sip->sel_lwp != NULL) {
1163 		/* Only one LWP waiting. */
1164 		l = sip->sel_lwp;
1165 		if (l->l_selflag == SEL_BLOCKING) {
1166 			/*
1167 			 * If it's sleeping, wake it up.  If not, it's
1168 			 * already awake but hasn't yet removed itself
1169 			 * from the selector.  We reset the state below
1170 			 * so that we only attempt to do this once.
1171 			 */
1172 			lwp_lock(l);
1173 			if (l->l_wchan == &select_cv) {
1174 				/* lwp_unsleep() releases the LWP lock. */
1175 				lwp_unsleep(l);
1176 			} else
1177 				lwp_unlock(l);
1178 		} else {
1179 			/*
1180 			 * Not yet asleep.  Reset its state below so that
1181 			 * it will go around again.
1182 			 */
1183 		}
1184 		l->l_selflag = SEL_RESET;
1185 	}
1186 	mutex_exit(&select_lock);
1187 }
1188 
1189 void
1190 selnotify(struct selinfo *sip, long knhint)
1191 {
1192 
1193 	selwakeup(sip);
1194 	KNOTE(&sip->sel_klist, knhint);
1195 }
1196 
1197 /*
1198  * Remove an LWP from all objects that it is waiting for.
1199  */
1200 static void
1201 selclear(void)
1202 {
1203 	struct selinfo *sip;
1204 	lwp_t *l = curlwp;
1205 
1206 	KASSERT(mutex_owned(&select_lock));
1207 
1208 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1209 		KASSERT(sip->sel_lwp == l);
1210 		sip->sel_lwp = NULL;
1211 	}
1212 }
1213 
1214 /*
1215  * Initialize the select/poll system calls.
1216  */
1217 void
1218 selsysinit(void)
1219 {
1220 
1221 	mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1222 	cv_init(&select_cv, "select");
1223 }
1224