xref: /netbsd-src/sys/kern/sys_generic.c (revision 503611ba29d4c920cb1878a9ece7ebd1e0ac2e16)
1 /*	$NetBSD: sys_generic.c,v 1.92 2006/09/03 06:34:34 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.92 2006/09/03 06:34:34 christos Exp $");
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/filedesc.h>
47 #include <sys/ioctl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 
61 #include <sys/mount.h>
62 #include <sys/sa.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct lwp *, fd_mask *, fd_mask *, int, register_t *);
68 int pollscan(struct lwp *, struct pollfd *, int, register_t *);
69 
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct lwp *l, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct proc	*p;
86 	struct filedesc	*fdp;
87 
88 	fd = SCARG(uap, fd);
89 	p = l->l_proc;
90 	fdp = p->p_fd;
91 
92 	if ((fp = fd_getfile(fdp, fd)) == NULL)
93 		return (EBADF);
94 
95 	if ((fp->f_flag & FREAD) == 0) {
96 		simple_unlock(&fp->f_slock);
97 		return (EBADF);
98 	}
99 
100 	FILE_USE(fp);
101 
102 	/* dofileread() will unuse the descriptor for us */
103 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
104 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
105 }
106 
107 int
108 dofileread(struct lwp *l, int fd, struct file *fp, void *buf, size_t nbyte,
109 	off_t *offset, int flags, register_t *retval)
110 {
111 	struct iovec aiov;
112 	struct uio auio;
113 	struct proc *p;
114 	struct vmspace *vm;
115 	size_t cnt;
116 	int error;
117 #ifdef KTRACE
118 	struct iovec	ktriov = { .iov_base = NULL, };
119 #endif
120 	p = l->l_proc;
121 
122 	error = proc_vmspace_getref(p, &vm);
123 	if (error) {
124 		goto out;
125 	}
126 
127 	aiov.iov_base = (caddr_t)buf;
128 	aiov.iov_len = nbyte;
129 	auio.uio_iov = &aiov;
130 	auio.uio_iovcnt = 1;
131 	auio.uio_resid = nbyte;
132 	auio.uio_rw = UIO_READ;
133 	auio.uio_vmspace = vm;
134 
135 	/*
136 	 * Reads return ssize_t because -1 is returned on error.  Therefore
137 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
138 	 * values.
139 	 */
140 	if (auio.uio_resid > SSIZE_MAX) {
141 		error = EINVAL;
142 		goto out;
143 	}
144 
145 #ifdef KTRACE
146 	/*
147 	 * if tracing, save a copy of iovec
148 	 */
149 	if (KTRPOINT(p, KTR_GENIO))
150 		ktriov = aiov;
151 #endif
152 	cnt = auio.uio_resid;
153 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
154 	if (error)
155 		if (auio.uio_resid != cnt && (error == ERESTART ||
156 		    error == EINTR || error == EWOULDBLOCK))
157 			error = 0;
158 	cnt -= auio.uio_resid;
159 #ifdef KTRACE
160 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
161 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
162 #endif
163 	*retval = cnt;
164  out:
165 	FILE_UNUSE(fp, l);
166 	uvmspace_free(vm);
167 	return (error);
168 }
169 
170 /*
171  * Scatter read system call.
172  */
173 int
174 sys_readv(struct lwp *l, void *v, register_t *retval)
175 {
176 	struct sys_readv_args /* {
177 		syscallarg(int)				fd;
178 		syscallarg(const struct iovec *)	iovp;
179 		syscallarg(int)				iovcnt;
180 	} */ *uap = v;
181 	struct filedesc	*fdp;
182 	struct file *fp;
183 	struct proc *p;
184 	int fd;
185 
186 	fd = SCARG(uap, fd);
187 	p = l->l_proc;
188 	fdp = p->p_fd;
189 
190 	if ((fp = fd_getfile(fdp, fd)) == NULL)
191 		return (EBADF);
192 
193 	if ((fp->f_flag & FREAD) == 0) {
194 		simple_unlock(&fp->f_slock);
195 		return (EBADF);
196 	}
197 
198 	FILE_USE(fp);
199 
200 	/* dofilereadv() will unuse the descriptor for us */
201 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
202 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
203 }
204 
205 int
206 dofilereadv(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
207 	int iovcnt, off_t *offset, int flags, register_t *retval)
208 {
209 	struct proc *p;
210 	struct uio	auio;
211 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
212 	struct vmspace	*vm;
213 	int		i, error;
214 	size_t		cnt;
215 	u_int		iovlen;
216 #ifdef KTRACE
217 	struct iovec	*ktriov;
218 #endif
219 
220 	p = l->l_proc;
221 	error = proc_vmspace_getref(p, &vm);
222 	if (error) {
223 		goto out;
224 	}
225 
226 #ifdef KTRACE
227 	ktriov = NULL;
228 #endif
229 	/* note: can't use iovlen until iovcnt is validated */
230 	iovlen = iovcnt * sizeof(struct iovec);
231 	if ((u_int)iovcnt > UIO_SMALLIOV) {
232 		if ((u_int)iovcnt > IOV_MAX) {
233 			error = EINVAL;
234 			goto out;
235 		}
236 		iov = malloc(iovlen, M_IOV, M_WAITOK);
237 		needfree = iov;
238 	} else if ((u_int)iovcnt > 0) {
239 		iov = aiov;
240 		needfree = NULL;
241 	} else {
242 		error = EINVAL;
243 		goto out;
244 	}
245 
246 	auio.uio_iov = iov;
247 	auio.uio_iovcnt = iovcnt;
248 	auio.uio_rw = UIO_READ;
249 	auio.uio_vmspace = vm;
250 	error = copyin(iovp, iov, iovlen);
251 	if (error)
252 		goto done;
253 	auio.uio_resid = 0;
254 	for (i = 0; i < iovcnt; i++) {
255 		auio.uio_resid += iov->iov_len;
256 		/*
257 		 * Reads return ssize_t because -1 is returned on error.
258 		 * Therefore we must restrict the length to SSIZE_MAX to
259 		 * avoid garbage return values.
260 		 */
261 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
262 			error = EINVAL;
263 			goto done;
264 		}
265 		iov++;
266 	}
267 #ifdef KTRACE
268 	/*
269 	 * if tracing, save a copy of iovec
270 	 */
271 	if (KTRPOINT(p, KTR_GENIO))  {
272 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
273 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
274 	}
275 #endif
276 	cnt = auio.uio_resid;
277 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
278 	if (error)
279 		if (auio.uio_resid != cnt && (error == ERESTART ||
280 		    error == EINTR || error == EWOULDBLOCK))
281 			error = 0;
282 	cnt -= auio.uio_resid;
283 #ifdef KTRACE
284 	if (ktriov != NULL) {
285 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
286 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
287 		free(ktriov, M_TEMP);
288 	}
289 #endif
290 	*retval = cnt;
291  done:
292 	if (needfree)
293 		free(needfree, M_IOV);
294  out:
295 	FILE_UNUSE(fp, l);
296 	uvmspace_free(vm);
297 	return (error);
298 }
299 
300 /*
301  * Write system call
302  */
303 int
304 sys_write(struct lwp *l, void *v, register_t *retval)
305 {
306 	struct sys_write_args /* {
307 		syscallarg(int)			fd;
308 		syscallarg(const void *)	buf;
309 		syscallarg(size_t)		nbyte;
310 	} */ *uap = v;
311 	int		fd;
312 	struct file	*fp;
313 	struct proc	*p;
314 	struct filedesc	*fdp;
315 
316 	fd = SCARG(uap, fd);
317 	p = l->l_proc;
318 	fdp = p->p_fd;
319 
320 	if ((fp = fd_getfile(fdp, fd)) == NULL)
321 		return (EBADF);
322 
323 	if ((fp->f_flag & FWRITE) == 0) {
324 		simple_unlock(&fp->f_slock);
325 		return (EBADF);
326 	}
327 
328 	FILE_USE(fp);
329 
330 	/* dofilewrite() will unuse the descriptor for us */
331 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
332 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
333 }
334 
335 int
336 dofilewrite(struct lwp *l, int fd, struct file *fp, const void *buf,
337 	size_t nbyte, off_t *offset, int flags, register_t *retval)
338 {
339 	struct iovec aiov;
340 	struct uio auio;
341 	struct proc *p;
342 	struct vmspace *vm;
343 	size_t cnt;
344 	int error;
345 #ifdef KTRACE
346 	struct iovec	ktriov = { .iov_base = NULL, };
347 #endif
348 
349 	p = l->l_proc;
350 	error = proc_vmspace_getref(p, &vm);
351 	if (error) {
352 		goto out;
353 	}
354 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
355 	aiov.iov_len = nbyte;
356 	auio.uio_iov = &aiov;
357 	auio.uio_iovcnt = 1;
358 	auio.uio_resid = nbyte;
359 	auio.uio_rw = UIO_WRITE;
360 	auio.uio_vmspace = vm;
361 
362 	/*
363 	 * Writes return ssize_t because -1 is returned on error.  Therefore
364 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
365 	 * values.
366 	 */
367 	if (auio.uio_resid > SSIZE_MAX) {
368 		error = EINVAL;
369 		goto out;
370 	}
371 
372 #ifdef KTRACE
373 	/*
374 	 * if tracing, save a copy of iovec
375 	 */
376 	if (KTRPOINT(p, KTR_GENIO))
377 		ktriov = aiov;
378 #endif
379 	cnt = auio.uio_resid;
380 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
381 	if (error) {
382 		if (auio.uio_resid != cnt && (error == ERESTART ||
383 		    error == EINTR || error == EWOULDBLOCK))
384 			error = 0;
385 		if (error == EPIPE)
386 			psignal(p, SIGPIPE);
387 	}
388 	cnt -= auio.uio_resid;
389 #ifdef KTRACE
390 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
391 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
392 #endif
393 	*retval = cnt;
394  out:
395 	FILE_UNUSE(fp, l);
396 	uvmspace_free(vm);
397 	return (error);
398 }
399 
400 /*
401  * Gather write system call
402  */
403 int
404 sys_writev(struct lwp *l, void *v, register_t *retval)
405 {
406 	struct sys_writev_args /* {
407 		syscallarg(int)				fd;
408 		syscallarg(const struct iovec *)	iovp;
409 		syscallarg(int)				iovcnt;
410 	} */ *uap = v;
411 	int		fd;
412 	struct file	*fp;
413 	struct proc	*p;
414 	struct filedesc	*fdp;
415 
416 	fd = SCARG(uap, fd);
417 	p = l->l_proc;
418 	fdp = p->p_fd;
419 
420 	if ((fp = fd_getfile(fdp, fd)) == NULL)
421 		return (EBADF);
422 
423 	if ((fp->f_flag & FWRITE) == 0) {
424 		simple_unlock(&fp->f_slock);
425 		return (EBADF);
426 	}
427 
428 	FILE_USE(fp);
429 
430 	/* dofilewritev() will unuse the descriptor for us */
431 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
432 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
433 }
434 
435 int
436 dofilewritev(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
437 	int iovcnt, off_t *offset, int flags, register_t *retval)
438 {
439 	struct proc	*p;
440 	struct uio	auio;
441 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
442 	struct vmspace	*vm;
443 	int		i, error;
444 	size_t		cnt;
445 	u_int		iovlen;
446 #ifdef KTRACE
447 	struct iovec	*ktriov;
448 #endif
449 
450 	p = l->l_proc;
451 	error = proc_vmspace_getref(p, &vm);
452 	if (error) {
453 		goto out;
454 	}
455 #ifdef KTRACE
456 	ktriov = NULL;
457 #endif
458 	/* note: can't use iovlen until iovcnt is validated */
459 	iovlen = iovcnt * sizeof(struct iovec);
460 	if ((u_int)iovcnt > UIO_SMALLIOV) {
461 		if ((u_int)iovcnt > IOV_MAX) {
462 			error = EINVAL;
463 			goto out;
464 		}
465 		iov = malloc(iovlen, M_IOV, M_WAITOK);
466 		needfree = iov;
467 	} else if ((u_int)iovcnt > 0) {
468 		iov = aiov;
469 		needfree = NULL;
470 	} else {
471 		error = EINVAL;
472 		goto out;
473 	}
474 
475 	auio.uio_iov = iov;
476 	auio.uio_iovcnt = iovcnt;
477 	auio.uio_rw = UIO_WRITE;
478 	auio.uio_vmspace = vm;
479 	error = copyin(iovp, iov, iovlen);
480 	if (error)
481 		goto done;
482 	auio.uio_resid = 0;
483 	for (i = 0; i < iovcnt; i++) {
484 		auio.uio_resid += iov->iov_len;
485 		/*
486 		 * Writes return ssize_t because -1 is returned on error.
487 		 * Therefore we must restrict the length to SSIZE_MAX to
488 		 * avoid garbage return values.
489 		 */
490 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
491 			error = EINVAL;
492 			goto done;
493 		}
494 		iov++;
495 	}
496 #ifdef KTRACE
497 	/*
498 	 * if tracing, save a copy of iovec
499 	 */
500 	if (KTRPOINT(p, KTR_GENIO))  {
501 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
502 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
503 	}
504 #endif
505 	cnt = auio.uio_resid;
506 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
507 	if (error) {
508 		if (auio.uio_resid != cnt && (error == ERESTART ||
509 		    error == EINTR || error == EWOULDBLOCK))
510 			error = 0;
511 		if (error == EPIPE)
512 			psignal(p, SIGPIPE);
513 	}
514 	cnt -= auio.uio_resid;
515 #ifdef KTRACE
516 	if (ktriov != NULL) {
517 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
518 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
519 		free(ktriov, M_TEMP);
520 	}
521 #endif
522 	*retval = cnt;
523  done:
524 	if (needfree)
525 		free(needfree, M_IOV);
526  out:
527 	FILE_UNUSE(fp, l);
528 	uvmspace_free(vm);
529 	return (error);
530 }
531 
532 /*
533  * Ioctl system call
534  */
535 /* ARGSUSED */
536 int
537 sys_ioctl(struct lwp *l, void *v, register_t *retval)
538 {
539 	struct sys_ioctl_args /* {
540 		syscallarg(int)		fd;
541 		syscallarg(u_long)	com;
542 		syscallarg(caddr_t)	data;
543 	} */ *uap = v;
544 	struct file	*fp;
545 	struct proc	*p;
546 	struct filedesc	*fdp;
547 	u_long		com;
548 	int		error;
549 	u_int		size;
550 	caddr_t		data, memp;
551 #define	STK_PARAMS	128
552 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
553 
554 	error = 0;
555 	p = l->l_proc;
556 	fdp = p->p_fd;
557 
558 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
559 		return (EBADF);
560 
561 	FILE_USE(fp);
562 
563 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
564 		error = EBADF;
565 		com = 0;
566 		goto out;
567 	}
568 
569 	switch (com = SCARG(uap, com)) {
570 	case FIONCLEX:
571 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
572 		goto out;
573 
574 	case FIOCLEX:
575 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
576 		goto out;
577 	}
578 
579 	/*
580 	 * Interpret high order word to find amount of data to be
581 	 * copied to/from the user's address space.
582 	 */
583 	size = IOCPARM_LEN(com);
584 	if (size > IOCPARM_MAX) {
585 		error = ENOTTY;
586 		goto out;
587 	}
588 	memp = NULL;
589 	if (size > sizeof(stkbuf)) {
590 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
591 		data = memp;
592 	} else
593 		data = (caddr_t)stkbuf;
594 	if (com&IOC_IN) {
595 		if (size) {
596 			error = copyin(SCARG(uap, data), data, size);
597 			if (error) {
598 				if (memp)
599 					free(memp, M_IOCTLOPS);
600 				goto out;
601 			}
602 #ifdef KTRACE
603 			if (KTRPOINT(p, KTR_GENIO)) {
604 				struct iovec iov;
605 				iov.iov_base = SCARG(uap, data);
606 				iov.iov_len = size;
607 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
608 					size, 0);
609 			}
610 #endif
611 		} else
612 			*(caddr_t *)data = SCARG(uap, data);
613 	} else if ((com&IOC_OUT) && size)
614 		/*
615 		 * Zero the buffer so the user always
616 		 * gets back something deterministic.
617 		 */
618 		memset(data, 0, size);
619 	else if (com&IOC_VOID)
620 		*(caddr_t *)data = SCARG(uap, data);
621 
622 	switch (com) {
623 
624 	case FIONBIO:
625 		if (*(int *)data != 0)
626 			fp->f_flag |= FNONBLOCK;
627 		else
628 			fp->f_flag &= ~FNONBLOCK;
629 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
630 		break;
631 
632 	case FIOASYNC:
633 		if (*(int *)data != 0)
634 			fp->f_flag |= FASYNC;
635 		else
636 			fp->f_flag &= ~FASYNC;
637 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
638 		break;
639 
640 	default:
641 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
642 		/*
643 		 * Copy any data to user, size was
644 		 * already set and checked above.
645 		 */
646 		if (error == 0 && (com&IOC_OUT) && size) {
647 			error = copyout(data, SCARG(uap, data), size);
648 #ifdef KTRACE
649 			if (KTRPOINT(p, KTR_GENIO)) {
650 				struct iovec iov;
651 				iov.iov_base = SCARG(uap, data);
652 				iov.iov_len = size;
653 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
654 					size, error);
655 			}
656 #endif
657 		}
658 		break;
659 	}
660 	if (memp)
661 		free(memp, M_IOCTLOPS);
662  out:
663 	FILE_UNUSE(fp, l);
664 	switch (error) {
665 	case -1:
666 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
667 		    "pid=%d comm=%s\n",
668 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
669 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
670 		    p->p_pid, p->p_comm);
671 		/* FALLTHROUGH */
672 	case EPASSTHROUGH:
673 		error = ENOTTY;
674 		/* FALLTHROUGH */
675 	default:
676 		return (error);
677 	}
678 }
679 
680 int	selwait, nselcoll;
681 
682 /*
683  * Select system call.
684  */
685 int
686 sys_pselect(struct lwp *l, void *v, register_t *retval)
687 {
688 	struct sys_pselect_args /* {
689 		syscallarg(int)				nd;
690 		syscallarg(fd_set *)			in;
691 		syscallarg(fd_set *)			ou;
692 		syscallarg(fd_set *)			ex;
693 		syscallarg(const struct timespec *)	ts;
694 		syscallarg(sigset_t *)			mask;
695 	} */ * const uap = v;
696 	struct timespec	ats;
697 	struct timeval	atv, *tv = NULL;
698 	sigset_t	amask, *mask = NULL;
699 	int		error;
700 
701 	if (SCARG(uap, ts)) {
702 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
703 		if (error)
704 			return error;
705 		atv.tv_sec = ats.tv_sec;
706 		atv.tv_usec = ats.tv_nsec / 1000;
707 		tv = &atv;
708 	}
709 	if (SCARG(uap, mask) != NULL) {
710 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
711 		if (error)
712 			return error;
713 		mask = &amask;
714 	}
715 
716 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
717 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
718 }
719 
720 int
721 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
722 {
723 	if (itimerfix(tv))
724 		return -1;
725 	getmicrouptime(sleeptv);
726 	return 0;
727 }
728 
729 int
730 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
731 {
732 	/*
733 	 * We have to recalculate the timeout on every retry.
734 	 */
735 	struct timeval slepttv;
736 	/*
737 	 * reduce tv by elapsed time
738 	 * based on monotonic time scale
739 	 */
740 	getmicrouptime(&slepttv);
741 	timeradd(tv, sleeptv, tv);
742 	timersub(tv, &slepttv, tv);
743 	*sleeptv = slepttv;
744 	return tvtohz(tv);
745 }
746 
747 int
748 sys_select(struct lwp *l, void *v, register_t *retval)
749 {
750 	struct sys_select_args /* {
751 		syscallarg(int)			nd;
752 		syscallarg(fd_set *)		in;
753 		syscallarg(fd_set *)		ou;
754 		syscallarg(fd_set *)		ex;
755 		syscallarg(struct timeval *)	tv;
756 	} */ * const uap = v;
757 	struct timeval atv, *tv = NULL;
758 	int error;
759 
760 	if (SCARG(uap, tv)) {
761 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
762 			sizeof(atv));
763 		if (error)
764 			return error;
765 		tv = &atv;
766 	}
767 
768 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
769 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
770 }
771 
772 int
773 selcommon(struct lwp *l, register_t *retval, int nd, fd_set *u_in,
774 	fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
775 {
776 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
777 			    sizeof(fd_mask) * 6];
778 	struct proc	* const p = l->l_proc;
779 	caddr_t		bits;
780 	int		s, ncoll, error, timo;
781 	size_t		ni;
782 	sigset_t	oldmask;
783 	struct timeval  sleeptv;
784 
785 	error = 0;
786 	if (nd < 0)
787 		return (EINVAL);
788 	if (nd > p->p_fd->fd_nfiles) {
789 		/* forgiving; slightly wrong */
790 		nd = p->p_fd->fd_nfiles;
791 	}
792 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
793 	if (ni * 6 > sizeof(smallbits))
794 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
795 	else
796 		bits = smallbits;
797 
798 #define	getbits(name, x)						\
799 	if (u_ ## name) {						\
800 		error = copyin(u_ ## name, bits + ni * x, ni);		\
801 		if (error)						\
802 			goto done;					\
803 	} else								\
804 		memset(bits + ni * x, 0, ni);
805 	getbits(in, 0);
806 	getbits(ou, 1);
807 	getbits(ex, 2);
808 #undef	getbits
809 
810 	timo = 0;
811 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
812 		error = EINVAL;
813 		goto done;
814 	}
815 
816 	if (mask)
817 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
818 
819  retry:
820 	ncoll = nselcoll;
821 	l->l_flag |= L_SELECT;
822 	error = selscan(l, (fd_mask *)(bits + ni * 0),
823 			   (fd_mask *)(bits + ni * 3), nd, retval);
824 	if (error || *retval)
825 		goto done;
826 	if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
827 		goto done;
828 	s = splsched();
829 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
830 		splx(s);
831 		goto retry;
832 	}
833 	l->l_flag &= ~L_SELECT;
834 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
835 	splx(s);
836 	if (error == 0)
837 		goto retry;
838  done:
839 	if (mask)
840 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
841 	l->l_flag &= ~L_SELECT;
842 	/* select is not restarted after signals... */
843 	if (error == ERESTART)
844 		error = EINTR;
845 	if (error == EWOULDBLOCK)
846 		error = 0;
847 	if (error == 0) {
848 
849 #define	putbits(name, x)						\
850 		if (u_ ## name) {					\
851 			error = copyout(bits + ni * x, u_ ## name, ni); \
852 			if (error)					\
853 				goto out;				\
854 		}
855 		putbits(in, 3);
856 		putbits(ou, 4);
857 		putbits(ex, 5);
858 #undef putbits
859 	}
860  out:
861 	if (ni * 6 > sizeof(smallbits))
862 		free(bits, M_TEMP);
863 	return (error);
864 }
865 
866 int
867 selscan(struct lwp *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
868 	register_t *retval)
869 {
870 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
871 			       POLLWRNORM | POLLHUP | POLLERR,
872 			       POLLRDBAND };
873 	struct proc *p = l->l_proc;
874 	struct filedesc	*fdp;
875 	int msk, i, j, fd, n;
876 	fd_mask ibits, obits;
877 	struct file *fp;
878 
879 	fdp = p->p_fd;
880 	n = 0;
881 	for (msk = 0; msk < 3; msk++) {
882 		for (i = 0; i < nfd; i += NFDBITS) {
883 			ibits = *ibitp++;
884 			obits = 0;
885 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
886 				ibits &= ~(1 << j);
887 				if ((fp = fd_getfile(fdp, fd)) == NULL)
888 					return (EBADF);
889 				FILE_USE(fp);
890 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
891 					obits |= (1 << j);
892 					n++;
893 				}
894 				FILE_UNUSE(fp, l);
895 			}
896 			*obitp++ = obits;
897 		}
898 	}
899 	*retval = n;
900 	return (0);
901 }
902 
903 /*
904  * Poll system call.
905  */
906 int
907 sys_poll(struct lwp *l, void *v, register_t *retval)
908 {
909 	struct sys_poll_args /* {
910 		syscallarg(struct pollfd *)	fds;
911 		syscallarg(u_int)		nfds;
912 		syscallarg(int)			timeout;
913 	} */ * const uap = v;
914 	struct timeval	atv, *tv = NULL;
915 
916 	if (SCARG(uap, timeout) != INFTIM) {
917 		atv.tv_sec = SCARG(uap, timeout) / 1000;
918 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
919 		tv = &atv;
920 	}
921 
922 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
923 		tv, NULL);
924 }
925 
926 /*
927  * Poll system call.
928  */
929 int
930 sys_pollts(struct lwp *l, void *v, register_t *retval)
931 {
932 	struct sys_pollts_args /* {
933 		syscallarg(struct pollfd *)		fds;
934 		syscallarg(u_int)			nfds;
935 		syscallarg(const struct timespec *)	ts;
936 		syscallarg(const sigset_t *)		mask;
937 	} */ * const uap = v;
938 	struct timespec	ats;
939 	struct timeval	atv, *tv = NULL;
940 	sigset_t	amask, *mask = NULL;
941 	int		error;
942 
943 	if (SCARG(uap, ts)) {
944 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
945 		if (error)
946 			return error;
947 		atv.tv_sec = ats.tv_sec;
948 		atv.tv_usec = ats.tv_nsec / 1000;
949 		tv = &atv;
950 	}
951 	if (SCARG(uap, mask)) {
952 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
953 		if (error)
954 			return error;
955 		mask = &amask;
956 	}
957 
958 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
959 		tv, mask);
960 }
961 
962 int
963 pollcommon(struct lwp *l, register_t *retval,
964 	struct pollfd *u_fds, u_int nfds,
965 	struct timeval *tv, sigset_t *mask)
966 {
967 	char		smallbits[32 * sizeof(struct pollfd)];
968 	struct proc	* const p = l->l_proc;
969 	caddr_t		bits;
970 	sigset_t	oldmask;
971 	int		s, ncoll, error, timo;
972 	size_t		ni;
973 	struct timeval	sleeptv;
974 
975 	if (nfds > p->p_fd->fd_nfiles) {
976 		/* forgiving; slightly wrong */
977 		nfds = p->p_fd->fd_nfiles;
978 	}
979 	ni = nfds * sizeof(struct pollfd);
980 	if (ni > sizeof(smallbits))
981 		bits = malloc(ni, M_TEMP, M_WAITOK);
982 	else
983 		bits = smallbits;
984 
985 	error = copyin(u_fds, bits, ni);
986 	if (error)
987 		goto done;
988 
989 	timo = 0;
990 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
991 		error = EINVAL;
992 		goto done;
993 	}
994 
995 	if (mask != NULL)
996 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
997 
998  retry:
999 	ncoll = nselcoll;
1000 	l->l_flag |= L_SELECT;
1001 	error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1002 	if (error || *retval)
1003 		goto done;
1004 	if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1005 		goto done;
1006 	s = splsched();
1007 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
1008 		splx(s);
1009 		goto retry;
1010 	}
1011 	l->l_flag &= ~L_SELECT;
1012 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
1013 	splx(s);
1014 	if (error == 0)
1015 		goto retry;
1016  done:
1017 	if (mask != NULL)
1018 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
1019 	l->l_flag &= ~L_SELECT;
1020 	/* poll is not restarted after signals... */
1021 	if (error == ERESTART)
1022 		error = EINTR;
1023 	if (error == EWOULDBLOCK)
1024 		error = 0;
1025 	if (error == 0) {
1026 		error = copyout(bits, u_fds, ni);
1027 		if (error)
1028 			goto out;
1029 	}
1030  out:
1031 	if (ni > sizeof(smallbits))
1032 		free(bits, M_TEMP);
1033 	return (error);
1034 }
1035 
1036 int
1037 pollscan(struct lwp *l, struct pollfd *fds, int nfd, register_t *retval)
1038 {
1039 	struct proc	*p = l->l_proc;
1040 	struct filedesc	*fdp;
1041 	int		i, n;
1042 	struct file	*fp;
1043 
1044 	fdp = p->p_fd;
1045 	n = 0;
1046 	for (i = 0; i < nfd; i++, fds++) {
1047 		if (fds->fd >= fdp->fd_nfiles) {
1048 			fds->revents = POLLNVAL;
1049 			n++;
1050 		} else if (fds->fd < 0) {
1051 			fds->revents = 0;
1052 		} else {
1053 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1054 				fds->revents = POLLNVAL;
1055 				n++;
1056 			} else {
1057 				FILE_USE(fp);
1058 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1059 				    fds->events | POLLERR | POLLHUP, l);
1060 				if (fds->revents != 0)
1061 					n++;
1062 				FILE_UNUSE(fp, l);
1063 			}
1064 		}
1065 	}
1066 	*retval = n;
1067 	return (0);
1068 }
1069 
1070 /*ARGSUSED*/
1071 int
1072 seltrue(dev_t dev, int events, struct lwp *l)
1073 {
1074 
1075 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1076 }
1077 
1078 /*
1079  * Record a select request.
1080  */
1081 void
1082 selrecord(struct lwp *selector, struct selinfo *sip)
1083 {
1084 	struct lwp	*l;
1085 	struct proc	*p;
1086 	pid_t		mypid;
1087 
1088 	mypid = selector->l_proc->p_pid;
1089 	if (sip->sel_pid == mypid)
1090 		return;
1091 	if (sip->sel_pid && (p = pfind(sip->sel_pid))) {
1092 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1093 			if (l->l_wchan == (caddr_t)&selwait) {
1094 				sip->sel_collision = 1;
1095 				return;
1096 			}
1097 		}
1098 	}
1099 
1100 	sip->sel_pid = mypid;
1101 }
1102 
1103 /*
1104  * Do a wakeup when a selectable event occurs.
1105  */
1106 void
1107 selwakeup(sip)
1108 	struct selinfo *sip;
1109 {
1110 	struct lwp *l;
1111 	struct proc *p;
1112 	int s;
1113 
1114 	if (sip->sel_pid == 0)
1115 		return;
1116 	if (sip->sel_collision) {
1117 		sip->sel_pid = 0;
1118 		nselcoll++;
1119 		sip->sel_collision = 0;
1120 		wakeup((caddr_t)&selwait);
1121 		return;
1122 	}
1123 	p = pfind(sip->sel_pid);
1124 	sip->sel_pid = 0;
1125 	if (p != NULL) {
1126 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1127 			SCHED_LOCK(s);
1128 			if (l->l_wchan == (caddr_t)&selwait) {
1129 				if (l->l_stat == LSSLEEP)
1130 					setrunnable(l);
1131 				else
1132 					unsleep(l);
1133 			} else if (l->l_flag & L_SELECT)
1134 				l->l_flag &= ~L_SELECT;
1135 			SCHED_UNLOCK(s);
1136 		}
1137 	}
1138 }
1139