xref: /netbsd-src/sys/kern/sys_generic.c (revision f81322cf185a4db50f71fcf7701f20198272620e)
1 /*	$NetBSD: sys_generic.c,v 1.85 2006/03/01 12:38:21 yamt Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.85 2006/03/01 12:38:21 yamt Exp $");
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/filedesc.h>
47 #include <sys/ioctl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 
61 #include <sys/mount.h>
62 #include <sys/sa.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct lwp *, fd_mask *, fd_mask *, int, register_t *);
68 int pollscan(struct lwp *, struct pollfd *, int, register_t *);
69 
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct lwp *l, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct proc	*p;
86 	struct filedesc	*fdp;
87 
88 	fd = SCARG(uap, fd);
89 	p = l->l_proc;
90 	fdp = p->p_fd;
91 
92 	if ((fp = fd_getfile(fdp, fd)) == NULL)
93 		return (EBADF);
94 
95 	if ((fp->f_flag & FREAD) == 0) {
96 		simple_unlock(&fp->f_slock);
97 		return (EBADF);
98 	}
99 
100 	FILE_USE(fp);
101 
102 	/* dofileread() will unuse the descriptor for us */
103 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
104 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
105 }
106 
107 int
108 dofileread(struct lwp *l, int fd, struct file *fp, void *buf, size_t nbyte,
109 	off_t *offset, int flags, register_t *retval)
110 {
111 	struct iovec aiov;
112 	struct uio auio;
113 	struct proc *p;
114 	struct vmspace *vm;
115 	size_t cnt;
116 	int error;
117 #ifdef KTRACE
118 	struct iovec	ktriov = {0};
119 #endif
120 	p = l->l_proc;
121 
122 	error = proc_vmspace_getref(p, &vm);
123 	if (error) {
124 		goto out;
125 	}
126 
127 	aiov.iov_base = (caddr_t)buf;
128 	aiov.iov_len = nbyte;
129 	auio.uio_iov = &aiov;
130 	auio.uio_iovcnt = 1;
131 	auio.uio_resid = nbyte;
132 	auio.uio_rw = UIO_READ;
133 	auio.uio_vmspace = vm;
134 
135 	/*
136 	 * Reads return ssize_t because -1 is returned on error.  Therefore
137 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
138 	 * values.
139 	 */
140 	if (auio.uio_resid > SSIZE_MAX) {
141 		error = EINVAL;
142 		goto out;
143 	}
144 
145 #ifdef KTRACE
146 	/*
147 	 * if tracing, save a copy of iovec
148 	 */
149 	if (KTRPOINT(p, KTR_GENIO))
150 		ktriov = aiov;
151 #endif
152 	cnt = auio.uio_resid;
153 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
154 	if (error)
155 		if (auio.uio_resid != cnt && (error == ERESTART ||
156 		    error == EINTR || error == EWOULDBLOCK))
157 			error = 0;
158 	cnt -= auio.uio_resid;
159 #ifdef KTRACE
160 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
161 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
162 #endif
163 	*retval = cnt;
164  out:
165 	FILE_UNUSE(fp, l);
166 	uvmspace_free(vm);
167 	return (error);
168 }
169 
170 /*
171  * Scatter read system call.
172  */
173 int
174 sys_readv(struct lwp *l, void *v, register_t *retval)
175 {
176 	struct sys_readv_args /* {
177 		syscallarg(int)				fd;
178 		syscallarg(const struct iovec *)	iovp;
179 		syscallarg(int)				iovcnt;
180 	} */ *uap = v;
181 	struct filedesc	*fdp;
182 	struct file *fp;
183 	struct proc *p;
184 	int fd;
185 
186 	fd = SCARG(uap, fd);
187 	p = l->l_proc;
188 	fdp = p->p_fd;
189 
190 	if ((fp = fd_getfile(fdp, fd)) == NULL)
191 		return (EBADF);
192 
193 	if ((fp->f_flag & FREAD) == 0) {
194 		simple_unlock(&fp->f_slock);
195 		return (EBADF);
196 	}
197 
198 	FILE_USE(fp);
199 
200 	/* dofilereadv() will unuse the descriptor for us */
201 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
202 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
203 }
204 
205 int
206 dofilereadv(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
207 	int iovcnt, off_t *offset, int flags, register_t *retval)
208 {
209 	struct proc *p;
210 	struct uio	auio;
211 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
212 	struct vmspace	*vm;
213 	int		i, error;
214 	size_t		cnt;
215 	u_int		iovlen;
216 #ifdef KTRACE
217 	struct iovec	*ktriov;
218 #endif
219 
220 	p = l->l_proc;
221 	error = proc_vmspace_getref(p, &vm);
222 	if (error) {
223 		goto out;
224 	}
225 
226 #ifdef KTRACE
227 	ktriov = NULL;
228 #endif
229 	/* note: can't use iovlen until iovcnt is validated */
230 	iovlen = iovcnt * sizeof(struct iovec);
231 	if ((u_int)iovcnt > UIO_SMALLIOV) {
232 		if ((u_int)iovcnt > IOV_MAX) {
233 			error = EINVAL;
234 			goto out;
235 		}
236 		iov = malloc(iovlen, M_IOV, M_WAITOK);
237 		needfree = iov;
238 	} else if ((u_int)iovcnt > 0) {
239 		iov = aiov;
240 		needfree = NULL;
241 	} else {
242 		error = EINVAL;
243 		goto out;
244 	}
245 
246 	auio.uio_iov = iov;
247 	auio.uio_iovcnt = iovcnt;
248 	auio.uio_rw = UIO_READ;
249 	auio.uio_vmspace = vm;
250 	error = copyin(iovp, iov, iovlen);
251 	if (error)
252 		goto done;
253 	auio.uio_resid = 0;
254 	for (i = 0; i < iovcnt; i++) {
255 		auio.uio_resid += iov->iov_len;
256 		/*
257 		 * Reads return ssize_t because -1 is returned on error.
258 		 * Therefore we must restrict the length to SSIZE_MAX to
259 		 * avoid garbage return values.
260 		 */
261 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
262 			error = EINVAL;
263 			goto done;
264 		}
265 		iov++;
266 	}
267 #ifdef KTRACE
268 	/*
269 	 * if tracing, save a copy of iovec
270 	 */
271 	if (KTRPOINT(p, KTR_GENIO))  {
272 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
273 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
274 	}
275 #endif
276 	cnt = auio.uio_resid;
277 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
278 	if (error)
279 		if (auio.uio_resid != cnt && (error == ERESTART ||
280 		    error == EINTR || error == EWOULDBLOCK))
281 			error = 0;
282 	cnt -= auio.uio_resid;
283 #ifdef KTRACE
284 	if (ktriov != NULL) {
285 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
286 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
287 		free(ktriov, M_TEMP);
288 	}
289 #endif
290 	*retval = cnt;
291  done:
292 	if (needfree)
293 		free(needfree, M_IOV);
294  out:
295 	FILE_UNUSE(fp, l);
296 	uvmspace_free(vm);
297 	return (error);
298 }
299 
300 /*
301  * Write system call
302  */
303 int
304 sys_write(struct lwp *l, void *v, register_t *retval)
305 {
306 	struct sys_write_args /* {
307 		syscallarg(int)			fd;
308 		syscallarg(const void *)	buf;
309 		syscallarg(size_t)		nbyte;
310 	} */ *uap = v;
311 	int		fd;
312 	struct file	*fp;
313 	struct proc	*p;
314 	struct filedesc	*fdp;
315 
316 	fd = SCARG(uap, fd);
317 	p = l->l_proc;
318 	fdp = p->p_fd;
319 
320 	if ((fp = fd_getfile(fdp, fd)) == NULL)
321 		return (EBADF);
322 
323 	if ((fp->f_flag & FWRITE) == 0) {
324 		simple_unlock(&fp->f_slock);
325 		return (EBADF);
326 	}
327 
328 	FILE_USE(fp);
329 
330 	/* dofilewrite() will unuse the descriptor for us */
331 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
332 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
333 }
334 
335 int
336 dofilewrite(struct lwp *l, int fd, struct file *fp, const void *buf,
337 	size_t nbyte, off_t *offset, int flags, register_t *retval)
338 {
339 	struct iovec aiov;
340 	struct uio auio;
341 	struct proc *p;
342 	struct vmspace *vm;
343 	size_t cnt;
344 	int error;
345 #ifdef KTRACE
346 	struct iovec	ktriov = {0};
347 #endif
348 
349 	p = l->l_proc;
350 	error = proc_vmspace_getref(p, &vm);
351 	if (error) {
352 		goto out;
353 	}
354 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
355 	aiov.iov_len = nbyte;
356 	auio.uio_iov = &aiov;
357 	auio.uio_iovcnt = 1;
358 	auio.uio_resid = nbyte;
359 	auio.uio_rw = UIO_WRITE;
360 	auio.uio_vmspace = vm;
361 
362 	/*
363 	 * Writes return ssize_t because -1 is returned on error.  Therefore
364 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
365 	 * values.
366 	 */
367 	if (auio.uio_resid > SSIZE_MAX) {
368 		error = EINVAL;
369 		goto out;
370 	}
371 
372 #ifdef KTRACE
373 	/*
374 	 * if tracing, save a copy of iovec
375 	 */
376 	if (KTRPOINT(p, KTR_GENIO))
377 		ktriov = aiov;
378 #endif
379 	cnt = auio.uio_resid;
380 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
381 	if (error) {
382 		if (auio.uio_resid != cnt && (error == ERESTART ||
383 		    error == EINTR || error == EWOULDBLOCK))
384 			error = 0;
385 		if (error == EPIPE)
386 			psignal(p, SIGPIPE);
387 	}
388 	cnt -= auio.uio_resid;
389 #ifdef KTRACE
390 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
391 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
392 #endif
393 	*retval = cnt;
394  out:
395 	FILE_UNUSE(fp, l);
396 	uvmspace_free(vm);
397 	return (error);
398 }
399 
400 /*
401  * Gather write system call
402  */
403 int
404 sys_writev(struct lwp *l, void *v, register_t *retval)
405 {
406 	struct sys_writev_args /* {
407 		syscallarg(int)				fd;
408 		syscallarg(const struct iovec *)	iovp;
409 		syscallarg(int)				iovcnt;
410 	} */ *uap = v;
411 	int		fd;
412 	struct file	*fp;
413 	struct proc	*p;
414 	struct filedesc	*fdp;
415 
416 	fd = SCARG(uap, fd);
417 	p = l->l_proc;
418 	fdp = p->p_fd;
419 
420 	if ((fp = fd_getfile(fdp, fd)) == NULL)
421 		return (EBADF);
422 
423 	if ((fp->f_flag & FWRITE) == 0) {
424 		simple_unlock(&fp->f_slock);
425 		return (EBADF);
426 	}
427 
428 	FILE_USE(fp);
429 
430 	/* dofilewritev() will unuse the descriptor for us */
431 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
432 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
433 }
434 
435 int
436 dofilewritev(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
437 	int iovcnt, off_t *offset, int flags, register_t *retval)
438 {
439 	struct proc	*p;
440 	struct uio	auio;
441 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
442 	struct vmspace	*vm;
443 	int		i, error;
444 	size_t		cnt;
445 	u_int		iovlen;
446 #ifdef KTRACE
447 	struct iovec	*ktriov;
448 #endif
449 
450 	p = l->l_proc;
451 	error = proc_vmspace_getref(p, &vm);
452 	if (error) {
453 		goto out;
454 	}
455 #ifdef KTRACE
456 	ktriov = NULL;
457 #endif
458 	/* note: can't use iovlen until iovcnt is validated */
459 	iovlen = iovcnt * sizeof(struct iovec);
460 	if ((u_int)iovcnt > UIO_SMALLIOV) {
461 		if ((u_int)iovcnt > IOV_MAX) {
462 			error = EINVAL;
463 			goto out;
464 		}
465 		iov = malloc(iovlen, M_IOV, M_WAITOK);
466 		needfree = iov;
467 	} else if ((u_int)iovcnt > 0) {
468 		iov = aiov;
469 		needfree = NULL;
470 	} else {
471 		error = EINVAL;
472 		goto out;
473 	}
474 
475 	auio.uio_iov = iov;
476 	auio.uio_iovcnt = iovcnt;
477 	auio.uio_rw = UIO_WRITE;
478 	auio.uio_vmspace = vm;
479 	error = copyin(iovp, iov, iovlen);
480 	if (error)
481 		goto done;
482 	auio.uio_resid = 0;
483 	for (i = 0; i < iovcnt; i++) {
484 		auio.uio_resid += iov->iov_len;
485 		/*
486 		 * Writes return ssize_t because -1 is returned on error.
487 		 * Therefore we must restrict the length to SSIZE_MAX to
488 		 * avoid garbage return values.
489 		 */
490 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
491 			error = EINVAL;
492 			goto done;
493 		}
494 		iov++;
495 	}
496 #ifdef KTRACE
497 	/*
498 	 * if tracing, save a copy of iovec
499 	 */
500 	if (KTRPOINT(p, KTR_GENIO))  {
501 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
502 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
503 	}
504 #endif
505 	cnt = auio.uio_resid;
506 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
507 	if (error) {
508 		if (auio.uio_resid != cnt && (error == ERESTART ||
509 		    error == EINTR || error == EWOULDBLOCK))
510 			error = 0;
511 		if (error == EPIPE)
512 			psignal(p, SIGPIPE);
513 	}
514 	cnt -= auio.uio_resid;
515 #ifdef KTRACE
516 	if (ktriov != NULL) {
517 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
518 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
519 		free(ktriov, M_TEMP);
520 	}
521 #endif
522 	*retval = cnt;
523  done:
524 	if (needfree)
525 		free(needfree, M_IOV);
526  out:
527 	FILE_UNUSE(fp, l);
528 	uvmspace_free(vm);
529 	return (error);
530 }
531 
532 /*
533  * Ioctl system call
534  */
535 /* ARGSUSED */
536 int
537 sys_ioctl(struct lwp *l, void *v, register_t *retval)
538 {
539 	struct sys_ioctl_args /* {
540 		syscallarg(int)		fd;
541 		syscallarg(u_long)	com;
542 		syscallarg(caddr_t)	data;
543 	} */ *uap = v;
544 	struct file	*fp;
545 	struct proc	*p;
546 	struct filedesc	*fdp;
547 	u_long		com;
548 	int		error;
549 	u_int		size;
550 	caddr_t		data, memp;
551 #define	STK_PARAMS	128
552 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
553 
554 	error = 0;
555 	p = l->l_proc;
556 	fdp = p->p_fd;
557 
558 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
559 		return (EBADF);
560 
561 	FILE_USE(fp);
562 
563 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
564 		error = EBADF;
565 		com = 0;
566 		goto out;
567 	}
568 
569 	switch (com = SCARG(uap, com)) {
570 	case FIONCLEX:
571 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
572 		goto out;
573 
574 	case FIOCLEX:
575 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
576 		goto out;
577 	}
578 
579 	/*
580 	 * Interpret high order word to find amount of data to be
581 	 * copied to/from the user's address space.
582 	 */
583 	size = IOCPARM_LEN(com);
584 	if (size > IOCPARM_MAX) {
585 		error = ENOTTY;
586 		goto out;
587 	}
588 	memp = NULL;
589 	if (size > sizeof(stkbuf)) {
590 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
591 		data = memp;
592 	} else
593 		data = (caddr_t)stkbuf;
594 	if (com&IOC_IN) {
595 		if (size) {
596 			error = copyin(SCARG(uap, data), data, size);
597 			if (error) {
598 				if (memp)
599 					free(memp, M_IOCTLOPS);
600 				goto out;
601 			}
602 #ifdef KTRACE
603 			if (KTRPOINT(p, KTR_GENIO)) {
604 				struct iovec iov;
605 				iov.iov_base = SCARG(uap, data);
606 				iov.iov_len = size;
607 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
608 					size, 0);
609 			}
610 #endif
611 		} else
612 			*(caddr_t *)data = SCARG(uap, data);
613 	} else if ((com&IOC_OUT) && size)
614 		/*
615 		 * Zero the buffer so the user always
616 		 * gets back something deterministic.
617 		 */
618 		memset(data, 0, size);
619 	else if (com&IOC_VOID)
620 		*(caddr_t *)data = SCARG(uap, data);
621 
622 	switch (com) {
623 
624 	case FIONBIO:
625 		if (*(int *)data != 0)
626 			fp->f_flag |= FNONBLOCK;
627 		else
628 			fp->f_flag &= ~FNONBLOCK;
629 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
630 		break;
631 
632 	case FIOASYNC:
633 		if (*(int *)data != 0)
634 			fp->f_flag |= FASYNC;
635 		else
636 			fp->f_flag &= ~FASYNC;
637 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
638 		break;
639 
640 	default:
641 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
642 		/*
643 		 * Copy any data to user, size was
644 		 * already set and checked above.
645 		 */
646 		if (error == 0 && (com&IOC_OUT) && size) {
647 			error = copyout(data, SCARG(uap, data), size);
648 #ifdef KTRACE
649 			if (KTRPOINT(p, KTR_GENIO)) {
650 				struct iovec iov;
651 				iov.iov_base = SCARG(uap, data);
652 				iov.iov_len = size;
653 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
654 					size, error);
655 			}
656 #endif
657 		}
658 		break;
659 	}
660 	if (memp)
661 		free(memp, M_IOCTLOPS);
662  out:
663 	FILE_UNUSE(fp, l);
664 	switch (error) {
665 	case -1:
666 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
667 		    "pid=%d comm=%s\n",
668 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
669 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
670 		    p->p_pid, p->p_comm);
671 		/* FALLTHROUGH */
672 	case EPASSTHROUGH:
673 		error = ENOTTY;
674 		/* FALLTHROUGH */
675 	default:
676 		return (error);
677 	}
678 }
679 
680 int	selwait, nselcoll;
681 
682 /*
683  * Select system call.
684  */
685 int
686 sys_pselect(struct lwp *l, void *v, register_t *retval)
687 {
688 	struct sys_pselect_args /* {
689 		syscallarg(int)				nd;
690 		syscallarg(fd_set *)			in;
691 		syscallarg(fd_set *)			ou;
692 		syscallarg(fd_set *)			ex;
693 		syscallarg(const struct timespec *)	ts;
694 		syscallarg(sigset_t *)			mask;
695 	} */ * const uap = v;
696 	struct timespec	ats;
697 	struct timeval	atv, *tv = NULL;
698 	sigset_t	amask, *mask = NULL;
699 	int		error;
700 
701 	if (SCARG(uap, ts)) {
702 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
703 		if (error)
704 			return error;
705 		atv.tv_sec = ats.tv_sec;
706 		atv.tv_usec = ats.tv_nsec / 1000;
707 		tv = &atv;
708 	}
709 	if (SCARG(uap, mask) != NULL) {
710 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
711 		if (error)
712 			return error;
713 		mask = &amask;
714 	}
715 
716 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
717 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
718 }
719 
720 int
721 sys_select(struct lwp *l, void *v, register_t *retval)
722 {
723 	struct sys_select_args /* {
724 		syscallarg(int)			nd;
725 		syscallarg(fd_set *)		in;
726 		syscallarg(fd_set *)		ou;
727 		syscallarg(fd_set *)		ex;
728 		syscallarg(struct timeval *)	tv;
729 	} */ * const uap = v;
730 	struct timeval atv, *tv = NULL;
731 	int error;
732 
733 	if (SCARG(uap, tv)) {
734 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
735 			sizeof(atv));
736 		if (error)
737 			return error;
738 		tv = &atv;
739 	}
740 
741 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
742 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
743 }
744 
745 int
746 selcommon(struct lwp *l, register_t *retval, int nd, fd_set *u_in,
747 	fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
748 {
749 	struct proc	* const p = l->l_proc;
750 	caddr_t		bits;
751 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
752 			    sizeof(fd_mask) * 6];
753 	int		s, ncoll, error, timo;
754 	size_t		ni;
755 	sigset_t	oldmask;
756 
757 	error = 0;
758 	if (nd < 0)
759 		return (EINVAL);
760 	if (nd > p->p_fd->fd_nfiles) {
761 		/* forgiving; slightly wrong */
762 		nd = p->p_fd->fd_nfiles;
763 	}
764 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
765 	if (ni * 6 > sizeof(smallbits))
766 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
767 	else
768 		bits = smallbits;
769 
770 #define	getbits(name, x)						\
771 	if (u_ ## name) {						\
772 		error = copyin(u_ ## name, bits + ni * x, ni);		\
773 		if (error)						\
774 			goto done;					\
775 	} else								\
776 		memset(bits + ni * x, 0, ni);
777 	getbits(in, 0);
778 	getbits(ou, 1);
779 	getbits(ex, 2);
780 #undef	getbits
781 
782 	timo = 0;
783 	if (tv) {
784 		if (itimerfix(tv)) {
785 			error = EINVAL;
786 			goto done;
787 		}
788 		s = splclock();
789 		timeradd(tv, &time, tv);
790 		splx(s);
791 	}
792 	if (mask)
793 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
794 
795  retry:
796 	ncoll = nselcoll;
797 	l->l_flag |= L_SELECT;
798 	error = selscan(l, (fd_mask *)(bits + ni * 0),
799 			   (fd_mask *)(bits + ni * 3), nd, retval);
800 	if (error || *retval)
801 		goto done;
802 	if (tv) {
803 		/*
804 		 * We have to recalculate the timeout on every retry.
805 		 */
806 		timo = hzto(tv);
807 		if (timo <= 0)
808 			goto done;
809 	}
810 	s = splsched();
811 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
812 		splx(s);
813 		goto retry;
814 	}
815 	l->l_flag &= ~L_SELECT;
816 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
817 	splx(s);
818 	if (error == 0)
819 		goto retry;
820  done:
821 	if (mask)
822 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
823 	l->l_flag &= ~L_SELECT;
824 	/* select is not restarted after signals... */
825 	if (error == ERESTART)
826 		error = EINTR;
827 	if (error == EWOULDBLOCK)
828 		error = 0;
829 	if (error == 0) {
830 
831 #define	putbits(name, x)						\
832 		if (u_ ## name) {					\
833 			error = copyout(bits + ni * x, u_ ## name, ni); \
834 			if (error)					\
835 				goto out;				\
836 		}
837 		putbits(in, 3);
838 		putbits(ou, 4);
839 		putbits(ex, 5);
840 #undef putbits
841 	}
842  out:
843 	if (ni * 6 > sizeof(smallbits))
844 		free(bits, M_TEMP);
845 	return (error);
846 }
847 
848 int
849 selscan(struct lwp *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
850 	register_t *retval)
851 {
852 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
853 			       POLLWRNORM | POLLHUP | POLLERR,
854 			       POLLRDBAND };
855 	struct proc *p = l->l_proc;
856 	struct filedesc	*fdp;
857 	int msk, i, j, fd, n;
858 	fd_mask ibits, obits;
859 	struct file *fp;
860 
861 	fdp = p->p_fd;
862 	n = 0;
863 	for (msk = 0; msk < 3; msk++) {
864 		for (i = 0; i < nfd; i += NFDBITS) {
865 			ibits = *ibitp++;
866 			obits = 0;
867 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
868 				ibits &= ~(1 << j);
869 				if ((fp = fd_getfile(fdp, fd)) == NULL)
870 					return (EBADF);
871 				FILE_USE(fp);
872 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
873 					obits |= (1 << j);
874 					n++;
875 				}
876 				FILE_UNUSE(fp, l);
877 			}
878 			*obitp++ = obits;
879 		}
880 	}
881 	*retval = n;
882 	return (0);
883 }
884 
885 /*
886  * Poll system call.
887  */
888 int
889 sys_poll(struct lwp *l, void *v, register_t *retval)
890 {
891 	struct sys_poll_args /* {
892 		syscallarg(struct pollfd *)	fds;
893 		syscallarg(u_int)		nfds;
894 		syscallarg(int)			timeout;
895 	} */ * const uap = v;
896 	struct timeval	atv, *tv = NULL;
897 
898 	if (SCARG(uap, timeout) != INFTIM) {
899 		atv.tv_sec = SCARG(uap, timeout) / 1000;
900 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
901 		tv = &atv;
902 	}
903 
904 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
905 		tv, NULL);
906 }
907 
908 /*
909  * Poll system call.
910  */
911 int
912 sys_pollts(struct lwp *l, void *v, register_t *retval)
913 {
914 	struct sys_pollts_args /* {
915 		syscallarg(struct pollfd *)		fds;
916 		syscallarg(u_int)			nfds;
917 		syscallarg(const struct timespec *)	ts;
918 		syscallarg(const sigset_t *)		mask;
919 	} */ * const uap = v;
920 	struct timespec	ats;
921 	struct timeval	atv, *tv = NULL;
922 	sigset_t	amask, *mask = NULL;
923 	int		error;
924 
925 	if (SCARG(uap, ts)) {
926 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
927 		if (error)
928 			return error;
929 		atv.tv_sec = ats.tv_sec;
930 		atv.tv_usec = ats.tv_nsec / 1000;
931 		tv = &atv;
932 	}
933 	if (SCARG(uap, mask)) {
934 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
935 		if (error)
936 			return error;
937 		mask = &amask;
938 	}
939 
940 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
941 		tv, mask);
942 }
943 
944 int
945 pollcommon(struct lwp *l, register_t *retval,
946 	struct pollfd *u_fds, u_int nfds,
947 	struct timeval *tv, sigset_t *mask)
948 {
949 	struct proc	* const p = l->l_proc;
950 	caddr_t		bits;
951 	char		smallbits[32 * sizeof(struct pollfd)];
952 	sigset_t	oldmask;
953 	int		s, ncoll, error, timo;
954 	size_t		ni;
955 
956 	if (nfds > p->p_fd->fd_nfiles) {
957 		/* forgiving; slightly wrong */
958 		nfds = p->p_fd->fd_nfiles;
959 	}
960 	ni = nfds * sizeof(struct pollfd);
961 	if (ni > sizeof(smallbits))
962 		bits = malloc(ni, M_TEMP, M_WAITOK);
963 	else
964 		bits = smallbits;
965 
966 	error = copyin(u_fds, bits, ni);
967 	if (error)
968 		goto done;
969 
970 	timo = 0;
971 	if (tv) {
972 		if (itimerfix(tv)) {
973 			error = EINVAL;
974 			goto done;
975 		}
976 		s = splclock();
977 		timeradd(tv, &time, tv);
978 		splx(s);
979 	}
980 	if (mask != NULL)
981 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
982 
983  retry:
984 	ncoll = nselcoll;
985 	l->l_flag |= L_SELECT;
986 	error = pollscan(l, (struct pollfd *)bits, nfds, retval);
987 	if (error || *retval)
988 		goto done;
989 	if (tv) {
990 		/*
991 		 * We have to recalculate the timeout on every retry.
992 		 */
993 		timo = hzto(tv);
994 		if (timo <= 0)
995 			goto done;
996 	}
997 	s = splsched();
998 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
999 		splx(s);
1000 		goto retry;
1001 	}
1002 	l->l_flag &= ~L_SELECT;
1003 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
1004 	splx(s);
1005 	if (error == 0)
1006 		goto retry;
1007  done:
1008 	if (mask != NULL)
1009 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
1010 	l->l_flag &= ~L_SELECT;
1011 	/* poll is not restarted after signals... */
1012 	if (error == ERESTART)
1013 		error = EINTR;
1014 	if (error == EWOULDBLOCK)
1015 		error = 0;
1016 	if (error == 0) {
1017 		error = copyout(bits, u_fds, ni);
1018 		if (error)
1019 			goto out;
1020 	}
1021  out:
1022 	if (ni > sizeof(smallbits))
1023 		free(bits, M_TEMP);
1024 	return (error);
1025 }
1026 
1027 int
1028 pollscan(struct lwp *l, struct pollfd *fds, int nfd, register_t *retval)
1029 {
1030 	struct proc	*p = l->l_proc;
1031 	struct filedesc	*fdp;
1032 	int		i, n;
1033 	struct file	*fp;
1034 
1035 	fdp = p->p_fd;
1036 	n = 0;
1037 	for (i = 0; i < nfd; i++, fds++) {
1038 		if (fds->fd >= fdp->fd_nfiles) {
1039 			fds->revents = POLLNVAL;
1040 			n++;
1041 		} else if (fds->fd < 0) {
1042 			fds->revents = 0;
1043 		} else {
1044 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1045 				fds->revents = POLLNVAL;
1046 				n++;
1047 			} else {
1048 				FILE_USE(fp);
1049 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1050 				    fds->events | POLLERR | POLLHUP, l);
1051 				if (fds->revents != 0)
1052 					n++;
1053 				FILE_UNUSE(fp, l);
1054 			}
1055 		}
1056 	}
1057 	*retval = n;
1058 	return (0);
1059 }
1060 
1061 /*ARGSUSED*/
1062 int
1063 seltrue(dev_t dev, int events, struct lwp *l)
1064 {
1065 
1066 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1067 }
1068 
1069 /*
1070  * Record a select request.
1071  */
1072 void
1073 selrecord(struct lwp *selector, struct selinfo *sip)
1074 {
1075 	struct lwp	*l;
1076 	struct proc	*p;
1077 	pid_t		mypid;
1078 
1079 	mypid = selector->l_proc->p_pid;
1080 	if (sip->sel_pid == mypid)
1081 		return;
1082 	if (sip->sel_pid && (p = pfind(sip->sel_pid))) {
1083 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1084 			if (l->l_wchan == (caddr_t)&selwait) {
1085 				sip->sel_collision = 1;
1086 				return;
1087 			}
1088 		}
1089 	}
1090 
1091 	sip->sel_pid = mypid;
1092 }
1093 
1094 /*
1095  * Do a wakeup when a selectable event occurs.
1096  */
1097 void
1098 selwakeup(sip)
1099 	struct selinfo *sip;
1100 {
1101 	struct lwp *l;
1102 	struct proc *p;
1103 	int s;
1104 
1105 	if (sip->sel_pid == 0)
1106 		return;
1107 	if (sip->sel_collision) {
1108 		sip->sel_pid = 0;
1109 		nselcoll++;
1110 		sip->sel_collision = 0;
1111 		wakeup((caddr_t)&selwait);
1112 		return;
1113 	}
1114 	p = pfind(sip->sel_pid);
1115 	sip->sel_pid = 0;
1116 	if (p != NULL) {
1117 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1118 			SCHED_LOCK(s);
1119 			if (l->l_wchan == (caddr_t)&selwait) {
1120 				if (l->l_stat == LSSLEEP)
1121 					setrunnable(l);
1122 				else
1123 					unsleep(l);
1124 			} else if (l->l_flag & L_SELECT)
1125 				l->l_flag &= ~L_SELECT;
1126 			SCHED_UNLOCK(s);
1127 		}
1128 	}
1129 }
1130