xref: /netbsd-src/sys/kern/sys_generic.c (revision d48f14661dda8638fee055ba15d35bdfb29b9fa8)
1 /*	$NetBSD: sys_generic.c,v 1.86 2006/06/07 22:33:40 kardel Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.86 2006/06/07 22:33:40 kardel Exp $");
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/filedesc.h>
47 #include <sys/ioctl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 
61 #include <sys/mount.h>
62 #include <sys/sa.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct lwp *, fd_mask *, fd_mask *, int, register_t *);
68 int pollscan(struct lwp *, struct pollfd *, int, register_t *);
69 
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct lwp *l, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct proc	*p;
86 	struct filedesc	*fdp;
87 
88 	fd = SCARG(uap, fd);
89 	p = l->l_proc;
90 	fdp = p->p_fd;
91 
92 	if ((fp = fd_getfile(fdp, fd)) == NULL)
93 		return (EBADF);
94 
95 	if ((fp->f_flag & FREAD) == 0) {
96 		simple_unlock(&fp->f_slock);
97 		return (EBADF);
98 	}
99 
100 	FILE_USE(fp);
101 
102 	/* dofileread() will unuse the descriptor for us */
103 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
104 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
105 }
106 
107 int
108 dofileread(struct lwp *l, int fd, struct file *fp, void *buf, size_t nbyte,
109 	off_t *offset, int flags, register_t *retval)
110 {
111 	struct iovec aiov;
112 	struct uio auio;
113 	struct proc *p;
114 	struct vmspace *vm;
115 	size_t cnt;
116 	int error;
117 #ifdef KTRACE
118 	struct iovec	ktriov = {0};
119 #endif
120 	p = l->l_proc;
121 
122 	error = proc_vmspace_getref(p, &vm);
123 	if (error) {
124 		goto out;
125 	}
126 
127 	aiov.iov_base = (caddr_t)buf;
128 	aiov.iov_len = nbyte;
129 	auio.uio_iov = &aiov;
130 	auio.uio_iovcnt = 1;
131 	auio.uio_resid = nbyte;
132 	auio.uio_rw = UIO_READ;
133 	auio.uio_vmspace = vm;
134 
135 	/*
136 	 * Reads return ssize_t because -1 is returned on error.  Therefore
137 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
138 	 * values.
139 	 */
140 	if (auio.uio_resid > SSIZE_MAX) {
141 		error = EINVAL;
142 		goto out;
143 	}
144 
145 #ifdef KTRACE
146 	/*
147 	 * if tracing, save a copy of iovec
148 	 */
149 	if (KTRPOINT(p, KTR_GENIO))
150 		ktriov = aiov;
151 #endif
152 	cnt = auio.uio_resid;
153 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
154 	if (error)
155 		if (auio.uio_resid != cnt && (error == ERESTART ||
156 		    error == EINTR || error == EWOULDBLOCK))
157 			error = 0;
158 	cnt -= auio.uio_resid;
159 #ifdef KTRACE
160 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
161 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
162 #endif
163 	*retval = cnt;
164  out:
165 	FILE_UNUSE(fp, l);
166 	uvmspace_free(vm);
167 	return (error);
168 }
169 
170 /*
171  * Scatter read system call.
172  */
173 int
174 sys_readv(struct lwp *l, void *v, register_t *retval)
175 {
176 	struct sys_readv_args /* {
177 		syscallarg(int)				fd;
178 		syscallarg(const struct iovec *)	iovp;
179 		syscallarg(int)				iovcnt;
180 	} */ *uap = v;
181 	struct filedesc	*fdp;
182 	struct file *fp;
183 	struct proc *p;
184 	int fd;
185 
186 	fd = SCARG(uap, fd);
187 	p = l->l_proc;
188 	fdp = p->p_fd;
189 
190 	if ((fp = fd_getfile(fdp, fd)) == NULL)
191 		return (EBADF);
192 
193 	if ((fp->f_flag & FREAD) == 0) {
194 		simple_unlock(&fp->f_slock);
195 		return (EBADF);
196 	}
197 
198 	FILE_USE(fp);
199 
200 	/* dofilereadv() will unuse the descriptor for us */
201 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
202 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
203 }
204 
205 int
206 dofilereadv(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
207 	int iovcnt, off_t *offset, int flags, register_t *retval)
208 {
209 	struct proc *p;
210 	struct uio	auio;
211 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
212 	struct vmspace	*vm;
213 	int		i, error;
214 	size_t		cnt;
215 	u_int		iovlen;
216 #ifdef KTRACE
217 	struct iovec	*ktriov;
218 #endif
219 
220 	p = l->l_proc;
221 	error = proc_vmspace_getref(p, &vm);
222 	if (error) {
223 		goto out;
224 	}
225 
226 #ifdef KTRACE
227 	ktriov = NULL;
228 #endif
229 	/* note: can't use iovlen until iovcnt is validated */
230 	iovlen = iovcnt * sizeof(struct iovec);
231 	if ((u_int)iovcnt > UIO_SMALLIOV) {
232 		if ((u_int)iovcnt > IOV_MAX) {
233 			error = EINVAL;
234 			goto out;
235 		}
236 		iov = malloc(iovlen, M_IOV, M_WAITOK);
237 		needfree = iov;
238 	} else if ((u_int)iovcnt > 0) {
239 		iov = aiov;
240 		needfree = NULL;
241 	} else {
242 		error = EINVAL;
243 		goto out;
244 	}
245 
246 	auio.uio_iov = iov;
247 	auio.uio_iovcnt = iovcnt;
248 	auio.uio_rw = UIO_READ;
249 	auio.uio_vmspace = vm;
250 	error = copyin(iovp, iov, iovlen);
251 	if (error)
252 		goto done;
253 	auio.uio_resid = 0;
254 	for (i = 0; i < iovcnt; i++) {
255 		auio.uio_resid += iov->iov_len;
256 		/*
257 		 * Reads return ssize_t because -1 is returned on error.
258 		 * Therefore we must restrict the length to SSIZE_MAX to
259 		 * avoid garbage return values.
260 		 */
261 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
262 			error = EINVAL;
263 			goto done;
264 		}
265 		iov++;
266 	}
267 #ifdef KTRACE
268 	/*
269 	 * if tracing, save a copy of iovec
270 	 */
271 	if (KTRPOINT(p, KTR_GENIO))  {
272 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
273 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
274 	}
275 #endif
276 	cnt = auio.uio_resid;
277 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
278 	if (error)
279 		if (auio.uio_resid != cnt && (error == ERESTART ||
280 		    error == EINTR || error == EWOULDBLOCK))
281 			error = 0;
282 	cnt -= auio.uio_resid;
283 #ifdef KTRACE
284 	if (ktriov != NULL) {
285 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
286 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
287 		free(ktriov, M_TEMP);
288 	}
289 #endif
290 	*retval = cnt;
291  done:
292 	if (needfree)
293 		free(needfree, M_IOV);
294  out:
295 	FILE_UNUSE(fp, l);
296 	uvmspace_free(vm);
297 	return (error);
298 }
299 
300 /*
301  * Write system call
302  */
303 int
304 sys_write(struct lwp *l, void *v, register_t *retval)
305 {
306 	struct sys_write_args /* {
307 		syscallarg(int)			fd;
308 		syscallarg(const void *)	buf;
309 		syscallarg(size_t)		nbyte;
310 	} */ *uap = v;
311 	int		fd;
312 	struct file	*fp;
313 	struct proc	*p;
314 	struct filedesc	*fdp;
315 
316 	fd = SCARG(uap, fd);
317 	p = l->l_proc;
318 	fdp = p->p_fd;
319 
320 	if ((fp = fd_getfile(fdp, fd)) == NULL)
321 		return (EBADF);
322 
323 	if ((fp->f_flag & FWRITE) == 0) {
324 		simple_unlock(&fp->f_slock);
325 		return (EBADF);
326 	}
327 
328 	FILE_USE(fp);
329 
330 	/* dofilewrite() will unuse the descriptor for us */
331 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
332 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
333 }
334 
335 int
336 dofilewrite(struct lwp *l, int fd, struct file *fp, const void *buf,
337 	size_t nbyte, off_t *offset, int flags, register_t *retval)
338 {
339 	struct iovec aiov;
340 	struct uio auio;
341 	struct proc *p;
342 	struct vmspace *vm;
343 	size_t cnt;
344 	int error;
345 #ifdef KTRACE
346 	struct iovec	ktriov = {0};
347 #endif
348 
349 	p = l->l_proc;
350 	error = proc_vmspace_getref(p, &vm);
351 	if (error) {
352 		goto out;
353 	}
354 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
355 	aiov.iov_len = nbyte;
356 	auio.uio_iov = &aiov;
357 	auio.uio_iovcnt = 1;
358 	auio.uio_resid = nbyte;
359 	auio.uio_rw = UIO_WRITE;
360 	auio.uio_vmspace = vm;
361 
362 	/*
363 	 * Writes return ssize_t because -1 is returned on error.  Therefore
364 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
365 	 * values.
366 	 */
367 	if (auio.uio_resid > SSIZE_MAX) {
368 		error = EINVAL;
369 		goto out;
370 	}
371 
372 #ifdef KTRACE
373 	/*
374 	 * if tracing, save a copy of iovec
375 	 */
376 	if (KTRPOINT(p, KTR_GENIO))
377 		ktriov = aiov;
378 #endif
379 	cnt = auio.uio_resid;
380 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
381 	if (error) {
382 		if (auio.uio_resid != cnt && (error == ERESTART ||
383 		    error == EINTR || error == EWOULDBLOCK))
384 			error = 0;
385 		if (error == EPIPE)
386 			psignal(p, SIGPIPE);
387 	}
388 	cnt -= auio.uio_resid;
389 #ifdef KTRACE
390 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
391 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
392 #endif
393 	*retval = cnt;
394  out:
395 	FILE_UNUSE(fp, l);
396 	uvmspace_free(vm);
397 	return (error);
398 }
399 
400 /*
401  * Gather write system call
402  */
403 int
404 sys_writev(struct lwp *l, void *v, register_t *retval)
405 {
406 	struct sys_writev_args /* {
407 		syscallarg(int)				fd;
408 		syscallarg(const struct iovec *)	iovp;
409 		syscallarg(int)				iovcnt;
410 	} */ *uap = v;
411 	int		fd;
412 	struct file	*fp;
413 	struct proc	*p;
414 	struct filedesc	*fdp;
415 
416 	fd = SCARG(uap, fd);
417 	p = l->l_proc;
418 	fdp = p->p_fd;
419 
420 	if ((fp = fd_getfile(fdp, fd)) == NULL)
421 		return (EBADF);
422 
423 	if ((fp->f_flag & FWRITE) == 0) {
424 		simple_unlock(&fp->f_slock);
425 		return (EBADF);
426 	}
427 
428 	FILE_USE(fp);
429 
430 	/* dofilewritev() will unuse the descriptor for us */
431 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
432 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
433 }
434 
435 int
436 dofilewritev(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
437 	int iovcnt, off_t *offset, int flags, register_t *retval)
438 {
439 	struct proc	*p;
440 	struct uio	auio;
441 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
442 	struct vmspace	*vm;
443 	int		i, error;
444 	size_t		cnt;
445 	u_int		iovlen;
446 #ifdef KTRACE
447 	struct iovec	*ktriov;
448 #endif
449 
450 	p = l->l_proc;
451 	error = proc_vmspace_getref(p, &vm);
452 	if (error) {
453 		goto out;
454 	}
455 #ifdef KTRACE
456 	ktriov = NULL;
457 #endif
458 	/* note: can't use iovlen until iovcnt is validated */
459 	iovlen = iovcnt * sizeof(struct iovec);
460 	if ((u_int)iovcnt > UIO_SMALLIOV) {
461 		if ((u_int)iovcnt > IOV_MAX) {
462 			error = EINVAL;
463 			goto out;
464 		}
465 		iov = malloc(iovlen, M_IOV, M_WAITOK);
466 		needfree = iov;
467 	} else if ((u_int)iovcnt > 0) {
468 		iov = aiov;
469 		needfree = NULL;
470 	} else {
471 		error = EINVAL;
472 		goto out;
473 	}
474 
475 	auio.uio_iov = iov;
476 	auio.uio_iovcnt = iovcnt;
477 	auio.uio_rw = UIO_WRITE;
478 	auio.uio_vmspace = vm;
479 	error = copyin(iovp, iov, iovlen);
480 	if (error)
481 		goto done;
482 	auio.uio_resid = 0;
483 	for (i = 0; i < iovcnt; i++) {
484 		auio.uio_resid += iov->iov_len;
485 		/*
486 		 * Writes return ssize_t because -1 is returned on error.
487 		 * Therefore we must restrict the length to SSIZE_MAX to
488 		 * avoid garbage return values.
489 		 */
490 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
491 			error = EINVAL;
492 			goto done;
493 		}
494 		iov++;
495 	}
496 #ifdef KTRACE
497 	/*
498 	 * if tracing, save a copy of iovec
499 	 */
500 	if (KTRPOINT(p, KTR_GENIO))  {
501 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
502 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
503 	}
504 #endif
505 	cnt = auio.uio_resid;
506 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
507 	if (error) {
508 		if (auio.uio_resid != cnt && (error == ERESTART ||
509 		    error == EINTR || error == EWOULDBLOCK))
510 			error = 0;
511 		if (error == EPIPE)
512 			psignal(p, SIGPIPE);
513 	}
514 	cnt -= auio.uio_resid;
515 #ifdef KTRACE
516 	if (ktriov != NULL) {
517 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
518 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
519 		free(ktriov, M_TEMP);
520 	}
521 #endif
522 	*retval = cnt;
523  done:
524 	if (needfree)
525 		free(needfree, M_IOV);
526  out:
527 	FILE_UNUSE(fp, l);
528 	uvmspace_free(vm);
529 	return (error);
530 }
531 
532 /*
533  * Ioctl system call
534  */
535 /* ARGSUSED */
536 int
537 sys_ioctl(struct lwp *l, void *v, register_t *retval)
538 {
539 	struct sys_ioctl_args /* {
540 		syscallarg(int)		fd;
541 		syscallarg(u_long)	com;
542 		syscallarg(caddr_t)	data;
543 	} */ *uap = v;
544 	struct file	*fp;
545 	struct proc	*p;
546 	struct filedesc	*fdp;
547 	u_long		com;
548 	int		error;
549 	u_int		size;
550 	caddr_t		data, memp;
551 #define	STK_PARAMS	128
552 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
553 
554 	error = 0;
555 	p = l->l_proc;
556 	fdp = p->p_fd;
557 
558 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
559 		return (EBADF);
560 
561 	FILE_USE(fp);
562 
563 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
564 		error = EBADF;
565 		com = 0;
566 		goto out;
567 	}
568 
569 	switch (com = SCARG(uap, com)) {
570 	case FIONCLEX:
571 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
572 		goto out;
573 
574 	case FIOCLEX:
575 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
576 		goto out;
577 	}
578 
579 	/*
580 	 * Interpret high order word to find amount of data to be
581 	 * copied to/from the user's address space.
582 	 */
583 	size = IOCPARM_LEN(com);
584 	if (size > IOCPARM_MAX) {
585 		error = ENOTTY;
586 		goto out;
587 	}
588 	memp = NULL;
589 	if (size > sizeof(stkbuf)) {
590 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
591 		data = memp;
592 	} else
593 		data = (caddr_t)stkbuf;
594 	if (com&IOC_IN) {
595 		if (size) {
596 			error = copyin(SCARG(uap, data), data, size);
597 			if (error) {
598 				if (memp)
599 					free(memp, M_IOCTLOPS);
600 				goto out;
601 			}
602 #ifdef KTRACE
603 			if (KTRPOINT(p, KTR_GENIO)) {
604 				struct iovec iov;
605 				iov.iov_base = SCARG(uap, data);
606 				iov.iov_len = size;
607 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
608 					size, 0);
609 			}
610 #endif
611 		} else
612 			*(caddr_t *)data = SCARG(uap, data);
613 	} else if ((com&IOC_OUT) && size)
614 		/*
615 		 * Zero the buffer so the user always
616 		 * gets back something deterministic.
617 		 */
618 		memset(data, 0, size);
619 	else if (com&IOC_VOID)
620 		*(caddr_t *)data = SCARG(uap, data);
621 
622 	switch (com) {
623 
624 	case FIONBIO:
625 		if (*(int *)data != 0)
626 			fp->f_flag |= FNONBLOCK;
627 		else
628 			fp->f_flag &= ~FNONBLOCK;
629 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
630 		break;
631 
632 	case FIOASYNC:
633 		if (*(int *)data != 0)
634 			fp->f_flag |= FASYNC;
635 		else
636 			fp->f_flag &= ~FASYNC;
637 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
638 		break;
639 
640 	default:
641 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
642 		/*
643 		 * Copy any data to user, size was
644 		 * already set and checked above.
645 		 */
646 		if (error == 0 && (com&IOC_OUT) && size) {
647 			error = copyout(data, SCARG(uap, data), size);
648 #ifdef KTRACE
649 			if (KTRPOINT(p, KTR_GENIO)) {
650 				struct iovec iov;
651 				iov.iov_base = SCARG(uap, data);
652 				iov.iov_len = size;
653 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
654 					size, error);
655 			}
656 #endif
657 		}
658 		break;
659 	}
660 	if (memp)
661 		free(memp, M_IOCTLOPS);
662  out:
663 	FILE_UNUSE(fp, l);
664 	switch (error) {
665 	case -1:
666 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
667 		    "pid=%d comm=%s\n",
668 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
669 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
670 		    p->p_pid, p->p_comm);
671 		/* FALLTHROUGH */
672 	case EPASSTHROUGH:
673 		error = ENOTTY;
674 		/* FALLTHROUGH */
675 	default:
676 		return (error);
677 	}
678 }
679 
680 int	selwait, nselcoll;
681 
682 /*
683  * Select system call.
684  */
685 int
686 sys_pselect(struct lwp *l, void *v, register_t *retval)
687 {
688 	struct sys_pselect_args /* {
689 		syscallarg(int)				nd;
690 		syscallarg(fd_set *)			in;
691 		syscallarg(fd_set *)			ou;
692 		syscallarg(fd_set *)			ex;
693 		syscallarg(const struct timespec *)	ts;
694 		syscallarg(sigset_t *)			mask;
695 	} */ * const uap = v;
696 	struct timespec	ats;
697 	struct timeval	atv, *tv = NULL;
698 	sigset_t	amask, *mask = NULL;
699 	int		error;
700 
701 	if (SCARG(uap, ts)) {
702 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
703 		if (error)
704 			return error;
705 		atv.tv_sec = ats.tv_sec;
706 		atv.tv_usec = ats.tv_nsec / 1000;
707 		tv = &atv;
708 	}
709 	if (SCARG(uap, mask) != NULL) {
710 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
711 		if (error)
712 			return error;
713 		mask = &amask;
714 	}
715 
716 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
717 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
718 }
719 
720 int
721 sys_select(struct lwp *l, void *v, register_t *retval)
722 {
723 	struct sys_select_args /* {
724 		syscallarg(int)			nd;
725 		syscallarg(fd_set *)		in;
726 		syscallarg(fd_set *)		ou;
727 		syscallarg(fd_set *)		ex;
728 		syscallarg(struct timeval *)	tv;
729 	} */ * const uap = v;
730 	struct timeval atv, *tv = NULL;
731 	int error;
732 
733 	if (SCARG(uap, tv)) {
734 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
735 			sizeof(atv));
736 		if (error)
737 			return error;
738 		tv = &atv;
739 	}
740 
741 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
742 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
743 }
744 
745 int
746 selcommon(struct lwp *l, register_t *retval, int nd, fd_set *u_in,
747 	fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
748 {
749 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
750 			    sizeof(fd_mask) * 6];
751 	struct proc	* const p = l->l_proc;
752 	caddr_t		bits;
753 	int		s, ncoll, error, timo;
754 	size_t		ni;
755 	sigset_t	oldmask;
756 
757 	error = 0;
758 	if (nd < 0)
759 		return (EINVAL);
760 	if (nd > p->p_fd->fd_nfiles) {
761 		/* forgiving; slightly wrong */
762 		nd = p->p_fd->fd_nfiles;
763 	}
764 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
765 	if (ni * 6 > sizeof(smallbits))
766 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
767 	else
768 		bits = smallbits;
769 
770 #define	getbits(name, x)						\
771 	if (u_ ## name) {						\
772 		error = copyin(u_ ## name, bits + ni * x, ni);		\
773 		if (error)						\
774 			goto done;					\
775 	} else								\
776 		memset(bits + ni * x, 0, ni);
777 	getbits(in, 0);
778 	getbits(ou, 1);
779 	getbits(ex, 2);
780 #undef	getbits
781 
782 	timo = 0;
783 	if (tv && itimerfix(tv)) {
784 		error = EINVAL;
785 		goto done;
786 	}
787 	if (mask)
788 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
789 
790  retry:
791 	ncoll = nselcoll;
792 	l->l_flag |= L_SELECT;
793 	error = selscan(l, (fd_mask *)(bits + ni * 0),
794 			   (fd_mask *)(bits + ni * 3), nd, retval);
795 	if (error || *retval)
796 		goto done;
797 	if (tv) {
798 		/*
799 		 * We have to recalculate the timeout on every retry.
800 		 */
801 		timo = tvtohz(tv);
802 		if (timo <= 0)
803 			goto done;
804 	}
805 	s = splsched();
806 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
807 		splx(s);
808 		goto retry;
809 	}
810 	l->l_flag &= ~L_SELECT;
811 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
812 	splx(s);
813 	if (error == 0)
814 		goto retry;
815  done:
816 	if (mask)
817 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
818 	l->l_flag &= ~L_SELECT;
819 	/* select is not restarted after signals... */
820 	if (error == ERESTART)
821 		error = EINTR;
822 	if (error == EWOULDBLOCK)
823 		error = 0;
824 	if (error == 0) {
825 
826 #define	putbits(name, x)						\
827 		if (u_ ## name) {					\
828 			error = copyout(bits + ni * x, u_ ## name, ni); \
829 			if (error)					\
830 				goto out;				\
831 		}
832 		putbits(in, 3);
833 		putbits(ou, 4);
834 		putbits(ex, 5);
835 #undef putbits
836 	}
837  out:
838 	if (ni * 6 > sizeof(smallbits))
839 		free(bits, M_TEMP);
840 	return (error);
841 }
842 
843 int
844 selscan(struct lwp *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
845 	register_t *retval)
846 {
847 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
848 			       POLLWRNORM | POLLHUP | POLLERR,
849 			       POLLRDBAND };
850 	struct proc *p = l->l_proc;
851 	struct filedesc	*fdp;
852 	int msk, i, j, fd, n;
853 	fd_mask ibits, obits;
854 	struct file *fp;
855 
856 	fdp = p->p_fd;
857 	n = 0;
858 	for (msk = 0; msk < 3; msk++) {
859 		for (i = 0; i < nfd; i += NFDBITS) {
860 			ibits = *ibitp++;
861 			obits = 0;
862 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
863 				ibits &= ~(1 << j);
864 				if ((fp = fd_getfile(fdp, fd)) == NULL)
865 					return (EBADF);
866 				FILE_USE(fp);
867 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
868 					obits |= (1 << j);
869 					n++;
870 				}
871 				FILE_UNUSE(fp, l);
872 			}
873 			*obitp++ = obits;
874 		}
875 	}
876 	*retval = n;
877 	return (0);
878 }
879 
880 /*
881  * Poll system call.
882  */
883 int
884 sys_poll(struct lwp *l, void *v, register_t *retval)
885 {
886 	struct sys_poll_args /* {
887 		syscallarg(struct pollfd *)	fds;
888 		syscallarg(u_int)		nfds;
889 		syscallarg(int)			timeout;
890 	} */ * const uap = v;
891 	struct timeval	atv, *tv = NULL;
892 
893 	if (SCARG(uap, timeout) != INFTIM) {
894 		atv.tv_sec = SCARG(uap, timeout) / 1000;
895 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
896 		tv = &atv;
897 	}
898 
899 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
900 		tv, NULL);
901 }
902 
903 /*
904  * Poll system call.
905  */
906 int
907 sys_pollts(struct lwp *l, void *v, register_t *retval)
908 {
909 	struct sys_pollts_args /* {
910 		syscallarg(struct pollfd *)		fds;
911 		syscallarg(u_int)			nfds;
912 		syscallarg(const struct timespec *)	ts;
913 		syscallarg(const sigset_t *)		mask;
914 	} */ * const uap = v;
915 	struct timespec	ats;
916 	struct timeval	atv, *tv = NULL;
917 	sigset_t	amask, *mask = NULL;
918 	int		error;
919 
920 	if (SCARG(uap, ts)) {
921 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
922 		if (error)
923 			return error;
924 		atv.tv_sec = ats.tv_sec;
925 		atv.tv_usec = ats.tv_nsec / 1000;
926 		tv = &atv;
927 	}
928 	if (SCARG(uap, mask)) {
929 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
930 		if (error)
931 			return error;
932 		mask = &amask;
933 	}
934 
935 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
936 		tv, mask);
937 }
938 
939 int
940 pollcommon(struct lwp *l, register_t *retval,
941 	struct pollfd *u_fds, u_int nfds,
942 	struct timeval *tv, sigset_t *mask)
943 {
944 	char		smallbits[32 * sizeof(struct pollfd)];
945 	struct proc	* const p = l->l_proc;
946 	caddr_t		bits;
947 	sigset_t	oldmask;
948 	int		s, ncoll, error, timo;
949 	size_t		ni;
950 
951 	if (nfds > p->p_fd->fd_nfiles) {
952 		/* forgiving; slightly wrong */
953 		nfds = p->p_fd->fd_nfiles;
954 	}
955 	ni = nfds * sizeof(struct pollfd);
956 	if (ni > sizeof(smallbits))
957 		bits = malloc(ni, M_TEMP, M_WAITOK);
958 	else
959 		bits = smallbits;
960 
961 	error = copyin(u_fds, bits, ni);
962 	if (error)
963 		goto done;
964 
965 	timo = 0;
966 	if (tv && itimerfix(tv)) {
967 		error = EINVAL;
968 		goto done;
969 	}
970 	if (mask != NULL)
971 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
972 
973  retry:
974 	ncoll = nselcoll;
975 	l->l_flag |= L_SELECT;
976 	error = pollscan(l, (struct pollfd *)bits, nfds, retval);
977 	if (error || *retval)
978 		goto done;
979 	if (tv) {
980 		/*
981 		 * We have to recalculate the timeout on every retry.
982 		 */
983 		timo = tvtohz(tv);
984 		if (timo <= 0)
985 			goto done;
986 	}
987 	s = splsched();
988 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
989 		splx(s);
990 		goto retry;
991 	}
992 	l->l_flag &= ~L_SELECT;
993 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
994 	splx(s);
995 	if (error == 0)
996 		goto retry;
997  done:
998 	if (mask != NULL)
999 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
1000 	l->l_flag &= ~L_SELECT;
1001 	/* poll is not restarted after signals... */
1002 	if (error == ERESTART)
1003 		error = EINTR;
1004 	if (error == EWOULDBLOCK)
1005 		error = 0;
1006 	if (error == 0) {
1007 		error = copyout(bits, u_fds, ni);
1008 		if (error)
1009 			goto out;
1010 	}
1011  out:
1012 	if (ni > sizeof(smallbits))
1013 		free(bits, M_TEMP);
1014 	return (error);
1015 }
1016 
1017 int
1018 pollscan(struct lwp *l, struct pollfd *fds, int nfd, register_t *retval)
1019 {
1020 	struct proc	*p = l->l_proc;
1021 	struct filedesc	*fdp;
1022 	int		i, n;
1023 	struct file	*fp;
1024 
1025 	fdp = p->p_fd;
1026 	n = 0;
1027 	for (i = 0; i < nfd; i++, fds++) {
1028 		if (fds->fd >= fdp->fd_nfiles) {
1029 			fds->revents = POLLNVAL;
1030 			n++;
1031 		} else if (fds->fd < 0) {
1032 			fds->revents = 0;
1033 		} else {
1034 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1035 				fds->revents = POLLNVAL;
1036 				n++;
1037 			} else {
1038 				FILE_USE(fp);
1039 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1040 				    fds->events | POLLERR | POLLHUP, l);
1041 				if (fds->revents != 0)
1042 					n++;
1043 				FILE_UNUSE(fp, l);
1044 			}
1045 		}
1046 	}
1047 	*retval = n;
1048 	return (0);
1049 }
1050 
1051 /*ARGSUSED*/
1052 int
1053 seltrue(dev_t dev, int events, struct lwp *l)
1054 {
1055 
1056 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1057 }
1058 
1059 /*
1060  * Record a select request.
1061  */
1062 void
1063 selrecord(struct lwp *selector, struct selinfo *sip)
1064 {
1065 	struct lwp	*l;
1066 	struct proc	*p;
1067 	pid_t		mypid;
1068 
1069 	mypid = selector->l_proc->p_pid;
1070 	if (sip->sel_pid == mypid)
1071 		return;
1072 	if (sip->sel_pid && (p = pfind(sip->sel_pid))) {
1073 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1074 			if (l->l_wchan == (caddr_t)&selwait) {
1075 				sip->sel_collision = 1;
1076 				return;
1077 			}
1078 		}
1079 	}
1080 
1081 	sip->sel_pid = mypid;
1082 }
1083 
1084 /*
1085  * Do a wakeup when a selectable event occurs.
1086  */
1087 void
1088 selwakeup(sip)
1089 	struct selinfo *sip;
1090 {
1091 	struct lwp *l;
1092 	struct proc *p;
1093 	int s;
1094 
1095 	if (sip->sel_pid == 0)
1096 		return;
1097 	if (sip->sel_collision) {
1098 		sip->sel_pid = 0;
1099 		nselcoll++;
1100 		sip->sel_collision = 0;
1101 		wakeup((caddr_t)&selwait);
1102 		return;
1103 	}
1104 	p = pfind(sip->sel_pid);
1105 	sip->sel_pid = 0;
1106 	if (p != NULL) {
1107 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1108 			SCHED_LOCK(s);
1109 			if (l->l_wchan == (caddr_t)&selwait) {
1110 				if (l->l_stat == LSSLEEP)
1111 					setrunnable(l);
1112 				else
1113 					unsleep(l);
1114 			} else if (l->l_flag & L_SELECT)
1115 				l->l_flag &= ~L_SELECT;
1116 			SCHED_UNLOCK(s);
1117 		}
1118 	}
1119 }
1120