xref: /netbsd-src/sys/kern/sys_generic.c (revision ce2c90c7c172d95d2402a5b3d96d8f8e6d138a21)
1 /*	$NetBSD: sys_generic.c,v 1.94 2006/10/13 16:53:36 dogcow Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.94 2006/10/13 16:53:36 dogcow Exp $");
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/filedesc.h>
47 #include <sys/ioctl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 
61 #include <sys/mount.h>
62 #include <sys/sa.h>
63 #include <sys/syscallargs.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 int selscan(struct lwp *, fd_mask *, fd_mask *, int, register_t *);
68 int pollscan(struct lwp *, struct pollfd *, int, register_t *);
69 
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct lwp *l, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct proc	*p;
86 	struct filedesc	*fdp;
87 
88 	fd = SCARG(uap, fd);
89 	p = l->l_proc;
90 	fdp = p->p_fd;
91 
92 	if ((fp = fd_getfile(fdp, fd)) == NULL)
93 		return (EBADF);
94 
95 	if ((fp->f_flag & FREAD) == 0) {
96 		simple_unlock(&fp->f_slock);
97 		return (EBADF);
98 	}
99 
100 	FILE_USE(fp);
101 
102 	/* dofileread() will unuse the descriptor for us */
103 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
104 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
105 }
106 
107 int
108 dofileread(struct lwp *l, int fd, struct file *fp, void *buf, size_t nbyte,
109 	off_t *offset, int flags, register_t *retval)
110 {
111 	struct iovec aiov;
112 	struct uio auio;
113 	struct proc *p;
114 	struct vmspace *vm;
115 	size_t cnt;
116 	int error;
117 #ifdef KTRACE
118 	struct iovec	ktriov = { .iov_base = NULL, };
119 #else
120 	do { if (&fd) {} } while (/* CONSTCOND */ 0); /* shut up -Wunused */
121 #endif
122 	p = l->l_proc;
123 
124 	error = proc_vmspace_getref(p, &vm);
125 	if (error) {
126 		goto out;
127 	}
128 
129 	aiov.iov_base = (caddr_t)buf;
130 	aiov.iov_len = nbyte;
131 	auio.uio_iov = &aiov;
132 	auio.uio_iovcnt = 1;
133 	auio.uio_resid = nbyte;
134 	auio.uio_rw = UIO_READ;
135 	auio.uio_vmspace = vm;
136 
137 	/*
138 	 * Reads return ssize_t because -1 is returned on error.  Therefore
139 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
140 	 * values.
141 	 */
142 	if (auio.uio_resid > SSIZE_MAX) {
143 		error = EINVAL;
144 		goto out;
145 	}
146 
147 #ifdef KTRACE
148 	/*
149 	 * if tracing, save a copy of iovec
150 	 */
151 	if (KTRPOINT(p, KTR_GENIO))
152 		ktriov = aiov;
153 #endif
154 	cnt = auio.uio_resid;
155 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
156 	if (error)
157 		if (auio.uio_resid != cnt && (error == ERESTART ||
158 		    error == EINTR || error == EWOULDBLOCK))
159 			error = 0;
160 	cnt -= auio.uio_resid;
161 #ifdef KTRACE
162 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
163 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
164 #endif
165 	*retval = cnt;
166  out:
167 	FILE_UNUSE(fp, l);
168 	uvmspace_free(vm);
169 	return (error);
170 }
171 
172 /*
173  * Scatter read system call.
174  */
175 int
176 sys_readv(struct lwp *l, void *v, register_t *retval)
177 {
178 	struct sys_readv_args /* {
179 		syscallarg(int)				fd;
180 		syscallarg(const struct iovec *)	iovp;
181 		syscallarg(int)				iovcnt;
182 	} */ *uap = v;
183 	struct filedesc	*fdp;
184 	struct file *fp;
185 	struct proc *p;
186 	int fd;
187 
188 	fd = SCARG(uap, fd);
189 	p = l->l_proc;
190 	fdp = p->p_fd;
191 
192 	if ((fp = fd_getfile(fdp, fd)) == NULL)
193 		return (EBADF);
194 
195 	if ((fp->f_flag & FREAD) == 0) {
196 		simple_unlock(&fp->f_slock);
197 		return (EBADF);
198 	}
199 
200 	FILE_USE(fp);
201 
202 	/* dofilereadv() will unuse the descriptor for us */
203 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
204 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
205 }
206 
207 int
208 dofilereadv(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
209 	int iovcnt, off_t *offset, int flags, register_t *retval)
210 {
211 	struct proc *p;
212 	struct uio	auio;
213 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
214 	struct vmspace	*vm;
215 	int		i, error;
216 	size_t		cnt;
217 	u_int		iovlen;
218 #ifdef KTRACE
219 	struct iovec	*ktriov;
220 #else
221 	do { if (&fd) {} } while (/* CONSTCOND */ 0); /* shut up -Wunused */
222 #endif
223 
224 	p = l->l_proc;
225 	error = proc_vmspace_getref(p, &vm);
226 	if (error) {
227 		goto out;
228 	}
229 
230 #ifdef KTRACE
231 	ktriov = NULL;
232 #endif
233 	/* note: can't use iovlen until iovcnt is validated */
234 	iovlen = iovcnt * sizeof(struct iovec);
235 	if ((u_int)iovcnt > UIO_SMALLIOV) {
236 		if ((u_int)iovcnt > IOV_MAX) {
237 			error = EINVAL;
238 			goto out;
239 		}
240 		iov = malloc(iovlen, M_IOV, M_WAITOK);
241 		needfree = iov;
242 	} else if ((u_int)iovcnt > 0) {
243 		iov = aiov;
244 		needfree = NULL;
245 	} else {
246 		error = EINVAL;
247 		goto out;
248 	}
249 
250 	auio.uio_iov = iov;
251 	auio.uio_iovcnt = iovcnt;
252 	auio.uio_rw = UIO_READ;
253 	auio.uio_vmspace = vm;
254 	error = copyin(iovp, iov, iovlen);
255 	if (error)
256 		goto done;
257 	auio.uio_resid = 0;
258 	for (i = 0; i < iovcnt; i++) {
259 		auio.uio_resid += iov->iov_len;
260 		/*
261 		 * Reads return ssize_t because -1 is returned on error.
262 		 * Therefore we must restrict the length to SSIZE_MAX to
263 		 * avoid garbage return values.
264 		 */
265 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
266 			error = EINVAL;
267 			goto done;
268 		}
269 		iov++;
270 	}
271 #ifdef KTRACE
272 	/*
273 	 * if tracing, save a copy of iovec
274 	 */
275 	if (KTRPOINT(p, KTR_GENIO))  {
276 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
277 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
278 	}
279 #endif
280 	cnt = auio.uio_resid;
281 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
282 	if (error)
283 		if (auio.uio_resid != cnt && (error == ERESTART ||
284 		    error == EINTR || error == EWOULDBLOCK))
285 			error = 0;
286 	cnt -= auio.uio_resid;
287 #ifdef KTRACE
288 	if (ktriov != NULL) {
289 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
290 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
291 		free(ktriov, M_TEMP);
292 	}
293 #endif
294 	*retval = cnt;
295  done:
296 	if (needfree)
297 		free(needfree, M_IOV);
298  out:
299 	FILE_UNUSE(fp, l);
300 	uvmspace_free(vm);
301 	return (error);
302 }
303 
304 /*
305  * Write system call
306  */
307 int
308 sys_write(struct lwp *l, void *v, register_t *retval)
309 {
310 	struct sys_write_args /* {
311 		syscallarg(int)			fd;
312 		syscallarg(const void *)	buf;
313 		syscallarg(size_t)		nbyte;
314 	} */ *uap = v;
315 	int		fd;
316 	struct file	*fp;
317 	struct proc	*p;
318 	struct filedesc	*fdp;
319 
320 	fd = SCARG(uap, fd);
321 	p = l->l_proc;
322 	fdp = p->p_fd;
323 
324 	if ((fp = fd_getfile(fdp, fd)) == NULL)
325 		return (EBADF);
326 
327 	if ((fp->f_flag & FWRITE) == 0) {
328 		simple_unlock(&fp->f_slock);
329 		return (EBADF);
330 	}
331 
332 	FILE_USE(fp);
333 
334 	/* dofilewrite() will unuse the descriptor for us */
335 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
336 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
337 }
338 
339 int
340 dofilewrite(struct lwp *l, int fd, struct file *fp, const void *buf,
341 	size_t nbyte, off_t *offset, int flags, register_t *retval)
342 {
343 	struct iovec aiov;
344 	struct uio auio;
345 	struct proc *p;
346 	struct vmspace *vm;
347 	size_t cnt;
348 	int error;
349 #ifdef KTRACE
350 	struct iovec	ktriov = { .iov_base = NULL, };
351 #else
352 	do { if (&fd) {} } while (/* CONSTCOND */ 0); /* shut up -Wunused */
353 #endif
354 
355 	p = l->l_proc;
356 	error = proc_vmspace_getref(p, &vm);
357 	if (error) {
358 		goto out;
359 	}
360 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
361 	aiov.iov_len = nbyte;
362 	auio.uio_iov = &aiov;
363 	auio.uio_iovcnt = 1;
364 	auio.uio_resid = nbyte;
365 	auio.uio_rw = UIO_WRITE;
366 	auio.uio_vmspace = vm;
367 
368 	/*
369 	 * Writes return ssize_t because -1 is returned on error.  Therefore
370 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
371 	 * values.
372 	 */
373 	if (auio.uio_resid > SSIZE_MAX) {
374 		error = EINVAL;
375 		goto out;
376 	}
377 
378 #ifdef KTRACE
379 	/*
380 	 * if tracing, save a copy of iovec
381 	 */
382 	if (KTRPOINT(p, KTR_GENIO))
383 		ktriov = aiov;
384 #endif
385 	cnt = auio.uio_resid;
386 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
387 	if (error) {
388 		if (auio.uio_resid != cnt && (error == ERESTART ||
389 		    error == EINTR || error == EWOULDBLOCK))
390 			error = 0;
391 		if (error == EPIPE)
392 			psignal(p, SIGPIPE);
393 	}
394 	cnt -= auio.uio_resid;
395 #ifdef KTRACE
396 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
397 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
398 #endif
399 	*retval = cnt;
400  out:
401 	FILE_UNUSE(fp, l);
402 	uvmspace_free(vm);
403 	return (error);
404 }
405 
406 /*
407  * Gather write system call
408  */
409 int
410 sys_writev(struct lwp *l, void *v, register_t *retval)
411 {
412 	struct sys_writev_args /* {
413 		syscallarg(int)				fd;
414 		syscallarg(const struct iovec *)	iovp;
415 		syscallarg(int)				iovcnt;
416 	} */ *uap = v;
417 	int		fd;
418 	struct file	*fp;
419 	struct proc	*p;
420 	struct filedesc	*fdp;
421 
422 	fd = SCARG(uap, fd);
423 	p = l->l_proc;
424 	fdp = p->p_fd;
425 
426 	if ((fp = fd_getfile(fdp, fd)) == NULL)
427 		return (EBADF);
428 
429 	if ((fp->f_flag & FWRITE) == 0) {
430 		simple_unlock(&fp->f_slock);
431 		return (EBADF);
432 	}
433 
434 	FILE_USE(fp);
435 
436 	/* dofilewritev() will unuse the descriptor for us */
437 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
438 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
439 }
440 
441 int
442 dofilewritev(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
443 	int iovcnt, off_t *offset, int flags, register_t *retval)
444 {
445 	struct proc	*p;
446 	struct uio	auio;
447 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
448 	struct vmspace	*vm;
449 	int		i, error;
450 	size_t		cnt;
451 	u_int		iovlen;
452 #ifdef KTRACE
453 	struct iovec	*ktriov;
454 #else
455 	do { if (&fd) {} } while (/* CONSTCOND */ 0); /* shut up -Wunused */
456 #endif
457 
458 	p = l->l_proc;
459 	error = proc_vmspace_getref(p, &vm);
460 	if (error) {
461 		goto out;
462 	}
463 #ifdef KTRACE
464 	ktriov = NULL;
465 #endif
466 	/* note: can't use iovlen until iovcnt is validated */
467 	iovlen = iovcnt * sizeof(struct iovec);
468 	if ((u_int)iovcnt > UIO_SMALLIOV) {
469 		if ((u_int)iovcnt > IOV_MAX) {
470 			error = EINVAL;
471 			goto out;
472 		}
473 		iov = malloc(iovlen, M_IOV, M_WAITOK);
474 		needfree = iov;
475 	} else if ((u_int)iovcnt > 0) {
476 		iov = aiov;
477 		needfree = NULL;
478 	} else {
479 		error = EINVAL;
480 		goto out;
481 	}
482 
483 	auio.uio_iov = iov;
484 	auio.uio_iovcnt = iovcnt;
485 	auio.uio_rw = UIO_WRITE;
486 	auio.uio_vmspace = vm;
487 	error = copyin(iovp, iov, iovlen);
488 	if (error)
489 		goto done;
490 	auio.uio_resid = 0;
491 	for (i = 0; i < iovcnt; i++) {
492 		auio.uio_resid += iov->iov_len;
493 		/*
494 		 * Writes return ssize_t because -1 is returned on error.
495 		 * Therefore we must restrict the length to SSIZE_MAX to
496 		 * avoid garbage return values.
497 		 */
498 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
499 			error = EINVAL;
500 			goto done;
501 		}
502 		iov++;
503 	}
504 #ifdef KTRACE
505 	/*
506 	 * if tracing, save a copy of iovec
507 	 */
508 	if (KTRPOINT(p, KTR_GENIO))  {
509 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
510 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
511 	}
512 #endif
513 	cnt = auio.uio_resid;
514 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
515 	if (error) {
516 		if (auio.uio_resid != cnt && (error == ERESTART ||
517 		    error == EINTR || error == EWOULDBLOCK))
518 			error = 0;
519 		if (error == EPIPE)
520 			psignal(p, SIGPIPE);
521 	}
522 	cnt -= auio.uio_resid;
523 #ifdef KTRACE
524 	if (ktriov != NULL) {
525 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
526 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
527 		free(ktriov, M_TEMP);
528 	}
529 #endif
530 	*retval = cnt;
531  done:
532 	if (needfree)
533 		free(needfree, M_IOV);
534  out:
535 	FILE_UNUSE(fp, l);
536 	uvmspace_free(vm);
537 	return (error);
538 }
539 
540 /*
541  * Ioctl system call
542  */
543 /* ARGSUSED */
544 int
545 sys_ioctl(struct lwp *l, void *v, register_t *retval __unused)
546 {
547 	struct sys_ioctl_args /* {
548 		syscallarg(int)		fd;
549 		syscallarg(u_long)	com;
550 		syscallarg(caddr_t)	data;
551 	} */ *uap = v;
552 	struct file	*fp;
553 	struct proc	*p;
554 	struct filedesc	*fdp;
555 	u_long		com;
556 	int		error;
557 	u_int		size;
558 	caddr_t		data, memp;
559 #define	STK_PARAMS	128
560 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
561 
562 	error = 0;
563 	p = l->l_proc;
564 	fdp = p->p_fd;
565 
566 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
567 		return (EBADF);
568 
569 	FILE_USE(fp);
570 
571 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
572 		error = EBADF;
573 		com = 0;
574 		goto out;
575 	}
576 
577 	switch (com = SCARG(uap, com)) {
578 	case FIONCLEX:
579 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
580 		goto out;
581 
582 	case FIOCLEX:
583 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
584 		goto out;
585 	}
586 
587 	/*
588 	 * Interpret high order word to find amount of data to be
589 	 * copied to/from the user's address space.
590 	 */
591 	size = IOCPARM_LEN(com);
592 	if (size > IOCPARM_MAX) {
593 		error = ENOTTY;
594 		goto out;
595 	}
596 	memp = NULL;
597 	if (size > sizeof(stkbuf)) {
598 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
599 		data = memp;
600 	} else
601 		data = (caddr_t)stkbuf;
602 	if (com&IOC_IN) {
603 		if (size) {
604 			error = copyin(SCARG(uap, data), data, size);
605 			if (error) {
606 				if (memp)
607 					free(memp, M_IOCTLOPS);
608 				goto out;
609 			}
610 #ifdef KTRACE
611 			if (KTRPOINT(p, KTR_GENIO)) {
612 				struct iovec iov;
613 				iov.iov_base = SCARG(uap, data);
614 				iov.iov_len = size;
615 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
616 					size, 0);
617 			}
618 #endif
619 		} else
620 			*(caddr_t *)data = SCARG(uap, data);
621 	} else if ((com&IOC_OUT) && size)
622 		/*
623 		 * Zero the buffer so the user always
624 		 * gets back something deterministic.
625 		 */
626 		memset(data, 0, size);
627 	else if (com&IOC_VOID)
628 		*(caddr_t *)data = SCARG(uap, data);
629 
630 	switch (com) {
631 
632 	case FIONBIO:
633 		if (*(int *)data != 0)
634 			fp->f_flag |= FNONBLOCK;
635 		else
636 			fp->f_flag &= ~FNONBLOCK;
637 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
638 		break;
639 
640 	case FIOASYNC:
641 		if (*(int *)data != 0)
642 			fp->f_flag |= FASYNC;
643 		else
644 			fp->f_flag &= ~FASYNC;
645 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
646 		break;
647 
648 	default:
649 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
650 		/*
651 		 * Copy any data to user, size was
652 		 * already set and checked above.
653 		 */
654 		if (error == 0 && (com&IOC_OUT) && size) {
655 			error = copyout(data, SCARG(uap, data), size);
656 #ifdef KTRACE
657 			if (KTRPOINT(p, KTR_GENIO)) {
658 				struct iovec iov;
659 				iov.iov_base = SCARG(uap, data);
660 				iov.iov_len = size;
661 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
662 					size, error);
663 			}
664 #endif
665 		}
666 		break;
667 	}
668 	if (memp)
669 		free(memp, M_IOCTLOPS);
670  out:
671 	FILE_UNUSE(fp, l);
672 	switch (error) {
673 	case -1:
674 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
675 		    "pid=%d comm=%s\n",
676 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
677 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
678 		    p->p_pid, p->p_comm);
679 		/* FALLTHROUGH */
680 	case EPASSTHROUGH:
681 		error = ENOTTY;
682 		/* FALLTHROUGH */
683 	default:
684 		return (error);
685 	}
686 }
687 
688 int	selwait, nselcoll;
689 
690 /*
691  * Select system call.
692  */
693 int
694 sys_pselect(struct lwp *l, void *v, register_t *retval)
695 {
696 	struct sys_pselect_args /* {
697 		syscallarg(int)				nd;
698 		syscallarg(fd_set *)			in;
699 		syscallarg(fd_set *)			ou;
700 		syscallarg(fd_set *)			ex;
701 		syscallarg(const struct timespec *)	ts;
702 		syscallarg(sigset_t *)			mask;
703 	} */ * const uap = v;
704 	struct timespec	ats;
705 	struct timeval	atv, *tv = NULL;
706 	sigset_t	amask, *mask = NULL;
707 	int		error;
708 
709 	if (SCARG(uap, ts)) {
710 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
711 		if (error)
712 			return error;
713 		atv.tv_sec = ats.tv_sec;
714 		atv.tv_usec = ats.tv_nsec / 1000;
715 		tv = &atv;
716 	}
717 	if (SCARG(uap, mask) != NULL) {
718 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
719 		if (error)
720 			return error;
721 		mask = &amask;
722 	}
723 
724 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
725 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
726 }
727 
728 int
729 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
730 {
731 	if (itimerfix(tv))
732 		return -1;
733 	getmicrouptime(sleeptv);
734 	return 0;
735 }
736 
737 int
738 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
739 {
740 	/*
741 	 * We have to recalculate the timeout on every retry.
742 	 */
743 	struct timeval slepttv;
744 	/*
745 	 * reduce tv by elapsed time
746 	 * based on monotonic time scale
747 	 */
748 	getmicrouptime(&slepttv);
749 	timeradd(tv, sleeptv, tv);
750 	timersub(tv, &slepttv, tv);
751 	*sleeptv = slepttv;
752 	return tvtohz(tv);
753 }
754 
755 int
756 sys_select(struct lwp *l, void *v, register_t *retval)
757 {
758 	struct sys_select_args /* {
759 		syscallarg(int)			nd;
760 		syscallarg(fd_set *)		in;
761 		syscallarg(fd_set *)		ou;
762 		syscallarg(fd_set *)		ex;
763 		syscallarg(struct timeval *)	tv;
764 	} */ * const uap = v;
765 	struct timeval atv, *tv = NULL;
766 	int error;
767 
768 	if (SCARG(uap, tv)) {
769 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
770 			sizeof(atv));
771 		if (error)
772 			return error;
773 		tv = &atv;
774 	}
775 
776 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
777 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
778 }
779 
780 int
781 selcommon(struct lwp *l, register_t *retval, int nd, fd_set *u_in,
782 	fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
783 {
784 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
785 			    sizeof(fd_mask) * 6];
786 	struct proc	* const p = l->l_proc;
787 	caddr_t		bits;
788 	int		s, ncoll, error, timo;
789 	size_t		ni;
790 	sigset_t	oldmask;
791 	struct timeval  sleeptv;
792 
793 	error = 0;
794 	if (nd < 0)
795 		return (EINVAL);
796 	if (nd > p->p_fd->fd_nfiles) {
797 		/* forgiving; slightly wrong */
798 		nd = p->p_fd->fd_nfiles;
799 	}
800 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
801 	if (ni * 6 > sizeof(smallbits))
802 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
803 	else
804 		bits = smallbits;
805 
806 #define	getbits(name, x)						\
807 	if (u_ ## name) {						\
808 		error = copyin(u_ ## name, bits + ni * x, ni);		\
809 		if (error)						\
810 			goto done;					\
811 	} else								\
812 		memset(bits + ni * x, 0, ni);
813 	getbits(in, 0);
814 	getbits(ou, 1);
815 	getbits(ex, 2);
816 #undef	getbits
817 
818 	timo = 0;
819 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
820 		error = EINVAL;
821 		goto done;
822 	}
823 
824 	if (mask)
825 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
826 
827  retry:
828 	ncoll = nselcoll;
829 	l->l_flag |= L_SELECT;
830 	error = selscan(l, (fd_mask *)(bits + ni * 0),
831 			   (fd_mask *)(bits + ni * 3), nd, retval);
832 	if (error || *retval)
833 		goto done;
834 	if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
835 		goto done;
836 	s = splsched();
837 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
838 		splx(s);
839 		goto retry;
840 	}
841 	l->l_flag &= ~L_SELECT;
842 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
843 	splx(s);
844 	if (error == 0)
845 		goto retry;
846  done:
847 	if (mask)
848 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
849 	l->l_flag &= ~L_SELECT;
850 	/* select is not restarted after signals... */
851 	if (error == ERESTART)
852 		error = EINTR;
853 	if (error == EWOULDBLOCK)
854 		error = 0;
855 	if (error == 0) {
856 
857 #define	putbits(name, x)						\
858 		if (u_ ## name) {					\
859 			error = copyout(bits + ni * x, u_ ## name, ni); \
860 			if (error)					\
861 				goto out;				\
862 		}
863 		putbits(in, 3);
864 		putbits(ou, 4);
865 		putbits(ex, 5);
866 #undef putbits
867 	}
868  out:
869 	if (ni * 6 > sizeof(smallbits))
870 		free(bits, M_TEMP);
871 	return (error);
872 }
873 
874 int
875 selscan(struct lwp *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
876 	register_t *retval)
877 {
878 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
879 			       POLLWRNORM | POLLHUP | POLLERR,
880 			       POLLRDBAND };
881 	struct proc *p = l->l_proc;
882 	struct filedesc	*fdp;
883 	int msk, i, j, fd, n;
884 	fd_mask ibits, obits;
885 	struct file *fp;
886 
887 	fdp = p->p_fd;
888 	n = 0;
889 	for (msk = 0; msk < 3; msk++) {
890 		for (i = 0; i < nfd; i += NFDBITS) {
891 			ibits = *ibitp++;
892 			obits = 0;
893 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
894 				ibits &= ~(1 << j);
895 				if ((fp = fd_getfile(fdp, fd)) == NULL)
896 					return (EBADF);
897 				FILE_USE(fp);
898 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
899 					obits |= (1 << j);
900 					n++;
901 				}
902 				FILE_UNUSE(fp, l);
903 			}
904 			*obitp++ = obits;
905 		}
906 	}
907 	*retval = n;
908 	return (0);
909 }
910 
911 /*
912  * Poll system call.
913  */
914 int
915 sys_poll(struct lwp *l, void *v, register_t *retval)
916 {
917 	struct sys_poll_args /* {
918 		syscallarg(struct pollfd *)	fds;
919 		syscallarg(u_int)		nfds;
920 		syscallarg(int)			timeout;
921 	} */ * const uap = v;
922 	struct timeval	atv, *tv = NULL;
923 
924 	if (SCARG(uap, timeout) != INFTIM) {
925 		atv.tv_sec = SCARG(uap, timeout) / 1000;
926 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
927 		tv = &atv;
928 	}
929 
930 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
931 		tv, NULL);
932 }
933 
934 /*
935  * Poll system call.
936  */
937 int
938 sys_pollts(struct lwp *l, void *v, register_t *retval)
939 {
940 	struct sys_pollts_args /* {
941 		syscallarg(struct pollfd *)		fds;
942 		syscallarg(u_int)			nfds;
943 		syscallarg(const struct timespec *)	ts;
944 		syscallarg(const sigset_t *)		mask;
945 	} */ * const uap = v;
946 	struct timespec	ats;
947 	struct timeval	atv, *tv = NULL;
948 	sigset_t	amask, *mask = NULL;
949 	int		error;
950 
951 	if (SCARG(uap, ts)) {
952 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
953 		if (error)
954 			return error;
955 		atv.tv_sec = ats.tv_sec;
956 		atv.tv_usec = ats.tv_nsec / 1000;
957 		tv = &atv;
958 	}
959 	if (SCARG(uap, mask)) {
960 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
961 		if (error)
962 			return error;
963 		mask = &amask;
964 	}
965 
966 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
967 		tv, mask);
968 }
969 
970 int
971 pollcommon(struct lwp *l, register_t *retval,
972 	struct pollfd *u_fds, u_int nfds,
973 	struct timeval *tv, sigset_t *mask)
974 {
975 	char		smallbits[32 * sizeof(struct pollfd)];
976 	struct proc	* const p = l->l_proc;
977 	caddr_t		bits;
978 	sigset_t	oldmask;
979 	int		s, ncoll, error, timo;
980 	size_t		ni;
981 	struct timeval	sleeptv;
982 
983 	if (nfds > p->p_fd->fd_nfiles) {
984 		/* forgiving; slightly wrong */
985 		nfds = p->p_fd->fd_nfiles;
986 	}
987 	ni = nfds * sizeof(struct pollfd);
988 	if (ni > sizeof(smallbits))
989 		bits = malloc(ni, M_TEMP, M_WAITOK);
990 	else
991 		bits = smallbits;
992 
993 	error = copyin(u_fds, bits, ni);
994 	if (error)
995 		goto done;
996 
997 	timo = 0;
998 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
999 		error = EINVAL;
1000 		goto done;
1001 	}
1002 
1003 	if (mask != NULL)
1004 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
1005 
1006  retry:
1007 	ncoll = nselcoll;
1008 	l->l_flag |= L_SELECT;
1009 	error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1010 	if (error || *retval)
1011 		goto done;
1012 	if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1013 		goto done;
1014 	s = splsched();
1015 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
1016 		splx(s);
1017 		goto retry;
1018 	}
1019 	l->l_flag &= ~L_SELECT;
1020 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
1021 	splx(s);
1022 	if (error == 0)
1023 		goto retry;
1024  done:
1025 	if (mask != NULL)
1026 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
1027 	l->l_flag &= ~L_SELECT;
1028 	/* poll is not restarted after signals... */
1029 	if (error == ERESTART)
1030 		error = EINTR;
1031 	if (error == EWOULDBLOCK)
1032 		error = 0;
1033 	if (error == 0) {
1034 		error = copyout(bits, u_fds, ni);
1035 		if (error)
1036 			goto out;
1037 	}
1038  out:
1039 	if (ni > sizeof(smallbits))
1040 		free(bits, M_TEMP);
1041 	return (error);
1042 }
1043 
1044 int
1045 pollscan(struct lwp *l, struct pollfd *fds, int nfd, register_t *retval)
1046 {
1047 	struct proc	*p = l->l_proc;
1048 	struct filedesc	*fdp;
1049 	int		i, n;
1050 	struct file	*fp;
1051 
1052 	fdp = p->p_fd;
1053 	n = 0;
1054 	for (i = 0; i < nfd; i++, fds++) {
1055 		if (fds->fd >= fdp->fd_nfiles) {
1056 			fds->revents = POLLNVAL;
1057 			n++;
1058 		} else if (fds->fd < 0) {
1059 			fds->revents = 0;
1060 		} else {
1061 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1062 				fds->revents = POLLNVAL;
1063 				n++;
1064 			} else {
1065 				FILE_USE(fp);
1066 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1067 				    fds->events | POLLERR | POLLHUP, l);
1068 				if (fds->revents != 0)
1069 					n++;
1070 				FILE_UNUSE(fp, l);
1071 			}
1072 		}
1073 	}
1074 	*retval = n;
1075 	return (0);
1076 }
1077 
1078 /*ARGSUSED*/
1079 int
1080 seltrue(dev_t dev __unused, int events, struct lwp *l __unused)
1081 {
1082 
1083 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1084 }
1085 
1086 /*
1087  * Record a select request.
1088  */
1089 void
1090 selrecord(struct lwp *selector, struct selinfo *sip)
1091 {
1092 	struct lwp	*l;
1093 	struct proc	*p;
1094 	pid_t		mypid;
1095 
1096 	mypid = selector->l_proc->p_pid;
1097 	if (sip->sel_pid == mypid)
1098 		return;
1099 	if (sip->sel_pid && (p = pfind(sip->sel_pid))) {
1100 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1101 			if (l->l_wchan == (caddr_t)&selwait) {
1102 				sip->sel_collision = 1;
1103 				return;
1104 			}
1105 		}
1106 	}
1107 
1108 	sip->sel_pid = mypid;
1109 }
1110 
1111 /*
1112  * Do a wakeup when a selectable event occurs.
1113  */
1114 void
1115 selwakeup(sip)
1116 	struct selinfo *sip;
1117 {
1118 	struct lwp *l;
1119 	struct proc *p;
1120 	int s;
1121 
1122 	if (sip->sel_pid == 0)
1123 		return;
1124 	if (sip->sel_collision) {
1125 		sip->sel_pid = 0;
1126 		nselcoll++;
1127 		sip->sel_collision = 0;
1128 		wakeup((caddr_t)&selwait);
1129 		return;
1130 	}
1131 	p = pfind(sip->sel_pid);
1132 	sip->sel_pid = 0;
1133 	if (p != NULL) {
1134 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1135 			SCHED_LOCK(s);
1136 			if (l->l_wchan == (caddr_t)&selwait) {
1137 				if (l->l_stat == LSSLEEP)
1138 					setrunnable(l);
1139 				else
1140 					unsleep(l);
1141 			} else if (l->l_flag & L_SELECT)
1142 				l->l_flag &= ~L_SELECT;
1143 			SCHED_UNLOCK(s);
1144 		}
1145 	}
1146 }
1147