xref: /netbsd-src/sys/kern/sys_generic.c (revision 366e81869c056a2c569cba90e971c65ad4d009e6)
1 /*	$NetBSD: sys_generic.c,v 1.61 2002/03/17 19:41:07 atatat Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
41  */
42 
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.61 2002/03/17 19:41:07 atatat Exp $");
45 
46 #include "opt_ktrace.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/filedesc.h>
51 #include <sys/ioctl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/socketvar.h>
55 #include <sys/signalvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/stat.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 
65 #include <sys/mount.h>
66 #include <sys/syscallargs.h>
67 
68 int selscan __P((struct proc *, fd_mask *, fd_mask *, int, register_t *));
69 int pollscan __P((struct proc *, struct pollfd *, int, register_t *));
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct proc *p, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct filedesc	*fdp;
86 
87 	fd = SCARG(uap, fd);
88 	fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile(fdp, fd)) == NULL)
91 		return (EBADF);
92 
93 	if ((fp->f_flag & FREAD) == 0)
94 		return (EBADF);
95 
96 	FILE_USE(fp);
97 
98 	/* dofileread() will unuse the descriptor for us */
99 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
100 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
101 }
102 
103 int
104 dofileread(struct proc *p, int fd, struct file *fp, void *buf, size_t nbyte,
105 	off_t *offset, int flags, register_t *retval)
106 {
107 	struct uio	auio;
108 	struct iovec	aiov;
109 	long		cnt, error;
110 #ifdef KTRACE
111 	struct iovec	ktriov;
112 #endif
113 	error = 0;
114 
115 	aiov.iov_base = (caddr_t)buf;
116 	aiov.iov_len = nbyte;
117 	auio.uio_iov = &aiov;
118 	auio.uio_iovcnt = 1;
119 	auio.uio_resid = nbyte;
120 	auio.uio_rw = UIO_READ;
121 	auio.uio_segflg = UIO_USERSPACE;
122 	auio.uio_procp = p;
123 
124 	/*
125 	 * Reads return ssize_t because -1 is returned on error.  Therefore
126 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
127 	 * values.
128 	 */
129 	if (auio.uio_resid > SSIZE_MAX) {
130 		error = EINVAL;
131 		goto out;
132 	}
133 
134 #ifdef KTRACE
135 	/*
136 	 * if tracing, save a copy of iovec
137 	 */
138 	if (KTRPOINT(p, KTR_GENIO))
139 		ktriov = aiov;
140 #endif
141 	cnt = auio.uio_resid;
142 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
143 	if (error)
144 		if (auio.uio_resid != cnt && (error == ERESTART ||
145 		    error == EINTR || error == EWOULDBLOCK))
146 			error = 0;
147 	cnt -= auio.uio_resid;
148 #ifdef KTRACE
149 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
150 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
151 #endif
152 	*retval = cnt;
153  out:
154 	FILE_UNUSE(fp, p);
155 	return (error);
156 }
157 
158 /*
159  * Scatter read system call.
160  */
161 int
162 sys_readv(struct proc *p, void *v, register_t *retval)
163 {
164 	struct sys_readv_args /* {
165 		syscallarg(int)				fd;
166 		syscallarg(const struct iovec *)	iovp;
167 		syscallarg(int)				iovcnt;
168 	} */ *uap = v;
169 	int		fd;
170 	struct file	*fp;
171 	struct filedesc	*fdp;
172 
173 	fd = SCARG(uap, fd);
174 	fdp = p->p_fd;
175 
176 	if ((fp = fd_getfile(fdp, fd)) == NULL)
177 		return (EBADF);
178 
179 	if ((fp->f_flag & FREAD) == 0)
180 		return (EBADF);
181 
182 	FILE_USE(fp);
183 
184 	/* dofilereadv() will unuse the descriptor for us */
185 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
186 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
187 }
188 
189 int
190 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
191 	int iovcnt, off_t *offset, int flags, register_t *retval)
192 {
193 	struct uio	auio;
194 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
195 	long		i, cnt, error;
196 	u_int		iovlen;
197 #ifdef KTRACE
198 	struct iovec	*ktriov;
199 #endif
200 
201 	error = 0;
202 #ifdef KTRACE
203 	ktriov = NULL;
204 #endif
205 	/* note: can't use iovlen until iovcnt is validated */
206 	iovlen = iovcnt * sizeof(struct iovec);
207 	if ((u_int)iovcnt > UIO_SMALLIOV) {
208 		if ((u_int)iovcnt > IOV_MAX) {
209 			error = EINVAL;
210 			goto out;
211 		}
212 		iov = malloc(iovlen, M_IOV, M_WAITOK);
213 		needfree = iov;
214 	} else if ((u_int)iovcnt > 0) {
215 		iov = aiov;
216 		needfree = NULL;
217 	} else {
218 		error = EINVAL;
219 		goto out;
220 	}
221 
222 	auio.uio_iov = iov;
223 	auio.uio_iovcnt = iovcnt;
224 	auio.uio_rw = UIO_READ;
225 	auio.uio_segflg = UIO_USERSPACE;
226 	auio.uio_procp = p;
227 	error = copyin(iovp, iov, iovlen);
228 	if (error)
229 		goto done;
230 	auio.uio_resid = 0;
231 	for (i = 0; i < iovcnt; i++) {
232 		auio.uio_resid += iov->iov_len;
233 		/*
234 		 * Reads return ssize_t because -1 is returned on error.
235 		 * Therefore we must restrict the length to SSIZE_MAX to
236 		 * avoid garbage return values.
237 		 */
238 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
239 			error = EINVAL;
240 			goto done;
241 		}
242 		iov++;
243 	}
244 #ifdef KTRACE
245 	/*
246 	 * if tracing, save a copy of iovec
247 	 */
248 	if (KTRPOINT(p, KTR_GENIO))  {
249 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
250 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
251 	}
252 #endif
253 	cnt = auio.uio_resid;
254 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
255 	if (error)
256 		if (auio.uio_resid != cnt && (error == ERESTART ||
257 		    error == EINTR || error == EWOULDBLOCK))
258 			error = 0;
259 	cnt -= auio.uio_resid;
260 #ifdef KTRACE
261 	if (ktriov != NULL) {
262 		if (error == 0)
263 			ktrgenio(p, fd, UIO_READ, ktriov, cnt, error);
264 		free(ktriov, M_TEMP);
265 	}
266 #endif
267 	*retval = cnt;
268  done:
269 	if (needfree)
270 		free(needfree, M_IOV);
271  out:
272 	FILE_UNUSE(fp, p);
273 	return (error);
274 }
275 
276 /*
277  * Write system call
278  */
279 int
280 sys_write(struct proc *p, void *v, register_t *retval)
281 {
282 	struct sys_write_args /* {
283 		syscallarg(int)			fd;
284 		syscallarg(const void *)	buf;
285 		syscallarg(size_t)		nbyte;
286 	} */ *uap = v;
287 	int		fd;
288 	struct file	*fp;
289 	struct filedesc	*fdp;
290 
291 	fd = SCARG(uap, fd);
292 	fdp = p->p_fd;
293 
294 	if ((fp = fd_getfile(fdp, fd)) == NULL)
295 		return (EBADF);
296 
297 	if ((fp->f_flag & FWRITE) == 0)
298 		return (EBADF);
299 
300 	FILE_USE(fp);
301 
302 	/* dofilewrite() will unuse the descriptor for us */
303 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
304 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
305 }
306 
307 int
308 dofilewrite(struct proc *p, int fd, struct file *fp, const void *buf,
309 	size_t nbyte, off_t *offset, int flags, register_t *retval)
310 {
311 	struct uio	auio;
312 	struct iovec	aiov;
313 	long		cnt, error;
314 #ifdef KTRACE
315 	struct iovec	ktriov;
316 #endif
317 
318 	error = 0;
319 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
320 	aiov.iov_len = nbyte;
321 	auio.uio_iov = &aiov;
322 	auio.uio_iovcnt = 1;
323 	auio.uio_resid = nbyte;
324 	auio.uio_rw = UIO_WRITE;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	auio.uio_procp = p;
327 
328 	/*
329 	 * Writes return ssize_t because -1 is returned on error.  Therefore
330 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
331 	 * values.
332 	 */
333 	if (auio.uio_resid > SSIZE_MAX) {
334 		error = EINVAL;
335 		goto out;
336 	}
337 
338 #ifdef KTRACE
339 	/*
340 	 * if tracing, save a copy of iovec
341 	 */
342 	if (KTRPOINT(p, KTR_GENIO))
343 		ktriov = aiov;
344 #endif
345 	cnt = auio.uio_resid;
346 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
347 	if (error) {
348 		if (auio.uio_resid != cnt && (error == ERESTART ||
349 		    error == EINTR || error == EWOULDBLOCK))
350 			error = 0;
351 		if (error == EPIPE)
352 			psignal(p, SIGPIPE);
353 	}
354 	cnt -= auio.uio_resid;
355 #ifdef KTRACE
356 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
357 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
358 #endif
359 	*retval = cnt;
360  out:
361 	FILE_UNUSE(fp, p);
362 	return (error);
363 }
364 
365 /*
366  * Gather write system call
367  */
368 int
369 sys_writev(struct proc *p, void *v, register_t *retval)
370 {
371 	struct sys_writev_args /* {
372 		syscallarg(int)				fd;
373 		syscallarg(const struct iovec *)	iovp;
374 		syscallarg(int)				iovcnt;
375 	} */ *uap = v;
376 	int		fd;
377 	struct file	*fp;
378 	struct filedesc	*fdp;
379 
380 	fd = SCARG(uap, fd);
381 	fdp = p->p_fd;
382 
383 	if ((fp = fd_getfile(fdp, fd)) == NULL)
384 		return (EBADF);
385 
386 	if ((fp->f_flag & FWRITE) == 0)
387 		return (EBADF);
388 
389 	FILE_USE(fp);
390 
391 	/* dofilewritev() will unuse the descriptor for us */
392 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
393 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
394 }
395 
396 int
397 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
398 	int iovcnt, off_t *offset, int flags, register_t *retval)
399 {
400 	struct uio	auio;
401 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
402 	long		i, cnt, error;
403 	u_int		iovlen;
404 #ifdef KTRACE
405 	struct iovec	*ktriov;
406 #endif
407 
408 	error = 0;
409 #ifdef KTRACE
410 	ktriov = NULL;
411 #endif
412 	/* note: can't use iovlen until iovcnt is validated */
413 	iovlen = iovcnt * sizeof(struct iovec);
414 	if ((u_int)iovcnt > UIO_SMALLIOV) {
415 		if ((u_int)iovcnt > IOV_MAX)
416 			return (EINVAL);
417 		iov = malloc(iovlen, M_IOV, M_WAITOK);
418 		needfree = iov;
419 	} else if ((u_int)iovcnt > 0) {
420 		iov = aiov;
421 		needfree = NULL;
422 	} else {
423 		error = EINVAL;
424 		goto out;
425 	}
426 
427 	auio.uio_iov = iov;
428 	auio.uio_iovcnt = iovcnt;
429 	auio.uio_rw = UIO_WRITE;
430 	auio.uio_segflg = UIO_USERSPACE;
431 	auio.uio_procp = p;
432 	error = copyin(iovp, iov, iovlen);
433 	if (error)
434 		goto done;
435 	auio.uio_resid = 0;
436 	for (i = 0; i < iovcnt; i++) {
437 		auio.uio_resid += iov->iov_len;
438 		/*
439 		 * Writes return ssize_t because -1 is returned on error.
440 		 * Therefore we must restrict the length to SSIZE_MAX to
441 		 * avoid garbage return values.
442 		 */
443 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
444 			error = EINVAL;
445 			goto done;
446 		}
447 		iov++;
448 	}
449 #ifdef KTRACE
450 	/*
451 	 * if tracing, save a copy of iovec
452 	 */
453 	if (KTRPOINT(p, KTR_GENIO))  {
454 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
455 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
456 	}
457 #endif
458 	cnt = auio.uio_resid;
459 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
460 	if (error) {
461 		if (auio.uio_resid != cnt && (error == ERESTART ||
462 		    error == EINTR || error == EWOULDBLOCK))
463 			error = 0;
464 		if (error == EPIPE)
465 			psignal(p, SIGPIPE);
466 	}
467 	cnt -= auio.uio_resid;
468 #ifdef KTRACE
469 	if (KTRPOINT(p, KTR_GENIO))
470 		if (error == 0) {
471 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
472 		free(ktriov, M_TEMP);
473 	}
474 #endif
475 	*retval = cnt;
476  done:
477 	if (needfree)
478 		free(needfree, M_IOV);
479  out:
480 	FILE_UNUSE(fp, p);
481 	return (error);
482 }
483 
484 /*
485  * Ioctl system call
486  */
487 /* ARGSUSED */
488 int
489 sys_ioctl(struct proc *p, void *v, register_t *retval)
490 {
491 	struct sys_ioctl_args /* {
492 		syscallarg(int)		fd;
493 		syscallarg(u_long)	com;
494 		syscallarg(caddr_t)	data;
495 	} */ *uap = v;
496 	struct file	*fp;
497 	struct filedesc	*fdp;
498 	u_long		com;
499 	int		error;
500 	u_int		size;
501 	caddr_t		data, memp;
502 	int		tmp;
503 #define	STK_PARAMS	128
504 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
505 
506 	error = 0;
507 	fdp = p->p_fd;
508 
509 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
510 		return (EBADF);
511 
512 	FILE_USE(fp);
513 
514 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
515 		error = EBADF;
516 		goto out;
517 	}
518 
519 	switch (com = SCARG(uap, com)) {
520 	case FIONCLEX:
521 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
522 		goto out;
523 
524 	case FIOCLEX:
525 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
526 		goto out;
527 	}
528 
529 	/*
530 	 * Interpret high order word to find amount of data to be
531 	 * copied to/from the user's address space.
532 	 */
533 	size = IOCPARM_LEN(com);
534 	if (size > IOCPARM_MAX) {
535 		error = ENOTTY;
536 		goto out;
537 	}
538 	memp = NULL;
539 	if (size > sizeof(stkbuf)) {
540 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
541 		data = memp;
542 	} else
543 		data = (caddr_t)stkbuf;
544 	if (com&IOC_IN) {
545 		if (size) {
546 			error = copyin(SCARG(uap, data), data, size);
547 			if (error) {
548 				if (memp)
549 					free(memp, M_IOCTLOPS);
550 				goto out;
551 			}
552 		} else
553 			*(caddr_t *)data = SCARG(uap, data);
554 	} else if ((com&IOC_OUT) && size)
555 		/*
556 		 * Zero the buffer so the user always
557 		 * gets back something deterministic.
558 		 */
559 		memset(data, 0, size);
560 	else if (com&IOC_VOID)
561 		*(caddr_t *)data = SCARG(uap, data);
562 
563 	switch (com) {
564 
565 	case FIONBIO:
566 		if ((tmp = *(int *)data) != 0)
567 			fp->f_flag |= FNONBLOCK;
568 		else
569 			fp->f_flag &= ~FNONBLOCK;
570 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
571 		break;
572 
573 	case FIOASYNC:
574 		if ((tmp = *(int *)data) != 0)
575 			fp->f_flag |= FASYNC;
576 		else
577 			fp->f_flag &= ~FASYNC;
578 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
579 		break;
580 
581 	case FIOSETOWN:
582 		tmp = *(int *)data;
583 		if (fp->f_type == DTYPE_SOCKET) {
584 			((struct socket *)fp->f_data)->so_pgid = tmp;
585 			error = 0;
586 			break;
587 		}
588 		if (tmp <= 0) {
589 			tmp = -tmp;
590 		} else {
591 			struct proc *p1 = pfind(tmp);
592 			if (p1 == 0) {
593 				error = ESRCH;
594 				break;
595 			}
596 			tmp = p1->p_pgrp->pg_id;
597 		}
598 		error = (*fp->f_ops->fo_ioctl)
599 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
600 		break;
601 
602 	case FIOGETOWN:
603 		if (fp->f_type == DTYPE_SOCKET) {
604 			error = 0;
605 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
606 			break;
607 		}
608 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
609 		if (error == 0)
610 			*(int *)data = -*(int *)data;
611 		break;
612 
613 	default:
614 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
615 		/*
616 		 * Copy any data to user, size was
617 		 * already set and checked above.
618 		 */
619 		if (error == 0 && (com&IOC_OUT) && size)
620 			error = copyout(data, SCARG(uap, data), size);
621 		break;
622 	}
623 	if (memp)
624 		free(memp, M_IOCTLOPS);
625  out:
626 	FILE_UNUSE(fp, p);
627 	switch (error) {
628 	case -1:
629 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
630 		    "pid=%d comm=%s\n",
631 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
632 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
633 		    p->p_pid, p->p_comm);
634 		/* FALLTHROUGH */
635 	case EPASSTHROUGH:
636 		error = ENOTTY;
637 		/* FALLTHROUGH */
638 	default:
639 		return (error);
640 	}
641 }
642 
643 int	selwait, nselcoll;
644 
645 /*
646  * Select system call.
647  */
648 int
649 sys_select(struct proc *p, void *v, register_t *retval)
650 {
651 	struct sys_select_args /* {
652 		syscallarg(int)			nd;
653 		syscallarg(fd_set *)		in;
654 		syscallarg(fd_set *)		ou;
655 		syscallarg(fd_set *)		ex;
656 		syscallarg(struct timeval *)	tv;
657 	} */ *uap = v;
658 	caddr_t		bits;
659 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
660 			    sizeof(fd_mask) * 6];
661 	struct		timeval atv;
662 	int		s, ncoll, error, timo;
663 	size_t		ni;
664 
665 	error = 0;
666 	if (SCARG(uap, nd) < 0)
667 		return (EINVAL);
668 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
669 		/* forgiving; slightly wrong */
670 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
671 	}
672 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
673 	if (ni * 6 > sizeof(smallbits))
674 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
675 	else
676 		bits = smallbits;
677 
678 #define	getbits(name, x)						\
679 	if (SCARG(uap, name)) {						\
680 		error = copyin(SCARG(uap, name), bits + ni * x, ni);	\
681 		if (error)						\
682 			goto done;					\
683 	} else								\
684 		memset(bits + ni * x, 0, ni);
685 	getbits(in, 0);
686 	getbits(ou, 1);
687 	getbits(ex, 2);
688 #undef	getbits
689 
690 	if (SCARG(uap, tv)) {
691 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
692 			sizeof(atv));
693 		if (error)
694 			goto done;
695 		if (itimerfix(&atv)) {
696 			error = EINVAL;
697 			goto done;
698 		}
699 		s = splclock();
700 		timeradd(&atv, &time, &atv);
701 		splx(s);
702 	} else
703 		timo = 0;
704  retry:
705 	ncoll = nselcoll;
706 	p->p_flag |= P_SELECT;
707 	error = selscan(p, (fd_mask *)(bits + ni * 0),
708 			   (fd_mask *)(bits + ni * 3), SCARG(uap, nd), retval);
709 	if (error || *retval)
710 		goto done;
711 	if (SCARG(uap, tv)) {
712 		/*
713 		 * We have to recalculate the timeout on every retry.
714 		 */
715 		timo = hzto(&atv);
716 		if (timo <= 0)
717 			goto done;
718 	}
719 	s = splsched();
720 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
721 		splx(s);
722 		goto retry;
723 	}
724 	p->p_flag &= ~P_SELECT;
725 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
726 	splx(s);
727 	if (error == 0)
728 		goto retry;
729  done:
730 	p->p_flag &= ~P_SELECT;
731 	/* select is not restarted after signals... */
732 	if (error == ERESTART)
733 		error = EINTR;
734 	if (error == EWOULDBLOCK)
735 		error = 0;
736 	if (error == 0) {
737 
738 #define	putbits(name, x)						\
739 		if (SCARG(uap, name)) {					\
740 			error = copyout(bits + ni * x, SCARG(uap, name), ni); \
741 			if (error)					\
742 				goto out;				\
743 		}
744 		putbits(in, 3);
745 		putbits(ou, 4);
746 		putbits(ex, 5);
747 #undef putbits
748 	}
749  out:
750 	if (ni * 6 > sizeof(smallbits))
751 		free(bits, M_TEMP);
752 	return (error);
753 }
754 
755 int
756 selscan(struct proc *p, fd_mask *ibitp, fd_mask *obitp, int nfd,
757 	register_t *retval)
758 {
759 	struct filedesc	*fdp;
760 	int		msk, i, j, fd, n;
761 	fd_mask		ibits, obits;
762 	struct file	*fp;
763 	static int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
764 			       POLLWRNORM | POLLHUP | POLLERR,
765 			       POLLRDBAND };
766 
767 	fdp = p->p_fd;
768 	n = 0;
769 	for (msk = 0; msk < 3; msk++) {
770 		for (i = 0; i < nfd; i += NFDBITS) {
771 			ibits = *ibitp++;
772 			obits = 0;
773 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
774 				ibits &= ~(1 << j);
775 				if ((fp = fd_getfile(fdp, fd)) == NULL)
776 					return (EBADF);
777 				FILE_USE(fp);
778 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
779 					obits |= (1 << j);
780 					n++;
781 				}
782 				FILE_UNUSE(fp, p);
783 			}
784 			*obitp++ = obits;
785 		}
786 	}
787 	*retval = n;
788 	return (0);
789 }
790 
791 /*
792  * Poll system call.
793  */
794 int
795 sys_poll(struct proc *p, void *v, register_t *retval)
796 {
797 	struct sys_poll_args /* {
798 		syscallarg(struct pollfd *)	fds;
799 		syscallarg(u_int)		nfds;
800 		syscallarg(int)			timeout;
801 	} */ *uap = v;
802 	caddr_t		bits;
803 	char		smallbits[32 * sizeof(struct pollfd)];
804 	struct timeval	atv;
805 	int		s, ncoll, error, timo;
806 	size_t		ni;
807 
808 	error = 0;
809 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
810 		/* forgiving; slightly wrong */
811 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
812 	}
813 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
814 	if (ni > sizeof(smallbits))
815 		bits = malloc(ni, M_TEMP, M_WAITOK);
816 	else
817 		bits = smallbits;
818 
819 	error = copyin(SCARG(uap, fds), bits, ni);
820 	if (error)
821 		goto done;
822 
823 	if (SCARG(uap, timeout) != INFTIM) {
824 		atv.tv_sec = SCARG(uap, timeout) / 1000;
825 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
826 		if (itimerfix(&atv)) {
827 			error = EINVAL;
828 			goto done;
829 		}
830 		s = splclock();
831 		timeradd(&atv, &time, &atv);
832 		splx(s);
833 	} else
834 		timo = 0;
835  retry:
836 	ncoll = nselcoll;
837 	p->p_flag |= P_SELECT;
838 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds), retval);
839 	if (error || *retval)
840 		goto done;
841 	if (SCARG(uap, timeout) != INFTIM) {
842 		/*
843 		 * We have to recalculate the timeout on every retry.
844 		 */
845 		timo = hzto(&atv);
846 		if (timo <= 0)
847 			goto done;
848 	}
849 	s = splsched();
850 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
851 		splx(s);
852 		goto retry;
853 	}
854 	p->p_flag &= ~P_SELECT;
855 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
856 	splx(s);
857 	if (error == 0)
858 		goto retry;
859  done:
860 	p->p_flag &= ~P_SELECT;
861 	/* poll is not restarted after signals... */
862 	if (error == ERESTART)
863 		error = EINTR;
864 	if (error == EWOULDBLOCK)
865 		error = 0;
866 	if (error == 0) {
867 		error = copyout(bits, SCARG(uap, fds), ni);
868 		if (error)
869 			goto out;
870 	}
871  out:
872 	if (ni > sizeof(smallbits))
873 		free(bits, M_TEMP);
874 	return (error);
875 }
876 
877 int
878 pollscan(struct proc *p, struct pollfd *fds, int nfd, register_t *retval)
879 {
880 	struct filedesc	*fdp;
881 	int		i, n;
882 	struct file	*fp;
883 
884 	fdp = p->p_fd;
885 	n = 0;
886 	for (i = 0; i < nfd; i++, fds++) {
887 		if (fds->fd >= fdp->fd_nfiles) {
888 			fds->revents = POLLNVAL;
889 			n++;
890 		} else if (fds->fd < 0) {
891 			fds->revents = 0;
892 		} else {
893 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
894 				fds->revents = POLLNVAL;
895 				n++;
896 			} else {
897 				FILE_USE(fp);
898 				fds->revents = (*fp->f_ops->fo_poll)(fp,
899 				    fds->events | POLLERR | POLLHUP, p);
900 				if (fds->revents != 0)
901 					n++;
902 				FILE_UNUSE(fp, p);
903 			}
904 		}
905 	}
906 	*retval = n;
907 	return (0);
908 }
909 
910 /*ARGSUSED*/
911 int
912 seltrue(dev_t dev, int events, struct proc *p)
913 {
914 
915 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
916 }
917 
918 /*
919  * Record a select request.
920  */
921 void
922 selrecord(struct proc *selector, struct selinfo *sip)
923 {
924 	struct proc	*p;
925 	pid_t		mypid;
926 
927 	mypid = selector->p_pid;
928 	if (sip->si_pid == mypid)
929 		return;
930 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
931 	    p->p_wchan == (caddr_t)&selwait)
932 		sip->si_flags |= SI_COLL;
933 	else {
934 		sip->si_flags &= ~SI_COLL;
935 		sip->si_pid = mypid;
936 	}
937 }
938 
939 /*
940  * Do a wakeup when a selectable event occurs.
941  */
942 void
943 selwakeup(sip)
944 	struct selinfo *sip;
945 {
946 	struct proc *p;
947 	int s;
948 
949 	if (sip->si_pid == 0)
950 		return;
951 	if (sip->si_flags & SI_COLL) {
952 		nselcoll++;
953 		sip->si_flags &= ~SI_COLL;
954 		wakeup((caddr_t)&selwait);
955 	}
956 	p = pfind(sip->si_pid);
957 	sip->si_pid = 0;
958 	if (p != NULL) {
959 		SCHED_LOCK(s);
960 		if (p->p_wchan == (caddr_t)&selwait) {
961 			if (p->p_stat == SSLEEP)
962 				setrunnable(p);
963 			else
964 				unsleep(p);
965 		} else if (p->p_flag & P_SELECT)
966 			p->p_flag &= ~P_SELECT;
967 		SCHED_UNLOCK(s);
968 	}
969 }
970