xref: /netbsd-src/sys/kern/sys_generic.c (revision 1ca5c1b28139779176bd5c13ad7c5f25c0bcd5f8)
1 /*	$NetBSD: sys_generic.c,v 1.60 2001/11/14 18:43:58 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
41  */
42 
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.60 2001/11/14 18:43:58 christos Exp $");
45 
46 #include "opt_ktrace.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/filedesc.h>
51 #include <sys/ioctl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/socketvar.h>
55 #include <sys/signalvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/stat.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 
65 #include <sys/mount.h>
66 #include <sys/syscallargs.h>
67 
68 int selscan __P((struct proc *, fd_mask *, fd_mask *, int, register_t *));
69 int pollscan __P((struct proc *, struct pollfd *, int, register_t *));
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct proc *p, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct filedesc	*fdp;
86 
87 	fd = SCARG(uap, fd);
88 	fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile(fdp, fd)) == NULL)
91 		return (EBADF);
92 
93 	if ((fp->f_flag & FREAD) == 0)
94 		return (EBADF);
95 
96 	FILE_USE(fp);
97 
98 	/* dofileread() will unuse the descriptor for us */
99 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
100 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
101 }
102 
103 int
104 dofileread(struct proc *p, int fd, struct file *fp, void *buf, size_t nbyte,
105 	off_t *offset, int flags, register_t *retval)
106 {
107 	struct uio	auio;
108 	struct iovec	aiov;
109 	long		cnt, error;
110 #ifdef KTRACE
111 	struct iovec	ktriov;
112 #endif
113 	error = 0;
114 
115 	aiov.iov_base = (caddr_t)buf;
116 	aiov.iov_len = nbyte;
117 	auio.uio_iov = &aiov;
118 	auio.uio_iovcnt = 1;
119 	auio.uio_resid = nbyte;
120 	auio.uio_rw = UIO_READ;
121 	auio.uio_segflg = UIO_USERSPACE;
122 	auio.uio_procp = p;
123 
124 	/*
125 	 * Reads return ssize_t because -1 is returned on error.  Therefore
126 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
127 	 * values.
128 	 */
129 	if (auio.uio_resid > SSIZE_MAX) {
130 		error = EINVAL;
131 		goto out;
132 	}
133 
134 #ifdef KTRACE
135 	/*
136 	 * if tracing, save a copy of iovec
137 	 */
138 	if (KTRPOINT(p, KTR_GENIO))
139 		ktriov = aiov;
140 #endif
141 	cnt = auio.uio_resid;
142 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
143 	if (error)
144 		if (auio.uio_resid != cnt && (error == ERESTART ||
145 		    error == EINTR || error == EWOULDBLOCK))
146 			error = 0;
147 	cnt -= auio.uio_resid;
148 #ifdef KTRACE
149 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
150 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
151 #endif
152 	*retval = cnt;
153  out:
154 	FILE_UNUSE(fp, p);
155 	return (error);
156 }
157 
158 /*
159  * Scatter read system call.
160  */
161 int
162 sys_readv(struct proc *p, void *v, register_t *retval)
163 {
164 	struct sys_readv_args /* {
165 		syscallarg(int)				fd;
166 		syscallarg(const struct iovec *)	iovp;
167 		syscallarg(int)				iovcnt;
168 	} */ *uap = v;
169 	int		fd;
170 	struct file	*fp;
171 	struct filedesc	*fdp;
172 
173 	fd = SCARG(uap, fd);
174 	fdp = p->p_fd;
175 
176 	if ((fp = fd_getfile(fdp, fd)) == NULL)
177 		return (EBADF);
178 
179 	if ((fp->f_flag & FREAD) == 0)
180 		return (EBADF);
181 
182 	FILE_USE(fp);
183 
184 	/* dofilereadv() will unuse the descriptor for us */
185 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
186 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
187 }
188 
189 int
190 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
191 	int iovcnt, off_t *offset, int flags, register_t *retval)
192 {
193 	struct uio	auio;
194 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
195 	long		i, cnt, error;
196 	u_int		iovlen;
197 #ifdef KTRACE
198 	struct iovec	*ktriov;
199 #endif
200 
201 	error = 0;
202 #ifdef KTRACE
203 	ktriov = NULL;
204 #endif
205 	/* note: can't use iovlen until iovcnt is validated */
206 	iovlen = iovcnt * sizeof(struct iovec);
207 	if ((u_int)iovcnt > UIO_SMALLIOV) {
208 		if ((u_int)iovcnt > IOV_MAX) {
209 			error = EINVAL;
210 			goto out;
211 		}
212 		iov = malloc(iovlen, M_IOV, M_WAITOK);
213 		needfree = iov;
214 	} else if ((u_int)iovcnt > 0) {
215 		iov = aiov;
216 		needfree = NULL;
217 	} else {
218 		error = EINVAL;
219 		goto out;
220 	}
221 
222 	auio.uio_iov = iov;
223 	auio.uio_iovcnt = iovcnt;
224 	auio.uio_rw = UIO_READ;
225 	auio.uio_segflg = UIO_USERSPACE;
226 	auio.uio_procp = p;
227 	error = copyin(iovp, iov, iovlen);
228 	if (error)
229 		goto done;
230 	auio.uio_resid = 0;
231 	for (i = 0; i < iovcnt; i++) {
232 		auio.uio_resid += iov->iov_len;
233 		/*
234 		 * Reads return ssize_t because -1 is returned on error.
235 		 * Therefore we must restrict the length to SSIZE_MAX to
236 		 * avoid garbage return values.
237 		 */
238 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
239 			error = EINVAL;
240 			goto done;
241 		}
242 		iov++;
243 	}
244 #ifdef KTRACE
245 	/*
246 	 * if tracing, save a copy of iovec
247 	 */
248 	if (KTRPOINT(p, KTR_GENIO))  {
249 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
250 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
251 	}
252 #endif
253 	cnt = auio.uio_resid;
254 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
255 	if (error)
256 		if (auio.uio_resid != cnt && (error == ERESTART ||
257 		    error == EINTR || error == EWOULDBLOCK))
258 			error = 0;
259 	cnt -= auio.uio_resid;
260 #ifdef KTRACE
261 	if (ktriov != NULL) {
262 		if (error == 0)
263 			ktrgenio(p, fd, UIO_READ, ktriov, cnt, error);
264 		free(ktriov, M_TEMP);
265 	}
266 #endif
267 	*retval = cnt;
268  done:
269 	if (needfree)
270 		free(needfree, M_IOV);
271  out:
272 	FILE_UNUSE(fp, p);
273 	return (error);
274 }
275 
276 /*
277  * Write system call
278  */
279 int
280 sys_write(struct proc *p, void *v, register_t *retval)
281 {
282 	struct sys_write_args /* {
283 		syscallarg(int)			fd;
284 		syscallarg(const void *)	buf;
285 		syscallarg(size_t)		nbyte;
286 	} */ *uap = v;
287 	int		fd;
288 	struct file	*fp;
289 	struct filedesc	*fdp;
290 
291 	fd = SCARG(uap, fd);
292 	fdp = p->p_fd;
293 
294 	if ((fp = fd_getfile(fdp, fd)) == NULL)
295 		return (EBADF);
296 
297 	if ((fp->f_flag & FWRITE) == 0)
298 		return (EBADF);
299 
300 	FILE_USE(fp);
301 
302 	/* dofilewrite() will unuse the descriptor for us */
303 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
304 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
305 }
306 
307 int
308 dofilewrite(struct proc *p, int fd, struct file *fp, const void *buf,
309 	size_t nbyte, off_t *offset, int flags, register_t *retval)
310 {
311 	struct uio	auio;
312 	struct iovec	aiov;
313 	long		cnt, error;
314 #ifdef KTRACE
315 	struct iovec	ktriov;
316 #endif
317 
318 	error = 0;
319 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
320 	aiov.iov_len = nbyte;
321 	auio.uio_iov = &aiov;
322 	auio.uio_iovcnt = 1;
323 	auio.uio_resid = nbyte;
324 	auio.uio_rw = UIO_WRITE;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	auio.uio_procp = p;
327 
328 	/*
329 	 * Writes return ssize_t because -1 is returned on error.  Therefore
330 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
331 	 * values.
332 	 */
333 	if (auio.uio_resid > SSIZE_MAX) {
334 		error = EINVAL;
335 		goto out;
336 	}
337 
338 #ifdef KTRACE
339 	/*
340 	 * if tracing, save a copy of iovec
341 	 */
342 	if (KTRPOINT(p, KTR_GENIO))
343 		ktriov = aiov;
344 #endif
345 	cnt = auio.uio_resid;
346 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
347 	if (error) {
348 		if (auio.uio_resid != cnt && (error == ERESTART ||
349 		    error == EINTR || error == EWOULDBLOCK))
350 			error = 0;
351 		if (error == EPIPE)
352 			psignal(p, SIGPIPE);
353 	}
354 	cnt -= auio.uio_resid;
355 #ifdef KTRACE
356 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
357 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
358 #endif
359 	*retval = cnt;
360  out:
361 	FILE_UNUSE(fp, p);
362 	return (error);
363 }
364 
365 /*
366  * Gather write system call
367  */
368 int
369 sys_writev(struct proc *p, void *v, register_t *retval)
370 {
371 	struct sys_writev_args /* {
372 		syscallarg(int)				fd;
373 		syscallarg(const struct iovec *)	iovp;
374 		syscallarg(int)				iovcnt;
375 	} */ *uap = v;
376 	int		fd;
377 	struct file	*fp;
378 	struct filedesc	*fdp;
379 
380 	fd = SCARG(uap, fd);
381 	fdp = p->p_fd;
382 
383 	if ((fp = fd_getfile(fdp, fd)) == NULL)
384 		return (EBADF);
385 
386 	if ((fp->f_flag & FWRITE) == 0)
387 		return (EBADF);
388 
389 	FILE_USE(fp);
390 
391 	/* dofilewritev() will unuse the descriptor for us */
392 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
393 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
394 }
395 
396 int
397 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
398 	int iovcnt, off_t *offset, int flags, register_t *retval)
399 {
400 	struct uio	auio;
401 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
402 	long		i, cnt, error;
403 	u_int		iovlen;
404 #ifdef KTRACE
405 	struct iovec	*ktriov;
406 #endif
407 
408 	error = 0;
409 #ifdef KTRACE
410 	ktriov = NULL;
411 #endif
412 	/* note: can't use iovlen until iovcnt is validated */
413 	iovlen = iovcnt * sizeof(struct iovec);
414 	if ((u_int)iovcnt > UIO_SMALLIOV) {
415 		if ((u_int)iovcnt > IOV_MAX)
416 			return (EINVAL);
417 		iov = malloc(iovlen, M_IOV, M_WAITOK);
418 		needfree = iov;
419 	} else if ((u_int)iovcnt > 0) {
420 		iov = aiov;
421 		needfree = NULL;
422 	} else {
423 		error = EINVAL;
424 		goto out;
425 	}
426 
427 	auio.uio_iov = iov;
428 	auio.uio_iovcnt = iovcnt;
429 	auio.uio_rw = UIO_WRITE;
430 	auio.uio_segflg = UIO_USERSPACE;
431 	auio.uio_procp = p;
432 	error = copyin(iovp, iov, iovlen);
433 	if (error)
434 		goto done;
435 	auio.uio_resid = 0;
436 	for (i = 0; i < iovcnt; i++) {
437 		auio.uio_resid += iov->iov_len;
438 		/*
439 		 * Writes return ssize_t because -1 is returned on error.
440 		 * Therefore we must restrict the length to SSIZE_MAX to
441 		 * avoid garbage return values.
442 		 */
443 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
444 			error = EINVAL;
445 			goto done;
446 		}
447 		iov++;
448 	}
449 #ifdef KTRACE
450 	/*
451 	 * if tracing, save a copy of iovec
452 	 */
453 	if (KTRPOINT(p, KTR_GENIO))  {
454 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
455 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
456 	}
457 #endif
458 	cnt = auio.uio_resid;
459 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
460 	if (error) {
461 		if (auio.uio_resid != cnt && (error == ERESTART ||
462 		    error == EINTR || error == EWOULDBLOCK))
463 			error = 0;
464 		if (error == EPIPE)
465 			psignal(p, SIGPIPE);
466 	}
467 	cnt -= auio.uio_resid;
468 #ifdef KTRACE
469 	if (KTRPOINT(p, KTR_GENIO))
470 		if (error == 0) {
471 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
472 		free(ktriov, M_TEMP);
473 	}
474 #endif
475 	*retval = cnt;
476  done:
477 	if (needfree)
478 		free(needfree, M_IOV);
479  out:
480 	FILE_UNUSE(fp, p);
481 	return (error);
482 }
483 
484 /*
485  * Ioctl system call
486  */
487 /* ARGSUSED */
488 int
489 sys_ioctl(struct proc *p, void *v, register_t *retval)
490 {
491 	struct sys_ioctl_args /* {
492 		syscallarg(int)		fd;
493 		syscallarg(u_long)	com;
494 		syscallarg(caddr_t)	data;
495 	} */ *uap = v;
496 	struct file	*fp;
497 	struct filedesc	*fdp;
498 	u_long		com;
499 	int		error;
500 	u_int		size;
501 	caddr_t		data, memp;
502 	int		tmp;
503 #define	STK_PARAMS	128
504 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
505 
506 	error = 0;
507 	fdp = p->p_fd;
508 
509 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
510 		return (EBADF);
511 
512 	FILE_USE(fp);
513 
514 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
515 		error = EBADF;
516 		goto out;
517 	}
518 
519 	switch (com = SCARG(uap, com)) {
520 	case FIONCLEX:
521 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
522 		goto out;
523 
524 	case FIOCLEX:
525 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
526 		goto out;
527 	}
528 
529 	/*
530 	 * Interpret high order word to find amount of data to be
531 	 * copied to/from the user's address space.
532 	 */
533 	size = IOCPARM_LEN(com);
534 	if (size > IOCPARM_MAX) {
535 		error = ENOTTY;
536 		goto out;
537 	}
538 	memp = NULL;
539 	if (size > sizeof(stkbuf)) {
540 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
541 		data = memp;
542 	} else
543 		data = (caddr_t)stkbuf;
544 	if (com&IOC_IN) {
545 		if (size) {
546 			error = copyin(SCARG(uap, data), data, size);
547 			if (error) {
548 				if (memp)
549 					free(memp, M_IOCTLOPS);
550 				goto out;
551 			}
552 		} else
553 			*(caddr_t *)data = SCARG(uap, data);
554 	} else if ((com&IOC_OUT) && size)
555 		/*
556 		 * Zero the buffer so the user always
557 		 * gets back something deterministic.
558 		 */
559 		memset(data, 0, size);
560 	else if (com&IOC_VOID)
561 		*(caddr_t *)data = SCARG(uap, data);
562 
563 	switch (com) {
564 
565 	case FIONBIO:
566 		if ((tmp = *(int *)data) != 0)
567 			fp->f_flag |= FNONBLOCK;
568 		else
569 			fp->f_flag &= ~FNONBLOCK;
570 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
571 		break;
572 
573 	case FIOASYNC:
574 		if ((tmp = *(int *)data) != 0)
575 			fp->f_flag |= FASYNC;
576 		else
577 			fp->f_flag &= ~FASYNC;
578 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
579 		break;
580 
581 	case FIOSETOWN:
582 		tmp = *(int *)data;
583 		if (fp->f_type == DTYPE_SOCKET) {
584 			((struct socket *)fp->f_data)->so_pgid = tmp;
585 			error = 0;
586 			break;
587 		}
588 		if (tmp <= 0) {
589 			tmp = -tmp;
590 		} else {
591 			struct proc *p1 = pfind(tmp);
592 			if (p1 == 0) {
593 				error = ESRCH;
594 				break;
595 			}
596 			tmp = p1->p_pgrp->pg_id;
597 		}
598 		error = (*fp->f_ops->fo_ioctl)
599 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
600 		break;
601 
602 	case FIOGETOWN:
603 		if (fp->f_type == DTYPE_SOCKET) {
604 			error = 0;
605 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
606 			break;
607 		}
608 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
609 		if (error == 0)
610 			*(int *)data = -*(int *)data;
611 		break;
612 
613 	default:
614 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
615 		/*
616 		 * Copy any data to user, size was
617 		 * already set and checked above.
618 		 */
619 		if (error == 0 && (com&IOC_OUT) && size)
620 			error = copyout(data, SCARG(uap, data), size);
621 		break;
622 	}
623 	if (memp)
624 		free(memp, M_IOCTLOPS);
625  out:
626 	FILE_UNUSE(fp, p);
627 	return (error);
628 }
629 
630 int	selwait, nselcoll;
631 
632 /*
633  * Select system call.
634  */
635 int
636 sys_select(struct proc *p, void *v, register_t *retval)
637 {
638 	struct sys_select_args /* {
639 		syscallarg(int)			nd;
640 		syscallarg(fd_set *)		in;
641 		syscallarg(fd_set *)		ou;
642 		syscallarg(fd_set *)		ex;
643 		syscallarg(struct timeval *)	tv;
644 	} */ *uap = v;
645 	caddr_t		bits;
646 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
647 			    sizeof(fd_mask) * 6];
648 	struct		timeval atv;
649 	int		s, ncoll, error, timo;
650 	size_t		ni;
651 
652 	error = 0;
653 	if (SCARG(uap, nd) < 0)
654 		return (EINVAL);
655 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
656 		/* forgiving; slightly wrong */
657 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
658 	}
659 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
660 	if (ni * 6 > sizeof(smallbits))
661 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
662 	else
663 		bits = smallbits;
664 
665 #define	getbits(name, x)						\
666 	if (SCARG(uap, name)) {						\
667 		error = copyin(SCARG(uap, name), bits + ni * x, ni);	\
668 		if (error)						\
669 			goto done;					\
670 	} else								\
671 		memset(bits + ni * x, 0, ni);
672 	getbits(in, 0);
673 	getbits(ou, 1);
674 	getbits(ex, 2);
675 #undef	getbits
676 
677 	if (SCARG(uap, tv)) {
678 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
679 			sizeof(atv));
680 		if (error)
681 			goto done;
682 		if (itimerfix(&atv)) {
683 			error = EINVAL;
684 			goto done;
685 		}
686 		s = splclock();
687 		timeradd(&atv, &time, &atv);
688 		splx(s);
689 	} else
690 		timo = 0;
691  retry:
692 	ncoll = nselcoll;
693 	p->p_flag |= P_SELECT;
694 	error = selscan(p, (fd_mask *)(bits + ni * 0),
695 			   (fd_mask *)(bits + ni * 3), SCARG(uap, nd), retval);
696 	if (error || *retval)
697 		goto done;
698 	if (SCARG(uap, tv)) {
699 		/*
700 		 * We have to recalculate the timeout on every retry.
701 		 */
702 		timo = hzto(&atv);
703 		if (timo <= 0)
704 			goto done;
705 	}
706 	s = splsched();
707 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
708 		splx(s);
709 		goto retry;
710 	}
711 	p->p_flag &= ~P_SELECT;
712 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
713 	splx(s);
714 	if (error == 0)
715 		goto retry;
716  done:
717 	p->p_flag &= ~P_SELECT;
718 	/* select is not restarted after signals... */
719 	if (error == ERESTART)
720 		error = EINTR;
721 	if (error == EWOULDBLOCK)
722 		error = 0;
723 	if (error == 0) {
724 
725 #define	putbits(name, x)						\
726 		if (SCARG(uap, name)) {					\
727 			error = copyout(bits + ni * x, SCARG(uap, name), ni); \
728 			if (error)					\
729 				goto out;				\
730 		}
731 		putbits(in, 3);
732 		putbits(ou, 4);
733 		putbits(ex, 5);
734 #undef putbits
735 	}
736  out:
737 	if (ni * 6 > sizeof(smallbits))
738 		free(bits, M_TEMP);
739 	return (error);
740 }
741 
742 int
743 selscan(struct proc *p, fd_mask *ibitp, fd_mask *obitp, int nfd,
744 	register_t *retval)
745 {
746 	struct filedesc	*fdp;
747 	int		msk, i, j, fd, n;
748 	fd_mask		ibits, obits;
749 	struct file	*fp;
750 	static int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
751 			       POLLWRNORM | POLLHUP | POLLERR,
752 			       POLLRDBAND };
753 
754 	fdp = p->p_fd;
755 	n = 0;
756 	for (msk = 0; msk < 3; msk++) {
757 		for (i = 0; i < nfd; i += NFDBITS) {
758 			ibits = *ibitp++;
759 			obits = 0;
760 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
761 				ibits &= ~(1 << j);
762 				if ((fp = fd_getfile(fdp, fd)) == NULL)
763 					return (EBADF);
764 				FILE_USE(fp);
765 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
766 					obits |= (1 << j);
767 					n++;
768 				}
769 				FILE_UNUSE(fp, p);
770 			}
771 			*obitp++ = obits;
772 		}
773 	}
774 	*retval = n;
775 	return (0);
776 }
777 
778 /*
779  * Poll system call.
780  */
781 int
782 sys_poll(struct proc *p, void *v, register_t *retval)
783 {
784 	struct sys_poll_args /* {
785 		syscallarg(struct pollfd *)	fds;
786 		syscallarg(u_int)		nfds;
787 		syscallarg(int)			timeout;
788 	} */ *uap = v;
789 	caddr_t		bits;
790 	char		smallbits[32 * sizeof(struct pollfd)];
791 	struct timeval	atv;
792 	int		s, ncoll, error, timo;
793 	size_t		ni;
794 
795 	error = 0;
796 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
797 		/* forgiving; slightly wrong */
798 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
799 	}
800 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
801 	if (ni > sizeof(smallbits))
802 		bits = malloc(ni, M_TEMP, M_WAITOK);
803 	else
804 		bits = smallbits;
805 
806 	error = copyin(SCARG(uap, fds), bits, ni);
807 	if (error)
808 		goto done;
809 
810 	if (SCARG(uap, timeout) != INFTIM) {
811 		atv.tv_sec = SCARG(uap, timeout) / 1000;
812 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
813 		if (itimerfix(&atv)) {
814 			error = EINVAL;
815 			goto done;
816 		}
817 		s = splclock();
818 		timeradd(&atv, &time, &atv);
819 		splx(s);
820 	} else
821 		timo = 0;
822  retry:
823 	ncoll = nselcoll;
824 	p->p_flag |= P_SELECT;
825 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds), retval);
826 	if (error || *retval)
827 		goto done;
828 	if (SCARG(uap, timeout) != INFTIM) {
829 		/*
830 		 * We have to recalculate the timeout on every retry.
831 		 */
832 		timo = hzto(&atv);
833 		if (timo <= 0)
834 			goto done;
835 	}
836 	s = splsched();
837 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
838 		splx(s);
839 		goto retry;
840 	}
841 	p->p_flag &= ~P_SELECT;
842 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
843 	splx(s);
844 	if (error == 0)
845 		goto retry;
846  done:
847 	p->p_flag &= ~P_SELECT;
848 	/* poll is not restarted after signals... */
849 	if (error == ERESTART)
850 		error = EINTR;
851 	if (error == EWOULDBLOCK)
852 		error = 0;
853 	if (error == 0) {
854 		error = copyout(bits, SCARG(uap, fds), ni);
855 		if (error)
856 			goto out;
857 	}
858  out:
859 	if (ni > sizeof(smallbits))
860 		free(bits, M_TEMP);
861 	return (error);
862 }
863 
864 int
865 pollscan(struct proc *p, struct pollfd *fds, int nfd, register_t *retval)
866 {
867 	struct filedesc	*fdp;
868 	int		i, n;
869 	struct file	*fp;
870 
871 	fdp = p->p_fd;
872 	n = 0;
873 	for (i = 0; i < nfd; i++, fds++) {
874 		if (fds->fd >= fdp->fd_nfiles) {
875 			fds->revents = POLLNVAL;
876 			n++;
877 		} else if (fds->fd < 0) {
878 			fds->revents = 0;
879 		} else {
880 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
881 				fds->revents = POLLNVAL;
882 				n++;
883 			} else {
884 				FILE_USE(fp);
885 				fds->revents = (*fp->f_ops->fo_poll)(fp,
886 				    fds->events | POLLERR | POLLHUP, p);
887 				if (fds->revents != 0)
888 					n++;
889 				FILE_UNUSE(fp, p);
890 			}
891 		}
892 	}
893 	*retval = n;
894 	return (0);
895 }
896 
897 /*ARGSUSED*/
898 int
899 seltrue(dev_t dev, int events, struct proc *p)
900 {
901 
902 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
903 }
904 
905 /*
906  * Record a select request.
907  */
908 void
909 selrecord(struct proc *selector, struct selinfo *sip)
910 {
911 	struct proc	*p;
912 	pid_t		mypid;
913 
914 	mypid = selector->p_pid;
915 	if (sip->si_pid == mypid)
916 		return;
917 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
918 	    p->p_wchan == (caddr_t)&selwait)
919 		sip->si_flags |= SI_COLL;
920 	else {
921 		sip->si_flags &= ~SI_COLL;
922 		sip->si_pid = mypid;
923 	}
924 }
925 
926 /*
927  * Do a wakeup when a selectable event occurs.
928  */
929 void
930 selwakeup(sip)
931 	struct selinfo *sip;
932 {
933 	struct proc *p;
934 	int s;
935 
936 	if (sip->si_pid == 0)
937 		return;
938 	if (sip->si_flags & SI_COLL) {
939 		nselcoll++;
940 		sip->si_flags &= ~SI_COLL;
941 		wakeup((caddr_t)&selwait);
942 	}
943 	p = pfind(sip->si_pid);
944 	sip->si_pid = 0;
945 	if (p != NULL) {
946 		SCHED_LOCK(s);
947 		if (p->p_wchan == (caddr_t)&selwait) {
948 			if (p->p_stat == SSLEEP)
949 				setrunnable(p);
950 			else
951 				unsleep(p);
952 		} else if (p->p_flag & P_SELECT)
953 			p->p_flag &= ~P_SELECT;
954 		SCHED_UNLOCK(s);
955 	}
956 }
957