xref: /openbsd-src/sys/kern/sys_generic.c (revision a7b596c7f71d9e139262fb8afd5132f3b816f18e)
1 /*	$OpenBSD: sys_generic.c,v 1.32 2002/02/02 16:05:58 art Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
67 int seltrue __P((dev_t, int, struct proc *));
68 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(p, v, retval)
76 	struct proc *p;
77 	void *v;
78 	register_t *retval;
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	int fd = SCARG(uap, fd);
86 	struct file *fp;
87 	struct filedesc *fdp = p->p_fd;
88 
89 	if ((fp = fd_getfile(fdp, fd)) == NULL)
90 		return (EBADF);
91 	if ((fp->f_flag & FREAD) == 0)
92 		return (EBADF);
93 
94 	/* dofileread() will unuse the descriptor for us */
95 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
96 	    &fp->f_offset, retval));
97 }
98 
99 int
100 dofileread(p, fd, fp, buf, nbyte, offset, retval)
101 	struct proc *p;
102 	int fd;
103 	struct file *fp;
104 	void *buf;
105 	size_t nbyte;
106 	off_t *offset;
107 	register_t *retval;
108 {
109 	struct uio auio;
110 	struct iovec aiov;
111 	long cnt, error = 0;
112 #ifdef KTRACE
113 	struct iovec ktriov;
114 #endif
115 
116 	aiov.iov_base = (caddr_t)buf;
117 	aiov.iov_len = nbyte;
118 	auio.uio_iov = &aiov;
119 	auio.uio_iovcnt = 1;
120 	auio.uio_resid = nbyte;
121 	auio.uio_rw = UIO_READ;
122 	auio.uio_segflg = UIO_USERSPACE;
123 	auio.uio_procp = p;
124 
125 	/*
126 	 * Reads return ssize_t because -1 is returned on error.  Therefore
127 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
128 	 * values.
129 	 */
130 	if (auio.uio_resid > SSIZE_MAX) {
131 		error = EINVAL;
132 		goto out;
133 	}
134 
135 #ifdef KTRACE
136 	/*
137 	 * if tracing, save a copy of iovec
138 	 */
139 	if (KTRPOINT(p, KTR_GENIO))
140 		ktriov = aiov;
141 #endif
142 	cnt = auio.uio_resid;
143 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
144 	if (error)
145 		if (auio.uio_resid != cnt && (error == ERESTART ||
146 		    error == EINTR || error == EWOULDBLOCK))
147 			error = 0;
148 	cnt -= auio.uio_resid;
149 #ifdef KTRACE
150 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
151 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
152 #endif
153 	*retval = cnt;
154  out:
155 #if notyet
156 	FILE_UNUSE(fp, p);
157 #endif
158 	return (error);
159 }
160 
161 /*
162  * Scatter read system call.
163  */
164 int
165 sys_readv(p, v, retval)
166 	struct proc *p;
167 	void *v;
168 	register_t *retval;
169 {
170 	struct sys_readv_args /* {
171 		syscallarg(int) fd;
172 		syscallarg(const struct iovec *) iovp;
173 		syscallarg(int) iovcnt;
174 	} */ *uap = v;
175 	int fd = SCARG(uap, fd);
176 	struct file *fp;
177 	struct filedesc *fdp = p->p_fd;
178 
179 	if ((fp = fd_getfile(fdp, fd)) == NULL)
180 		return (EBADF);
181 	if ((fp->f_flag & FREAD) == 0)
182 		return (EBADF);
183 
184 	/* dofilereadv() will unuse the descriptor for us */
185 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
186 	    &fp->f_offset, retval));
187 }
188 
189 int
190 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
191 	struct proc *p;
192 	int fd;
193 	struct file *fp;
194 	const struct iovec *iovp;
195 	int iovcnt;
196 	off_t *offset;
197 	register_t *retval;
198 {
199 	struct uio auio;
200 	struct iovec *iov;
201 	struct iovec *needfree;
202 	struct iovec aiov[UIO_SMALLIOV];
203 	long i, cnt, error = 0;
204 	u_int iovlen;
205 #ifdef KTRACE
206 	struct iovec *ktriov = NULL;
207 #endif
208 
209 	/* note: can't use iovlen until iovcnt is validated */
210 	iovlen = iovcnt * sizeof(struct iovec);
211 	if ((u_int)iovcnt > UIO_SMALLIOV) {
212 		if ((u_int)iovcnt > IOV_MAX) {
213 			error = EINVAL;
214 			goto out;
215 		}
216 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
217 	} else if ((u_int)iovcnt > 0) {
218 		iov = aiov;
219 		needfree = NULL;
220 	} else {
221 		error = EINVAL;
222 		goto out;
223 	}
224 
225 	auio.uio_iov = iov;
226 	auio.uio_iovcnt = iovcnt;
227 	auio.uio_rw = UIO_READ;
228 	auio.uio_segflg = UIO_USERSPACE;
229 	auio.uio_procp = p;
230 	error = copyin(iovp, iov, iovlen);
231 	if (error)
232 		goto done;
233 	auio.uio_resid = 0;
234 	for (i = 0; i < iovcnt; i++) {
235 		auio.uio_resid += iov->iov_len;
236 		/*
237 		 * Reads return ssize_t because -1 is returned on error.
238 		 * Therefore we must restrict the length to SSIZE_MAX to
239 		 * avoid garbage return values.
240 		 */
241 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
242 			error = EINVAL;
243 			goto done;
244 		}
245 		iov++;
246 	}
247 #ifdef KTRACE
248 	/*
249 	 * if tracing, save a copy of iovec
250 	 */
251 	if (KTRPOINT(p, KTR_GENIO))  {
252 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
253 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
254 	}
255 #endif
256 	cnt = auio.uio_resid;
257 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
258 	if (error)
259 		if (auio.uio_resid != cnt && (error == ERESTART ||
260 		    error == EINTR || error == EWOULDBLOCK))
261 			error = 0;
262 	cnt -= auio.uio_resid;
263 #ifdef KTRACE
264 	if (ktriov != NULL) {
265 		if (error == 0)
266 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
267 			    error);
268 		free(ktriov, M_TEMP);
269 	}
270 #endif
271 	*retval = cnt;
272  done:
273 	if (needfree)
274 		free(needfree, M_IOV);
275  out:
276 #if notyet
277 	FILE_UNUSE(fp, p);
278 #endif
279 	return (error);
280 }
281 
282 /*
283  * Write system call
284  */
285 int
286 sys_write(p, v, retval)
287 	struct proc *p;
288 	void *v;
289 	register_t *retval;
290 {
291 	struct sys_write_args /* {
292 		syscallarg(int) fd;
293 		syscallarg(const void *) buf;
294 		syscallarg(size_t) nbyte;
295 	} */ *uap = v;
296 	int fd = SCARG(uap, fd);
297 	struct file *fp;
298 	struct filedesc *fdp = p->p_fd;
299 
300 	if ((fp = fd_getfile(fdp, fd)) == NULL)
301 		return (EBADF);
302 	if ((fp->f_flag & FWRITE) == 0)
303 		return (EBADF);
304 
305 	/* dofilewrite() will unuse the descriptor for us */
306 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
307 	    &fp->f_offset, retval));
308 }
309 
310 int
311 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
312 	struct proc *p;
313 	int fd;
314 	struct file *fp;
315 	const void *buf;
316 	size_t nbyte;
317 	off_t *offset;
318 	register_t *retval;
319 {
320 	struct uio auio;
321 	struct iovec aiov;
322 	long cnt, error = 0;
323 #ifdef KTRACE
324 	struct iovec ktriov;
325 #endif
326 
327 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
328 	aiov.iov_len = nbyte;
329 	auio.uio_iov = &aiov;
330 	auio.uio_iovcnt = 1;
331 	auio.uio_resid = nbyte;
332 	auio.uio_rw = UIO_WRITE;
333 	auio.uio_segflg = UIO_USERSPACE;
334 	auio.uio_procp = p;
335 
336 	/*
337 	 * Writes return ssize_t because -1 is returned on error.  Therefore
338 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
339 	 * values.
340 	 */
341 	if (auio.uio_resid > SSIZE_MAX) {
342 		error = EINVAL;
343 		goto out;
344 	}
345 
346 #ifdef KTRACE
347 	/*
348 	 * if tracing, save a copy of iovec
349 	 */
350 	if (KTRPOINT(p, KTR_GENIO))
351 		ktriov = aiov;
352 #endif
353 	cnt = auio.uio_resid;
354 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
355 	if (error) {
356 		if (auio.uio_resid != cnt && (error == ERESTART ||
357 		    error == EINTR || error == EWOULDBLOCK))
358 			error = 0;
359 		if (error == EPIPE)
360 			psignal(p, SIGPIPE);
361 	}
362 	cnt -= auio.uio_resid;
363 #ifdef KTRACE
364 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
365 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
366 #endif
367 	*retval = cnt;
368  out:
369 #if notyet
370 	FILE_UNUSE(fp, p);
371 #endif
372 	return (error);
373 }
374 
375 /*
376  * Gather write system call
377  */
378 int
379 sys_writev(p, v, retval)
380 	struct proc *p;
381 	void *v;
382 	register_t *retval;
383 {
384 	struct sys_writev_args /* {
385 		syscallarg(int) fd;
386 		syscallarg(const struct iovec *) iovp;
387 		syscallarg(int) iovcnt;
388 	} */ *uap = v;
389 	int fd = SCARG(uap, fd);
390 	struct file *fp;
391 	struct filedesc *fdp = p->p_fd;
392 
393 	if ((fp = fd_getfile(fdp, fd)) == NULL)
394 		return (EBADF);
395 	if ((fp->f_flag & FWRITE) == 0)
396 		return (EBADF);
397 
398 	/* dofilewritev() will unuse the descriptor for us */
399 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
400 	    &fp->f_offset, retval));
401 }
402 
403 int
404 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
405 	struct proc *p;
406 	int fd;
407 	struct file *fp;
408 	const struct iovec *iovp;
409 	int iovcnt;
410 	off_t *offset;
411 	register_t *retval;
412 {
413 	struct uio auio;
414 	struct iovec *iov;
415 	struct iovec *needfree;
416 	struct iovec aiov[UIO_SMALLIOV];
417 	long i, cnt, error = 0;
418 	u_int iovlen;
419 #ifdef KTRACE
420 	struct iovec *ktriov = NULL;
421 #endif
422 
423 	/* note: can't use iovlen until iovcnt is validated */
424 	iovlen = iovcnt * sizeof(struct iovec);
425 	if ((u_int)iovcnt > UIO_SMALLIOV) {
426 		if ((u_int)iovcnt > IOV_MAX)
427 			return (EINVAL);
428 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
429 	} else if ((u_int)iovcnt > 0) {
430 		iov = aiov;
431 		needfree = NULL;
432 	} else {
433 		error = EINVAL;
434 		goto out;
435 	}
436 
437 	auio.uio_iov = iov;
438 	auio.uio_iovcnt = iovcnt;
439 	auio.uio_rw = UIO_WRITE;
440 	auio.uio_segflg = UIO_USERSPACE;
441 	auio.uio_procp = p;
442 	error = copyin(iovp, iov, iovlen);
443 	if (error)
444 		goto done;
445 	auio.uio_resid = 0;
446 	for (i = 0; i < iovcnt; i++) {
447 		auio.uio_resid += iov->iov_len;
448 		/*
449 		 * Writes return ssize_t because -1 is returned on error.
450 		 * Therefore we must restrict the length to SSIZE_MAX to
451 		 * avoid garbage return values.
452 		 */
453 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
454 			error = EINVAL;
455 			goto done;
456 		}
457 		iov++;
458 	}
459 #ifdef KTRACE
460 	/*
461 	 * if tracing, save a copy of iovec
462 	 */
463 	if (KTRPOINT(p, KTR_GENIO))  {
464 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
465 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
466 	}
467 #endif
468 	cnt = auio.uio_resid;
469 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
470 	if (error) {
471 		if (auio.uio_resid != cnt && (error == ERESTART ||
472 		    error == EINTR || error == EWOULDBLOCK))
473 			error = 0;
474 		if (error == EPIPE)
475 			psignal(p, SIGPIPE);
476 	}
477 	cnt -= auio.uio_resid;
478 #ifdef KTRACE
479 	if (ktriov != NULL) {
480 		if (error == 0)
481 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
482 			    error);
483 		free(ktriov, M_TEMP);
484 	}
485 #endif
486 	*retval = cnt;
487  done:
488 	if (needfree)
489 		free(needfree, M_IOV);
490  out:
491 #if notyet
492 	FILE_UNUSE(fp, p);
493 #endif
494 	return (error);
495 }
496 
497 /*
498  * Ioctl system call
499  */
500 /* ARGSUSED */
501 int
502 sys_ioctl(p, v, retval)
503 	struct proc *p;
504 	void *v;
505 	register_t *retval;
506 {
507 	register struct sys_ioctl_args /* {
508 		syscallarg(int) fd;
509 		syscallarg(u_long) com;
510 		syscallarg(caddr_t) data;
511 	} */ *uap = v;
512 	register struct file *fp;
513 	register struct filedesc *fdp;
514 	register u_long com;
515 	register int error;
516 	register u_int size;
517 	caddr_t data, memp;
518 	int tmp;
519 #define STK_PARAMS	128
520 	char stkbuf[STK_PARAMS];
521 
522 	fdp = p->p_fd;
523 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
524 		return (EBADF);
525 
526 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
527 		return (EBADF);
528 
529 	switch (com = SCARG(uap, com)) {
530 	case FIONCLEX:
531 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
532 		return (0);
533 	case FIOCLEX:
534 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
535 		return (0);
536 	}
537 
538 	/*
539 	 * Interpret high order word to find amount of data to be
540 	 * copied to/from the user's address space.
541 	 */
542 	size = IOCPARM_LEN(com);
543 	if (size > IOCPARM_MAX)
544 		return (ENOTTY);
545 	memp = NULL;
546 	if (size > sizeof (stkbuf)) {
547 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
548 		data = memp;
549 	} else
550 		data = stkbuf;
551 	if (com&IOC_IN) {
552 		if (size) {
553 			error = copyin(SCARG(uap, data), data, (u_int)size);
554 			if (error) {
555 				if (memp)
556 					free(memp, M_IOCTLOPS);
557 				return (error);
558 			}
559 		} else
560 			*(caddr_t *)data = SCARG(uap, data);
561 	} else if ((com&IOC_OUT) && size)
562 		/*
563 		 * Zero the buffer so the user always
564 		 * gets back something deterministic.
565 		 */
566 		bzero(data, size);
567 	else if (com&IOC_VOID)
568 		*(caddr_t *)data = SCARG(uap, data);
569 
570 	switch (com) {
571 
572 	case FIONBIO:
573 		if ((tmp = *(int *)data) != 0)
574 			fp->f_flag |= FNONBLOCK;
575 		else
576 			fp->f_flag &= ~FNONBLOCK;
577 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
578 		break;
579 
580 	case FIOASYNC:
581 		if ((tmp = *(int *)data) != 0)
582 			fp->f_flag |= FASYNC;
583 		else
584 			fp->f_flag &= ~FASYNC;
585 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
586 		break;
587 
588 	case FIOSETOWN:
589 		tmp = *(int *)data;
590 		if (fp->f_type == DTYPE_SOCKET) {
591 			struct socket *so = (struct socket *)fp->f_data;
592 
593 			so->so_pgid = tmp;
594 			so->so_siguid = p->p_cred->p_ruid;
595 			so->so_sigeuid = p->p_ucred->cr_uid;
596 			error = 0;
597 			break;
598 		}
599 		if (tmp <= 0) {
600 			tmp = -tmp;
601 		} else {
602 			struct proc *p1 = pfind(tmp);
603 			if (p1 == 0) {
604 				error = ESRCH;
605 				break;
606 			}
607 			tmp = p1->p_pgrp->pg_id;
608 		}
609 		error = (*fp->f_ops->fo_ioctl)
610 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
611 		break;
612 
613 	case FIOGETOWN:
614 		if (fp->f_type == DTYPE_SOCKET) {
615 			error = 0;
616 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
617 			break;
618 		}
619 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
620 		*(int *)data = -*(int *)data;
621 		break;
622 
623 	default:
624 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
625 		/*
626 		 * Copy any data to user, size was
627 		 * already set and checked above.
628 		 */
629 		if (error == 0 && (com&IOC_OUT) && size)
630 			error = copyout(data, SCARG(uap, data), (u_int)size);
631 		break;
632 	}
633 	if (memp)
634 		free(memp, M_IOCTLOPS);
635 	return (error);
636 }
637 
638 int	selwait, nselcoll;
639 
640 /*
641  * Select system call.
642  */
643 int
644 sys_select(p, v, retval)
645 	register struct proc *p;
646 	void *v;
647 	register_t *retval;
648 {
649 	register struct sys_select_args /* {
650 		syscallarg(int) nd;
651 		syscallarg(fd_set *) in;
652 		syscallarg(fd_set *) ou;
653 		syscallarg(fd_set *) ex;
654 		syscallarg(struct timeval *) tv;
655 	} */ *uap = v;
656 	fd_set bits[6], *pibits[3], *pobits[3];
657 	struct timeval atv;
658 	int s, ncoll, error = 0, timo;
659 	u_int ni;
660 
661 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
662 		/* forgiving; slightly wrong */
663 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
664 	}
665 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
666 	if (SCARG(uap, nd) > FD_SETSIZE) {
667 		caddr_t mbits;
668 
669 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
670 		bzero(mbits, ni * 6);
671 		pibits[0] = (fd_set *)&mbits[ni * 0];
672 		pibits[1] = (fd_set *)&mbits[ni * 1];
673 		pibits[2] = (fd_set *)&mbits[ni * 2];
674 		pobits[0] = (fd_set *)&mbits[ni * 3];
675 		pobits[1] = (fd_set *)&mbits[ni * 4];
676 		pobits[2] = (fd_set *)&mbits[ni * 5];
677 	} else {
678 		bzero((caddr_t)bits, sizeof(bits));
679 		pibits[0] = &bits[0];
680 		pibits[1] = &bits[1];
681 		pibits[2] = &bits[2];
682 		pobits[0] = &bits[3];
683 		pobits[1] = &bits[4];
684 		pobits[2] = &bits[5];
685 	}
686 
687 #define	getbits(name, x) \
688 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
689 	    (caddr_t)pibits[x], ni))) \
690 		goto done;
691 	getbits(in, 0);
692 	getbits(ou, 1);
693 	getbits(ex, 2);
694 #undef	getbits
695 
696 	if (SCARG(uap, tv)) {
697 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
698 			sizeof (atv));
699 		if (error)
700 			goto done;
701 		if (itimerfix(&atv)) {
702 			error = EINVAL;
703 			goto done;
704 		}
705 		s = splclock();
706 		timeradd(&atv, &time, &atv);
707 		splx(s);
708 	} else
709 		timo = 0;
710 retry:
711 	ncoll = nselcoll;
712 	p->p_flag |= P_SELECT;
713 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
714 	if (error || *retval)
715 		goto done;
716 	if (SCARG(uap, tv)) {
717 		/*
718 		 * We have to recalculate the timeout on every retry.
719 		 */
720 		timo = hzto(&atv);
721 		if (timo <= 0)
722 			goto done;
723 	}
724 	s = splhigh();
725 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
726 		splx(s);
727 		goto retry;
728 	}
729 	p->p_flag &= ~P_SELECT;
730 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
731 	splx(s);
732 	if (error == 0)
733 		goto retry;
734 done:
735 	p->p_flag &= ~P_SELECT;
736 	/* select is not restarted after signals... */
737 	if (error == ERESTART)
738 		error = EINTR;
739 	if (error == EWOULDBLOCK)
740 		error = 0;
741 #define	putbits(name, x) \
742 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
743 	    (caddr_t)SCARG(uap, name), ni))) \
744 		error = error2;
745 	if (error == 0) {
746 		int error2;
747 
748 		putbits(in, 0);
749 		putbits(ou, 1);
750 		putbits(ex, 2);
751 #undef putbits
752 	}
753 
754 	if (pibits[0] != &bits[0])
755 		free(pibits[0], M_TEMP);
756 	return (error);
757 }
758 
759 int
760 selscan(p, ibits, obits, nfd, retval)
761 	struct proc *p;
762 	fd_set *ibits, *obits;
763 	int nfd;
764 	register_t *retval;
765 {
766 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
767 	register struct filedesc *fdp = p->p_fd;
768 	register int msk, i, j, fd;
769 	register fd_mask bits;
770 	struct file *fp;
771 	int ni, n = 0;
772 	static int flag[3] = { FREAD, FWRITE, 0 };
773 
774 	/*
775 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
776 	 * up to the next byte) otherwise the fd_set's are normal sized.
777 	 */
778 	ni = sizeof(fd_set);
779 	if (nfd > FD_SETSIZE)
780 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
781 
782 	for (msk = 0; msk < 3; msk++) {
783 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
784 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
785 
786 		for (i = 0; i < nfd; i += NFDBITS) {
787 			bits = pibits->fds_bits[i/NFDBITS];
788 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
789 				bits &= ~(1 << j);
790 				if ((fp = fd_getfile(fdp, fd)) == NULL)
791 					return (EBADF);
792 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
793 					FD_SET(fd, pobits);
794 					n++;
795 				}
796 			}
797 		}
798 	}
799 	*retval = n;
800 	return (0);
801 }
802 
803 /*ARGSUSED*/
804 int
805 seltrue(dev, flag, p)
806 	dev_t dev;
807 	int flag;
808 	struct proc *p;
809 {
810 
811 	return (1);
812 }
813 
814 /*
815  * Record a select request.
816  */
817 void
818 selrecord(selector, sip)
819 	struct proc *selector;
820 	struct selinfo *sip;
821 {
822 	struct proc *p;
823 	pid_t mypid;
824 
825 	mypid = selector->p_pid;
826 	if (sip->si_selpid == mypid)
827 		return;
828 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
829 	    p->p_wchan == (caddr_t)&selwait)
830 		sip->si_flags |= SI_COLL;
831 	else
832 		sip->si_selpid = mypid;
833 }
834 
835 /*
836  * Do a wakeup when a selectable event occurs.
837  */
838 void
839 selwakeup(sip)
840 	register struct selinfo *sip;
841 {
842 	register struct proc *p;
843 	int s;
844 
845 	if (sip->si_selpid == 0)
846 		return;
847 	if (sip->si_flags & SI_COLL) {
848 		nselcoll++;
849 		sip->si_flags &= ~SI_COLL;
850 		wakeup((caddr_t)&selwait);
851 	}
852 	p = pfind(sip->si_selpid);
853 	sip->si_selpid = 0;
854 	if (p != NULL) {
855 		s = splhigh();
856 		if (p->p_wchan == (caddr_t)&selwait) {
857 			if (p->p_stat == SSLEEP)
858 				setrunnable(p);
859 			else
860 				unsleep(p);
861 		} else if (p->p_flag & P_SELECT)
862 			p->p_flag &= ~P_SELECT;
863 		splx(s);
864 	}
865 }
866 
867 void
868 pollscan(p, pl, nfd, retval)
869 	struct proc *p;
870 	struct pollfd *pl;
871 	int nfd;
872 	register_t *retval;
873 {
874 	register struct filedesc *fdp = p->p_fd;
875 	register int msk, i;
876 	struct file *fp;
877 	int x, n = 0;
878 	static int flag[3] = { FREAD, FWRITE, 0 };
879 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
880 
881 	/*
882 	 * XXX: We need to implement the rest of the flags.
883 	 */
884 	for (i = 0; i < nfd; i++) {
885 		/* Check the file descriptor. */
886 		if (pl[i].fd < 0) {
887 			pl[i].revents = 0;
888 			continue;
889 		}
890 		if ((fp = fd_getfile(fdp, pl[i].fd)) == NULL) {
891 			pl[i].revents = POLLNVAL;
892 			n++;
893 			continue;
894 		}
895 		for (x = msk = 0; msk < 3; msk++) {
896 			if (pl[i].events & pflag[msk]) {
897 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
898 					pl[i].revents |= pflag[msk] &
899 					    pl[i].events;
900 					x++;
901 				}
902 			}
903 		}
904 		if (x)
905 			n++;
906 	}
907 	*retval = n;
908 }
909 
910 /*
911  * We are using the same mechanism as select only we encode/decode args
912  * differently.
913  */
914 int
915 sys_poll(p, v, retval)
916 	register struct proc *p;
917 	void *v;
918 	register_t *retval;
919 {
920 	struct sys_poll_args *uap = v;
921 	size_t sz;
922 	struct pollfd pfds[4], *pl = pfds;
923 	int msec = SCARG(uap, timeout);
924 	struct timeval atv;
925 	int timo, ncoll, i, s, error, error2;
926 	extern int nselcoll, selwait;
927 
928 	/* Standards say no more than MAX_OPEN; this is possibly better. */
929 	if (SCARG(uap, nfds) > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
930 	    maxfiles))
931 		return (EINVAL);
932 
933 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
934 
935 	/* optimize for the default case, of a small nfds value */
936 	if (sz > sizeof(pfds))
937 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
938 
939 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
940 		goto bad;
941 
942 	for (i = 0; i < SCARG(uap, nfds); i++)
943 		pl[i].revents = 0;
944 
945 	if (msec != -1) {
946 		atv.tv_sec = msec / 1000;
947 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
948 
949 		if (itimerfix(&atv)) {
950 			error = EINVAL;
951 			goto done;
952 		}
953 		s = splclock();
954 		timeradd(&atv, &time, &atv);
955 		splx(s);
956 	} else
957 		timo = 0;
958 
959 retry:
960 	ncoll = nselcoll;
961 	p->p_flag |= P_SELECT;
962 	pollscan(p, pl, SCARG(uap, nfds), retval);
963 	if (*retval)
964 		goto done;
965 	if (msec != -1) {
966 		/*
967 		 * We have to recalculate the timeout on every retry.
968 		 */
969 		timo = hzto(&atv);
970 		if (timo <= 0)
971 			goto done;
972 	}
973 	s = splhigh();
974 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
975 		splx(s);
976 		goto retry;
977 	}
978 	p->p_flag &= ~P_SELECT;
979 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
980 	splx(s);
981 	if (error == 0)
982 		goto retry;
983 
984 done:
985 	p->p_flag &= ~P_SELECT;
986 	/* poll is not restarted after signals... */
987 	if (error == ERESTART)
988 		error = EINTR;
989 	if (error == EWOULDBLOCK)
990 		error = 0;
991 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
992 		error = error2;
993 bad:
994 	if (pl != pfds)
995 		free((char *) pl, M_TEMP);
996 	return (error);
997 }
998 
999