xref: /openbsd-src/sys/kern/sys_generic.c (revision b18008b0408a2028b731e7f31eb64cd69fb3767c)
1 /*	$OpenBSD: sys_generic.c,v 1.35 2002/02/08 19:47:50 art Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
67 int seltrue __P((dev_t, int, struct proc *));
68 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(p, v, retval)
76 	struct proc *p;
77 	void *v;
78 	register_t *retval;
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	int fd = SCARG(uap, fd);
86 	struct file *fp;
87 	struct filedesc *fdp = p->p_fd;
88 
89 	if ((fp = fd_getfile(fdp, fd)) == NULL)
90 		return (EBADF);
91 	if ((fp->f_flag & FREAD) == 0)
92 		return (EBADF);
93 
94 	FREF(fp);
95 
96 	/* dofileread() will FRELE the descriptor for us */
97 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
98 	    &fp->f_offset, retval));
99 }
100 
101 int
102 dofileread(p, fd, fp, buf, nbyte, offset, retval)
103 	struct proc *p;
104 	int fd;
105 	struct file *fp;
106 	void *buf;
107 	size_t nbyte;
108 	off_t *offset;
109 	register_t *retval;
110 {
111 	struct uio auio;
112 	struct iovec aiov;
113 	long cnt, error = 0;
114 #ifdef KTRACE
115 	struct iovec ktriov;
116 #endif
117 
118 	aiov.iov_base = (caddr_t)buf;
119 	aiov.iov_len = nbyte;
120 	auio.uio_iov = &aiov;
121 	auio.uio_iovcnt = 1;
122 	auio.uio_resid = nbyte;
123 	auio.uio_rw = UIO_READ;
124 	auio.uio_segflg = UIO_USERSPACE;
125 	auio.uio_procp = p;
126 
127 	/*
128 	 * Reads return ssize_t because -1 is returned on error.  Therefore
129 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
130 	 * values.
131 	 */
132 	if (auio.uio_resid > SSIZE_MAX) {
133 		error = EINVAL;
134 		goto out;
135 	}
136 
137 #ifdef KTRACE
138 	/*
139 	 * if tracing, save a copy of iovec
140 	 */
141 	if (KTRPOINT(p, KTR_GENIO))
142 		ktriov = aiov;
143 #endif
144 	cnt = auio.uio_resid;
145 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
146 	if (error)
147 		if (auio.uio_resid != cnt && (error == ERESTART ||
148 		    error == EINTR || error == EWOULDBLOCK))
149 			error = 0;
150 	cnt -= auio.uio_resid;
151 #ifdef KTRACE
152 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
153 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
154 #endif
155 	*retval = cnt;
156  out:
157 	FRELE(fp);
158 	return (error);
159 }
160 
161 /*
162  * Scatter read system call.
163  */
164 int
165 sys_readv(p, v, retval)
166 	struct proc *p;
167 	void *v;
168 	register_t *retval;
169 {
170 	struct sys_readv_args /* {
171 		syscallarg(int) fd;
172 		syscallarg(const struct iovec *) iovp;
173 		syscallarg(int) iovcnt;
174 	} */ *uap = v;
175 	int fd = SCARG(uap, fd);
176 	struct file *fp;
177 	struct filedesc *fdp = p->p_fd;
178 
179 	if ((fp = fd_getfile(fdp, fd)) == NULL)
180 		return (EBADF);
181 	if ((fp->f_flag & FREAD) == 0)
182 		return (EBADF);
183 
184 	FREF(fp);
185 
186 	/* dofilereadv() will FRELE the descriptor for us */
187 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
188 	    &fp->f_offset, retval));
189 }
190 
191 int
192 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
193 	struct proc *p;
194 	int fd;
195 	struct file *fp;
196 	const struct iovec *iovp;
197 	int iovcnt;
198 	off_t *offset;
199 	register_t *retval;
200 {
201 	struct uio auio;
202 	struct iovec *iov;
203 	struct iovec *needfree;
204 	struct iovec aiov[UIO_SMALLIOV];
205 	long i, cnt, error = 0;
206 	u_int iovlen;
207 #ifdef KTRACE
208 	struct iovec *ktriov = NULL;
209 #endif
210 
211 	/* note: can't use iovlen until iovcnt is validated */
212 	iovlen = iovcnt * sizeof(struct iovec);
213 	if ((u_int)iovcnt > UIO_SMALLIOV) {
214 		if ((u_int)iovcnt > IOV_MAX) {
215 			error = EINVAL;
216 			goto out;
217 		}
218 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
219 	} else if ((u_int)iovcnt > 0) {
220 		iov = aiov;
221 		needfree = NULL;
222 	} else {
223 		error = EINVAL;
224 		goto out;
225 	}
226 
227 	auio.uio_iov = iov;
228 	auio.uio_iovcnt = iovcnt;
229 	auio.uio_rw = UIO_READ;
230 	auio.uio_segflg = UIO_USERSPACE;
231 	auio.uio_procp = p;
232 	error = copyin(iovp, iov, iovlen);
233 	if (error)
234 		goto done;
235 	auio.uio_resid = 0;
236 	for (i = 0; i < iovcnt; i++) {
237 		auio.uio_resid += iov->iov_len;
238 		/*
239 		 * Reads return ssize_t because -1 is returned on error.
240 		 * Therefore we must restrict the length to SSIZE_MAX to
241 		 * avoid garbage return values.
242 		 */
243 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
244 			error = EINVAL;
245 			goto done;
246 		}
247 		iov++;
248 	}
249 #ifdef KTRACE
250 	/*
251 	 * if tracing, save a copy of iovec
252 	 */
253 	if (KTRPOINT(p, KTR_GENIO))  {
254 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
255 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
256 	}
257 #endif
258 	cnt = auio.uio_resid;
259 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
260 	if (error)
261 		if (auio.uio_resid != cnt && (error == ERESTART ||
262 		    error == EINTR || error == EWOULDBLOCK))
263 			error = 0;
264 	cnt -= auio.uio_resid;
265 #ifdef KTRACE
266 	if (ktriov != NULL) {
267 		if (error == 0)
268 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
269 			    error);
270 		free(ktriov, M_TEMP);
271 	}
272 #endif
273 	*retval = cnt;
274  done:
275 	if (needfree)
276 		free(needfree, M_IOV);
277  out:
278 	FRELE(fp);
279 	return (error);
280 }
281 
282 /*
283  * Write system call
284  */
285 int
286 sys_write(p, v, retval)
287 	struct proc *p;
288 	void *v;
289 	register_t *retval;
290 {
291 	struct sys_write_args /* {
292 		syscallarg(int) fd;
293 		syscallarg(const void *) buf;
294 		syscallarg(size_t) nbyte;
295 	} */ *uap = v;
296 	int fd = SCARG(uap, fd);
297 	struct file *fp;
298 	struct filedesc *fdp = p->p_fd;
299 
300 	if ((fp = fd_getfile(fdp, fd)) == NULL)
301 		return (EBADF);
302 	if ((fp->f_flag & FWRITE) == 0)
303 		return (EBADF);
304 
305 	/* dofilewrite() will unuse the descriptor for us */
306 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
307 	    &fp->f_offset, retval));
308 }
309 
310 int
311 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
312 	struct proc *p;
313 	int fd;
314 	struct file *fp;
315 	const void *buf;
316 	size_t nbyte;
317 	off_t *offset;
318 	register_t *retval;
319 {
320 	struct uio auio;
321 	struct iovec aiov;
322 	long cnt, error = 0;
323 #ifdef KTRACE
324 	struct iovec ktriov;
325 #endif
326 
327 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
328 	aiov.iov_len = nbyte;
329 	auio.uio_iov = &aiov;
330 	auio.uio_iovcnt = 1;
331 	auio.uio_resid = nbyte;
332 	auio.uio_rw = UIO_WRITE;
333 	auio.uio_segflg = UIO_USERSPACE;
334 	auio.uio_procp = p;
335 
336 	/*
337 	 * Writes return ssize_t because -1 is returned on error.  Therefore
338 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
339 	 * values.
340 	 */
341 	if (auio.uio_resid > SSIZE_MAX) {
342 		error = EINVAL;
343 		goto out;
344 	}
345 
346 #ifdef KTRACE
347 	/*
348 	 * if tracing, save a copy of iovec
349 	 */
350 	if (KTRPOINT(p, KTR_GENIO))
351 		ktriov = aiov;
352 #endif
353 	cnt = auio.uio_resid;
354 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
355 	if (error) {
356 		if (auio.uio_resid != cnt && (error == ERESTART ||
357 		    error == EINTR || error == EWOULDBLOCK))
358 			error = 0;
359 		if (error == EPIPE)
360 			psignal(p, SIGPIPE);
361 	}
362 	cnt -= auio.uio_resid;
363 #ifdef KTRACE
364 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
365 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
366 #endif
367 	*retval = cnt;
368  out:
369 	return (error);
370 }
371 
372 /*
373  * Gather write system call
374  */
375 int
376 sys_writev(p, v, retval)
377 	struct proc *p;
378 	void *v;
379 	register_t *retval;
380 {
381 	struct sys_writev_args /* {
382 		syscallarg(int) fd;
383 		syscallarg(const struct iovec *) iovp;
384 		syscallarg(int) iovcnt;
385 	} */ *uap = v;
386 	int fd = SCARG(uap, fd);
387 	struct file *fp;
388 	struct filedesc *fdp = p->p_fd;
389 
390 	if ((fp = fd_getfile(fdp, fd)) == NULL)
391 		return (EBADF);
392 	if ((fp->f_flag & FWRITE) == 0)
393 		return (EBADF);
394 
395 	/* dofilewritev() will unuse the descriptor for us */
396 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
397 	    &fp->f_offset, retval));
398 }
399 
400 int
401 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
402 	struct proc *p;
403 	int fd;
404 	struct file *fp;
405 	const struct iovec *iovp;
406 	int iovcnt;
407 	off_t *offset;
408 	register_t *retval;
409 {
410 	struct uio auio;
411 	struct iovec *iov;
412 	struct iovec *needfree;
413 	struct iovec aiov[UIO_SMALLIOV];
414 	long i, cnt, error = 0;
415 	u_int iovlen;
416 #ifdef KTRACE
417 	struct iovec *ktriov = NULL;
418 #endif
419 
420 	/* note: can't use iovlen until iovcnt is validated */
421 	iovlen = iovcnt * sizeof(struct iovec);
422 	if ((u_int)iovcnt > UIO_SMALLIOV) {
423 		if ((u_int)iovcnt > IOV_MAX)
424 			return (EINVAL);
425 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
426 	} else if ((u_int)iovcnt > 0) {
427 		iov = aiov;
428 		needfree = NULL;
429 	} else {
430 		error = EINVAL;
431 		goto out;
432 	}
433 
434 	auio.uio_iov = iov;
435 	auio.uio_iovcnt = iovcnt;
436 	auio.uio_rw = UIO_WRITE;
437 	auio.uio_segflg = UIO_USERSPACE;
438 	auio.uio_procp = p;
439 	error = copyin(iovp, iov, iovlen);
440 	if (error)
441 		goto done;
442 	auio.uio_resid = 0;
443 	for (i = 0; i < iovcnt; i++) {
444 		auio.uio_resid += iov->iov_len;
445 		/*
446 		 * Writes return ssize_t because -1 is returned on error.
447 		 * Therefore we must restrict the length to SSIZE_MAX to
448 		 * avoid garbage return values.
449 		 */
450 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
451 			error = EINVAL;
452 			goto done;
453 		}
454 		iov++;
455 	}
456 #ifdef KTRACE
457 	/*
458 	 * if tracing, save a copy of iovec
459 	 */
460 	if (KTRPOINT(p, KTR_GENIO))  {
461 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
462 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
463 	}
464 #endif
465 	cnt = auio.uio_resid;
466 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
467 	if (error) {
468 		if (auio.uio_resid != cnt && (error == ERESTART ||
469 		    error == EINTR || error == EWOULDBLOCK))
470 			error = 0;
471 		if (error == EPIPE)
472 			psignal(p, SIGPIPE);
473 	}
474 	cnt -= auio.uio_resid;
475 #ifdef KTRACE
476 	if (ktriov != NULL) {
477 		if (error == 0)
478 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
479 			    error);
480 		free(ktriov, M_TEMP);
481 	}
482 #endif
483 	*retval = cnt;
484  done:
485 	if (needfree)
486 		free(needfree, M_IOV);
487  out:
488 	return (error);
489 }
490 
491 /*
492  * Ioctl system call
493  */
494 /* ARGSUSED */
495 int
496 sys_ioctl(p, v, retval)
497 	struct proc *p;
498 	void *v;
499 	register_t *retval;
500 {
501 	register struct sys_ioctl_args /* {
502 		syscallarg(int) fd;
503 		syscallarg(u_long) com;
504 		syscallarg(caddr_t) data;
505 	} */ *uap = v;
506 	register struct file *fp;
507 	register struct filedesc *fdp;
508 	register u_long com;
509 	register int error;
510 	register u_int size;
511 	caddr_t data, memp;
512 	int tmp;
513 #define STK_PARAMS	128
514 	char stkbuf[STK_PARAMS];
515 
516 	fdp = p->p_fd;
517 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
518 		return (EBADF);
519 
520 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
521 		return (EBADF);
522 
523 	switch (com = SCARG(uap, com)) {
524 	case FIONCLEX:
525 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
526 		return (0);
527 	case FIOCLEX:
528 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
529 		return (0);
530 	}
531 
532 	/*
533 	 * Interpret high order word to find amount of data to be
534 	 * copied to/from the user's address space.
535 	 */
536 	size = IOCPARM_LEN(com);
537 	if (size > IOCPARM_MAX)
538 		return (ENOTTY);
539 	memp = NULL;
540 	if (size > sizeof (stkbuf)) {
541 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
542 		data = memp;
543 	} else
544 		data = stkbuf;
545 	if (com&IOC_IN) {
546 		if (size) {
547 			error = copyin(SCARG(uap, data), data, (u_int)size);
548 			if (error) {
549 				if (memp)
550 					free(memp, M_IOCTLOPS);
551 				return (error);
552 			}
553 		} else
554 			*(caddr_t *)data = SCARG(uap, data);
555 	} else if ((com&IOC_OUT) && size)
556 		/*
557 		 * Zero the buffer so the user always
558 		 * gets back something deterministic.
559 		 */
560 		bzero(data, size);
561 	else if (com&IOC_VOID)
562 		*(caddr_t *)data = SCARG(uap, data);
563 
564 	switch (com) {
565 
566 	case FIONBIO:
567 		if ((tmp = *(int *)data) != 0)
568 			fp->f_flag |= FNONBLOCK;
569 		else
570 			fp->f_flag &= ~FNONBLOCK;
571 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
572 		break;
573 
574 	case FIOASYNC:
575 		if ((tmp = *(int *)data) != 0)
576 			fp->f_flag |= FASYNC;
577 		else
578 			fp->f_flag &= ~FASYNC;
579 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
580 		break;
581 
582 	case FIOSETOWN:
583 		tmp = *(int *)data;
584 		if (fp->f_type == DTYPE_SOCKET) {
585 			struct socket *so = (struct socket *)fp->f_data;
586 
587 			so->so_pgid = tmp;
588 			so->so_siguid = p->p_cred->p_ruid;
589 			so->so_sigeuid = p->p_ucred->cr_uid;
590 			error = 0;
591 			break;
592 		}
593 		if (tmp <= 0) {
594 			tmp = -tmp;
595 		} else {
596 			struct proc *p1 = pfind(tmp);
597 			if (p1 == 0) {
598 				error = ESRCH;
599 				break;
600 			}
601 			tmp = p1->p_pgrp->pg_id;
602 		}
603 		error = (*fp->f_ops->fo_ioctl)
604 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
605 		break;
606 
607 	case FIOGETOWN:
608 		if (fp->f_type == DTYPE_SOCKET) {
609 			error = 0;
610 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
611 			break;
612 		}
613 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
614 		*(int *)data = -*(int *)data;
615 		break;
616 
617 	default:
618 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
619 		/*
620 		 * Copy any data to user, size was
621 		 * already set and checked above.
622 		 */
623 		if (error == 0 && (com&IOC_OUT) && size)
624 			error = copyout(data, SCARG(uap, data), (u_int)size);
625 		break;
626 	}
627 	if (memp)
628 		free(memp, M_IOCTLOPS);
629 	return (error);
630 }
631 
632 int	selwait, nselcoll;
633 
634 /*
635  * Select system call.
636  */
637 int
638 sys_select(p, v, retval)
639 	register struct proc *p;
640 	void *v;
641 	register_t *retval;
642 {
643 	register struct sys_select_args /* {
644 		syscallarg(int) nd;
645 		syscallarg(fd_set *) in;
646 		syscallarg(fd_set *) ou;
647 		syscallarg(fd_set *) ex;
648 		syscallarg(struct timeval *) tv;
649 	} */ *uap = v;
650 	fd_set bits[6], *pibits[3], *pobits[3];
651 	struct timeval atv;
652 	int s, ncoll, error = 0, timo;
653 	u_int ni;
654 
655 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
656 		/* forgiving; slightly wrong */
657 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
658 	}
659 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
660 	if (SCARG(uap, nd) > FD_SETSIZE) {
661 		caddr_t mbits;
662 
663 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
664 		bzero(mbits, ni * 6);
665 		pibits[0] = (fd_set *)&mbits[ni * 0];
666 		pibits[1] = (fd_set *)&mbits[ni * 1];
667 		pibits[2] = (fd_set *)&mbits[ni * 2];
668 		pobits[0] = (fd_set *)&mbits[ni * 3];
669 		pobits[1] = (fd_set *)&mbits[ni * 4];
670 		pobits[2] = (fd_set *)&mbits[ni * 5];
671 	} else {
672 		bzero((caddr_t)bits, sizeof(bits));
673 		pibits[0] = &bits[0];
674 		pibits[1] = &bits[1];
675 		pibits[2] = &bits[2];
676 		pobits[0] = &bits[3];
677 		pobits[1] = &bits[4];
678 		pobits[2] = &bits[5];
679 	}
680 
681 #define	getbits(name, x) \
682 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
683 	    (caddr_t)pibits[x], ni))) \
684 		goto done;
685 	getbits(in, 0);
686 	getbits(ou, 1);
687 	getbits(ex, 2);
688 #undef	getbits
689 
690 	if (SCARG(uap, tv)) {
691 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
692 			sizeof (atv));
693 		if (error)
694 			goto done;
695 		if (itimerfix(&atv)) {
696 			error = EINVAL;
697 			goto done;
698 		}
699 		s = splclock();
700 		timeradd(&atv, &time, &atv);
701 		splx(s);
702 	} else
703 		timo = 0;
704 retry:
705 	ncoll = nselcoll;
706 	p->p_flag |= P_SELECT;
707 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
708 	if (error || *retval)
709 		goto done;
710 	if (SCARG(uap, tv)) {
711 		/*
712 		 * We have to recalculate the timeout on every retry.
713 		 */
714 		timo = hzto(&atv);
715 		if (timo <= 0)
716 			goto done;
717 	}
718 	s = splhigh();
719 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
720 		splx(s);
721 		goto retry;
722 	}
723 	p->p_flag &= ~P_SELECT;
724 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
725 	splx(s);
726 	if (error == 0)
727 		goto retry;
728 done:
729 	p->p_flag &= ~P_SELECT;
730 	/* select is not restarted after signals... */
731 	if (error == ERESTART)
732 		error = EINTR;
733 	if (error == EWOULDBLOCK)
734 		error = 0;
735 #define	putbits(name, x) \
736 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
737 	    (caddr_t)SCARG(uap, name), ni))) \
738 		error = error2;
739 	if (error == 0) {
740 		int error2;
741 
742 		putbits(in, 0);
743 		putbits(ou, 1);
744 		putbits(ex, 2);
745 #undef putbits
746 	}
747 
748 	if (pibits[0] != &bits[0])
749 		free(pibits[0], M_TEMP);
750 	return (error);
751 }
752 
753 int
754 selscan(p, ibits, obits, nfd, retval)
755 	struct proc *p;
756 	fd_set *ibits, *obits;
757 	int nfd;
758 	register_t *retval;
759 {
760 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
761 	register struct filedesc *fdp = p->p_fd;
762 	register int msk, i, j, fd;
763 	register fd_mask bits;
764 	struct file *fp;
765 	int ni, n = 0;
766 	static int flag[3] = { FREAD, FWRITE, 0 };
767 
768 	/*
769 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
770 	 * up to the next byte) otherwise the fd_set's are normal sized.
771 	 */
772 	ni = sizeof(fd_set);
773 	if (nfd > FD_SETSIZE)
774 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
775 
776 	for (msk = 0; msk < 3; msk++) {
777 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
778 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
779 
780 		for (i = 0; i < nfd; i += NFDBITS) {
781 			bits = pibits->fds_bits[i/NFDBITS];
782 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
783 				bits &= ~(1 << j);
784 				if ((fp = fd_getfile(fdp, fd)) == NULL)
785 					return (EBADF);
786 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
787 					FD_SET(fd, pobits);
788 					n++;
789 				}
790 			}
791 		}
792 	}
793 	*retval = n;
794 	return (0);
795 }
796 
797 /*ARGSUSED*/
798 int
799 seltrue(dev, flag, p)
800 	dev_t dev;
801 	int flag;
802 	struct proc *p;
803 {
804 
805 	return (1);
806 }
807 
808 /*
809  * Record a select request.
810  */
811 void
812 selrecord(selector, sip)
813 	struct proc *selector;
814 	struct selinfo *sip;
815 {
816 	struct proc *p;
817 	pid_t mypid;
818 
819 	mypid = selector->p_pid;
820 	if (sip->si_selpid == mypid)
821 		return;
822 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
823 	    p->p_wchan == (caddr_t)&selwait)
824 		sip->si_flags |= SI_COLL;
825 	else
826 		sip->si_selpid = mypid;
827 }
828 
829 /*
830  * Do a wakeup when a selectable event occurs.
831  */
832 void
833 selwakeup(sip)
834 	register struct selinfo *sip;
835 {
836 	register struct proc *p;
837 	int s;
838 
839 	if (sip->si_selpid == 0)
840 		return;
841 	if (sip->si_flags & SI_COLL) {
842 		nselcoll++;
843 		sip->si_flags &= ~SI_COLL;
844 		wakeup((caddr_t)&selwait);
845 	}
846 	p = pfind(sip->si_selpid);
847 	sip->si_selpid = 0;
848 	if (p != NULL) {
849 		s = splhigh();
850 		if (p->p_wchan == (caddr_t)&selwait) {
851 			if (p->p_stat == SSLEEP)
852 				setrunnable(p);
853 			else
854 				unsleep(p);
855 		} else if (p->p_flag & P_SELECT)
856 			p->p_flag &= ~P_SELECT;
857 		splx(s);
858 	}
859 }
860 
861 void
862 pollscan(p, pl, nfd, retval)
863 	struct proc *p;
864 	struct pollfd *pl;
865 	int nfd;
866 	register_t *retval;
867 {
868 	register struct filedesc *fdp = p->p_fd;
869 	register int msk, i;
870 	struct file *fp;
871 	int x, n = 0;
872 	static int flag[3] = { FREAD, FWRITE, 0 };
873 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
874 
875 	/*
876 	 * XXX: We need to implement the rest of the flags.
877 	 */
878 	for (i = 0; i < nfd; i++) {
879 		/* Check the file descriptor. */
880 		if (pl[i].fd < 0) {
881 			pl[i].revents = 0;
882 			continue;
883 		}
884 		if ((fp = fd_getfile(fdp, pl[i].fd)) == NULL) {
885 			pl[i].revents = POLLNVAL;
886 			n++;
887 			continue;
888 		}
889 		for (x = msk = 0; msk < 3; msk++) {
890 			if (pl[i].events & pflag[msk]) {
891 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
892 					pl[i].revents |= pflag[msk] &
893 					    pl[i].events;
894 					x++;
895 				}
896 			}
897 		}
898 		if (x)
899 			n++;
900 	}
901 	*retval = n;
902 }
903 
904 /*
905  * We are using the same mechanism as select only we encode/decode args
906  * differently.
907  */
908 int
909 sys_poll(p, v, retval)
910 	register struct proc *p;
911 	void *v;
912 	register_t *retval;
913 {
914 	struct sys_poll_args *uap = v;
915 	size_t sz;
916 	struct pollfd pfds[4], *pl = pfds;
917 	int msec = SCARG(uap, timeout);
918 	struct timeval atv;
919 	int timo, ncoll, i, s, error, error2;
920 	extern int nselcoll, selwait;
921 
922 	/* Standards say no more than MAX_OPEN; this is possibly better. */
923 	if (SCARG(uap, nfds) > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
924 	    maxfiles))
925 		return (EINVAL);
926 
927 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
928 
929 	/* optimize for the default case, of a small nfds value */
930 	if (sz > sizeof(pfds))
931 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
932 
933 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
934 		goto bad;
935 
936 	for (i = 0; i < SCARG(uap, nfds); i++)
937 		pl[i].revents = 0;
938 
939 	if (msec != -1) {
940 		atv.tv_sec = msec / 1000;
941 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
942 
943 		if (itimerfix(&atv)) {
944 			error = EINVAL;
945 			goto done;
946 		}
947 		s = splclock();
948 		timeradd(&atv, &time, &atv);
949 		splx(s);
950 	} else
951 		timo = 0;
952 
953 retry:
954 	ncoll = nselcoll;
955 	p->p_flag |= P_SELECT;
956 	pollscan(p, pl, SCARG(uap, nfds), retval);
957 	if (*retval)
958 		goto done;
959 	if (msec != -1) {
960 		/*
961 		 * We have to recalculate the timeout on every retry.
962 		 */
963 		timo = hzto(&atv);
964 		if (timo <= 0)
965 			goto done;
966 	}
967 	s = splhigh();
968 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
969 		splx(s);
970 		goto retry;
971 	}
972 	p->p_flag &= ~P_SELECT;
973 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
974 	splx(s);
975 	if (error == 0)
976 		goto retry;
977 
978 done:
979 	p->p_flag &= ~P_SELECT;
980 	/* poll is not restarted after signals... */
981 	if (error == ERESTART)
982 		error = EINTR;
983 	if (error == EWOULDBLOCK)
984 		error = 0;
985 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
986 		error = error2;
987 bad:
988 	if (pl != pfds)
989 		free((char *) pl, M_TEMP);
990 	return (error);
991 }
992 
993