xref: /openbsd-src/sys/kern/sys_generic.c (revision 43c6a78befebd24f220c7d68787d20a6f035d4fe)
1 /*	$OpenBSD: sys_generic.c,v 1.34 2002/02/08 13:53:28 art Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
67 int seltrue __P((dev_t, int, struct proc *));
68 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(p, v, retval)
76 	struct proc *p;
77 	void *v;
78 	register_t *retval;
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	int fd = SCARG(uap, fd);
86 	struct file *fp;
87 	struct filedesc *fdp = p->p_fd;
88 
89 	if ((fp = fd_getfile(fdp, fd)) == NULL)
90 		return (EBADF);
91 	if ((fp->f_flag & FREAD) == 0)
92 		return (EBADF);
93 
94 	FREF(fp);
95 
96 	/* dofileread() will unuse the descriptor for us */
97 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
98 	    &fp->f_offset, retval));
99 }
100 
101 int
102 dofileread(p, fd, fp, buf, nbyte, offset, retval)
103 	struct proc *p;
104 	int fd;
105 	struct file *fp;
106 	void *buf;
107 	size_t nbyte;
108 	off_t *offset;
109 	register_t *retval;
110 {
111 	struct uio auio;
112 	struct iovec aiov;
113 	long cnt, error = 0;
114 #ifdef KTRACE
115 	struct iovec ktriov;
116 #endif
117 
118 	aiov.iov_base = (caddr_t)buf;
119 	aiov.iov_len = nbyte;
120 	auio.uio_iov = &aiov;
121 	auio.uio_iovcnt = 1;
122 	auio.uio_resid = nbyte;
123 	auio.uio_rw = UIO_READ;
124 	auio.uio_segflg = UIO_USERSPACE;
125 	auio.uio_procp = p;
126 
127 	/*
128 	 * Reads return ssize_t because -1 is returned on error.  Therefore
129 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
130 	 * values.
131 	 */
132 	if (auio.uio_resid > SSIZE_MAX) {
133 		error = EINVAL;
134 		goto out;
135 	}
136 
137 #ifdef KTRACE
138 	/*
139 	 * if tracing, save a copy of iovec
140 	 */
141 	if (KTRPOINT(p, KTR_GENIO))
142 		ktriov = aiov;
143 #endif
144 	cnt = auio.uio_resid;
145 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
146 	if (error)
147 		if (auio.uio_resid != cnt && (error == ERESTART ||
148 		    error == EINTR || error == EWOULDBLOCK))
149 			error = 0;
150 	cnt -= auio.uio_resid;
151 #ifdef KTRACE
152 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
153 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
154 #endif
155 	*retval = cnt;
156  out:
157 	FRELE(fp);
158 	return (error);
159 }
160 
161 /*
162  * Scatter read system call.
163  */
164 int
165 sys_readv(p, v, retval)
166 	struct proc *p;
167 	void *v;
168 	register_t *retval;
169 {
170 	struct sys_readv_args /* {
171 		syscallarg(int) fd;
172 		syscallarg(const struct iovec *) iovp;
173 		syscallarg(int) iovcnt;
174 	} */ *uap = v;
175 	int fd = SCARG(uap, fd);
176 	struct file *fp;
177 	struct filedesc *fdp = p->p_fd;
178 
179 	if ((fp = fd_getfile(fdp, fd)) == NULL)
180 		return (EBADF);
181 	if ((fp->f_flag & FREAD) == 0)
182 		return (EBADF);
183 
184 	/* dofilereadv() will unuse the descriptor for us */
185 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
186 	    &fp->f_offset, retval));
187 }
188 
189 int
190 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
191 	struct proc *p;
192 	int fd;
193 	struct file *fp;
194 	const struct iovec *iovp;
195 	int iovcnt;
196 	off_t *offset;
197 	register_t *retval;
198 {
199 	struct uio auio;
200 	struct iovec *iov;
201 	struct iovec *needfree;
202 	struct iovec aiov[UIO_SMALLIOV];
203 	long i, cnt, error = 0;
204 	u_int iovlen;
205 #ifdef KTRACE
206 	struct iovec *ktriov = NULL;
207 #endif
208 
209 	/* note: can't use iovlen until iovcnt is validated */
210 	iovlen = iovcnt * sizeof(struct iovec);
211 	if ((u_int)iovcnt > UIO_SMALLIOV) {
212 		if ((u_int)iovcnt > IOV_MAX) {
213 			error = EINVAL;
214 			goto out;
215 		}
216 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
217 	} else if ((u_int)iovcnt > 0) {
218 		iov = aiov;
219 		needfree = NULL;
220 	} else {
221 		error = EINVAL;
222 		goto out;
223 	}
224 
225 	auio.uio_iov = iov;
226 	auio.uio_iovcnt = iovcnt;
227 	auio.uio_rw = UIO_READ;
228 	auio.uio_segflg = UIO_USERSPACE;
229 	auio.uio_procp = p;
230 	error = copyin(iovp, iov, iovlen);
231 	if (error)
232 		goto done;
233 	auio.uio_resid = 0;
234 	for (i = 0; i < iovcnt; i++) {
235 		auio.uio_resid += iov->iov_len;
236 		/*
237 		 * Reads return ssize_t because -1 is returned on error.
238 		 * Therefore we must restrict the length to SSIZE_MAX to
239 		 * avoid garbage return values.
240 		 */
241 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
242 			error = EINVAL;
243 			goto done;
244 		}
245 		iov++;
246 	}
247 #ifdef KTRACE
248 	/*
249 	 * if tracing, save a copy of iovec
250 	 */
251 	if (KTRPOINT(p, KTR_GENIO))  {
252 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
253 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
254 	}
255 #endif
256 	cnt = auio.uio_resid;
257 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
258 	if (error)
259 		if (auio.uio_resid != cnt && (error == ERESTART ||
260 		    error == EINTR || error == EWOULDBLOCK))
261 			error = 0;
262 	cnt -= auio.uio_resid;
263 #ifdef KTRACE
264 	if (ktriov != NULL) {
265 		if (error == 0)
266 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
267 			    error);
268 		free(ktriov, M_TEMP);
269 	}
270 #endif
271 	*retval = cnt;
272  done:
273 	if (needfree)
274 		free(needfree, M_IOV);
275  out:
276 	return (error);
277 }
278 
279 /*
280  * Write system call
281  */
282 int
283 sys_write(p, v, retval)
284 	struct proc *p;
285 	void *v;
286 	register_t *retval;
287 {
288 	struct sys_write_args /* {
289 		syscallarg(int) fd;
290 		syscallarg(const void *) buf;
291 		syscallarg(size_t) nbyte;
292 	} */ *uap = v;
293 	int fd = SCARG(uap, fd);
294 	struct file *fp;
295 	struct filedesc *fdp = p->p_fd;
296 
297 	if ((fp = fd_getfile(fdp, fd)) == NULL)
298 		return (EBADF);
299 	if ((fp->f_flag & FWRITE) == 0)
300 		return (EBADF);
301 
302 	/* dofilewrite() will unuse the descriptor for us */
303 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
304 	    &fp->f_offset, retval));
305 }
306 
307 int
308 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
309 	struct proc *p;
310 	int fd;
311 	struct file *fp;
312 	const void *buf;
313 	size_t nbyte;
314 	off_t *offset;
315 	register_t *retval;
316 {
317 	struct uio auio;
318 	struct iovec aiov;
319 	long cnt, error = 0;
320 #ifdef KTRACE
321 	struct iovec ktriov;
322 #endif
323 
324 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
325 	aiov.iov_len = nbyte;
326 	auio.uio_iov = &aiov;
327 	auio.uio_iovcnt = 1;
328 	auio.uio_resid = nbyte;
329 	auio.uio_rw = UIO_WRITE;
330 	auio.uio_segflg = UIO_USERSPACE;
331 	auio.uio_procp = p;
332 
333 	/*
334 	 * Writes return ssize_t because -1 is returned on error.  Therefore
335 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
336 	 * values.
337 	 */
338 	if (auio.uio_resid > SSIZE_MAX) {
339 		error = EINVAL;
340 		goto out;
341 	}
342 
343 #ifdef KTRACE
344 	/*
345 	 * if tracing, save a copy of iovec
346 	 */
347 	if (KTRPOINT(p, KTR_GENIO))
348 		ktriov = aiov;
349 #endif
350 	cnt = auio.uio_resid;
351 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
352 	if (error) {
353 		if (auio.uio_resid != cnt && (error == ERESTART ||
354 		    error == EINTR || error == EWOULDBLOCK))
355 			error = 0;
356 		if (error == EPIPE)
357 			psignal(p, SIGPIPE);
358 	}
359 	cnt -= auio.uio_resid;
360 #ifdef KTRACE
361 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
362 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
363 #endif
364 	*retval = cnt;
365  out:
366 	return (error);
367 }
368 
369 /*
370  * Gather write system call
371  */
372 int
373 sys_writev(p, v, retval)
374 	struct proc *p;
375 	void *v;
376 	register_t *retval;
377 {
378 	struct sys_writev_args /* {
379 		syscallarg(int) fd;
380 		syscallarg(const struct iovec *) iovp;
381 		syscallarg(int) iovcnt;
382 	} */ *uap = v;
383 	int fd = SCARG(uap, fd);
384 	struct file *fp;
385 	struct filedesc *fdp = p->p_fd;
386 
387 	if ((fp = fd_getfile(fdp, fd)) == NULL)
388 		return (EBADF);
389 	if ((fp->f_flag & FWRITE) == 0)
390 		return (EBADF);
391 
392 	/* dofilewritev() will unuse the descriptor for us */
393 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
394 	    &fp->f_offset, retval));
395 }
396 
397 int
398 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
399 	struct proc *p;
400 	int fd;
401 	struct file *fp;
402 	const struct iovec *iovp;
403 	int iovcnt;
404 	off_t *offset;
405 	register_t *retval;
406 {
407 	struct uio auio;
408 	struct iovec *iov;
409 	struct iovec *needfree;
410 	struct iovec aiov[UIO_SMALLIOV];
411 	long i, cnt, error = 0;
412 	u_int iovlen;
413 #ifdef KTRACE
414 	struct iovec *ktriov = NULL;
415 #endif
416 
417 	/* note: can't use iovlen until iovcnt is validated */
418 	iovlen = iovcnt * sizeof(struct iovec);
419 	if ((u_int)iovcnt > UIO_SMALLIOV) {
420 		if ((u_int)iovcnt > IOV_MAX)
421 			return (EINVAL);
422 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
423 	} else if ((u_int)iovcnt > 0) {
424 		iov = aiov;
425 		needfree = NULL;
426 	} else {
427 		error = EINVAL;
428 		goto out;
429 	}
430 
431 	auio.uio_iov = iov;
432 	auio.uio_iovcnt = iovcnt;
433 	auio.uio_rw = UIO_WRITE;
434 	auio.uio_segflg = UIO_USERSPACE;
435 	auio.uio_procp = p;
436 	error = copyin(iovp, iov, iovlen);
437 	if (error)
438 		goto done;
439 	auio.uio_resid = 0;
440 	for (i = 0; i < iovcnt; i++) {
441 		auio.uio_resid += iov->iov_len;
442 		/*
443 		 * Writes return ssize_t because -1 is returned on error.
444 		 * Therefore we must restrict the length to SSIZE_MAX to
445 		 * avoid garbage return values.
446 		 */
447 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
448 			error = EINVAL;
449 			goto done;
450 		}
451 		iov++;
452 	}
453 #ifdef KTRACE
454 	/*
455 	 * if tracing, save a copy of iovec
456 	 */
457 	if (KTRPOINT(p, KTR_GENIO))  {
458 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
459 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
460 	}
461 #endif
462 	cnt = auio.uio_resid;
463 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
464 	if (error) {
465 		if (auio.uio_resid != cnt && (error == ERESTART ||
466 		    error == EINTR || error == EWOULDBLOCK))
467 			error = 0;
468 		if (error == EPIPE)
469 			psignal(p, SIGPIPE);
470 	}
471 	cnt -= auio.uio_resid;
472 #ifdef KTRACE
473 	if (ktriov != NULL) {
474 		if (error == 0)
475 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
476 			    error);
477 		free(ktriov, M_TEMP);
478 	}
479 #endif
480 	*retval = cnt;
481  done:
482 	if (needfree)
483 		free(needfree, M_IOV);
484  out:
485 	return (error);
486 }
487 
488 /*
489  * Ioctl system call
490  */
491 /* ARGSUSED */
492 int
493 sys_ioctl(p, v, retval)
494 	struct proc *p;
495 	void *v;
496 	register_t *retval;
497 {
498 	register struct sys_ioctl_args /* {
499 		syscallarg(int) fd;
500 		syscallarg(u_long) com;
501 		syscallarg(caddr_t) data;
502 	} */ *uap = v;
503 	register struct file *fp;
504 	register struct filedesc *fdp;
505 	register u_long com;
506 	register int error;
507 	register u_int size;
508 	caddr_t data, memp;
509 	int tmp;
510 #define STK_PARAMS	128
511 	char stkbuf[STK_PARAMS];
512 
513 	fdp = p->p_fd;
514 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
515 		return (EBADF);
516 
517 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
518 		return (EBADF);
519 
520 	switch (com = SCARG(uap, com)) {
521 	case FIONCLEX:
522 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
523 		return (0);
524 	case FIOCLEX:
525 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
526 		return (0);
527 	}
528 
529 	/*
530 	 * Interpret high order word to find amount of data to be
531 	 * copied to/from the user's address space.
532 	 */
533 	size = IOCPARM_LEN(com);
534 	if (size > IOCPARM_MAX)
535 		return (ENOTTY);
536 	memp = NULL;
537 	if (size > sizeof (stkbuf)) {
538 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
539 		data = memp;
540 	} else
541 		data = stkbuf;
542 	if (com&IOC_IN) {
543 		if (size) {
544 			error = copyin(SCARG(uap, data), data, (u_int)size);
545 			if (error) {
546 				if (memp)
547 					free(memp, M_IOCTLOPS);
548 				return (error);
549 			}
550 		} else
551 			*(caddr_t *)data = SCARG(uap, data);
552 	} else if ((com&IOC_OUT) && size)
553 		/*
554 		 * Zero the buffer so the user always
555 		 * gets back something deterministic.
556 		 */
557 		bzero(data, size);
558 	else if (com&IOC_VOID)
559 		*(caddr_t *)data = SCARG(uap, data);
560 
561 	switch (com) {
562 
563 	case FIONBIO:
564 		if ((tmp = *(int *)data) != 0)
565 			fp->f_flag |= FNONBLOCK;
566 		else
567 			fp->f_flag &= ~FNONBLOCK;
568 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
569 		break;
570 
571 	case FIOASYNC:
572 		if ((tmp = *(int *)data) != 0)
573 			fp->f_flag |= FASYNC;
574 		else
575 			fp->f_flag &= ~FASYNC;
576 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
577 		break;
578 
579 	case FIOSETOWN:
580 		tmp = *(int *)data;
581 		if (fp->f_type == DTYPE_SOCKET) {
582 			struct socket *so = (struct socket *)fp->f_data;
583 
584 			so->so_pgid = tmp;
585 			so->so_siguid = p->p_cred->p_ruid;
586 			so->so_sigeuid = p->p_ucred->cr_uid;
587 			error = 0;
588 			break;
589 		}
590 		if (tmp <= 0) {
591 			tmp = -tmp;
592 		} else {
593 			struct proc *p1 = pfind(tmp);
594 			if (p1 == 0) {
595 				error = ESRCH;
596 				break;
597 			}
598 			tmp = p1->p_pgrp->pg_id;
599 		}
600 		error = (*fp->f_ops->fo_ioctl)
601 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
602 		break;
603 
604 	case FIOGETOWN:
605 		if (fp->f_type == DTYPE_SOCKET) {
606 			error = 0;
607 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
608 			break;
609 		}
610 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
611 		*(int *)data = -*(int *)data;
612 		break;
613 
614 	default:
615 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
616 		/*
617 		 * Copy any data to user, size was
618 		 * already set and checked above.
619 		 */
620 		if (error == 0 && (com&IOC_OUT) && size)
621 			error = copyout(data, SCARG(uap, data), (u_int)size);
622 		break;
623 	}
624 	if (memp)
625 		free(memp, M_IOCTLOPS);
626 	return (error);
627 }
628 
629 int	selwait, nselcoll;
630 
631 /*
632  * Select system call.
633  */
634 int
635 sys_select(p, v, retval)
636 	register struct proc *p;
637 	void *v;
638 	register_t *retval;
639 {
640 	register struct sys_select_args /* {
641 		syscallarg(int) nd;
642 		syscallarg(fd_set *) in;
643 		syscallarg(fd_set *) ou;
644 		syscallarg(fd_set *) ex;
645 		syscallarg(struct timeval *) tv;
646 	} */ *uap = v;
647 	fd_set bits[6], *pibits[3], *pobits[3];
648 	struct timeval atv;
649 	int s, ncoll, error = 0, timo;
650 	u_int ni;
651 
652 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
653 		/* forgiving; slightly wrong */
654 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
655 	}
656 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
657 	if (SCARG(uap, nd) > FD_SETSIZE) {
658 		caddr_t mbits;
659 
660 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
661 		bzero(mbits, ni * 6);
662 		pibits[0] = (fd_set *)&mbits[ni * 0];
663 		pibits[1] = (fd_set *)&mbits[ni * 1];
664 		pibits[2] = (fd_set *)&mbits[ni * 2];
665 		pobits[0] = (fd_set *)&mbits[ni * 3];
666 		pobits[1] = (fd_set *)&mbits[ni * 4];
667 		pobits[2] = (fd_set *)&mbits[ni * 5];
668 	} else {
669 		bzero((caddr_t)bits, sizeof(bits));
670 		pibits[0] = &bits[0];
671 		pibits[1] = &bits[1];
672 		pibits[2] = &bits[2];
673 		pobits[0] = &bits[3];
674 		pobits[1] = &bits[4];
675 		pobits[2] = &bits[5];
676 	}
677 
678 #define	getbits(name, x) \
679 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
680 	    (caddr_t)pibits[x], ni))) \
681 		goto done;
682 	getbits(in, 0);
683 	getbits(ou, 1);
684 	getbits(ex, 2);
685 #undef	getbits
686 
687 	if (SCARG(uap, tv)) {
688 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
689 			sizeof (atv));
690 		if (error)
691 			goto done;
692 		if (itimerfix(&atv)) {
693 			error = EINVAL;
694 			goto done;
695 		}
696 		s = splclock();
697 		timeradd(&atv, &time, &atv);
698 		splx(s);
699 	} else
700 		timo = 0;
701 retry:
702 	ncoll = nselcoll;
703 	p->p_flag |= P_SELECT;
704 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
705 	if (error || *retval)
706 		goto done;
707 	if (SCARG(uap, tv)) {
708 		/*
709 		 * We have to recalculate the timeout on every retry.
710 		 */
711 		timo = hzto(&atv);
712 		if (timo <= 0)
713 			goto done;
714 	}
715 	s = splhigh();
716 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
717 		splx(s);
718 		goto retry;
719 	}
720 	p->p_flag &= ~P_SELECT;
721 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
722 	splx(s);
723 	if (error == 0)
724 		goto retry;
725 done:
726 	p->p_flag &= ~P_SELECT;
727 	/* select is not restarted after signals... */
728 	if (error == ERESTART)
729 		error = EINTR;
730 	if (error == EWOULDBLOCK)
731 		error = 0;
732 #define	putbits(name, x) \
733 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
734 	    (caddr_t)SCARG(uap, name), ni))) \
735 		error = error2;
736 	if (error == 0) {
737 		int error2;
738 
739 		putbits(in, 0);
740 		putbits(ou, 1);
741 		putbits(ex, 2);
742 #undef putbits
743 	}
744 
745 	if (pibits[0] != &bits[0])
746 		free(pibits[0], M_TEMP);
747 	return (error);
748 }
749 
750 int
751 selscan(p, ibits, obits, nfd, retval)
752 	struct proc *p;
753 	fd_set *ibits, *obits;
754 	int nfd;
755 	register_t *retval;
756 {
757 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
758 	register struct filedesc *fdp = p->p_fd;
759 	register int msk, i, j, fd;
760 	register fd_mask bits;
761 	struct file *fp;
762 	int ni, n = 0;
763 	static int flag[3] = { FREAD, FWRITE, 0 };
764 
765 	/*
766 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
767 	 * up to the next byte) otherwise the fd_set's are normal sized.
768 	 */
769 	ni = sizeof(fd_set);
770 	if (nfd > FD_SETSIZE)
771 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
772 
773 	for (msk = 0; msk < 3; msk++) {
774 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
775 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
776 
777 		for (i = 0; i < nfd; i += NFDBITS) {
778 			bits = pibits->fds_bits[i/NFDBITS];
779 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
780 				bits &= ~(1 << j);
781 				if ((fp = fd_getfile(fdp, fd)) == NULL)
782 					return (EBADF);
783 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
784 					FD_SET(fd, pobits);
785 					n++;
786 				}
787 			}
788 		}
789 	}
790 	*retval = n;
791 	return (0);
792 }
793 
794 /*ARGSUSED*/
795 int
796 seltrue(dev, flag, p)
797 	dev_t dev;
798 	int flag;
799 	struct proc *p;
800 {
801 
802 	return (1);
803 }
804 
805 /*
806  * Record a select request.
807  */
808 void
809 selrecord(selector, sip)
810 	struct proc *selector;
811 	struct selinfo *sip;
812 {
813 	struct proc *p;
814 	pid_t mypid;
815 
816 	mypid = selector->p_pid;
817 	if (sip->si_selpid == mypid)
818 		return;
819 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
820 	    p->p_wchan == (caddr_t)&selwait)
821 		sip->si_flags |= SI_COLL;
822 	else
823 		sip->si_selpid = mypid;
824 }
825 
826 /*
827  * Do a wakeup when a selectable event occurs.
828  */
829 void
830 selwakeup(sip)
831 	register struct selinfo *sip;
832 {
833 	register struct proc *p;
834 	int s;
835 
836 	if (sip->si_selpid == 0)
837 		return;
838 	if (sip->si_flags & SI_COLL) {
839 		nselcoll++;
840 		sip->si_flags &= ~SI_COLL;
841 		wakeup((caddr_t)&selwait);
842 	}
843 	p = pfind(sip->si_selpid);
844 	sip->si_selpid = 0;
845 	if (p != NULL) {
846 		s = splhigh();
847 		if (p->p_wchan == (caddr_t)&selwait) {
848 			if (p->p_stat == SSLEEP)
849 				setrunnable(p);
850 			else
851 				unsleep(p);
852 		} else if (p->p_flag & P_SELECT)
853 			p->p_flag &= ~P_SELECT;
854 		splx(s);
855 	}
856 }
857 
858 void
859 pollscan(p, pl, nfd, retval)
860 	struct proc *p;
861 	struct pollfd *pl;
862 	int nfd;
863 	register_t *retval;
864 {
865 	register struct filedesc *fdp = p->p_fd;
866 	register int msk, i;
867 	struct file *fp;
868 	int x, n = 0;
869 	static int flag[3] = { FREAD, FWRITE, 0 };
870 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
871 
872 	/*
873 	 * XXX: We need to implement the rest of the flags.
874 	 */
875 	for (i = 0; i < nfd; i++) {
876 		/* Check the file descriptor. */
877 		if (pl[i].fd < 0) {
878 			pl[i].revents = 0;
879 			continue;
880 		}
881 		if ((fp = fd_getfile(fdp, pl[i].fd)) == NULL) {
882 			pl[i].revents = POLLNVAL;
883 			n++;
884 			continue;
885 		}
886 		for (x = msk = 0; msk < 3; msk++) {
887 			if (pl[i].events & pflag[msk]) {
888 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
889 					pl[i].revents |= pflag[msk] &
890 					    pl[i].events;
891 					x++;
892 				}
893 			}
894 		}
895 		if (x)
896 			n++;
897 	}
898 	*retval = n;
899 }
900 
901 /*
902  * We are using the same mechanism as select only we encode/decode args
903  * differently.
904  */
905 int
906 sys_poll(p, v, retval)
907 	register struct proc *p;
908 	void *v;
909 	register_t *retval;
910 {
911 	struct sys_poll_args *uap = v;
912 	size_t sz;
913 	struct pollfd pfds[4], *pl = pfds;
914 	int msec = SCARG(uap, timeout);
915 	struct timeval atv;
916 	int timo, ncoll, i, s, error, error2;
917 	extern int nselcoll, selwait;
918 
919 	/* Standards say no more than MAX_OPEN; this is possibly better. */
920 	if (SCARG(uap, nfds) > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
921 	    maxfiles))
922 		return (EINVAL);
923 
924 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
925 
926 	/* optimize for the default case, of a small nfds value */
927 	if (sz > sizeof(pfds))
928 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
929 
930 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
931 		goto bad;
932 
933 	for (i = 0; i < SCARG(uap, nfds); i++)
934 		pl[i].revents = 0;
935 
936 	if (msec != -1) {
937 		atv.tv_sec = msec / 1000;
938 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
939 
940 		if (itimerfix(&atv)) {
941 			error = EINVAL;
942 			goto done;
943 		}
944 		s = splclock();
945 		timeradd(&atv, &time, &atv);
946 		splx(s);
947 	} else
948 		timo = 0;
949 
950 retry:
951 	ncoll = nselcoll;
952 	p->p_flag |= P_SELECT;
953 	pollscan(p, pl, SCARG(uap, nfds), retval);
954 	if (*retval)
955 		goto done;
956 	if (msec != -1) {
957 		/*
958 		 * We have to recalculate the timeout on every retry.
959 		 */
960 		timo = hzto(&atv);
961 		if (timo <= 0)
962 			goto done;
963 	}
964 	s = splhigh();
965 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
966 		splx(s);
967 		goto retry;
968 	}
969 	p->p_flag &= ~P_SELECT;
970 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
971 	splx(s);
972 	if (error == 0)
973 		goto retry;
974 
975 done:
976 	p->p_flag &= ~P_SELECT;
977 	/* poll is not restarted after signals... */
978 	if (error == ERESTART)
979 		error = EINTR;
980 	if (error == EWOULDBLOCK)
981 		error = 0;
982 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
983 		error = error2;
984 bad:
985 	if (pl != pfds)
986 		free((char *) pl, M_TEMP);
987 	return (error);
988 }
989 
990