xref: /openbsd-src/sys/kern/sys_generic.c (revision 47911bd667ac77dc523b8a13ef40b012dbffa741)
1 /*	$OpenBSD: sys_generic.c,v 1.41 2002/08/12 14:32:44 aaron Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 #include <uvm/uvm_extern.h>
67 
68 int selscan(struct proc *, fd_set *, fd_set *, int, register_t *);
69 int seltrue(dev_t, int, struct proc *);
70 void pollscan(struct proc *, struct pollfd *, int, register_t *);
71 
72 /*
73  * Read system call.
74  */
75 /* ARGSUSED */
76 int
77 sys_read(p, v, retval)
78 	struct proc *p;
79 	void *v;
80 	register_t *retval;
81 {
82 	struct sys_read_args /* {
83 		syscallarg(int) fd;
84 		syscallarg(void *) buf;
85 		syscallarg(size_t) nbyte;
86 	} */ *uap = v;
87 	int fd = SCARG(uap, fd);
88 	struct file *fp;
89 	struct filedesc *fdp = p->p_fd;
90 
91 	if ((fp = fd_getfile(fdp, fd)) == NULL)
92 		return (EBADF);
93 	if ((fp->f_flag & FREAD) == 0)
94 		return (EBADF);
95 
96 	FREF(fp);
97 
98 	/* dofileread() will FRELE the descriptor for us */
99 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
100 	    &fp->f_offset, retval));
101 }
102 
103 int
104 dofileread(p, fd, fp, buf, nbyte, offset, retval)
105 	struct proc *p;
106 	int fd;
107 	struct file *fp;
108 	void *buf;
109 	size_t nbyte;
110 	off_t *offset;
111 	register_t *retval;
112 {
113 	struct uio auio;
114 	struct iovec aiov;
115 	long cnt, error = 0;
116 #ifdef KTRACE
117 	struct iovec ktriov;
118 #endif
119 
120 	aiov.iov_base = (caddr_t)buf;
121 	aiov.iov_len = nbyte;
122 	auio.uio_iov = &aiov;
123 	auio.uio_iovcnt = 1;
124 	auio.uio_resid = nbyte;
125 	auio.uio_rw = UIO_READ;
126 	auio.uio_segflg = UIO_USERSPACE;
127 	auio.uio_procp = p;
128 
129 	/*
130 	 * Reads return ssize_t because -1 is returned on error.  Therefore
131 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
132 	 * values.
133 	 */
134 	if (auio.uio_resid > SSIZE_MAX) {
135 		error = EINVAL;
136 		goto out;
137 	}
138 
139 #ifdef KTRACE
140 	/*
141 	 * if tracing, save a copy of iovec
142 	 */
143 	if (KTRPOINT(p, KTR_GENIO))
144 		ktriov = aiov;
145 #endif
146 	cnt = auio.uio_resid;
147 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
148 	if (error)
149 		if (auio.uio_resid != cnt && (error == ERESTART ||
150 		    error == EINTR || error == EWOULDBLOCK))
151 			error = 0;
152 	cnt -= auio.uio_resid;
153 #ifdef KTRACE
154 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
155 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
156 #endif
157 	*retval = cnt;
158  out:
159 	FRELE(fp);
160 	return (error);
161 }
162 
163 /*
164  * Scatter read system call.
165  */
166 int
167 sys_readv(p, v, retval)
168 	struct proc *p;
169 	void *v;
170 	register_t *retval;
171 {
172 	struct sys_readv_args /* {
173 		syscallarg(int) fd;
174 		syscallarg(const struct iovec *) iovp;
175 		syscallarg(int) iovcnt;
176 	} */ *uap = v;
177 	int fd = SCARG(uap, fd);
178 	struct file *fp;
179 	struct filedesc *fdp = p->p_fd;
180 
181 	if ((fp = fd_getfile(fdp, fd)) == NULL)
182 		return (EBADF);
183 	if ((fp->f_flag & FREAD) == 0)
184 		return (EBADF);
185 
186 	FREF(fp);
187 
188 	/* dofilereadv() will FRELE the descriptor for us */
189 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
190 	    &fp->f_offset, retval));
191 }
192 
193 int
194 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
195 	struct proc *p;
196 	int fd;
197 	struct file *fp;
198 	const struct iovec *iovp;
199 	int iovcnt;
200 	off_t *offset;
201 	register_t *retval;
202 {
203 	struct uio auio;
204 	struct iovec *iov;
205 	struct iovec *needfree;
206 	struct iovec aiov[UIO_SMALLIOV];
207 	long i, cnt, error = 0;
208 	u_int iovlen;
209 #ifdef KTRACE
210 	struct iovec *ktriov = NULL;
211 #endif
212 
213 	/* note: can't use iovlen until iovcnt is validated */
214 	iovlen = iovcnt * sizeof(struct iovec);
215 	if ((u_int)iovcnt > UIO_SMALLIOV) {
216 		if ((u_int)iovcnt > IOV_MAX) {
217 			error = EINVAL;
218 			goto out;
219 		}
220 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
221 	} else if ((u_int)iovcnt > 0) {
222 		iov = aiov;
223 		needfree = NULL;
224 	} else {
225 		error = EINVAL;
226 		goto out;
227 	}
228 
229 	auio.uio_iov = iov;
230 	auio.uio_iovcnt = iovcnt;
231 	auio.uio_rw = UIO_READ;
232 	auio.uio_segflg = UIO_USERSPACE;
233 	auio.uio_procp = p;
234 	error = copyin(iovp, iov, iovlen);
235 	if (error)
236 		goto done;
237 	auio.uio_resid = 0;
238 	for (i = 0; i < iovcnt; i++) {
239 		auio.uio_resid += iov->iov_len;
240 		/*
241 		 * Reads return ssize_t because -1 is returned on error.
242 		 * Therefore we must restrict the length to SSIZE_MAX to
243 		 * avoid garbage return values.
244 		 */
245 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
246 			error = EINVAL;
247 			goto done;
248 		}
249 		iov++;
250 	}
251 #ifdef KTRACE
252 	/*
253 	 * if tracing, save a copy of iovec
254 	 */
255 	if (KTRPOINT(p, KTR_GENIO))  {
256 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
257 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
258 	}
259 #endif
260 	cnt = auio.uio_resid;
261 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
262 	if (error)
263 		if (auio.uio_resid != cnt && (error == ERESTART ||
264 		    error == EINTR || error == EWOULDBLOCK))
265 			error = 0;
266 	cnt -= auio.uio_resid;
267 #ifdef KTRACE
268 	if (ktriov != NULL) {
269 		if (error == 0)
270 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
271 			    error);
272 		free(ktriov, M_TEMP);
273 	}
274 #endif
275 	*retval = cnt;
276  done:
277 	if (needfree)
278 		free(needfree, M_IOV);
279  out:
280 	FRELE(fp);
281 	return (error);
282 }
283 
284 /*
285  * Write system call
286  */
287 int
288 sys_write(p, v, retval)
289 	struct proc *p;
290 	void *v;
291 	register_t *retval;
292 {
293 	struct sys_write_args /* {
294 		syscallarg(int) fd;
295 		syscallarg(const void *) buf;
296 		syscallarg(size_t) nbyte;
297 	} */ *uap = v;
298 	int fd = SCARG(uap, fd);
299 	struct file *fp;
300 	struct filedesc *fdp = p->p_fd;
301 
302 	if ((fp = fd_getfile(fdp, fd)) == NULL)
303 		return (EBADF);
304 	if ((fp->f_flag & FWRITE) == 0)
305 		return (EBADF);
306 
307 	FREF(fp);
308 
309 	/* dofilewrite() will FRELE the descriptor for us */
310 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
311 	    &fp->f_offset, retval));
312 }
313 
314 int
315 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
316 	struct proc *p;
317 	int fd;
318 	struct file *fp;
319 	const void *buf;
320 	size_t nbyte;
321 	off_t *offset;
322 	register_t *retval;
323 {
324 	struct uio auio;
325 	struct iovec aiov;
326 	long cnt, error = 0;
327 #ifdef KTRACE
328 	struct iovec ktriov;
329 #endif
330 
331 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
332 	aiov.iov_len = nbyte;
333 	auio.uio_iov = &aiov;
334 	auio.uio_iovcnt = 1;
335 	auio.uio_resid = nbyte;
336 	auio.uio_rw = UIO_WRITE;
337 	auio.uio_segflg = UIO_USERSPACE;
338 	auio.uio_procp = p;
339 
340 	/*
341 	 * Writes return ssize_t because -1 is returned on error.  Therefore
342 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
343 	 * values.
344 	 */
345 	if (auio.uio_resid > SSIZE_MAX) {
346 		error = EINVAL;
347 		goto out;
348 	}
349 
350 #ifdef KTRACE
351 	/*
352 	 * if tracing, save a copy of iovec
353 	 */
354 	if (KTRPOINT(p, KTR_GENIO))
355 		ktriov = aiov;
356 #endif
357 	cnt = auio.uio_resid;
358 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
359 	if (error) {
360 		if (auio.uio_resid != cnt && (error == ERESTART ||
361 		    error == EINTR || error == EWOULDBLOCK))
362 			error = 0;
363 		if (error == EPIPE)
364 			psignal(p, SIGPIPE);
365 	}
366 	cnt -= auio.uio_resid;
367 #ifdef KTRACE
368 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
369 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
370 #endif
371 	*retval = cnt;
372  out:
373 	FRELE(fp);
374 	return (error);
375 }
376 
377 /*
378  * Gather write system call
379  */
380 int
381 sys_writev(p, v, retval)
382 	struct proc *p;
383 	void *v;
384 	register_t *retval;
385 {
386 	struct sys_writev_args /* {
387 		syscallarg(int) fd;
388 		syscallarg(const struct iovec *) iovp;
389 		syscallarg(int) iovcnt;
390 	} */ *uap = v;
391 	int fd = SCARG(uap, fd);
392 	struct file *fp;
393 	struct filedesc *fdp = p->p_fd;
394 
395 	if ((fp = fd_getfile(fdp, fd)) == NULL)
396 		return (EBADF);
397 	if ((fp->f_flag & FWRITE) == 0)
398 		return (EBADF);
399 
400 	FREF(fp);
401 
402 	/* dofilewritev() will FRELE the descriptor for us */
403 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
404 	    &fp->f_offset, retval));
405 }
406 
407 int
408 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
409 	struct proc *p;
410 	int fd;
411 	struct file *fp;
412 	const struct iovec *iovp;
413 	int iovcnt;
414 	off_t *offset;
415 	register_t *retval;
416 {
417 	struct uio auio;
418 	struct iovec *iov;
419 	struct iovec *needfree;
420 	struct iovec aiov[UIO_SMALLIOV];
421 	long i, cnt, error = 0;
422 	u_int iovlen;
423 #ifdef KTRACE
424 	struct iovec *ktriov = NULL;
425 #endif
426 
427 	/* note: can't use iovlen until iovcnt is validated */
428 	iovlen = iovcnt * sizeof(struct iovec);
429 	if ((u_int)iovcnt > UIO_SMALLIOV) {
430 		if ((u_int)iovcnt > IOV_MAX) {
431 			error = EINVAL;
432 			goto out;
433 		}
434 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
435 	} else if ((u_int)iovcnt > 0) {
436 		iov = aiov;
437 		needfree = NULL;
438 	} else {
439 		error = EINVAL;
440 		goto out;
441 	}
442 
443 	auio.uio_iov = iov;
444 	auio.uio_iovcnt = iovcnt;
445 	auio.uio_rw = UIO_WRITE;
446 	auio.uio_segflg = UIO_USERSPACE;
447 	auio.uio_procp = p;
448 	error = copyin(iovp, iov, iovlen);
449 	if (error)
450 		goto done;
451 	auio.uio_resid = 0;
452 	for (i = 0; i < iovcnt; i++) {
453 		auio.uio_resid += iov->iov_len;
454 		/*
455 		 * Writes return ssize_t because -1 is returned on error.
456 		 * Therefore we must restrict the length to SSIZE_MAX to
457 		 * avoid garbage return values.
458 		 */
459 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
460 			error = EINVAL;
461 			goto done;
462 		}
463 		iov++;
464 	}
465 #ifdef KTRACE
466 	/*
467 	 * if tracing, save a copy of iovec
468 	 */
469 	if (KTRPOINT(p, KTR_GENIO))  {
470 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
471 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
472 	}
473 #endif
474 	cnt = auio.uio_resid;
475 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
476 	if (error) {
477 		if (auio.uio_resid != cnt && (error == ERESTART ||
478 		    error == EINTR || error == EWOULDBLOCK))
479 			error = 0;
480 		if (error == EPIPE)
481 			psignal(p, SIGPIPE);
482 	}
483 	cnt -= auio.uio_resid;
484 #ifdef KTRACE
485 	if (ktriov != NULL) {
486 		if (error == 0)
487 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
488 			    error);
489 		free(ktriov, M_TEMP);
490 	}
491 #endif
492 	*retval = cnt;
493  done:
494 	if (needfree)
495 		free(needfree, M_IOV);
496  out:
497 	FRELE(fp);
498 	return (error);
499 }
500 
501 /*
502  * Ioctl system call
503  */
504 /* ARGSUSED */
505 int
506 sys_ioctl(p, v, retval)
507 	struct proc *p;
508 	void *v;
509 	register_t *retval;
510 {
511 	struct sys_ioctl_args /* {
512 		syscallarg(int) fd;
513 		syscallarg(u_long) com;
514 		syscallarg(caddr_t) data;
515 	} */ *uap = v;
516 	struct file *fp;
517 	struct filedesc *fdp;
518 	u_long com;
519 	int error;
520 	u_int size;
521 	caddr_t data, memp;
522 	int tmp;
523 #define STK_PARAMS	128
524 	char stkbuf[STK_PARAMS];
525 
526 	fdp = p->p_fd;
527 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
528 		return (EBADF);
529 
530 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
531 		return (EBADF);
532 
533 	switch (com = SCARG(uap, com)) {
534 	case FIONCLEX:
535 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
536 		return (0);
537 	case FIOCLEX:
538 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
539 		return (0);
540 	}
541 
542 	/*
543 	 * Interpret high order word to find amount of data to be
544 	 * copied to/from the user's address space.
545 	 */
546 	size = IOCPARM_LEN(com);
547 	if (size > IOCPARM_MAX)
548 		return (ENOTTY);
549 	FREF(fp);
550 	memp = NULL;
551 	if (size > sizeof (stkbuf)) {
552 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
553 		data = memp;
554 	} else
555 		data = stkbuf;
556 	if (com&IOC_IN) {
557 		if (size) {
558 			error = copyin(SCARG(uap, data), data, (u_int)size);
559 			if (error) {
560 				goto out;
561 			}
562 		} else
563 			*(caddr_t *)data = SCARG(uap, data);
564 	} else if ((com&IOC_OUT) && size)
565 		/*
566 		 * Zero the buffer so the user always
567 		 * gets back something deterministic.
568 		 */
569 		bzero(data, size);
570 	else if (com&IOC_VOID)
571 		*(caddr_t *)data = SCARG(uap, data);
572 
573 	switch (com) {
574 
575 	case FIONBIO:
576 		if ((tmp = *(int *)data) != 0)
577 			fp->f_flag |= FNONBLOCK;
578 		else
579 			fp->f_flag &= ~FNONBLOCK;
580 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
581 		break;
582 
583 	case FIOASYNC:
584 		if ((tmp = *(int *)data) != 0)
585 			fp->f_flag |= FASYNC;
586 		else
587 			fp->f_flag &= ~FASYNC;
588 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
589 		break;
590 
591 	case FIOSETOWN:
592 		tmp = *(int *)data;
593 		if (fp->f_type == DTYPE_SOCKET) {
594 			struct socket *so = (struct socket *)fp->f_data;
595 
596 			so->so_pgid = tmp;
597 			so->so_siguid = p->p_cred->p_ruid;
598 			so->so_sigeuid = p->p_ucred->cr_uid;
599 			error = 0;
600 			break;
601 		}
602 		if (tmp <= 0) {
603 			tmp = -tmp;
604 		} else {
605 			struct proc *p1 = pfind(tmp);
606 			if (p1 == 0) {
607 				error = ESRCH;
608 				break;
609 			}
610 			tmp = p1->p_pgrp->pg_id;
611 		}
612 		error = (*fp->f_ops->fo_ioctl)
613 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
614 		break;
615 
616 	case FIOGETOWN:
617 		if (fp->f_type == DTYPE_SOCKET) {
618 			error = 0;
619 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
620 			break;
621 		}
622 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
623 		*(int *)data = -*(int *)data;
624 		break;
625 
626 	default:
627 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
628 		/*
629 		 * Copy any data to user, size was
630 		 * already set and checked above.
631 		 */
632 		if (error == 0 && (com&IOC_OUT) && size)
633 			error = copyout(data, SCARG(uap, data), (u_int)size);
634 		break;
635 	}
636 out:
637 	FRELE(fp);
638 	if (memp)
639 		free(memp, M_IOCTLOPS);
640 	return (error);
641 }
642 
643 int	selwait, nselcoll;
644 
645 /*
646  * Select system call.
647  */
648 int
649 sys_select(struct proc *p, void *v, register_t *retval)
650 {
651 	struct sys_select_args /* {
652 		syscallarg(int) nd;
653 		syscallarg(fd_set *) in;
654 		syscallarg(fd_set *) ou;
655 		syscallarg(fd_set *) ex;
656 		syscallarg(struct timeval *) tv;
657 	} */ *uap = v;
658 	fd_set bits[6], *pibits[3], *pobits[3];
659 	struct timeval atv;
660 	int s, ncoll, error = 0, timo;
661 	u_int nd, ni;
662 
663 	nd = SCARG(uap, nd);
664 	if (nd > p->p_fd->fd_nfiles) {
665 		/* forgiving; slightly wrong */
666 		nd = p->p_fd->fd_nfiles;
667 	}
668 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
669 	if (nd > FD_SETSIZE) {
670 		caddr_t mbits;
671 
672 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
673 		bzero(mbits, ni * 6);
674 		pibits[0] = (fd_set *)&mbits[ni * 0];
675 		pibits[1] = (fd_set *)&mbits[ni * 1];
676 		pibits[2] = (fd_set *)&mbits[ni * 2];
677 		pobits[0] = (fd_set *)&mbits[ni * 3];
678 		pobits[1] = (fd_set *)&mbits[ni * 4];
679 		pobits[2] = (fd_set *)&mbits[ni * 5];
680 	} else {
681 		bzero((caddr_t)bits, sizeof(bits));
682 		pibits[0] = &bits[0];
683 		pibits[1] = &bits[1];
684 		pibits[2] = &bits[2];
685 		pobits[0] = &bits[3];
686 		pobits[1] = &bits[4];
687 		pobits[2] = &bits[5];
688 	}
689 
690 #define	getbits(name, x) \
691 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
692 	    (caddr_t)pibits[x], ni))) \
693 		goto done;
694 	getbits(in, 0);
695 	getbits(ou, 1);
696 	getbits(ex, 2);
697 #undef	getbits
698 
699 	if (SCARG(uap, tv)) {
700 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
701 			sizeof (atv));
702 		if (error)
703 			goto done;
704 		if (itimerfix(&atv)) {
705 			error = EINVAL;
706 			goto done;
707 		}
708 		s = splclock();
709 		timeradd(&atv, &time, &atv);
710 		splx(s);
711 	} else
712 		timo = 0;
713 retry:
714 	ncoll = nselcoll;
715 	p->p_flag |= P_SELECT;
716 	error = selscan(p, pibits[0], pobits[0], nd, retval);
717 	if (error || *retval)
718 		goto done;
719 	if (SCARG(uap, tv)) {
720 		/*
721 		 * We have to recalculate the timeout on every retry.
722 		 */
723 		timo = hzto(&atv);
724 		if (timo <= 0)
725 			goto done;
726 	}
727 	s = splhigh();
728 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
729 		splx(s);
730 		goto retry;
731 	}
732 	p->p_flag &= ~P_SELECT;
733 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
734 	splx(s);
735 	if (error == 0)
736 		goto retry;
737 done:
738 	p->p_flag &= ~P_SELECT;
739 	/* select is not restarted after signals... */
740 	if (error == ERESTART)
741 		error = EINTR;
742 	if (error == EWOULDBLOCK)
743 		error = 0;
744 #define	putbits(name, x) \
745 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
746 	    (caddr_t)SCARG(uap, name), ni))) \
747 		error = error2;
748 	if (error == 0) {
749 		int error2;
750 
751 		putbits(in, 0);
752 		putbits(ou, 1);
753 		putbits(ex, 2);
754 #undef putbits
755 	}
756 
757 	if (pibits[0] != &bits[0])
758 		free(pibits[0], M_TEMP);
759 	return (error);
760 }
761 
762 int
763 selscan(p, ibits, obits, nfd, retval)
764 	struct proc *p;
765 	fd_set *ibits, *obits;
766 	int nfd;
767 	register_t *retval;
768 {
769 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
770 	register struct filedesc *fdp = p->p_fd;
771 	register int msk, i, j, fd;
772 	register fd_mask bits;
773 	struct file *fp;
774 	int ni, n = 0;
775 	static int flag[3] = { FREAD, FWRITE, 0 };
776 
777 	/*
778 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
779 	 * up to the next byte) otherwise the fd_set's are normal sized.
780 	 */
781 	ni = sizeof(fd_set);
782 	if (nfd > FD_SETSIZE)
783 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
784 
785 	for (msk = 0; msk < 3; msk++) {
786 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
787 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
788 
789 		for (i = 0; i < nfd; i += NFDBITS) {
790 			bits = pibits->fds_bits[i/NFDBITS];
791 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
792 				bits &= ~(1 << j);
793 				if ((fp = fd_getfile(fdp, fd)) == NULL)
794 					return (EBADF);
795 				FREF(fp);
796 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
797 					FD_SET(fd, pobits);
798 					n++;
799 				}
800 				FRELE(fp);
801 			}
802 		}
803 	}
804 	*retval = n;
805 	return (0);
806 }
807 
808 /*ARGSUSED*/
809 int
810 seltrue(dev, flag, p)
811 	dev_t dev;
812 	int flag;
813 	struct proc *p;
814 {
815 
816 	return (1);
817 }
818 
819 /*
820  * Record a select request.
821  */
822 void
823 selrecord(selector, sip)
824 	struct proc *selector;
825 	struct selinfo *sip;
826 {
827 	struct proc *p;
828 	pid_t mypid;
829 
830 	mypid = selector->p_pid;
831 	if (sip->si_selpid == mypid)
832 		return;
833 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
834 	    p->p_wchan == (caddr_t)&selwait)
835 		sip->si_flags |= SI_COLL;
836 	else
837 		sip->si_selpid = mypid;
838 }
839 
840 /*
841  * Do a wakeup when a selectable event occurs.
842  */
843 void
844 selwakeup(sip)
845 	register struct selinfo *sip;
846 {
847 	register struct proc *p;
848 	int s;
849 
850 	if (sip->si_selpid == 0)
851 		return;
852 	if (sip->si_flags & SI_COLL) {
853 		nselcoll++;
854 		sip->si_flags &= ~SI_COLL;
855 		wakeup((caddr_t)&selwait);
856 	}
857 	p = pfind(sip->si_selpid);
858 	sip->si_selpid = 0;
859 	if (p != NULL) {
860 		s = splhigh();
861 		if (p->p_wchan == (caddr_t)&selwait) {
862 			if (p->p_stat == SSLEEP)
863 				setrunnable(p);
864 			else
865 				unsleep(p);
866 		} else if (p->p_flag & P_SELECT)
867 			p->p_flag &= ~P_SELECT;
868 		splx(s);
869 	}
870 }
871 
872 void
873 pollscan(p, pl, nfd, retval)
874 	struct proc *p;
875 	struct pollfd *pl;
876 	int nfd;
877 	register_t *retval;
878 {
879 	register struct filedesc *fdp = p->p_fd;
880 	register int msk, i;
881 	struct file *fp;
882 	int x, n = 0;
883 	static int flag[3] = { FREAD, FWRITE, 0 };
884 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
885 
886 	/*
887 	 * XXX: We need to implement the rest of the flags.
888 	 */
889 	for (i = 0; i < nfd; i++) {
890 		/* Check the file descriptor. */
891 		if (pl[i].fd < 0) {
892 			pl[i].revents = 0;
893 			continue;
894 		}
895 		if ((fp = fd_getfile(fdp, pl[i].fd)) == NULL) {
896 			pl[i].revents = POLLNVAL;
897 			n++;
898 			continue;
899 		}
900 		FREF(fp);
901 		for (x = msk = 0; msk < 3; msk++) {
902 			if (pl[i].events & pflag[msk]) {
903 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
904 					pl[i].revents |= pflag[msk] &
905 					    pl[i].events;
906 					x++;
907 				}
908 			}
909 		}
910 		FRELE(fp);
911 		if (x)
912 			n++;
913 	}
914 	*retval = n;
915 }
916 
917 /*
918  * We are using the same mechanism as select only we encode/decode args
919  * differently.
920  */
921 int
922 sys_poll(struct proc *p, void *v, register_t *retval)
923 {
924 	struct sys_poll_args *uap = v;
925 	size_t sz;
926 	struct pollfd pfds[4], *pl = pfds;
927 	int msec = SCARG(uap, timeout);
928 	struct timeval atv;
929 	int timo, ncoll, i, s, error, error2;
930 	extern int nselcoll, selwait;
931 	u_int nfds = SCARG(uap, nfds);
932 
933 	/* Standards say no more than MAX_OPEN; this is possibly better. */
934 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
935 		return (EINVAL);
936 
937 	sz = sizeof(struct pollfd) * nfds;
938 
939 	/* optimize for the default case, of a small nfds value */
940 	if (sz > sizeof(pfds))
941 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
942 
943 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
944 		goto bad;
945 
946 	for (i = 0; i < nfds; i++)
947 		pl[i].revents = 0;
948 
949 	if (msec != -1) {
950 		atv.tv_sec = msec / 1000;
951 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
952 
953 		if (itimerfix(&atv)) {
954 			error = EINVAL;
955 			goto done;
956 		}
957 		s = splclock();
958 		timeradd(&atv, &time, &atv);
959 		splx(s);
960 	} else
961 		timo = 0;
962 
963 retry:
964 	ncoll = nselcoll;
965 	p->p_flag |= P_SELECT;
966 	pollscan(p, pl, nfds, retval);
967 	if (*retval)
968 		goto done;
969 	if (msec != -1) {
970 		/*
971 		 * We have to recalculate the timeout on every retry.
972 		 */
973 		timo = hzto(&atv);
974 		if (timo <= 0)
975 			goto done;
976 	}
977 	s = splhigh();
978 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
979 		splx(s);
980 		goto retry;
981 	}
982 	p->p_flag &= ~P_SELECT;
983 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
984 	splx(s);
985 	if (error == 0)
986 		goto retry;
987 
988 done:
989 	p->p_flag &= ~P_SELECT;
990 	/* poll is not restarted after signals... */
991 	if (error == ERESTART)
992 		error = EINTR;
993 	if (error == EWOULDBLOCK)
994 		error = 0;
995 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
996 		error = error2;
997 bad:
998 	if (pl != pfds)
999 		free((char *) pl, M_TEMP);
1000 	return (error);
1001 }
1002 
1003