xref: /openbsd-src/sys/kern/sys_generic.c (revision d424e204e60bfd7ffd01a57d3983f326a20fef28)
1 /*	$OpenBSD: sys_generic.c,v 1.29 2001/05/16 12:52:58 ho Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
67 int seltrue __P((dev_t, int, struct proc *));
68 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(p, v, retval)
76 	struct proc *p;
77 	void *v;
78 	register_t *retval;
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	int fd = SCARG(uap, fd);
86 	struct file *fp;
87 	struct filedesc *fdp = p->p_fd;
88 
89 	if ((u_int)fd >= fdp->fd_nfiles ||
90 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
91 #if notyet
92 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
93 #endif
94 	    (fp->f_flag & FREAD) == 0)
95 		return (EBADF);
96 
97 #if notyet
98 	FILE_USE(fp);
99 #endif
100 	/* dofileread() will unuse the descriptor for us */
101 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
102 	    &fp->f_offset, retval));
103 }
104 
105 int
106 dofileread(p, fd, fp, buf, nbyte, offset, retval)
107 	struct proc *p;
108 	int fd;
109 	struct file *fp;
110 	void *buf;
111 	size_t nbyte;
112 	off_t *offset;
113 	register_t *retval;
114 {
115 	struct uio auio;
116 	struct iovec aiov;
117 	long cnt, error = 0;
118 #ifdef KTRACE
119 	struct iovec ktriov;
120 #endif
121 
122 	aiov.iov_base = (caddr_t)buf;
123 	aiov.iov_len = nbyte;
124 	auio.uio_iov = &aiov;
125 	auio.uio_iovcnt = 1;
126 	auio.uio_resid = nbyte;
127 	auio.uio_rw = UIO_READ;
128 	auio.uio_segflg = UIO_USERSPACE;
129 	auio.uio_procp = p;
130 
131 	/*
132 	 * Reads return ssize_t because -1 is returned on error.  Therefore
133 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
134 	 * values.
135 	 */
136 	if (auio.uio_resid > SSIZE_MAX) {
137 		error = EINVAL;
138 		goto out;
139 	}
140 
141 #ifdef KTRACE
142 	/*
143 	 * if tracing, save a copy of iovec
144 	 */
145 	if (KTRPOINT(p, KTR_GENIO))
146 		ktriov = aiov;
147 #endif
148 	cnt = auio.uio_resid;
149 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
150 	if (error)
151 		if (auio.uio_resid != cnt && (error == ERESTART ||
152 		    error == EINTR || error == EWOULDBLOCK))
153 			error = 0;
154 	cnt -= auio.uio_resid;
155 #ifdef KTRACE
156 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
157 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
158 #endif
159 	*retval = cnt;
160  out:
161 #if notyet
162 	FILE_UNUSE(fp, p);
163 #endif
164 	return (error);
165 }
166 
167 /*
168  * Scatter read system call.
169  */
170 int
171 sys_readv(p, v, retval)
172 	struct proc *p;
173 	void *v;
174 	register_t *retval;
175 {
176 	struct sys_readv_args /* {
177 		syscallarg(int) fd;
178 		syscallarg(const struct iovec *) iovp;
179 		syscallarg(int) iovcnt;
180 	} */ *uap = v;
181 	int fd = SCARG(uap, fd);
182 	struct file *fp;
183 	struct filedesc *fdp = p->p_fd;
184 
185 	if ((u_int)fd >= fdp->fd_nfiles ||
186 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
187 #if notyet
188 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
189 #endif
190 	    (fp->f_flag & FREAD) == 0)
191 		return (EBADF);
192 
193 #if notyet
194 	FILE_USE(fp);
195 #endif
196 	/* dofilereadv() will unuse the descriptor for us */
197 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
198 	    &fp->f_offset, retval));
199 }
200 
201 int
202 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
203 	struct proc *p;
204 	int fd;
205 	struct file *fp;
206 	const struct iovec *iovp;
207 	int iovcnt;
208 	off_t *offset;
209 	register_t *retval;
210 {
211 	struct uio auio;
212 	struct iovec *iov;
213 	struct iovec *needfree;
214 	struct iovec aiov[UIO_SMALLIOV];
215 	long i, cnt, error = 0;
216 	u_int iovlen;
217 #ifdef KTRACE
218 	struct iovec *ktriov = NULL;
219 #endif
220 
221 	/* note: can't use iovlen until iovcnt is validated */
222 	iovlen = iovcnt * sizeof(struct iovec);
223 	if ((u_int)iovcnt > UIO_SMALLIOV) {
224 		if ((u_int)iovcnt > IOV_MAX) {
225 			error = EINVAL;
226 			goto out;
227 		}
228 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
229 	} else if ((u_int)iovcnt > 0) {
230 		iov = aiov;
231 		needfree = NULL;
232 	} else {
233 		error = EINVAL;
234 		goto out;
235 	}
236 
237 	auio.uio_iov = iov;
238 	auio.uio_iovcnt = iovcnt;
239 	auio.uio_rw = UIO_READ;
240 	auio.uio_segflg = UIO_USERSPACE;
241 	auio.uio_procp = p;
242 	error = copyin(iovp, iov, iovlen);
243 	if (error)
244 		goto done;
245 	auio.uio_resid = 0;
246 	for (i = 0; i < iovcnt; i++) {
247 		auio.uio_resid += iov->iov_len;
248 		/*
249 		 * Reads return ssize_t because -1 is returned on error.
250 		 * Therefore we must restrict the length to SSIZE_MAX to
251 		 * avoid garbage return values.
252 		 */
253 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
254 			error = EINVAL;
255 			goto done;
256 		}
257 		iov++;
258 	}
259 #ifdef KTRACE
260 	/*
261 	 * if tracing, save a copy of iovec
262 	 */
263 	if (KTRPOINT(p, KTR_GENIO))  {
264 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
265 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
266 	}
267 #endif
268 	cnt = auio.uio_resid;
269 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
270 	if (error)
271 		if (auio.uio_resid != cnt && (error == ERESTART ||
272 		    error == EINTR || error == EWOULDBLOCK))
273 			error = 0;
274 	cnt -= auio.uio_resid;
275 #ifdef KTRACE
276 	if (ktriov != NULL) {
277 		if (error == 0)
278 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
279 			    error);
280 		free(ktriov, M_TEMP);
281 	}
282 #endif
283 	*retval = cnt;
284  done:
285 	if (needfree)
286 		free(needfree, M_IOV);
287  out:
288 #if notyet
289 	FILE_UNUSE(fp, p);
290 #endif
291 	return (error);
292 }
293 
294 /*
295  * Write system call
296  */
297 int
298 sys_write(p, v, retval)
299 	struct proc *p;
300 	void *v;
301 	register_t *retval;
302 {
303 	struct sys_write_args /* {
304 		syscallarg(int) fd;
305 		syscallarg(const void *) buf;
306 		syscallarg(size_t) nbyte;
307 	} */ *uap = v;
308 	int fd = SCARG(uap, fd);
309 	struct file *fp;
310 	struct filedesc *fdp = p->p_fd;
311 
312 	if ((u_int)fd >= fdp->fd_nfiles ||
313 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
314 #if notyet
315 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
316 #endif
317 	    (fp->f_flag & FWRITE) == 0)
318 		return (EBADF);
319 
320 #if notyet
321 	FILE_USE(fp);
322 #endif
323 	/* dofilewrite() will unuse the descriptor for us */
324 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
325 	    &fp->f_offset, retval));
326 }
327 
328 int
329 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
330 	struct proc *p;
331 	int fd;
332 	struct file *fp;
333 	const void *buf;
334 	size_t nbyte;
335 	off_t *offset;
336 	register_t *retval;
337 {
338 	struct uio auio;
339 	struct iovec aiov;
340 	long cnt, error = 0;
341 #ifdef KTRACE
342 	struct iovec ktriov;
343 #endif
344 
345 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
346 	aiov.iov_len = nbyte;
347 	auio.uio_iov = &aiov;
348 	auio.uio_iovcnt = 1;
349 	auio.uio_resid = nbyte;
350 	auio.uio_rw = UIO_WRITE;
351 	auio.uio_segflg = UIO_USERSPACE;
352 	auio.uio_procp = p;
353 
354 	/*
355 	 * Writes return ssize_t because -1 is returned on error.  Therefore
356 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
357 	 * values.
358 	 */
359 	if (auio.uio_resid > SSIZE_MAX) {
360 		error = EINVAL;
361 		goto out;
362 	}
363 
364 #ifdef KTRACE
365 	/*
366 	 * if tracing, save a copy of iovec
367 	 */
368 	if (KTRPOINT(p, KTR_GENIO))
369 		ktriov = aiov;
370 #endif
371 	cnt = auio.uio_resid;
372 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
373 	if (error) {
374 		if (auio.uio_resid != cnt && (error == ERESTART ||
375 		    error == EINTR || error == EWOULDBLOCK))
376 			error = 0;
377 		if (error == EPIPE)
378 			psignal(p, SIGPIPE);
379 	}
380 	cnt -= auio.uio_resid;
381 #ifdef KTRACE
382 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
383 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
384 #endif
385 	*retval = cnt;
386  out:
387 #if notyet
388 	FILE_UNUSE(fp, p);
389 #endif
390 	return (error);
391 }
392 
393 /*
394  * Gather write system call
395  */
396 int
397 sys_writev(p, v, retval)
398 	struct proc *p;
399 	void *v;
400 	register_t *retval;
401 {
402 	struct sys_writev_args /* {
403 		syscallarg(int) fd;
404 		syscallarg(const struct iovec *) iovp;
405 		syscallarg(int) iovcnt;
406 	} */ *uap = v;
407 	int fd = SCARG(uap, fd);
408 	struct file *fp;
409 	struct filedesc *fdp = p->p_fd;
410 
411 	if ((u_int)fd >= fdp->fd_nfiles ||
412 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
413 #if notyet
414 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
415 #endif
416 	    (fp->f_flag & FWRITE) == 0)
417 		return (EBADF);
418 
419 #if notyet
420 	FILE_USE(fp);
421 #endif
422 	/* dofilewritev() will unuse the descriptor for us */
423 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
424 	    &fp->f_offset, retval));
425 }
426 
427 int
428 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
429 	struct proc *p;
430 	int fd;
431 	struct file *fp;
432 	const struct iovec *iovp;
433 	int iovcnt;
434 	off_t *offset;
435 	register_t *retval;
436 {
437 	struct uio auio;
438 	struct iovec *iov;
439 	struct iovec *needfree;
440 	struct iovec aiov[UIO_SMALLIOV];
441 	long i, cnt, error = 0;
442 	u_int iovlen;
443 #ifdef KTRACE
444 	struct iovec *ktriov = NULL;
445 #endif
446 
447 	/* note: can't use iovlen until iovcnt is validated */
448 	iovlen = iovcnt * sizeof(struct iovec);
449 	if ((u_int)iovcnt > UIO_SMALLIOV) {
450 		if ((u_int)iovcnt > IOV_MAX)
451 			return (EINVAL);
452 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
453 	} else if ((u_int)iovcnt > 0) {
454 		iov = aiov;
455 		needfree = NULL;
456 	} else {
457 		error = EINVAL;
458 		goto out;
459 	}
460 
461 	auio.uio_iov = iov;
462 	auio.uio_iovcnt = iovcnt;
463 	auio.uio_rw = UIO_WRITE;
464 	auio.uio_segflg = UIO_USERSPACE;
465 	auio.uio_procp = p;
466 	error = copyin(iovp, iov, iovlen);
467 	if (error)
468 		goto done;
469 	auio.uio_resid = 0;
470 	for (i = 0; i < iovcnt; i++) {
471 		auio.uio_resid += iov->iov_len;
472 		/*
473 		 * Writes return ssize_t because -1 is returned on error.
474 		 * Therefore we must restrict the length to SSIZE_MAX to
475 		 * avoid garbage return values.
476 		 */
477 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
478 			error = EINVAL;
479 			goto done;
480 		}
481 		iov++;
482 	}
483 #ifdef KTRACE
484 	/*
485 	 * if tracing, save a copy of iovec
486 	 */
487 	if (KTRPOINT(p, KTR_GENIO))  {
488 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
489 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
490 	}
491 #endif
492 	cnt = auio.uio_resid;
493 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
494 	if (error) {
495 		if (auio.uio_resid != cnt && (error == ERESTART ||
496 		    error == EINTR || error == EWOULDBLOCK))
497 			error = 0;
498 		if (error == EPIPE)
499 			psignal(p, SIGPIPE);
500 	}
501 	cnt -= auio.uio_resid;
502 #ifdef KTRACE
503 	if (ktriov != NULL) {
504 		if (error == 0)
505 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
506 			    error);
507 		free(ktriov, M_TEMP);
508 	}
509 #endif
510 	*retval = cnt;
511  done:
512 	if (needfree)
513 		free(needfree, M_IOV);
514  out:
515 #if notyet
516 	FILE_UNUSE(fp, p);
517 #endif
518 	return (error);
519 }
520 
521 /*
522  * Ioctl system call
523  */
524 /* ARGSUSED */
525 int
526 sys_ioctl(p, v, retval)
527 	struct proc *p;
528 	void *v;
529 	register_t *retval;
530 {
531 	register struct sys_ioctl_args /* {
532 		syscallarg(int) fd;
533 		syscallarg(u_long) com;
534 		syscallarg(caddr_t) data;
535 	} */ *uap = v;
536 	register struct file *fp;
537 	register struct filedesc *fdp;
538 	register u_long com;
539 	register int error;
540 	register u_int size;
541 	caddr_t data, memp;
542 	int tmp;
543 #define STK_PARAMS	128
544 	char stkbuf[STK_PARAMS];
545 
546 	fdp = p->p_fd;
547 	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
548 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
549 		return (EBADF);
550 
551 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
552 		return (EBADF);
553 
554 	switch (com = SCARG(uap, com)) {
555 	case FIONCLEX:
556 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
557 		return (0);
558 	case FIOCLEX:
559 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
560 		return (0);
561 	}
562 
563 	/*
564 	 * Interpret high order word to find amount of data to be
565 	 * copied to/from the user's address space.
566 	 */
567 	size = IOCPARM_LEN(com);
568 	if (size > IOCPARM_MAX)
569 		return (ENOTTY);
570 	memp = NULL;
571 	if (size > sizeof (stkbuf)) {
572 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
573 		data = memp;
574 	} else
575 		data = stkbuf;
576 	if (com&IOC_IN) {
577 		if (size) {
578 			error = copyin(SCARG(uap, data), data, (u_int)size);
579 			if (error) {
580 				if (memp)
581 					free(memp, M_IOCTLOPS);
582 				return (error);
583 			}
584 		} else
585 			*(caddr_t *)data = SCARG(uap, data);
586 	} else if ((com&IOC_OUT) && size)
587 		/*
588 		 * Zero the buffer so the user always
589 		 * gets back something deterministic.
590 		 */
591 		bzero(data, size);
592 	else if (com&IOC_VOID)
593 		*(caddr_t *)data = SCARG(uap, data);
594 
595 	switch (com) {
596 
597 	case FIONBIO:
598 		if ((tmp = *(int *)data) != 0)
599 			fp->f_flag |= FNONBLOCK;
600 		else
601 			fp->f_flag &= ~FNONBLOCK;
602 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
603 		break;
604 
605 	case FIOASYNC:
606 		if ((tmp = *(int *)data) != 0)
607 			fp->f_flag |= FASYNC;
608 		else
609 			fp->f_flag &= ~FASYNC;
610 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
611 		break;
612 
613 	case FIOSETOWN:
614 		tmp = *(int *)data;
615 		if (fp->f_type == DTYPE_SOCKET) {
616 			struct socket *so = (struct socket *)fp->f_data;
617 
618 			so->so_pgid = tmp;
619 			so->so_siguid = p->p_cred->p_ruid;
620 			so->so_sigeuid = p->p_ucred->cr_uid;
621 			error = 0;
622 			break;
623 		}
624 		if (tmp <= 0) {
625 			tmp = -tmp;
626 		} else {
627 			struct proc *p1 = pfind(tmp);
628 			if (p1 == 0) {
629 				error = ESRCH;
630 				break;
631 			}
632 			tmp = p1->p_pgrp->pg_id;
633 		}
634 		error = (*fp->f_ops->fo_ioctl)
635 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
636 		break;
637 
638 	case FIOGETOWN:
639 		if (fp->f_type == DTYPE_SOCKET) {
640 			error = 0;
641 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
642 			break;
643 		}
644 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
645 		*(int *)data = -*(int *)data;
646 		break;
647 
648 	default:
649 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
650 		/*
651 		 * Copy any data to user, size was
652 		 * already set and checked above.
653 		 */
654 		if (error == 0 && (com&IOC_OUT) && size)
655 			error = copyout(data, SCARG(uap, data), (u_int)size);
656 		break;
657 	}
658 	if (memp)
659 		free(memp, M_IOCTLOPS);
660 	return (error);
661 }
662 
663 int	selwait, nselcoll;
664 
665 /*
666  * Select system call.
667  */
668 int
669 sys_select(p, v, retval)
670 	register struct proc *p;
671 	void *v;
672 	register_t *retval;
673 {
674 	register struct sys_select_args /* {
675 		syscallarg(int) nd;
676 		syscallarg(fd_set *) in;
677 		syscallarg(fd_set *) ou;
678 		syscallarg(fd_set *) ex;
679 		syscallarg(struct timeval *) tv;
680 	} */ *uap = v;
681 	fd_set bits[6], *pibits[3], *pobits[3];
682 	struct timeval atv;
683 	int s, ncoll, error = 0, timo;
684 	u_int ni;
685 
686 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
687 		/* forgiving; slightly wrong */
688 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
689 	}
690 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
691 	if (SCARG(uap, nd) > FD_SETSIZE) {
692 		caddr_t mbits;
693 
694 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
695 		bzero(mbits, ni * 6);
696 		pibits[0] = (fd_set *)&mbits[ni * 0];
697 		pibits[1] = (fd_set *)&mbits[ni * 1];
698 		pibits[2] = (fd_set *)&mbits[ni * 2];
699 		pobits[0] = (fd_set *)&mbits[ni * 3];
700 		pobits[1] = (fd_set *)&mbits[ni * 4];
701 		pobits[2] = (fd_set *)&mbits[ni * 5];
702 	} else {
703 		bzero((caddr_t)bits, sizeof(bits));
704 		pibits[0] = &bits[0];
705 		pibits[1] = &bits[1];
706 		pibits[2] = &bits[2];
707 		pobits[0] = &bits[3];
708 		pobits[1] = &bits[4];
709 		pobits[2] = &bits[5];
710 	}
711 
712 #define	getbits(name, x) \
713 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
714 	    (caddr_t)pibits[x], ni))) \
715 		goto done;
716 	getbits(in, 0);
717 	getbits(ou, 1);
718 	getbits(ex, 2);
719 #undef	getbits
720 
721 	if (SCARG(uap, tv)) {
722 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
723 			sizeof (atv));
724 		if (error)
725 			goto done;
726 		if (itimerfix(&atv)) {
727 			error = EINVAL;
728 			goto done;
729 		}
730 		s = splclock();
731 		timeradd(&atv, &time, &atv);
732 		splx(s);
733 	} else
734 		timo = 0;
735 retry:
736 	ncoll = nselcoll;
737 	p->p_flag |= P_SELECT;
738 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
739 	if (error || *retval)
740 		goto done;
741 	if (SCARG(uap, tv)) {
742 		/*
743 		 * We have to recalculate the timeout on every retry.
744 		 */
745 		timo = hzto(&atv);
746 		if (timo <= 0)
747 			goto done;
748 	}
749 	s = splhigh();
750 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
751 		splx(s);
752 		goto retry;
753 	}
754 	p->p_flag &= ~P_SELECT;
755 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
756 	splx(s);
757 	if (error == 0)
758 		goto retry;
759 done:
760 	p->p_flag &= ~P_SELECT;
761 	/* select is not restarted after signals... */
762 	if (error == ERESTART)
763 		error = EINTR;
764 	if (error == EWOULDBLOCK)
765 		error = 0;
766 #define	putbits(name, x) \
767 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
768 	    (caddr_t)SCARG(uap, name), ni))) \
769 		error = error2;
770 	if (error == 0) {
771 		int error2;
772 
773 		putbits(in, 0);
774 		putbits(ou, 1);
775 		putbits(ex, 2);
776 #undef putbits
777 	}
778 
779 	if (pibits[0] != &bits[0])
780 		free(pibits[0], M_TEMP);
781 	return (error);
782 }
783 
784 int
785 selscan(p, ibits, obits, nfd, retval)
786 	struct proc *p;
787 	fd_set *ibits, *obits;
788 	int nfd;
789 	register_t *retval;
790 {
791 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
792 	register struct filedesc *fdp = p->p_fd;
793 	register int msk, i, j, fd;
794 	register fd_mask bits;
795 	struct file *fp;
796 	int ni, n = 0;
797 	static int flag[3] = { FREAD, FWRITE, 0 };
798 
799 	/*
800 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
801 	 * up to the next byte) otherwise the fd_set's are normal sized.
802 	 */
803 	ni = sizeof(fd_set);
804 	if (nfd > FD_SETSIZE)
805 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
806 
807 	for (msk = 0; msk < 3; msk++) {
808 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
809 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
810 
811 		for (i = 0; i < nfd; i += NFDBITS) {
812 			bits = pibits->fds_bits[i/NFDBITS];
813 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
814 				bits &= ~(1 << j);
815 				fp = fdp->fd_ofiles[fd];
816 				if (fp == NULL)
817 					return (EBADF);
818 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
819 					FD_SET(fd, pobits);
820 					n++;
821 				}
822 			}
823 		}
824 	}
825 	*retval = n;
826 	return (0);
827 }
828 
829 /*ARGSUSED*/
830 int
831 seltrue(dev, flag, p)
832 	dev_t dev;
833 	int flag;
834 	struct proc *p;
835 {
836 
837 	return (1);
838 }
839 
840 /*
841  * Record a select request.
842  */
843 void
844 selrecord(selector, sip)
845 	struct proc *selector;
846 	struct selinfo *sip;
847 {
848 	struct proc *p;
849 	pid_t mypid;
850 
851 	mypid = selector->p_pid;
852 	if (sip->si_selpid == mypid)
853 		return;
854 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
855 	    p->p_wchan == (caddr_t)&selwait)
856 		sip->si_flags |= SI_COLL;
857 	else
858 		sip->si_selpid = mypid;
859 }
860 
861 /*
862  * Do a wakeup when a selectable event occurs.
863  */
864 void
865 selwakeup(sip)
866 	register struct selinfo *sip;
867 {
868 	register struct proc *p;
869 	int s;
870 
871 	if (sip->si_selpid == 0)
872 		return;
873 	if (sip->si_flags & SI_COLL) {
874 		nselcoll++;
875 		sip->si_flags &= ~SI_COLL;
876 		wakeup((caddr_t)&selwait);
877 	}
878 	p = pfind(sip->si_selpid);
879 	sip->si_selpid = 0;
880 	if (p != NULL) {
881 		s = splhigh();
882 		if (p->p_wchan == (caddr_t)&selwait) {
883 			if (p->p_stat == SSLEEP)
884 				setrunnable(p);
885 			else
886 				unsleep(p);
887 		} else if (p->p_flag & P_SELECT)
888 			p->p_flag &= ~P_SELECT;
889 		splx(s);
890 	}
891 }
892 
893 void
894 pollscan(p, pl, nfd, retval)
895 	struct proc *p;
896 	struct pollfd *pl;
897 	int nfd;
898 	register_t *retval;
899 {
900 	register struct filedesc *fdp = p->p_fd;
901 	register int msk, i;
902 	struct file *fp;
903 	int x, n = 0;
904 	static int flag[3] = { FREAD, FWRITE, 0 };
905 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
906 
907 	/*
908 	 * XXX: We need to implement the rest of the flags.
909 	 */
910 	for (i = 0; i < nfd; i++) {
911 		/* Check the file descriptor. */
912 		if (pl[i].fd < 0)
913 			continue;
914 		if (pl[i].fd >= fdp->fd_nfiles) {
915 			pl[i].revents = POLLNVAL;
916 			n++;
917 			continue;
918 		}
919 
920 		fp = fdp->fd_ofiles[pl[i].fd];
921 		if (fp == NULL) {
922 			pl[i].revents = POLLNVAL;
923 			n++;
924 			continue;
925 		}
926 		for (x = msk = 0; msk < 3; msk++) {
927 			if (pl[i].events & pflag[msk]) {
928 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
929 					pl[i].revents |= pflag[msk] &
930 					    pl[i].events;
931 					x++;
932 				}
933 			}
934 		}
935 		if (x)
936 			n++;
937 	}
938 	*retval = n;
939 }
940 
941 /*
942  * We are using the same mechanism as select only we encode/decode args
943  * differently.
944  */
945 int
946 sys_poll(p, v, retval)
947 	register struct proc *p;
948 	void *v;
949 	register_t *retval;
950 {
951 	struct sys_poll_args *uap = v;
952 	size_t sz;
953 	struct pollfd pfds[4], *pl = pfds;
954 	int msec = SCARG(uap, timeout);
955 	struct timeval atv;
956 	int timo, ncoll, i, s, error, error2;
957 	extern int nselcoll, selwait;
958 
959 	/* Standards say no more than MAX_OPEN; this is possibly better. */
960 	if (SCARG(uap, nfds) > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
961 	    maxfiles))
962 		return (EINVAL);
963 
964 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
965 
966 	/* optimize for the default case, of a small nfds value */
967 	if (sz > sizeof(pfds))
968 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
969 
970 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
971 		goto bad;
972 
973 	for (i = 0; i < SCARG(uap, nfds); i++)
974 		pl[i].revents = 0;
975 
976 	if (msec != -1) {
977 		atv.tv_sec = msec / 1000;
978 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
979 
980 		if (itimerfix(&atv)) {
981 			error = EINVAL;
982 			goto done;
983 		}
984 		s = splclock();
985 		timeradd(&atv, &time, &atv);
986 		splx(s);
987 	} else
988 		timo = 0;
989 
990 retry:
991 	ncoll = nselcoll;
992 	p->p_flag |= P_SELECT;
993 	pollscan(p, pl, SCARG(uap, nfds), retval);
994 	if (*retval)
995 		goto done;
996 	if (msec != -1) {
997 		/*
998 		 * We have to recalculate the timeout on every retry.
999 		 */
1000 		timo = hzto(&atv);
1001 		if (timo <= 0)
1002 			goto done;
1003 	}
1004 	s = splhigh();
1005 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
1006 		splx(s);
1007 		goto retry;
1008 	}
1009 	p->p_flag &= ~P_SELECT;
1010 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
1011 	splx(s);
1012 	if (error == 0)
1013 		goto retry;
1014 
1015 done:
1016 	p->p_flag &= ~P_SELECT;
1017 	/* poll is not restarted after signals... */
1018 	if (error == ERESTART)
1019 		error = EINTR;
1020 	if (error == EWOULDBLOCK)
1021 		error = 0;
1022 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
1023 		error = error2;
1024 bad:
1025 	if (pl != pfds)
1026 		free((char *) pl, M_TEMP);
1027 	return (error);
1028 }
1029 
1030