xref: /openbsd-src/sys/kern/sys_generic.c (revision a27b30d8d79dbf3705f2f98bd6b19742bd06ae08)
1 /*	$OpenBSD: sys_generic.c,v 1.28 2000/11/10 18:15:47 art Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
67 int seltrue __P((dev_t, int, struct proc *));
68 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(p, v, retval)
76 	struct proc *p;
77 	void *v;
78 	register_t *retval;
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	int fd = SCARG(uap, fd);
86 	struct file *fp;
87 	struct filedesc *fdp = p->p_fd;
88 
89 	if ((u_int)fd >= fdp->fd_nfiles ||
90 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
91 #if notyet
92 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
93 #endif
94 	    (fp->f_flag & FREAD) == 0)
95 		return (EBADF);
96 
97 #if notyet
98 	FILE_USE(fp);
99 #endif
100 	/* dofileread() will unuse the descriptor for us */
101 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
102 	    &fp->f_offset, retval));
103 }
104 
105 int
106 dofileread(p, fd, fp, buf, nbyte, offset, retval)
107 	struct proc *p;
108 	int fd;
109 	struct file *fp;
110 	void *buf;
111 	size_t nbyte;
112 	off_t *offset;
113 	register_t *retval;
114 {
115 	struct uio auio;
116 	struct iovec aiov;
117 	long cnt, error = 0;
118 #ifdef KTRACE
119 	struct iovec ktriov;
120 #endif
121 
122 	aiov.iov_base = (caddr_t)buf;
123 	aiov.iov_len = nbyte;
124 	auio.uio_iov = &aiov;
125 	auio.uio_iovcnt = 1;
126 	auio.uio_resid = nbyte;
127 	auio.uio_rw = UIO_READ;
128 	auio.uio_segflg = UIO_USERSPACE;
129 	auio.uio_procp = p;
130 
131 	/*
132 	 * Reads return ssize_t because -1 is returned on error.  Therefore
133 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
134 	 * values.
135 	 */
136 	if (auio.uio_resid > SSIZE_MAX) {
137 		error = EINVAL;
138 		goto out;
139 	}
140 
141 #ifdef KTRACE
142 	/*
143 	 * if tracing, save a copy of iovec
144 	 */
145 	if (KTRPOINT(p, KTR_GENIO))
146 		ktriov = aiov;
147 #endif
148 	cnt = auio.uio_resid;
149 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
150 	if (error)
151 		if (auio.uio_resid != cnt && (error == ERESTART ||
152 		    error == EINTR || error == EWOULDBLOCK))
153 			error = 0;
154 	cnt -= auio.uio_resid;
155 #ifdef KTRACE
156 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
157 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
158 #endif
159 	*retval = cnt;
160  out:
161 #if notyet
162 	FILE_UNUSE(fp, p);
163 #endif
164 	return (error);
165 }
166 
167 /*
168  * Scatter read system call.
169  */
170 int
171 sys_readv(p, v, retval)
172 	struct proc *p;
173 	void *v;
174 	register_t *retval;
175 {
176 	struct sys_readv_args /* {
177 		syscallarg(int) fd;
178 		syscallarg(const struct iovec *) iovp;
179 		syscallarg(int) iovcnt;
180 	} */ *uap = v;
181 	int fd = SCARG(uap, fd);
182 	struct file *fp;
183 	struct filedesc *fdp = p->p_fd;
184 
185 	if ((u_int)fd >= fdp->fd_nfiles ||
186 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
187 #if notyet
188 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
189 #endif
190 	    (fp->f_flag & FREAD) == 0)
191 		return (EBADF);
192 
193 #if notyet
194 	FILE_USE(fp);
195 #endif
196 	/* dofilereadv() will unuse the descriptor for us */
197 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
198 	    &fp->f_offset, retval));
199 }
200 
201 int
202 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
203 	struct proc *p;
204 	int fd;
205 	struct file *fp;
206 	const struct iovec *iovp;
207 	int iovcnt;
208 	off_t *offset;
209 	register_t *retval;
210 {
211 	struct uio auio;
212 	struct iovec *iov;
213 	struct iovec *needfree;
214 	struct iovec aiov[UIO_SMALLIOV];
215 	long i, cnt, error = 0;
216 	u_int iovlen;
217 #ifdef KTRACE
218 	struct iovec *ktriov = NULL;
219 #endif
220 
221 	/* note: can't use iovlen until iovcnt is validated */
222 	iovlen = iovcnt * sizeof(struct iovec);
223 	if ((u_int)iovcnt > UIO_SMALLIOV) {
224 		if ((u_int)iovcnt > IOV_MAX) {
225 			error = EINVAL;
226 			goto out;
227 		}
228 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
229 	} else if ((u_int)iovcnt > 0) {
230 		iov = aiov;
231 		needfree = NULL;
232 	} else {
233 		error = EINVAL;
234 		goto out;
235 	}
236 
237 	auio.uio_iov = iov;
238 	auio.uio_iovcnt = iovcnt;
239 	auio.uio_rw = UIO_READ;
240 	auio.uio_segflg = UIO_USERSPACE;
241 	auio.uio_procp = p;
242 	error = copyin(iovp, iov, iovlen);
243 	if (error)
244 		goto done;
245 	auio.uio_resid = 0;
246 	for (i = 0; i < iovcnt; i++) {
247 		auio.uio_resid += iov->iov_len;
248 		/*
249 		 * Reads return ssize_t because -1 is returned on error.
250 		 * Therefore we must restrict the length to SSIZE_MAX to
251 		 * avoid garbage return values.
252 		 */
253 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
254 			error = EINVAL;
255 			goto done;
256 		}
257 		iov++;
258 	}
259 #ifdef KTRACE
260 	/*
261 	 * if tracing, save a copy of iovec
262 	 */
263 	if (KTRPOINT(p, KTR_GENIO))  {
264 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
265 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
266 	}
267 #endif
268 	cnt = auio.uio_resid;
269 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
270 	if (error)
271 		if (auio.uio_resid != cnt && (error == ERESTART ||
272 		    error == EINTR || error == EWOULDBLOCK))
273 			error = 0;
274 	cnt -= auio.uio_resid;
275 #ifdef KTRACE
276 	if (ktriov != NULL) {
277 		if (error == 0)
278 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
279 			    error);
280 		free(ktriov, M_TEMP);
281 	}
282 #endif
283 	*retval = cnt;
284  done:
285 	if (needfree)
286 		free(needfree, M_IOV);
287  out:
288 #if notyet
289 	FILE_UNUSE(fp, p);
290 #endif
291 	return (error);
292 }
293 
294 /*
295  * Write system call
296  */
297 int
298 sys_write(p, v, retval)
299 	struct proc *p;
300 	void *v;
301 	register_t *retval;
302 {
303 	struct sys_write_args /* {
304 		syscallarg(int) fd;
305 		syscallarg(const void *) buf;
306 		syscallarg(size_t) nbyte;
307 	} */ *uap = v;
308 	int fd = SCARG(uap, fd);
309 	struct file *fp;
310 	struct filedesc *fdp = p->p_fd;
311 
312 	if ((u_int)fd >= fdp->fd_nfiles ||
313 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
314 #if notyet
315 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
316 #endif
317 	    (fp->f_flag & FWRITE) == 0)
318 		return (EBADF);
319 
320 #if notyet
321 	FILE_USE(fp);
322 #endif
323 	/* dofilewrite() will unuse the descriptor for us */
324 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
325 	    &fp->f_offset, retval));
326 }
327 
328 int
329 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
330 	struct proc *p;
331 	int fd;
332 	struct file *fp;
333 	const void *buf;
334 	size_t nbyte;
335 	off_t *offset;
336 	register_t *retval;
337 {
338 	struct uio auio;
339 	struct iovec aiov;
340 	long cnt, error = 0;
341 #ifdef KTRACE
342 	struct iovec ktriov;
343 #endif
344 
345 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
346 	aiov.iov_len = nbyte;
347 	auio.uio_iov = &aiov;
348 	auio.uio_iovcnt = 1;
349 	auio.uio_resid = nbyte;
350 	auio.uio_rw = UIO_WRITE;
351 	auio.uio_segflg = UIO_USERSPACE;
352 	auio.uio_procp = p;
353 
354 	/*
355 	 * Writes return ssize_t because -1 is returned on error.  Therefore
356 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
357 	 * values.
358 	 */
359 	if (auio.uio_resid > SSIZE_MAX) {
360 		error = EINVAL;
361 		goto out;
362 	}
363 
364 #ifdef KTRACE
365 	/*
366 	 * if tracing, save a copy of iovec
367 	 */
368 	if (KTRPOINT(p, KTR_GENIO))
369 		ktriov = aiov;
370 #endif
371 	cnt = auio.uio_resid;
372 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
373 	if (error) {
374 		if (auio.uio_resid != cnt && (error == ERESTART ||
375 		    error == EINTR || error == EWOULDBLOCK))
376 			error = 0;
377 		if (error == EPIPE)
378 			psignal(p, SIGPIPE);
379 	}
380 	cnt -= auio.uio_resid;
381 #ifdef KTRACE
382 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
383 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
384 #endif
385 	*retval = cnt;
386  out:
387 #if notyet
388 	FILE_UNUSE(fp, p);
389 #endif
390 	return (error);
391 }
392 
393 /*
394  * Gather write system call
395  */
396 int
397 sys_writev(p, v, retval)
398 	struct proc *p;
399 	void *v;
400 	register_t *retval;
401 {
402 	struct sys_writev_args /* {
403 		syscallarg(int) fd;
404 		syscallarg(const struct iovec *) iovp;
405 		syscallarg(int) iovcnt;
406 	} */ *uap = v;
407 	int fd = SCARG(uap, fd);
408 	struct file *fp;
409 	struct filedesc *fdp = p->p_fd;
410 
411 	if ((u_int)fd >= fdp->fd_nfiles ||
412 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
413 #if notyet
414 	    (fp->f_iflags & FIF_WANTCLOSE) != 0 ||
415 #endif
416 	    (fp->f_flag & FWRITE) == 0)
417 		return (EBADF);
418 
419 #if notyet
420 	FILE_USE(fp);
421 #endif
422 	/* dofilewritev() will unuse the descriptor for us */
423 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
424 	    &fp->f_offset, retval));
425 }
426 
427 int
428 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
429 	struct proc *p;
430 	int fd;
431 	struct file *fp;
432 	const struct iovec *iovp;
433 	int iovcnt;
434 	off_t *offset;
435 	register_t *retval;
436 {
437 	struct uio auio;
438 	struct iovec *iov;
439 	struct iovec *needfree;
440 	struct iovec aiov[UIO_SMALLIOV];
441 	long i, cnt, error = 0;
442 	u_int iovlen;
443 #ifdef KTRACE
444 	struct iovec *ktriov = NULL;
445 #endif
446 
447 	/* note: can't use iovlen until iovcnt is validated */
448 	iovlen = iovcnt * sizeof(struct iovec);
449 	if ((u_int)iovcnt > UIO_SMALLIOV) {
450 		if ((u_int)iovcnt > IOV_MAX)
451 			return (EINVAL);
452 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
453 	} else if ((u_int)iovcnt > 0) {
454 		iov = aiov;
455 		needfree = NULL;
456 	} else {
457 		error = EINVAL;
458 		goto out;
459 	}
460 
461 	auio.uio_iov = iov;
462 	auio.uio_iovcnt = iovcnt;
463 	auio.uio_rw = UIO_WRITE;
464 	auio.uio_segflg = UIO_USERSPACE;
465 	auio.uio_procp = p;
466 	error = copyin(iovp, iov, iovlen);
467 	if (error)
468 		goto done;
469 	auio.uio_resid = 0;
470 	for (i = 0; i < iovcnt; i++) {
471 		auio.uio_resid += iov->iov_len;
472 		/*
473 		 * Writes return ssize_t because -1 is returned on error.
474 		 * Therefore we must restrict the length to SSIZE_MAX to
475 		 * avoid garbage return values.
476 		 */
477 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
478 			error = EINVAL;
479 			goto done;
480 		}
481 		iov++;
482 	}
483 #ifdef KTRACE
484 	/*
485 	 * if tracing, save a copy of iovec
486 	 */
487 	if (KTRPOINT(p, KTR_GENIO))  {
488 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
489 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
490 	}
491 #endif
492 	cnt = auio.uio_resid;
493 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
494 	if (error) {
495 		if (auio.uio_resid != cnt && (error == ERESTART ||
496 		    error == EINTR || error == EWOULDBLOCK))
497 			error = 0;
498 		if (error == EPIPE)
499 			psignal(p, SIGPIPE);
500 	}
501 	cnt -= auio.uio_resid;
502 #ifdef KTRACE
503 	if (ktriov != NULL) {
504 		if (error == 0)
505 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
506 			    error);
507 		free(ktriov, M_TEMP);
508 	}
509 #endif
510 	*retval = cnt;
511  done:
512 	if (needfree)
513 		free(needfree, M_IOV);
514  out:
515 #if notyet
516 	FILE_UNUSE(fp, p);
517 #endif
518 	return (error);
519 }
520 
521 /*
522  * Ioctl system call
523  */
524 /* ARGSUSED */
525 int
526 sys_ioctl(p, v, retval)
527 	struct proc *p;
528 	void *v;
529 	register_t *retval;
530 {
531 	register struct sys_ioctl_args /* {
532 		syscallarg(int) fd;
533 		syscallarg(u_long) com;
534 		syscallarg(caddr_t) data;
535 	} */ *uap = v;
536 	register struct file *fp;
537 	register struct filedesc *fdp;
538 	register u_long com;
539 	register int error;
540 	register u_int size;
541 	caddr_t data, memp;
542 	int tmp;
543 #define STK_PARAMS	128
544 	char stkbuf[STK_PARAMS];
545 
546 	fdp = p->p_fd;
547 	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
548 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
549 		return (EBADF);
550 
551 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
552 		return (EBADF);
553 
554 	switch (com = SCARG(uap, com)) {
555 	case FIONCLEX:
556 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
557 		return (0);
558 	case FIOCLEX:
559 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
560 		return (0);
561 	}
562 
563 	/*
564 	 * Interpret high order word to find amount of data to be
565 	 * copied to/from the user's address space.
566 	 */
567 	size = IOCPARM_LEN(com);
568 	if (size > IOCPARM_MAX)
569 		return (ENOTTY);
570 	memp = NULL;
571 	if (size > sizeof (stkbuf)) {
572 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
573 		data = memp;
574 	} else
575 		data = stkbuf;
576 	if (com&IOC_IN) {
577 		if (size) {
578 			error = copyin(SCARG(uap, data), data, (u_int)size);
579 			if (error) {
580 				if (memp)
581 					free(memp, M_IOCTLOPS);
582 				return (error);
583 			}
584 		} else
585 			*(caddr_t *)data = SCARG(uap, data);
586 	} else if ((com&IOC_OUT) && size)
587 		/*
588 		 * Zero the buffer so the user always
589 		 * gets back something deterministic.
590 		 */
591 		bzero(data, size);
592 	else if (com&IOC_VOID)
593 		*(caddr_t *)data = SCARG(uap, data);
594 
595 	switch (com) {
596 
597 	case FIONBIO:
598 		if ((tmp = *(int *)data) != 0)
599 			fp->f_flag |= FNONBLOCK;
600 		else
601 			fp->f_flag &= ~FNONBLOCK;
602 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
603 		break;
604 
605 	case FIOASYNC:
606 		if ((tmp = *(int *)data) != 0)
607 			fp->f_flag |= FASYNC;
608 		else
609 			fp->f_flag &= ~FASYNC;
610 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
611 		break;
612 
613 	case FIOSETOWN:
614 		tmp = *(int *)data;
615 		if (fp->f_type == DTYPE_SOCKET) {
616 			struct socket *so = (struct socket *)fp->f_data;
617 
618 			so->so_pgid = tmp;
619 			so->so_siguid = p->p_cred->p_ruid;
620 			so->so_sigeuid = p->p_ucred->cr_uid;
621 			error = 0;
622 			break;
623 		}
624 		if (tmp <= 0) {
625 			tmp = -tmp;
626 		} else {
627 			struct proc *p1 = pfind(tmp);
628 			if (p1 == 0) {
629 				error = ESRCH;
630 				break;
631 			}
632 			tmp = p1->p_pgrp->pg_id;
633 		}
634 		error = (*fp->f_ops->fo_ioctl)
635 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
636 		break;
637 
638 	case FIOGETOWN:
639 		if (fp->f_type == DTYPE_SOCKET) {
640 			error = 0;
641 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
642 			break;
643 		}
644 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
645 		*(int *)data = -*(int *)data;
646 		break;
647 
648 	default:
649 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
650 		/*
651 		 * Copy any data to user, size was
652 		 * already set and checked above.
653 		 */
654 		if (error == 0 && (com&IOC_OUT) && size)
655 			error = copyout(data, SCARG(uap, data), (u_int)size);
656 		break;
657 	}
658 	if (memp)
659 		free(memp, M_IOCTLOPS);
660 	return (error);
661 }
662 
663 int	selwait, nselcoll;
664 
665 /*
666  * Select system call.
667  */
668 int
669 sys_select(p, v, retval)
670 	register struct proc *p;
671 	void *v;
672 	register_t *retval;
673 {
674 	register struct sys_select_args /* {
675 		syscallarg(int) nd;
676 		syscallarg(fd_set *) in;
677 		syscallarg(fd_set *) ou;
678 		syscallarg(fd_set *) ex;
679 		syscallarg(struct timeval *) tv;
680 	} */ *uap = v;
681 	fd_set bits[6], *pibits[3], *pobits[3];
682 	struct timeval atv;
683 	int s, ncoll, error = 0, timo;
684 	u_int ni;
685 
686 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
687 		/* forgiving; slightly wrong */
688 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
689 	}
690 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
691 	if (SCARG(uap, nd) > FD_SETSIZE) {
692 		caddr_t mbits;
693 
694 		if ((mbits = malloc(ni * 6, M_TEMP, M_WAITOK)) == NULL) {
695 			error = EINVAL;
696 			goto cleanup;
697 		}
698 		bzero(mbits, ni * 6);
699 		pibits[0] = (fd_set *)&mbits[ni * 0];
700 		pibits[1] = (fd_set *)&mbits[ni * 1];
701 		pibits[2] = (fd_set *)&mbits[ni * 2];
702 		pobits[0] = (fd_set *)&mbits[ni * 3];
703 		pobits[1] = (fd_set *)&mbits[ni * 4];
704 		pobits[2] = (fd_set *)&mbits[ni * 5];
705 	} else {
706 		bzero((caddr_t)bits, sizeof(bits));
707 		pibits[0] = &bits[0];
708 		pibits[1] = &bits[1];
709 		pibits[2] = &bits[2];
710 		pobits[0] = &bits[3];
711 		pobits[1] = &bits[4];
712 		pobits[2] = &bits[5];
713 	}
714 
715 #define	getbits(name, x) \
716 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
717 	    (caddr_t)pibits[x], ni))) \
718 		goto done;
719 	getbits(in, 0);
720 	getbits(ou, 1);
721 	getbits(ex, 2);
722 #undef	getbits
723 
724 	if (SCARG(uap, tv)) {
725 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
726 			sizeof (atv));
727 		if (error)
728 			goto done;
729 		if (itimerfix(&atv)) {
730 			error = EINVAL;
731 			goto done;
732 		}
733 		s = splclock();
734 		timeradd(&atv, &time, &atv);
735 		splx(s);
736 	} else
737 		timo = 0;
738 retry:
739 	ncoll = nselcoll;
740 	p->p_flag |= P_SELECT;
741 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
742 	if (error || *retval)
743 		goto done;
744 	if (SCARG(uap, tv)) {
745 		/*
746 		 * We have to recalculate the timeout on every retry.
747 		 */
748 		timo = hzto(&atv);
749 		if (timo <= 0)
750 			goto done;
751 	}
752 	s = splhigh();
753 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
754 		splx(s);
755 		goto retry;
756 	}
757 	p->p_flag &= ~P_SELECT;
758 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
759 	splx(s);
760 	if (error == 0)
761 		goto retry;
762 done:
763 	p->p_flag &= ~P_SELECT;
764 	/* select is not restarted after signals... */
765 	if (error == ERESTART)
766 		error = EINTR;
767 	if (error == EWOULDBLOCK)
768 		error = 0;
769 #define	putbits(name, x) \
770 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
771 	    (caddr_t)SCARG(uap, name), ni))) \
772 		error = error2;
773 	if (error == 0) {
774 		int error2;
775 
776 		putbits(in, 0);
777 		putbits(ou, 1);
778 		putbits(ex, 2);
779 #undef putbits
780 	}
781 
782 cleanup:
783 	if (pibits[0] != &bits[0])
784 		free(pibits[0], M_TEMP);
785 	return (error);
786 }
787 
788 int
789 selscan(p, ibits, obits, nfd, retval)
790 	struct proc *p;
791 	fd_set *ibits, *obits;
792 	int nfd;
793 	register_t *retval;
794 {
795 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
796 	register struct filedesc *fdp = p->p_fd;
797 	register int msk, i, j, fd;
798 	register fd_mask bits;
799 	struct file *fp;
800 	int ni, n = 0;
801 	static int flag[3] = { FREAD, FWRITE, 0 };
802 
803 	/*
804 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
805 	 * up to the next byte) otherwise the fd_set's are normal sized.
806 	 */
807 	ni = sizeof(fd_set);
808 	if (nfd > FD_SETSIZE)
809 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
810 
811 	for (msk = 0; msk < 3; msk++) {
812 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
813 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
814 
815 		for (i = 0; i < nfd; i += NFDBITS) {
816 			bits = pibits->fds_bits[i/NFDBITS];
817 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
818 				bits &= ~(1 << j);
819 				fp = fdp->fd_ofiles[fd];
820 				if (fp == NULL)
821 					return (EBADF);
822 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
823 					FD_SET(fd, pobits);
824 					n++;
825 				}
826 			}
827 		}
828 	}
829 	*retval = n;
830 	return (0);
831 }
832 
833 /*ARGSUSED*/
834 int
835 seltrue(dev, flag, p)
836 	dev_t dev;
837 	int flag;
838 	struct proc *p;
839 {
840 
841 	return (1);
842 }
843 
844 /*
845  * Record a select request.
846  */
847 void
848 selrecord(selector, sip)
849 	struct proc *selector;
850 	struct selinfo *sip;
851 {
852 	struct proc *p;
853 	pid_t mypid;
854 
855 	mypid = selector->p_pid;
856 	if (sip->si_selpid == mypid)
857 		return;
858 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
859 	    p->p_wchan == (caddr_t)&selwait)
860 		sip->si_flags |= SI_COLL;
861 	else
862 		sip->si_selpid = mypid;
863 }
864 
865 /*
866  * Do a wakeup when a selectable event occurs.
867  */
868 void
869 selwakeup(sip)
870 	register struct selinfo *sip;
871 {
872 	register struct proc *p;
873 	int s;
874 
875 	if (sip->si_selpid == 0)
876 		return;
877 	if (sip->si_flags & SI_COLL) {
878 		nselcoll++;
879 		sip->si_flags &= ~SI_COLL;
880 		wakeup((caddr_t)&selwait);
881 	}
882 	p = pfind(sip->si_selpid);
883 	sip->si_selpid = 0;
884 	if (p != NULL) {
885 		s = splhigh();
886 		if (p->p_wchan == (caddr_t)&selwait) {
887 			if (p->p_stat == SSLEEP)
888 				setrunnable(p);
889 			else
890 				unsleep(p);
891 		} else if (p->p_flag & P_SELECT)
892 			p->p_flag &= ~P_SELECT;
893 		splx(s);
894 	}
895 }
896 
897 void
898 pollscan(p, pl, nfd, retval)
899 	struct proc *p;
900 	struct pollfd *pl;
901 	int nfd;
902 	register_t *retval;
903 {
904 	register struct filedesc *fdp = p->p_fd;
905 	register int msk, i;
906 	struct file *fp;
907 	int x, n = 0;
908 	static int flag[3] = { FREAD, FWRITE, 0 };
909 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
910 
911 	/*
912 	 * XXX: We need to implement the rest of the flags.
913 	 */
914 	for (i = 0; i < nfd; i++) {
915 		/* Check the file descriptor. */
916 		if (pl[i].fd < 0)
917 			continue;
918 		if (pl[i].fd >= fdp->fd_nfiles) {
919 			pl[i].revents = POLLNVAL;
920 			n++;
921 			continue;
922 		}
923 
924 		fp = fdp->fd_ofiles[pl[i].fd];
925 		if (fp == NULL) {
926 			pl[i].revents = POLLNVAL;
927 			n++;
928 			continue;
929 		}
930 		for (x = msk = 0; msk < 3; msk++) {
931 			if (pl[i].events & pflag[msk]) {
932 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
933 					pl[i].revents |= pflag[msk] &
934 					    pl[i].events;
935 					x++;
936 				}
937 			}
938 		}
939 		if (x)
940 			n++;
941 	}
942 	*retval = n;
943 }
944 
945 /*
946  * We are using the same mechanism as select only we encode/decode args
947  * differently.
948  */
949 int
950 sys_poll(p, v, retval)
951 	register struct proc *p;
952 	void *v;
953 	register_t *retval;
954 {
955 	struct sys_poll_args *uap = v;
956 	size_t sz;
957 	struct pollfd pfds[4], *pl = pfds;
958 	int msec = SCARG(uap, timeout);
959 	struct timeval atv;
960 	int timo, ncoll, i, s, error, error2;
961 	extern int nselcoll, selwait;
962 
963 	/* Standards say no more than MAX_OPEN; this is possibly better. */
964 	if (SCARG(uap, nfds) > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
965 	    maxfiles))
966 		return (EINVAL);
967 
968 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
969 
970 	/* optimize for the default case, of a small nfds value */
971 	if (sz > sizeof(pfds))
972 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
973 
974 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
975 		goto bad;
976 
977 	for (i = 0; i < SCARG(uap, nfds); i++)
978 		pl[i].revents = 0;
979 
980 	if (msec != -1) {
981 		atv.tv_sec = msec / 1000;
982 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
983 
984 		if (itimerfix(&atv)) {
985 			error = EINVAL;
986 			goto done;
987 		}
988 		s = splclock();
989 		timeradd(&atv, &time, &atv);
990 		splx(s);
991 	} else
992 		timo = 0;
993 
994 retry:
995 	ncoll = nselcoll;
996 	p->p_flag |= P_SELECT;
997 	pollscan(p, pl, SCARG(uap, nfds), retval);
998 	if (*retval)
999 		goto done;
1000 	if (msec != -1) {
1001 		/*
1002 		 * We have to recalculate the timeout on every retry.
1003 		 */
1004 		timo = hzto(&atv);
1005 		if (timo <= 0)
1006 			goto done;
1007 	}
1008 	s = splhigh();
1009 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
1010 		splx(s);
1011 		goto retry;
1012 	}
1013 	p->p_flag &= ~P_SELECT;
1014 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
1015 	splx(s);
1016 	if (error == 0)
1017 		goto retry;
1018 
1019 done:
1020 	p->p_flag &= ~P_SELECT;
1021 	/* poll is not restarted after signals... */
1022 	if (error == ERESTART)
1023 		error = EINTR;
1024 	if (error == EWOULDBLOCK)
1025 		error = 0;
1026 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
1027 		error = error2;
1028 bad:
1029 	if (pl != pfds)
1030 		free((char *) pl, M_TEMP);
1031 	return (error);
1032 }
1033 
1034