xref: /openbsd-src/sys/kern/sys_generic.c (revision 11efff7f3ac2b3cfeff0c0cddc14294d9b3aca4f)
1 /*	$OpenBSD: sys_generic.c,v 1.49 2004/06/24 19:35:24 tholo Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, register_t *);
66 int seltrue(dev_t, int, struct proc *);
67 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
68 
69 /*
70  * Read system call.
71  */
72 /* ARGSUSED */
73 int
74 sys_read(p, v, retval)
75 	struct proc *p;
76 	void *v;
77 	register_t *retval;
78 {
79 	struct sys_read_args /* {
80 		syscallarg(int) fd;
81 		syscallarg(void *) buf;
82 		syscallarg(size_t) nbyte;
83 	} */ *uap = v;
84 	int fd = SCARG(uap, fd);
85 	struct file *fp;
86 	struct filedesc *fdp = p->p_fd;
87 
88 	if ((fp = fd_getfile(fdp, fd)) == NULL)
89 		return (EBADF);
90 	if ((fp->f_flag & FREAD) == 0)
91 		return (EBADF);
92 
93 	FREF(fp);
94 
95 	/* dofileread() will FRELE the descriptor for us */
96 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
97 	    &fp->f_offset, retval));
98 }
99 
100 int
101 dofileread(p, fd, fp, buf, nbyte, offset, retval)
102 	struct proc *p;
103 	int fd;
104 	struct file *fp;
105 	void *buf;
106 	size_t nbyte;
107 	off_t *offset;
108 	register_t *retval;
109 {
110 	struct uio auio;
111 	struct iovec aiov;
112 	long cnt, error = 0;
113 #ifdef KTRACE
114 	struct iovec ktriov;
115 #endif
116 
117 	aiov.iov_base = buf;
118 	aiov.iov_len = nbyte;
119 	auio.uio_iov = &aiov;
120 	auio.uio_iovcnt = 1;
121 	auio.uio_resid = nbyte;
122 	auio.uio_rw = UIO_READ;
123 	auio.uio_segflg = UIO_USERSPACE;
124 	auio.uio_procp = p;
125 
126 	/*
127 	 * Reads return ssize_t because -1 is returned on error.  Therefore
128 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
129 	 * values.
130 	 */
131 	if (auio.uio_resid > SSIZE_MAX) {
132 		error = EINVAL;
133 		goto out;
134 	}
135 
136 #ifdef KTRACE
137 	/*
138 	 * if tracing, save a copy of iovec
139 	 */
140 	if (KTRPOINT(p, KTR_GENIO))
141 		ktriov = aiov;
142 #endif
143 	cnt = auio.uio_resid;
144 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
145 	if (error)
146 		if (auio.uio_resid != cnt && (error == ERESTART ||
147 		    error == EINTR || error == EWOULDBLOCK))
148 			error = 0;
149 	cnt -= auio.uio_resid;
150 #ifdef KTRACE
151 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
152 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
153 #endif
154 	*retval = cnt;
155  out:
156 	FRELE(fp);
157 	return (error);
158 }
159 
160 /*
161  * Scatter read system call.
162  */
163 int
164 sys_readv(p, v, retval)
165 	struct proc *p;
166 	void *v;
167 	register_t *retval;
168 {
169 	struct sys_readv_args /* {
170 		syscallarg(int) fd;
171 		syscallarg(const struct iovec *) iovp;
172 		syscallarg(int) iovcnt;
173 	} */ *uap = v;
174 	int fd = SCARG(uap, fd);
175 	struct file *fp;
176 	struct filedesc *fdp = p->p_fd;
177 
178 	if ((fp = fd_getfile(fdp, fd)) == NULL)
179 		return (EBADF);
180 	if ((fp->f_flag & FREAD) == 0)
181 		return (EBADF);
182 
183 	FREF(fp);
184 
185 	/* dofilereadv() will FRELE the descriptor for us */
186 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
187 	    &fp->f_offset, retval));
188 }
189 
190 int
191 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
192 	struct proc *p;
193 	int fd;
194 	struct file *fp;
195 	const struct iovec *iovp;
196 	int iovcnt;
197 	off_t *offset;
198 	register_t *retval;
199 {
200 	struct uio auio;
201 	struct iovec *iov;
202 	struct iovec *needfree;
203 	struct iovec aiov[UIO_SMALLIOV];
204 	long i, cnt, error = 0;
205 	u_int iovlen;
206 #ifdef KTRACE
207 	struct iovec *ktriov = NULL;
208 #endif
209 
210 	/* note: can't use iovlen until iovcnt is validated */
211 	iovlen = iovcnt * sizeof(struct iovec);
212 	if ((u_int)iovcnt > UIO_SMALLIOV) {
213 		if ((u_int)iovcnt > IOV_MAX) {
214 			error = EINVAL;
215 			goto out;
216 		}
217 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
218 	} else if ((u_int)iovcnt > 0) {
219 		iov = aiov;
220 		needfree = NULL;
221 	} else {
222 		error = EINVAL;
223 		goto out;
224 	}
225 
226 	auio.uio_iov = iov;
227 	auio.uio_iovcnt = iovcnt;
228 	auio.uio_rw = UIO_READ;
229 	auio.uio_segflg = UIO_USERSPACE;
230 	auio.uio_procp = p;
231 	error = copyin(iovp, iov, iovlen);
232 	if (error)
233 		goto done;
234 	auio.uio_resid = 0;
235 	for (i = 0; i < iovcnt; i++) {
236 		auio.uio_resid += iov->iov_len;
237 		/*
238 		 * Reads return ssize_t because -1 is returned on error.
239 		 * Therefore we must restrict the length to SSIZE_MAX to
240 		 * avoid garbage return values.
241 		 */
242 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
243 			error = EINVAL;
244 			goto done;
245 		}
246 		iov++;
247 	}
248 #ifdef KTRACE
249 	/*
250 	 * if tracing, save a copy of iovec
251 	 */
252 	if (KTRPOINT(p, KTR_GENIO))  {
253 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
254 		bcopy(auio.uio_iov, ktriov, iovlen);
255 	}
256 #endif
257 	cnt = auio.uio_resid;
258 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
259 	if (error)
260 		if (auio.uio_resid != cnt && (error == ERESTART ||
261 		    error == EINTR || error == EWOULDBLOCK))
262 			error = 0;
263 	cnt -= auio.uio_resid;
264 #ifdef KTRACE
265 	if (ktriov != NULL) {
266 		if (error == 0)
267 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
268 			    error);
269 		free(ktriov, M_TEMP);
270 	}
271 #endif
272 	*retval = cnt;
273  done:
274 	if (needfree)
275 		free(needfree, M_IOV);
276  out:
277 	FRELE(fp);
278 	return (error);
279 }
280 
281 /*
282  * Write system call
283  */
284 int
285 sys_write(p, v, retval)
286 	struct proc *p;
287 	void *v;
288 	register_t *retval;
289 {
290 	struct sys_write_args /* {
291 		syscallarg(int) fd;
292 		syscallarg(const void *) buf;
293 		syscallarg(size_t) nbyte;
294 	} */ *uap = v;
295 	int fd = SCARG(uap, fd);
296 	struct file *fp;
297 	struct filedesc *fdp = p->p_fd;
298 
299 	if ((fp = fd_getfile(fdp, fd)) == NULL)
300 		return (EBADF);
301 	if ((fp->f_flag & FWRITE) == 0)
302 		return (EBADF);
303 
304 	FREF(fp);
305 
306 	/* dofilewrite() will FRELE the descriptor for us */
307 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
308 	    &fp->f_offset, retval));
309 }
310 
311 int
312 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
313 	struct proc *p;
314 	int fd;
315 	struct file *fp;
316 	const void *buf;
317 	size_t nbyte;
318 	off_t *offset;
319 	register_t *retval;
320 {
321 	struct uio auio;
322 	struct iovec aiov;
323 	long cnt, error = 0;
324 #ifdef KTRACE
325 	struct iovec ktriov;
326 #endif
327 
328 	aiov.iov_base = (void *)buf;		/* XXX kills const */
329 	aiov.iov_len = nbyte;
330 	auio.uio_iov = &aiov;
331 	auio.uio_iovcnt = 1;
332 	auio.uio_resid = nbyte;
333 	auio.uio_rw = UIO_WRITE;
334 	auio.uio_segflg = UIO_USERSPACE;
335 	auio.uio_procp = p;
336 
337 	/*
338 	 * Writes return ssize_t because -1 is returned on error.  Therefore
339 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
340 	 * values.
341 	 */
342 	if (auio.uio_resid > SSIZE_MAX) {
343 		error = EINVAL;
344 		goto out;
345 	}
346 
347 #ifdef KTRACE
348 	/*
349 	 * if tracing, save a copy of iovec
350 	 */
351 	if (KTRPOINT(p, KTR_GENIO))
352 		ktriov = aiov;
353 #endif
354 	cnt = auio.uio_resid;
355 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
356 	if (error) {
357 		if (auio.uio_resid != cnt && (error == ERESTART ||
358 		    error == EINTR || error == EWOULDBLOCK))
359 			error = 0;
360 		if (error == EPIPE)
361 			psignal(p, SIGPIPE);
362 	}
363 	cnt -= auio.uio_resid;
364 #ifdef KTRACE
365 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
366 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
367 #endif
368 	*retval = cnt;
369  out:
370 	FRELE(fp);
371 	return (error);
372 }
373 
374 /*
375  * Gather write system call
376  */
377 int
378 sys_writev(p, v, retval)
379 	struct proc *p;
380 	void *v;
381 	register_t *retval;
382 {
383 	struct sys_writev_args /* {
384 		syscallarg(int) fd;
385 		syscallarg(const struct iovec *) iovp;
386 		syscallarg(int) iovcnt;
387 	} */ *uap = v;
388 	int fd = SCARG(uap, fd);
389 	struct file *fp;
390 	struct filedesc *fdp = p->p_fd;
391 
392 	if ((fp = fd_getfile(fdp, fd)) == NULL)
393 		return (EBADF);
394 	if ((fp->f_flag & FWRITE) == 0)
395 		return (EBADF);
396 
397 	FREF(fp);
398 
399 	/* dofilewritev() will FRELE the descriptor for us */
400 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
401 	    &fp->f_offset, retval));
402 }
403 
404 int
405 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
406 	struct proc *p;
407 	int fd;
408 	struct file *fp;
409 	const struct iovec *iovp;
410 	int iovcnt;
411 	off_t *offset;
412 	register_t *retval;
413 {
414 	struct uio auio;
415 	struct iovec *iov;
416 	struct iovec *needfree;
417 	struct iovec aiov[UIO_SMALLIOV];
418 	long i, cnt, error = 0;
419 	u_int iovlen;
420 #ifdef KTRACE
421 	struct iovec *ktriov = NULL;
422 #endif
423 
424 	/* note: can't use iovlen until iovcnt is validated */
425 	iovlen = iovcnt * sizeof(struct iovec);
426 	if ((u_int)iovcnt > UIO_SMALLIOV) {
427 		if ((u_int)iovcnt > IOV_MAX) {
428 			error = EINVAL;
429 			goto out;
430 		}
431 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
432 	} else if ((u_int)iovcnt > 0) {
433 		iov = aiov;
434 		needfree = NULL;
435 	} else {
436 		error = EINVAL;
437 		goto out;
438 	}
439 
440 	auio.uio_iov = iov;
441 	auio.uio_iovcnt = iovcnt;
442 	auio.uio_rw = UIO_WRITE;
443 	auio.uio_segflg = UIO_USERSPACE;
444 	auio.uio_procp = p;
445 	error = copyin(iovp, iov, iovlen);
446 	if (error)
447 		goto done;
448 	auio.uio_resid = 0;
449 	for (i = 0; i < iovcnt; i++) {
450 		auio.uio_resid += iov->iov_len;
451 		/*
452 		 * Writes return ssize_t because -1 is returned on error.
453 		 * Therefore we must restrict the length to SSIZE_MAX to
454 		 * avoid garbage return values.
455 		 */
456 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
457 			error = EINVAL;
458 			goto done;
459 		}
460 		iov++;
461 	}
462 #ifdef KTRACE
463 	/*
464 	 * if tracing, save a copy of iovec
465 	 */
466 	if (KTRPOINT(p, KTR_GENIO))  {
467 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
468 		bcopy(auio.uio_iov, ktriov, iovlen);
469 	}
470 #endif
471 	cnt = auio.uio_resid;
472 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
473 	if (error) {
474 		if (auio.uio_resid != cnt && (error == ERESTART ||
475 		    error == EINTR || error == EWOULDBLOCK))
476 			error = 0;
477 		if (error == EPIPE)
478 			psignal(p, SIGPIPE);
479 	}
480 	cnt -= auio.uio_resid;
481 #ifdef KTRACE
482 	if (ktriov != NULL) {
483 		if (error == 0)
484 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
485 			    error);
486 		free(ktriov, M_TEMP);
487 	}
488 #endif
489 	*retval = cnt;
490  done:
491 	if (needfree)
492 		free(needfree, M_IOV);
493  out:
494 	FRELE(fp);
495 	return (error);
496 }
497 
498 /*
499  * Ioctl system call
500  */
501 /* ARGSUSED */
502 int
503 sys_ioctl(p, v, retval)
504 	struct proc *p;
505 	void *v;
506 	register_t *retval;
507 {
508 	struct sys_ioctl_args /* {
509 		syscallarg(int) fd;
510 		syscallarg(u_long) com;
511 		syscallarg(void *) data;
512 	} */ *uap = v;
513 	struct file *fp;
514 	struct filedesc *fdp;
515 	u_long com;
516 	int error;
517 	u_int size;
518 	caddr_t data, memp;
519 	int tmp;
520 #define STK_PARAMS	128
521 	char stkbuf[STK_PARAMS];
522 
523 	fdp = p->p_fd;
524 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
525 		return (EBADF);
526 
527 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
528 		return (EBADF);
529 
530 	switch (com = SCARG(uap, com)) {
531 	case FIONCLEX:
532 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
533 		return (0);
534 	case FIOCLEX:
535 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
536 		return (0);
537 	}
538 
539 	/*
540 	 * Interpret high order word to find amount of data to be
541 	 * copied to/from the user's address space.
542 	 */
543 	size = IOCPARM_LEN(com);
544 	if (size > IOCPARM_MAX)
545 		return (ENOTTY);
546 	FREF(fp);
547 	memp = NULL;
548 	if (size > sizeof (stkbuf)) {
549 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
550 		data = memp;
551 	} else
552 		data = stkbuf;
553 	if (com&IOC_IN) {
554 		if (size) {
555 			error = copyin(SCARG(uap, data), data, (u_int)size);
556 			if (error) {
557 				goto out;
558 			}
559 		} else
560 			*(caddr_t *)data = SCARG(uap, data);
561 	} else if ((com&IOC_OUT) && size)
562 		/*
563 		 * Zero the buffer so the user always
564 		 * gets back something deterministic.
565 		 */
566 		bzero(data, size);
567 	else if (com&IOC_VOID)
568 		*(caddr_t *)data = SCARG(uap, data);
569 
570 	switch (com) {
571 
572 	case FIONBIO:
573 		if ((tmp = *(int *)data) != 0)
574 			fp->f_flag |= FNONBLOCK;
575 		else
576 			fp->f_flag &= ~FNONBLOCK;
577 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
578 		break;
579 
580 	case FIOASYNC:
581 		if ((tmp = *(int *)data) != 0)
582 			fp->f_flag |= FASYNC;
583 		else
584 			fp->f_flag &= ~FASYNC;
585 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
586 		break;
587 
588 	case FIOSETOWN:
589 		tmp = *(int *)data;
590 		if (fp->f_type == DTYPE_SOCKET) {
591 			struct socket *so = (struct socket *)fp->f_data;
592 
593 			so->so_pgid = tmp;
594 			so->so_siguid = p->p_cred->p_ruid;
595 			so->so_sigeuid = p->p_ucred->cr_uid;
596 			error = 0;
597 			break;
598 		}
599 		if (tmp <= 0) {
600 			tmp = -tmp;
601 		} else {
602 			struct proc *p1 = pfind(tmp);
603 			if (p1 == 0) {
604 				error = ESRCH;
605 				break;
606 			}
607 			tmp = p1->p_pgrp->pg_id;
608 		}
609 		error = (*fp->f_ops->fo_ioctl)
610 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
611 		break;
612 
613 	case FIOGETOWN:
614 		if (fp->f_type == DTYPE_SOCKET) {
615 			error = 0;
616 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
617 			break;
618 		}
619 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
620 		*(int *)data = -*(int *)data;
621 		break;
622 
623 	default:
624 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
625 		/*
626 		 * Copy any data to user, size was
627 		 * already set and checked above.
628 		 */
629 		if (error == 0 && (com&IOC_OUT) && size)
630 			error = copyout(data, SCARG(uap, data), (u_int)size);
631 		break;
632 	}
633 out:
634 	FRELE(fp);
635 	if (memp)
636 		free(memp, M_IOCTLOPS);
637 	return (error);
638 }
639 
640 int	selwait, nselcoll;
641 
642 /*
643  * Select system call.
644  */
645 int
646 sys_select(struct proc *p, void *v, register_t *retval)
647 {
648 	struct sys_select_args /* {
649 		syscallarg(int) nd;
650 		syscallarg(fd_set *) in;
651 		syscallarg(fd_set *) ou;
652 		syscallarg(fd_set *) ex;
653 		syscallarg(struct timeval *) tv;
654 	} */ *uap = v;
655 	fd_set bits[6], *pibits[3], *pobits[3];
656 	struct timeval atv, rtv, ttv;
657 	int s, ncoll, error = 0, timo;
658 	u_int nd, ni;
659 
660 	nd = SCARG(uap, nd);
661 	if (nd > p->p_fd->fd_nfiles) {
662 		/* forgiving; slightly wrong */
663 		nd = p->p_fd->fd_nfiles;
664 	}
665 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
666 	if (nd > FD_SETSIZE) {
667 		caddr_t mbits;
668 
669 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
670 		bzero(mbits, ni * 6);
671 		pibits[0] = (fd_set *)&mbits[ni * 0];
672 		pibits[1] = (fd_set *)&mbits[ni * 1];
673 		pibits[2] = (fd_set *)&mbits[ni * 2];
674 		pobits[0] = (fd_set *)&mbits[ni * 3];
675 		pobits[1] = (fd_set *)&mbits[ni * 4];
676 		pobits[2] = (fd_set *)&mbits[ni * 5];
677 	} else {
678 		bzero(bits, sizeof(bits));
679 		pibits[0] = &bits[0];
680 		pibits[1] = &bits[1];
681 		pibits[2] = &bits[2];
682 		pobits[0] = &bits[3];
683 		pobits[1] = &bits[4];
684 		pobits[2] = &bits[5];
685 	}
686 
687 #define	getbits(name, x) \
688 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
689 	    pibits[x], ni))) \
690 		goto done;
691 	getbits(in, 0);
692 	getbits(ou, 1);
693 	getbits(ex, 2);
694 #undef	getbits
695 
696 	if (SCARG(uap, tv)) {
697 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
698 		if (error)
699 			goto done;
700 		if (itimerfix(&atv)) {
701 			error = EINVAL;
702 			goto done;
703 		}
704 		getmicrouptime(&rtv);
705 		timeradd(&atv, &rtv, &atv);
706 	} else {
707 		atv.tv_sec = 0;
708 		atv.tv_usec = 0;
709 	}
710 	timo = 0;
711 
712 retry:
713 	ncoll = nselcoll;
714 	p->p_flag |= P_SELECT;
715 	error = selscan(p, pibits[0], pobits[0], nd, retval);
716 	if (error || *retval)
717 		goto done;
718 	if (SCARG(uap, tv)) {
719 		getmicrouptime(&rtv);
720 		if (timercmp(&rtv, &atv, >=))
721 			goto done;
722 		ttv = atv;
723 		timersub(&ttv, &rtv, &ttv);
724 		timo = ttv.tv_sec > 24 * 60 * 60 ?
725 			24 * 60 * 60 * hz : tvtohz(&ttv);
726 	}
727 	s = splhigh();
728 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
729 		splx(s);
730 		goto retry;
731 	}
732 	p->p_flag &= ~P_SELECT;
733 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
734 	splx(s);
735 	if (error == 0)
736 		goto retry;
737 done:
738 	p->p_flag &= ~P_SELECT;
739 	/* select is not restarted after signals... */
740 	if (error == ERESTART)
741 		error = EINTR;
742 	if (error == EWOULDBLOCK)
743 		error = 0;
744 #define	putbits(name, x) \
745 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
746 	    SCARG(uap, name), ni))) \
747 		error = error2;
748 	if (error == 0) {
749 		int error2;
750 
751 		putbits(in, 0);
752 		putbits(ou, 1);
753 		putbits(ex, 2);
754 #undef putbits
755 	}
756 
757 	if (pibits[0] != &bits[0])
758 		free(pibits[0], M_TEMP);
759 	return (error);
760 }
761 
762 int
763 selscan(p, ibits, obits, nfd, retval)
764 	struct proc *p;
765 	fd_set *ibits, *obits;
766 	int nfd;
767 	register_t *retval;
768 {
769 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
770 	register struct filedesc *fdp = p->p_fd;
771 	register int msk, i, j, fd;
772 	register fd_mask bits;
773 	struct file *fp;
774 	int ni, n = 0;
775 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
776 
777 	/*
778 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
779 	 * up to the next byte) otherwise the fd_set's are normal sized.
780 	 */
781 	ni = sizeof(fd_set);
782 	if (nfd > FD_SETSIZE)
783 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
784 
785 	for (msk = 0; msk < 3; msk++) {
786 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
787 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
788 
789 		for (i = 0; i < nfd; i += NFDBITS) {
790 			bits = pibits->fds_bits[i/NFDBITS];
791 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
792 				bits &= ~(1 << j);
793 				if ((fp = fd_getfile(fdp, fd)) == NULL)
794 					return (EBADF);
795 				FREF(fp);
796 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
797 					FD_SET(fd, pobits);
798 					n++;
799 				}
800 				FRELE(fp);
801 			}
802 		}
803 	}
804 	*retval = n;
805 	return (0);
806 }
807 
808 /*ARGSUSED*/
809 int
810 seltrue(dev, events, p)
811 	dev_t dev;
812 	int events;
813 	struct proc *p;
814 {
815 
816 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
817 }
818 
819 /*
820  * Record a select request.
821  */
822 void
823 selrecord(selector, sip)
824 	struct proc *selector;
825 	struct selinfo *sip;
826 {
827 	struct proc *p;
828 	pid_t mypid;
829 
830 	mypid = selector->p_pid;
831 	if (sip->si_selpid == mypid)
832 		return;
833 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
834 	    p->p_wchan == (caddr_t)&selwait)
835 		sip->si_flags |= SI_COLL;
836 	else
837 		sip->si_selpid = mypid;
838 }
839 
840 /*
841  * Do a wakeup when a selectable event occurs.
842  */
843 void
844 selwakeup(sip)
845 	register struct selinfo *sip;
846 {
847 	register struct proc *p;
848 	int s;
849 
850 	if (sip->si_selpid == 0)
851 		return;
852 	if (sip->si_flags & SI_COLL) {
853 		nselcoll++;
854 		sip->si_flags &= ~SI_COLL;
855 		wakeup(&selwait);
856 	}
857 	p = pfind(sip->si_selpid);
858 	sip->si_selpid = 0;
859 	if (p != NULL) {
860 		SCHED_LOCK(s);
861 		if (p->p_wchan == (caddr_t)&selwait) {
862 			if (p->p_stat == SSLEEP)
863 				setrunnable(p);
864 			else
865 				unsleep(p);
866 		} else if (p->p_flag & P_SELECT)
867 			p->p_flag &= ~P_SELECT;
868 		SCHED_UNLOCK(s);
869 	}
870 }
871 
872 void
873 pollscan(p, pl, nfd, retval)
874 	struct proc *p;
875 	struct pollfd *pl;
876 	u_int nfd;
877 	register_t *retval;
878 {
879 	struct filedesc *fdp = p->p_fd;
880 	struct file *fp;
881 	u_int i;
882 	int n = 0;
883 
884 	for (i = 0; i < nfd; i++, pl++) {
885 		/* Check the file descriptor. */
886 		if (pl->fd < 0) {
887 			pl->revents = 0;
888 			continue;
889 		}
890 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
891 			pl->revents = POLLNVAL;
892 			n++;
893 			continue;
894 		}
895 		FREF(fp);
896 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
897 		FRELE(fp);
898 		if (pl->revents != 0)
899 			n++;
900 	}
901 	*retval = n;
902 }
903 
904 /*
905  * We are using the same mechanism as select only we encode/decode args
906  * differently.
907  */
908 int
909 sys_poll(struct proc *p, void *v, register_t *retval)
910 {
911 	struct sys_poll_args /* {
912 		syscallarg(struct pollfd *) fds;
913 		syscallarg(u_int) nfds;
914 		syscallarg(int) timeout;
915 	} */ *uap = v;
916 	size_t sz;
917 	struct pollfd pfds[4], *pl = pfds;
918 	int msec = SCARG(uap, timeout);
919 	struct timeval atv, rtv, ttv;
920 	int timo, ncoll, i, s, error;
921 	extern int nselcoll, selwait;
922 	u_int nfds = SCARG(uap, nfds);
923 
924 	/* Standards say no more than MAX_OPEN; this is possibly better. */
925 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
926 		return (EINVAL);
927 
928 	sz = sizeof(struct pollfd) * nfds;
929 
930 	/* optimize for the default case, of a small nfds value */
931 	if (sz > sizeof(pfds))
932 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
933 
934 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
935 		goto bad;
936 
937 	for (i = 0; i < nfds; i++)
938 		pl[i].revents = 0;
939 
940 	if (msec != INFTIM) {
941 		atv.tv_sec = msec / 1000;
942 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
943 
944 		if (itimerfix(&atv)) {
945 			error = EINVAL;
946 			goto done;
947 		}
948 		getmicrouptime(&rtv);
949 		timeradd(&atv, &rtv, &atv);
950 	} else {
951 		atv.tv_sec = 0;
952 		atv.tv_usec = 0;
953 	}
954 	timo = 0;
955 
956 retry:
957 	ncoll = nselcoll;
958 	p->p_flag |= P_SELECT;
959 	pollscan(p, pl, nfds, retval);
960 	if (*retval)
961 		goto done;
962 	if (msec != INFTIM) {
963 		getmicrouptime(&rtv);
964 		if (timercmp(&rtv, &atv, >=))
965 			goto done;
966 		ttv = atv;
967 		timersub(&ttv, &rtv, &ttv);
968 		timo = ttv.tv_sec > 24 * 60 * 60 ?
969 			24 * 60 * 60 * hz : tvtohz(&ttv);
970 	}
971 	s = splhigh();
972 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
973 		splx(s);
974 		goto retry;
975 	}
976 	p->p_flag &= ~P_SELECT;
977 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
978 	splx(s);
979 	if (error == 0)
980 		goto retry;
981 
982 done:
983 	p->p_flag &= ~P_SELECT;
984 	/*
985 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
986 	 *       ignored (since the whole point is to see what would block).
987 	 */
988 	switch (error) {
989 	case ERESTART:
990 		error = EINTR;
991 		break;
992 	case EWOULDBLOCK:
993 	case 0:
994 		error = copyout(pl, SCARG(uap, fds), sz);
995 		break;
996 	}
997 bad:
998 	if (pl != pfds)
999 		free(pl, M_TEMP);
1000 	return (error);
1001 }
1002