xref: /openbsd-src/sys/kern/sys_generic.c (revision b725ae7711052a2233e31a66fefb8a752c388d7a)
1 /*	$OpenBSD: sys_generic.c,v 1.47 2003/12/10 23:10:08 millert Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 
59 #include <sys/mount.h>
60 #include <sys/syscallargs.h>
61 
62 #include <uvm/uvm_extern.h>
63 
64 int selscan(struct proc *, fd_set *, fd_set *, int, register_t *);
65 int seltrue(dev_t, int, struct proc *);
66 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
67 
68 /*
69  * Read system call.
70  */
71 /* ARGSUSED */
72 int
73 sys_read(p, v, retval)
74 	struct proc *p;
75 	void *v;
76 	register_t *retval;
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int) fd;
80 		syscallarg(void *) buf;
81 		syscallarg(size_t) nbyte;
82 	} */ *uap = v;
83 	int fd = SCARG(uap, fd);
84 	struct file *fp;
85 	struct filedesc *fdp = p->p_fd;
86 
87 	if ((fp = fd_getfile(fdp, fd)) == NULL)
88 		return (EBADF);
89 	if ((fp->f_flag & FREAD) == 0)
90 		return (EBADF);
91 
92 	FREF(fp);
93 
94 	/* dofileread() will FRELE the descriptor for us */
95 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
96 	    &fp->f_offset, retval));
97 }
98 
99 int
100 dofileread(p, fd, fp, buf, nbyte, offset, retval)
101 	struct proc *p;
102 	int fd;
103 	struct file *fp;
104 	void *buf;
105 	size_t nbyte;
106 	off_t *offset;
107 	register_t *retval;
108 {
109 	struct uio auio;
110 	struct iovec aiov;
111 	long cnt, error = 0;
112 #ifdef KTRACE
113 	struct iovec ktriov;
114 #endif
115 
116 	aiov.iov_base = buf;
117 	aiov.iov_len = nbyte;
118 	auio.uio_iov = &aiov;
119 	auio.uio_iovcnt = 1;
120 	auio.uio_resid = nbyte;
121 	auio.uio_rw = UIO_READ;
122 	auio.uio_segflg = UIO_USERSPACE;
123 	auio.uio_procp = p;
124 
125 	/*
126 	 * Reads return ssize_t because -1 is returned on error.  Therefore
127 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
128 	 * values.
129 	 */
130 	if (auio.uio_resid > SSIZE_MAX) {
131 		error = EINVAL;
132 		goto out;
133 	}
134 
135 #ifdef KTRACE
136 	/*
137 	 * if tracing, save a copy of iovec
138 	 */
139 	if (KTRPOINT(p, KTR_GENIO))
140 		ktriov = aiov;
141 #endif
142 	cnt = auio.uio_resid;
143 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
144 	if (error)
145 		if (auio.uio_resid != cnt && (error == ERESTART ||
146 		    error == EINTR || error == EWOULDBLOCK))
147 			error = 0;
148 	cnt -= auio.uio_resid;
149 #ifdef KTRACE
150 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
151 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
152 #endif
153 	*retval = cnt;
154  out:
155 	FRELE(fp);
156 	return (error);
157 }
158 
159 /*
160  * Scatter read system call.
161  */
162 int
163 sys_readv(p, v, retval)
164 	struct proc *p;
165 	void *v;
166 	register_t *retval;
167 {
168 	struct sys_readv_args /* {
169 		syscallarg(int) fd;
170 		syscallarg(const struct iovec *) iovp;
171 		syscallarg(int) iovcnt;
172 	} */ *uap = v;
173 	int fd = SCARG(uap, fd);
174 	struct file *fp;
175 	struct filedesc *fdp = p->p_fd;
176 
177 	if ((fp = fd_getfile(fdp, fd)) == NULL)
178 		return (EBADF);
179 	if ((fp->f_flag & FREAD) == 0)
180 		return (EBADF);
181 
182 	FREF(fp);
183 
184 	/* dofilereadv() will FRELE the descriptor for us */
185 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
186 	    &fp->f_offset, retval));
187 }
188 
189 int
190 dofilereadv(p, fd, fp, iovp, iovcnt, offset, retval)
191 	struct proc *p;
192 	int fd;
193 	struct file *fp;
194 	const struct iovec *iovp;
195 	int iovcnt;
196 	off_t *offset;
197 	register_t *retval;
198 {
199 	struct uio auio;
200 	struct iovec *iov;
201 	struct iovec *needfree;
202 	struct iovec aiov[UIO_SMALLIOV];
203 	long i, cnt, error = 0;
204 	u_int iovlen;
205 #ifdef KTRACE
206 	struct iovec *ktriov = NULL;
207 #endif
208 
209 	/* note: can't use iovlen until iovcnt is validated */
210 	iovlen = iovcnt * sizeof(struct iovec);
211 	if ((u_int)iovcnt > UIO_SMALLIOV) {
212 		if ((u_int)iovcnt > IOV_MAX) {
213 			error = EINVAL;
214 			goto out;
215 		}
216 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
217 	} else if ((u_int)iovcnt > 0) {
218 		iov = aiov;
219 		needfree = NULL;
220 	} else {
221 		error = EINVAL;
222 		goto out;
223 	}
224 
225 	auio.uio_iov = iov;
226 	auio.uio_iovcnt = iovcnt;
227 	auio.uio_rw = UIO_READ;
228 	auio.uio_segflg = UIO_USERSPACE;
229 	auio.uio_procp = p;
230 	error = copyin(iovp, iov, iovlen);
231 	if (error)
232 		goto done;
233 	auio.uio_resid = 0;
234 	for (i = 0; i < iovcnt; i++) {
235 		auio.uio_resid += iov->iov_len;
236 		/*
237 		 * Reads return ssize_t because -1 is returned on error.
238 		 * Therefore we must restrict the length to SSIZE_MAX to
239 		 * avoid garbage return values.
240 		 */
241 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
242 			error = EINVAL;
243 			goto done;
244 		}
245 		iov++;
246 	}
247 #ifdef KTRACE
248 	/*
249 	 * if tracing, save a copy of iovec
250 	 */
251 	if (KTRPOINT(p, KTR_GENIO))  {
252 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
253 		bcopy(auio.uio_iov, ktriov, iovlen);
254 	}
255 #endif
256 	cnt = auio.uio_resid;
257 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
258 	if (error)
259 		if (auio.uio_resid != cnt && (error == ERESTART ||
260 		    error == EINTR || error == EWOULDBLOCK))
261 			error = 0;
262 	cnt -= auio.uio_resid;
263 #ifdef KTRACE
264 	if (ktriov != NULL) {
265 		if (error == 0)
266 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
267 			    error);
268 		free(ktriov, M_TEMP);
269 	}
270 #endif
271 	*retval = cnt;
272  done:
273 	if (needfree)
274 		free(needfree, M_IOV);
275  out:
276 	FRELE(fp);
277 	return (error);
278 }
279 
280 /*
281  * Write system call
282  */
283 int
284 sys_write(p, v, retval)
285 	struct proc *p;
286 	void *v;
287 	register_t *retval;
288 {
289 	struct sys_write_args /* {
290 		syscallarg(int) fd;
291 		syscallarg(const void *) buf;
292 		syscallarg(size_t) nbyte;
293 	} */ *uap = v;
294 	int fd = SCARG(uap, fd);
295 	struct file *fp;
296 	struct filedesc *fdp = p->p_fd;
297 
298 	if ((fp = fd_getfile(fdp, fd)) == NULL)
299 		return (EBADF);
300 	if ((fp->f_flag & FWRITE) == 0)
301 		return (EBADF);
302 
303 	FREF(fp);
304 
305 	/* dofilewrite() will FRELE the descriptor for us */
306 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
307 	    &fp->f_offset, retval));
308 }
309 
310 int
311 dofilewrite(p, fd, fp, buf, nbyte, offset, retval)
312 	struct proc *p;
313 	int fd;
314 	struct file *fp;
315 	const void *buf;
316 	size_t nbyte;
317 	off_t *offset;
318 	register_t *retval;
319 {
320 	struct uio auio;
321 	struct iovec aiov;
322 	long cnt, error = 0;
323 #ifdef KTRACE
324 	struct iovec ktriov;
325 #endif
326 
327 	aiov.iov_base = (void *)buf;		/* XXX kills const */
328 	aiov.iov_len = nbyte;
329 	auio.uio_iov = &aiov;
330 	auio.uio_iovcnt = 1;
331 	auio.uio_resid = nbyte;
332 	auio.uio_rw = UIO_WRITE;
333 	auio.uio_segflg = UIO_USERSPACE;
334 	auio.uio_procp = p;
335 
336 	/*
337 	 * Writes return ssize_t because -1 is returned on error.  Therefore
338 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
339 	 * values.
340 	 */
341 	if (auio.uio_resid > SSIZE_MAX) {
342 		error = EINVAL;
343 		goto out;
344 	}
345 
346 #ifdef KTRACE
347 	/*
348 	 * if tracing, save a copy of iovec
349 	 */
350 	if (KTRPOINT(p, KTR_GENIO))
351 		ktriov = aiov;
352 #endif
353 	cnt = auio.uio_resid;
354 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
355 	if (error) {
356 		if (auio.uio_resid != cnt && (error == ERESTART ||
357 		    error == EINTR || error == EWOULDBLOCK))
358 			error = 0;
359 		if (error == EPIPE)
360 			psignal(p, SIGPIPE);
361 	}
362 	cnt -= auio.uio_resid;
363 #ifdef KTRACE
364 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
365 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
366 #endif
367 	*retval = cnt;
368  out:
369 	FRELE(fp);
370 	return (error);
371 }
372 
373 /*
374  * Gather write system call
375  */
376 int
377 sys_writev(p, v, retval)
378 	struct proc *p;
379 	void *v;
380 	register_t *retval;
381 {
382 	struct sys_writev_args /* {
383 		syscallarg(int) fd;
384 		syscallarg(const struct iovec *) iovp;
385 		syscallarg(int) iovcnt;
386 	} */ *uap = v;
387 	int fd = SCARG(uap, fd);
388 	struct file *fp;
389 	struct filedesc *fdp = p->p_fd;
390 
391 	if ((fp = fd_getfile(fdp, fd)) == NULL)
392 		return (EBADF);
393 	if ((fp->f_flag & FWRITE) == 0)
394 		return (EBADF);
395 
396 	FREF(fp);
397 
398 	/* dofilewritev() will FRELE the descriptor for us */
399 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
400 	    &fp->f_offset, retval));
401 }
402 
403 int
404 dofilewritev(p, fd, fp, iovp, iovcnt, offset, retval)
405 	struct proc *p;
406 	int fd;
407 	struct file *fp;
408 	const struct iovec *iovp;
409 	int iovcnt;
410 	off_t *offset;
411 	register_t *retval;
412 {
413 	struct uio auio;
414 	struct iovec *iov;
415 	struct iovec *needfree;
416 	struct iovec aiov[UIO_SMALLIOV];
417 	long i, cnt, error = 0;
418 	u_int iovlen;
419 #ifdef KTRACE
420 	struct iovec *ktriov = NULL;
421 #endif
422 
423 	/* note: can't use iovlen until iovcnt is validated */
424 	iovlen = iovcnt * sizeof(struct iovec);
425 	if ((u_int)iovcnt > UIO_SMALLIOV) {
426 		if ((u_int)iovcnt > IOV_MAX) {
427 			error = EINVAL;
428 			goto out;
429 		}
430 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
431 	} else if ((u_int)iovcnt > 0) {
432 		iov = aiov;
433 		needfree = NULL;
434 	} else {
435 		error = EINVAL;
436 		goto out;
437 	}
438 
439 	auio.uio_iov = iov;
440 	auio.uio_iovcnt = iovcnt;
441 	auio.uio_rw = UIO_WRITE;
442 	auio.uio_segflg = UIO_USERSPACE;
443 	auio.uio_procp = p;
444 	error = copyin(iovp, iov, iovlen);
445 	if (error)
446 		goto done;
447 	auio.uio_resid = 0;
448 	for (i = 0; i < iovcnt; i++) {
449 		auio.uio_resid += iov->iov_len;
450 		/*
451 		 * Writes return ssize_t because -1 is returned on error.
452 		 * Therefore we must restrict the length to SSIZE_MAX to
453 		 * avoid garbage return values.
454 		 */
455 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
456 			error = EINVAL;
457 			goto done;
458 		}
459 		iov++;
460 	}
461 #ifdef KTRACE
462 	/*
463 	 * if tracing, save a copy of iovec
464 	 */
465 	if (KTRPOINT(p, KTR_GENIO))  {
466 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
467 		bcopy(auio.uio_iov, ktriov, iovlen);
468 	}
469 #endif
470 	cnt = auio.uio_resid;
471 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
472 	if (error) {
473 		if (auio.uio_resid != cnt && (error == ERESTART ||
474 		    error == EINTR || error == EWOULDBLOCK))
475 			error = 0;
476 		if (error == EPIPE)
477 			psignal(p, SIGPIPE);
478 	}
479 	cnt -= auio.uio_resid;
480 #ifdef KTRACE
481 	if (ktriov != NULL) {
482 		if (error == 0)
483 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt,
484 			    error);
485 		free(ktriov, M_TEMP);
486 	}
487 #endif
488 	*retval = cnt;
489  done:
490 	if (needfree)
491 		free(needfree, M_IOV);
492  out:
493 	FRELE(fp);
494 	return (error);
495 }
496 
497 /*
498  * Ioctl system call
499  */
500 /* ARGSUSED */
501 int
502 sys_ioctl(p, v, retval)
503 	struct proc *p;
504 	void *v;
505 	register_t *retval;
506 {
507 	struct sys_ioctl_args /* {
508 		syscallarg(int) fd;
509 		syscallarg(u_long) com;
510 		syscallarg(void *) data;
511 	} */ *uap = v;
512 	struct file *fp;
513 	struct filedesc *fdp;
514 	u_long com;
515 	int error;
516 	u_int size;
517 	caddr_t data, memp;
518 	int tmp;
519 #define STK_PARAMS	128
520 	char stkbuf[STK_PARAMS];
521 
522 	fdp = p->p_fd;
523 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
524 		return (EBADF);
525 
526 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
527 		return (EBADF);
528 
529 	switch (com = SCARG(uap, com)) {
530 	case FIONCLEX:
531 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
532 		return (0);
533 	case FIOCLEX:
534 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
535 		return (0);
536 	}
537 
538 	/*
539 	 * Interpret high order word to find amount of data to be
540 	 * copied to/from the user's address space.
541 	 */
542 	size = IOCPARM_LEN(com);
543 	if (size > IOCPARM_MAX)
544 		return (ENOTTY);
545 	FREF(fp);
546 	memp = NULL;
547 	if (size > sizeof (stkbuf)) {
548 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
549 		data = memp;
550 	} else
551 		data = stkbuf;
552 	if (com&IOC_IN) {
553 		if (size) {
554 			error = copyin(SCARG(uap, data), data, (u_int)size);
555 			if (error) {
556 				goto out;
557 			}
558 		} else
559 			*(caddr_t *)data = SCARG(uap, data);
560 	} else if ((com&IOC_OUT) && size)
561 		/*
562 		 * Zero the buffer so the user always
563 		 * gets back something deterministic.
564 		 */
565 		bzero(data, size);
566 	else if (com&IOC_VOID)
567 		*(caddr_t *)data = SCARG(uap, data);
568 
569 	switch (com) {
570 
571 	case FIONBIO:
572 		if ((tmp = *(int *)data) != 0)
573 			fp->f_flag |= FNONBLOCK;
574 		else
575 			fp->f_flag &= ~FNONBLOCK;
576 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
577 		break;
578 
579 	case FIOASYNC:
580 		if ((tmp = *(int *)data) != 0)
581 			fp->f_flag |= FASYNC;
582 		else
583 			fp->f_flag &= ~FASYNC;
584 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
585 		break;
586 
587 	case FIOSETOWN:
588 		tmp = *(int *)data;
589 		if (fp->f_type == DTYPE_SOCKET) {
590 			struct socket *so = (struct socket *)fp->f_data;
591 
592 			so->so_pgid = tmp;
593 			so->so_siguid = p->p_cred->p_ruid;
594 			so->so_sigeuid = p->p_ucred->cr_uid;
595 			error = 0;
596 			break;
597 		}
598 		if (tmp <= 0) {
599 			tmp = -tmp;
600 		} else {
601 			struct proc *p1 = pfind(tmp);
602 			if (p1 == 0) {
603 				error = ESRCH;
604 				break;
605 			}
606 			tmp = p1->p_pgrp->pg_id;
607 		}
608 		error = (*fp->f_ops->fo_ioctl)
609 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
610 		break;
611 
612 	case FIOGETOWN:
613 		if (fp->f_type == DTYPE_SOCKET) {
614 			error = 0;
615 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
616 			break;
617 		}
618 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
619 		*(int *)data = -*(int *)data;
620 		break;
621 
622 	default:
623 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
624 		/*
625 		 * Copy any data to user, size was
626 		 * already set and checked above.
627 		 */
628 		if (error == 0 && (com&IOC_OUT) && size)
629 			error = copyout(data, SCARG(uap, data), (u_int)size);
630 		break;
631 	}
632 out:
633 	FRELE(fp);
634 	if (memp)
635 		free(memp, M_IOCTLOPS);
636 	return (error);
637 }
638 
639 int	selwait, nselcoll;
640 
641 /*
642  * Select system call.
643  */
644 int
645 sys_select(struct proc *p, void *v, register_t *retval)
646 {
647 	struct sys_select_args /* {
648 		syscallarg(int) nd;
649 		syscallarg(fd_set *) in;
650 		syscallarg(fd_set *) ou;
651 		syscallarg(fd_set *) ex;
652 		syscallarg(struct timeval *) tv;
653 	} */ *uap = v;
654 	fd_set bits[6], *pibits[3], *pobits[3];
655 	struct timeval atv;
656 	int s, ncoll, error = 0, timo;
657 	u_int nd, ni;
658 
659 	nd = SCARG(uap, nd);
660 	if (nd > p->p_fd->fd_nfiles) {
661 		/* forgiving; slightly wrong */
662 		nd = p->p_fd->fd_nfiles;
663 	}
664 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
665 	if (nd > FD_SETSIZE) {
666 		caddr_t mbits;
667 
668 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
669 		bzero(mbits, ni * 6);
670 		pibits[0] = (fd_set *)&mbits[ni * 0];
671 		pibits[1] = (fd_set *)&mbits[ni * 1];
672 		pibits[2] = (fd_set *)&mbits[ni * 2];
673 		pobits[0] = (fd_set *)&mbits[ni * 3];
674 		pobits[1] = (fd_set *)&mbits[ni * 4];
675 		pobits[2] = (fd_set *)&mbits[ni * 5];
676 	} else {
677 		bzero(bits, sizeof(bits));
678 		pibits[0] = &bits[0];
679 		pibits[1] = &bits[1];
680 		pibits[2] = &bits[2];
681 		pobits[0] = &bits[3];
682 		pobits[1] = &bits[4];
683 		pobits[2] = &bits[5];
684 	}
685 
686 #define	getbits(name, x) \
687 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
688 	    pibits[x], ni))) \
689 		goto done;
690 	getbits(in, 0);
691 	getbits(ou, 1);
692 	getbits(ex, 2);
693 #undef	getbits
694 
695 	if (SCARG(uap, tv)) {
696 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
697 		if (error)
698 			goto done;
699 		if (itimerfix(&atv)) {
700 			error = EINVAL;
701 			goto done;
702 		}
703 		s = splclock();
704 		timeradd(&atv, &time, &atv);
705 		splx(s);
706 	} else
707 		timo = 0;
708 retry:
709 	ncoll = nselcoll;
710 	p->p_flag |= P_SELECT;
711 	error = selscan(p, pibits[0], pobits[0], nd, retval);
712 	if (error || *retval)
713 		goto done;
714 	if (SCARG(uap, tv)) {
715 		/*
716 		 * We have to recalculate the timeout on every retry.
717 		 */
718 		timo = hzto(&atv);
719 		if (timo <= 0)
720 			goto done;
721 	}
722 	s = splhigh();
723 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
724 		splx(s);
725 		goto retry;
726 	}
727 	p->p_flag &= ~P_SELECT;
728 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
729 	splx(s);
730 	if (error == 0)
731 		goto retry;
732 done:
733 	p->p_flag &= ~P_SELECT;
734 	/* select is not restarted after signals... */
735 	if (error == ERESTART)
736 		error = EINTR;
737 	if (error == EWOULDBLOCK)
738 		error = 0;
739 #define	putbits(name, x) \
740 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
741 	    SCARG(uap, name), ni))) \
742 		error = error2;
743 	if (error == 0) {
744 		int error2;
745 
746 		putbits(in, 0);
747 		putbits(ou, 1);
748 		putbits(ex, 2);
749 #undef putbits
750 	}
751 
752 	if (pibits[0] != &bits[0])
753 		free(pibits[0], M_TEMP);
754 	return (error);
755 }
756 
757 int
758 selscan(p, ibits, obits, nfd, retval)
759 	struct proc *p;
760 	fd_set *ibits, *obits;
761 	int nfd;
762 	register_t *retval;
763 {
764 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
765 	register struct filedesc *fdp = p->p_fd;
766 	register int msk, i, j, fd;
767 	register fd_mask bits;
768 	struct file *fp;
769 	int ni, n = 0;
770 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
771 
772 	/*
773 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
774 	 * up to the next byte) otherwise the fd_set's are normal sized.
775 	 */
776 	ni = sizeof(fd_set);
777 	if (nfd > FD_SETSIZE)
778 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
779 
780 	for (msk = 0; msk < 3; msk++) {
781 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
782 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
783 
784 		for (i = 0; i < nfd; i += NFDBITS) {
785 			bits = pibits->fds_bits[i/NFDBITS];
786 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
787 				bits &= ~(1 << j);
788 				if ((fp = fd_getfile(fdp, fd)) == NULL)
789 					return (EBADF);
790 				FREF(fp);
791 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
792 					FD_SET(fd, pobits);
793 					n++;
794 				}
795 				FRELE(fp);
796 			}
797 		}
798 	}
799 	*retval = n;
800 	return (0);
801 }
802 
803 /*ARGSUSED*/
804 int
805 seltrue(dev, events, p)
806 	dev_t dev;
807 	int events;
808 	struct proc *p;
809 {
810 
811 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
812 }
813 
814 /*
815  * Record a select request.
816  */
817 void
818 selrecord(selector, sip)
819 	struct proc *selector;
820 	struct selinfo *sip;
821 {
822 	struct proc *p;
823 	pid_t mypid;
824 
825 	mypid = selector->p_pid;
826 	if (sip->si_selpid == mypid)
827 		return;
828 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
829 	    p->p_wchan == (caddr_t)&selwait)
830 		sip->si_flags |= SI_COLL;
831 	else
832 		sip->si_selpid = mypid;
833 }
834 
835 /*
836  * Do a wakeup when a selectable event occurs.
837  */
838 void
839 selwakeup(sip)
840 	register struct selinfo *sip;
841 {
842 	register struct proc *p;
843 	int s;
844 
845 	if (sip->si_selpid == 0)
846 		return;
847 	if (sip->si_flags & SI_COLL) {
848 		nselcoll++;
849 		sip->si_flags &= ~SI_COLL;
850 		wakeup(&selwait);
851 	}
852 	p = pfind(sip->si_selpid);
853 	sip->si_selpid = 0;
854 	if (p != NULL) {
855 		s = splhigh();
856 		if (p->p_wchan == (caddr_t)&selwait) {
857 			if (p->p_stat == SSLEEP)
858 				setrunnable(p);
859 			else
860 				unsleep(p);
861 		} else if (p->p_flag & P_SELECT)
862 			p->p_flag &= ~P_SELECT;
863 		splx(s);
864 	}
865 }
866 
867 void
868 pollscan(p, pl, nfd, retval)
869 	struct proc *p;
870 	struct pollfd *pl;
871 	u_int nfd;
872 	register_t *retval;
873 {
874 	struct filedesc *fdp = p->p_fd;
875 	struct file *fp;
876 	u_int i;
877 	int n = 0;
878 
879 	for (i = 0; i < nfd; i++, pl++) {
880 		/* Check the file descriptor. */
881 		if (pl->fd < 0) {
882 			pl->revents = 0;
883 			continue;
884 		}
885 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
886 			pl->revents = POLLNVAL;
887 			n++;
888 			continue;
889 		}
890 		FREF(fp);
891 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
892 		FRELE(fp);
893 		if (pl->revents != 0)
894 			n++;
895 	}
896 	*retval = n;
897 }
898 
899 /*
900  * We are using the same mechanism as select only we encode/decode args
901  * differently.
902  */
903 int
904 sys_poll(struct proc *p, void *v, register_t *retval)
905 {
906 	struct sys_poll_args /* {
907 		syscallarg(struct pollfd *) fds;
908 		syscallarg(u_int) nfds;
909 		syscallarg(int) timeout;
910 	} */ *uap = v;
911 	size_t sz;
912 	struct pollfd pfds[4], *pl = pfds;
913 	int msec = SCARG(uap, timeout);
914 	struct timeval atv;
915 	int timo, ncoll, i, s, error;
916 	extern int nselcoll, selwait;
917 	u_int nfds = SCARG(uap, nfds);
918 
919 	/* Standards say no more than MAX_OPEN; this is possibly better. */
920 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
921 		return (EINVAL);
922 
923 	sz = sizeof(struct pollfd) * nfds;
924 
925 	/* optimize for the default case, of a small nfds value */
926 	if (sz > sizeof(pfds))
927 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
928 
929 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
930 		goto bad;
931 
932 	for (i = 0; i < nfds; i++)
933 		pl[i].revents = 0;
934 
935 	if (msec != INFTIM) {
936 		atv.tv_sec = msec / 1000;
937 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
938 
939 		if (itimerfix(&atv)) {
940 			error = EINVAL;
941 			goto done;
942 		}
943 		s = splclock();
944 		timeradd(&atv, &time, &atv);
945 		splx(s);
946 	} else
947 		timo = 0;
948 
949 retry:
950 	ncoll = nselcoll;
951 	p->p_flag |= P_SELECT;
952 	pollscan(p, pl, nfds, retval);
953 	if (*retval)
954 		goto done;
955 	if (msec != INFTIM) {
956 		/*
957 		 * We have to recalculate the timeout on every retry.
958 		 */
959 		timo = hzto(&atv);
960 		if (timo <= 0)
961 			goto done;
962 	}
963 	s = splhigh();
964 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
965 		splx(s);
966 		goto retry;
967 	}
968 	p->p_flag &= ~P_SELECT;
969 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
970 	splx(s);
971 	if (error == 0)
972 		goto retry;
973 
974 done:
975 	p->p_flag &= ~P_SELECT;
976 	/*
977 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
978 	 *       ignored (since the whole point is to see what would block).
979 	 */
980 	switch (error) {
981 	case ERESTART:
982 		error = EINTR;
983 		break;
984 	case EWOULDBLOCK:
985 	case 0:
986 		error = copyout(pl, SCARG(uap, fds), sz);
987 		break;
988 	}
989 bad:
990 	if (pl != pfds)
991 		free(pl, M_TEMP);
992 	return (error);
993 }
994