xref: /openbsd-src/sys/kern/sys_generic.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: sys_generic.c,v 1.60 2009/03/24 13:49:38 kurt Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 int seltrue(dev_t, int, struct proc *);
67 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
68 int pollout(struct pollfd *, struct pollfd *, u_int);
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(struct proc *p, void *v, register_t *retval)
76 {
77 	struct sys_read_args /* {
78 		syscallarg(int) fd;
79 		syscallarg(void *) buf;
80 		syscallarg(size_t) nbyte;
81 	} */ *uap = v;
82 	int fd = SCARG(uap, fd);
83 	struct file *fp;
84 	struct filedesc *fdp = p->p_fd;
85 
86 	if ((fp = fd_getfile(fdp, fd)) == NULL)
87 		return (EBADF);
88 	if ((fp->f_flag & FREAD) == 0)
89 		return (EBADF);
90 
91 	FREF(fp);
92 
93 	/* dofileread() will FRELE the descriptor for us */
94 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
95 	    &fp->f_offset, retval));
96 }
97 
98 int
99 dofileread(struct proc *p, int fd, struct file *fp, void *buf, size_t nbyte,
100     off_t *offset, register_t *retval)
101 {
102 	struct uio auio;
103 	struct iovec aiov;
104 	long cnt, error = 0;
105 #ifdef KTRACE
106 	struct iovec ktriov;
107 #endif
108 
109 	aiov.iov_base = buf;
110 	aiov.iov_len = nbyte;
111 	auio.uio_iov = &aiov;
112 	auio.uio_iovcnt = 1;
113 	auio.uio_resid = nbyte;
114 	auio.uio_rw = UIO_READ;
115 	auio.uio_segflg = UIO_USERSPACE;
116 	auio.uio_procp = p;
117 
118 	/*
119 	 * Reads return ssize_t because -1 is returned on error.  Therefore
120 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
121 	 * values.
122 	 */
123 	if (auio.uio_resid > SSIZE_MAX) {
124 		error = EINVAL;
125 		goto out;
126 	}
127 
128 #ifdef KTRACE
129 	/*
130 	 * if tracing, save a copy of iovec
131 	 */
132 	if (KTRPOINT(p, KTR_GENIO))
133 		ktriov = aiov;
134 #endif
135 	cnt = auio.uio_resid;
136 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
137 	if (error)
138 		if (auio.uio_resid != cnt && (error == ERESTART ||
139 		    error == EINTR || error == EWOULDBLOCK))
140 			error = 0;
141 	cnt -= auio.uio_resid;
142 
143 	fp->f_rxfer++;
144 	fp->f_rbytes += cnt;
145 #ifdef KTRACE
146 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
147 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
148 #endif
149 	*retval = cnt;
150  out:
151 	FRELE(fp);
152 	return (error);
153 }
154 
155 /*
156  * Scatter read system call.
157  */
158 int
159 sys_readv(struct proc *p, void *v, register_t *retval)
160 {
161 	struct sys_readv_args /* {
162 		syscallarg(int) fd;
163 		syscallarg(const struct iovec *) iovp;
164 		syscallarg(int) iovcnt;
165 	} */ *uap = v;
166 	int fd = SCARG(uap, fd);
167 	struct file *fp;
168 	struct filedesc *fdp = p->p_fd;
169 
170 	if ((fp = fd_getfile(fdp, fd)) == NULL)
171 		return (EBADF);
172 	if ((fp->f_flag & FREAD) == 0)
173 		return (EBADF);
174 
175 	FREF(fp);
176 
177 	/* dofilereadv() will FRELE the descriptor for us */
178 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
179 	    &fp->f_offset, retval));
180 }
181 
182 int
183 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
184     int iovcnt, off_t *offset, register_t *retval)
185 {
186 	struct uio auio;
187 	struct iovec *iov;
188 	struct iovec *needfree;
189 	struct iovec aiov[UIO_SMALLIOV];
190 	long i, cnt, error = 0;
191 	u_int iovlen;
192 #ifdef KTRACE
193 	struct iovec *ktriov = NULL;
194 #endif
195 
196 	/* note: can't use iovlen until iovcnt is validated */
197 	iovlen = iovcnt * sizeof(struct iovec);
198 	if ((u_int)iovcnt > UIO_SMALLIOV) {
199 		if ((u_int)iovcnt > IOV_MAX) {
200 			error = EINVAL;
201 			goto out;
202 		}
203 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
204 	} else if ((u_int)iovcnt > 0) {
205 		iov = aiov;
206 		needfree = NULL;
207 	} else {
208 		error = EINVAL;
209 		goto out;
210 	}
211 
212 	auio.uio_iov = iov;
213 	auio.uio_iovcnt = iovcnt;
214 	auio.uio_rw = UIO_READ;
215 	auio.uio_segflg = UIO_USERSPACE;
216 	auio.uio_procp = p;
217 	error = copyin(iovp, iov, iovlen);
218 	if (error)
219 		goto done;
220 	auio.uio_resid = 0;
221 	for (i = 0; i < iovcnt; i++) {
222 		auio.uio_resid += iov->iov_len;
223 		/*
224 		 * Reads return ssize_t because -1 is returned on error.
225 		 * Therefore we must restrict the length to SSIZE_MAX to
226 		 * avoid garbage return values.
227 		 */
228 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
229 			error = EINVAL;
230 			goto done;
231 		}
232 		iov++;
233 	}
234 #ifdef KTRACE
235 	/*
236 	 * if tracing, save a copy of iovec
237 	 */
238 	if (KTRPOINT(p, KTR_GENIO))  {
239 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
240 		bcopy(auio.uio_iov, ktriov, iovlen);
241 	}
242 #endif
243 	cnt = auio.uio_resid;
244 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
245 	if (error)
246 		if (auio.uio_resid != cnt && (error == ERESTART ||
247 		    error == EINTR || error == EWOULDBLOCK))
248 			error = 0;
249 	cnt -= auio.uio_resid;
250 
251 	fp->f_rxfer++;
252 	fp->f_rbytes += cnt;
253 #ifdef KTRACE
254 	if (ktriov != NULL) {
255 		if (error == 0)
256 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
257 			    error);
258 		free(ktriov, M_TEMP);
259 	}
260 #endif
261 	*retval = cnt;
262  done:
263 	if (needfree)
264 		free(needfree, M_IOV);
265  out:
266 	FRELE(fp);
267 	return (error);
268 }
269 
270 /*
271  * Write system call
272  */
273 int
274 sys_write(struct proc *p, void *v, register_t *retval)
275 {
276 	struct sys_write_args /* {
277 		syscallarg(int) fd;
278 		syscallarg(const void *) buf;
279 		syscallarg(size_t) nbyte;
280 	} */ *uap = v;
281 	int fd = SCARG(uap, fd);
282 	struct file *fp;
283 	struct filedesc *fdp = p->p_fd;
284 
285 	if ((fp = fd_getfile(fdp, fd)) == NULL)
286 		return (EBADF);
287 	if ((fp->f_flag & FWRITE) == 0)
288 		return (EBADF);
289 
290 	FREF(fp);
291 
292 	/* dofilewrite() will FRELE the descriptor for us */
293 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
294 	    &fp->f_offset, retval));
295 }
296 
297 int
298 dofilewrite(struct proc *p, int fd, struct file *fp, const void *buf,
299     size_t nbyte, off_t *offset, register_t *retval)
300 {
301 	struct uio auio;
302 	struct iovec aiov;
303 	long cnt, error = 0;
304 #ifdef KTRACE
305 	struct iovec ktriov;
306 #endif
307 
308 	aiov.iov_base = (void *)buf;		/* XXX kills const */
309 	aiov.iov_len = nbyte;
310 	auio.uio_iov = &aiov;
311 	auio.uio_iovcnt = 1;
312 	auio.uio_resid = nbyte;
313 	auio.uio_rw = UIO_WRITE;
314 	auio.uio_segflg = UIO_USERSPACE;
315 	auio.uio_procp = p;
316 
317 	/*
318 	 * Writes return ssize_t because -1 is returned on error.  Therefore
319 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
320 	 * values.
321 	 */
322 	if (auio.uio_resid > SSIZE_MAX) {
323 		error = EINVAL;
324 		goto out;
325 	}
326 
327 #ifdef KTRACE
328 	/*
329 	 * if tracing, save a copy of iovec
330 	 */
331 	if (KTRPOINT(p, KTR_GENIO))
332 		ktriov = aiov;
333 #endif
334 	cnt = auio.uio_resid;
335 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
336 	if (error) {
337 		if (auio.uio_resid != cnt && (error == ERESTART ||
338 		    error == EINTR || error == EWOULDBLOCK))
339 			error = 0;
340 		if (error == EPIPE)
341 			ptsignal(p, SIGPIPE, STHREAD);
342 	}
343 	cnt -= auio.uio_resid;
344 
345 	fp->f_wxfer++;
346 	fp->f_wbytes += cnt;
347 #ifdef KTRACE
348 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
349 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
350 #endif
351 	*retval = cnt;
352  out:
353 	FRELE(fp);
354 	return (error);
355 }
356 
357 /*
358  * Gather write system call
359  */
360 int
361 sys_writev(struct proc *p, void *v, register_t *retval)
362 {
363 	struct sys_writev_args /* {
364 		syscallarg(int) fd;
365 		syscallarg(const struct iovec *) iovp;
366 		syscallarg(int) iovcnt;
367 	} */ *uap = v;
368 	int fd = SCARG(uap, fd);
369 	struct file *fp;
370 	struct filedesc *fdp = p->p_fd;
371 
372 	if ((fp = fd_getfile(fdp, fd)) == NULL)
373 		return (EBADF);
374 	if ((fp->f_flag & FWRITE) == 0)
375 		return (EBADF);
376 
377 	FREF(fp);
378 
379 	/* dofilewritev() will FRELE the descriptor for us */
380 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
381 	    &fp->f_offset, retval));
382 }
383 
384 int
385 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
386     int iovcnt, off_t *offset, register_t *retval)
387 {
388 	struct uio auio;
389 	struct iovec *iov;
390 	struct iovec *needfree;
391 	struct iovec aiov[UIO_SMALLIOV];
392 	long i, cnt, error = 0;
393 	u_int iovlen;
394 #ifdef KTRACE
395 	struct iovec *ktriov = NULL;
396 #endif
397 
398 	/* note: can't use iovlen until iovcnt is validated */
399 	iovlen = iovcnt * sizeof(struct iovec);
400 	if ((u_int)iovcnt > UIO_SMALLIOV) {
401 		if ((u_int)iovcnt > IOV_MAX) {
402 			error = EINVAL;
403 			goto out;
404 		}
405 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
406 	} else if ((u_int)iovcnt > 0) {
407 		iov = aiov;
408 		needfree = NULL;
409 	} else {
410 		error = EINVAL;
411 		goto out;
412 	}
413 
414 	auio.uio_iov = iov;
415 	auio.uio_iovcnt = iovcnt;
416 	auio.uio_rw = UIO_WRITE;
417 	auio.uio_segflg = UIO_USERSPACE;
418 	auio.uio_procp = p;
419 	error = copyin(iovp, iov, iovlen);
420 	if (error)
421 		goto done;
422 	auio.uio_resid = 0;
423 	for (i = 0; i < iovcnt; i++) {
424 		auio.uio_resid += iov->iov_len;
425 		/*
426 		 * Writes return ssize_t because -1 is returned on error.
427 		 * Therefore we must restrict the length to SSIZE_MAX to
428 		 * avoid garbage return values.
429 		 */
430 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
431 			error = EINVAL;
432 			goto done;
433 		}
434 		iov++;
435 	}
436 #ifdef KTRACE
437 	/*
438 	 * if tracing, save a copy of iovec
439 	 */
440 	if (KTRPOINT(p, KTR_GENIO))  {
441 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
442 		bcopy(auio.uio_iov, ktriov, iovlen);
443 	}
444 #endif
445 	cnt = auio.uio_resid;
446 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
447 	if (error) {
448 		if (auio.uio_resid != cnt && (error == ERESTART ||
449 		    error == EINTR || error == EWOULDBLOCK))
450 			error = 0;
451 		if (error == EPIPE)
452 			ptsignal(p, SIGPIPE, STHREAD);
453 	}
454 	cnt -= auio.uio_resid;
455 
456 	fp->f_wxfer++;
457 	fp->f_wbytes += cnt;
458 #ifdef KTRACE
459 	if (ktriov != NULL) {
460 		if (error == 0)
461 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
462 		free(ktriov, M_TEMP);
463 	}
464 #endif
465 	*retval = cnt;
466  done:
467 	if (needfree)
468 		free(needfree, M_IOV);
469  out:
470 	FRELE(fp);
471 	return (error);
472 }
473 
474 /*
475  * Ioctl system call
476  */
477 /* ARGSUSED */
478 int
479 sys_ioctl(struct proc *p, void *v, register_t *retval)
480 {
481 	struct sys_ioctl_args /* {
482 		syscallarg(int) fd;
483 		syscallarg(u_long) com;
484 		syscallarg(void *) data;
485 	} */ *uap = v;
486 	struct file *fp;
487 	struct filedesc *fdp;
488 	u_long com;
489 	int error;
490 	u_int size;
491 	caddr_t data, memp;
492 	int tmp;
493 #define STK_PARAMS	128
494 	char stkbuf[STK_PARAMS];
495 
496 	fdp = p->p_fd;
497 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
498 		return (EBADF);
499 
500 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
501 		return (EBADF);
502 
503 	switch (com = SCARG(uap, com)) {
504 	case FIONCLEX:
505 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
506 		return (0);
507 	case FIOCLEX:
508 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
509 		return (0);
510 	}
511 
512 	/*
513 	 * Interpret high order word to find amount of data to be
514 	 * copied to/from the user's address space.
515 	 */
516 	size = IOCPARM_LEN(com);
517 	if (size > IOCPARM_MAX)
518 		return (ENOTTY);
519 	FREF(fp);
520 	memp = NULL;
521 	if (size > sizeof (stkbuf)) {
522 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
523 		data = memp;
524 	} else
525 		data = stkbuf;
526 	if (com&IOC_IN) {
527 		if (size) {
528 			error = copyin(SCARG(uap, data), data, (u_int)size);
529 			if (error) {
530 				goto out;
531 			}
532 		} else
533 			*(caddr_t *)data = SCARG(uap, data);
534 	} else if ((com&IOC_OUT) && size)
535 		/*
536 		 * Zero the buffer so the user always
537 		 * gets back something deterministic.
538 		 */
539 		bzero(data, size);
540 	else if (com&IOC_VOID)
541 		*(caddr_t *)data = SCARG(uap, data);
542 
543 	switch (com) {
544 
545 	case FIONBIO:
546 		if ((tmp = *(int *)data) != 0)
547 			fp->f_flag |= FNONBLOCK;
548 		else
549 			fp->f_flag &= ~FNONBLOCK;
550 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
551 		break;
552 
553 	case FIOASYNC:
554 		if ((tmp = *(int *)data) != 0)
555 			fp->f_flag |= FASYNC;
556 		else
557 			fp->f_flag &= ~FASYNC;
558 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
559 		break;
560 
561 	case FIOSETOWN:
562 		tmp = *(int *)data;
563 		if (fp->f_type == DTYPE_SOCKET) {
564 			struct socket *so = (struct socket *)fp->f_data;
565 
566 			so->so_pgid = tmp;
567 			so->so_siguid = p->p_cred->p_ruid;
568 			so->so_sigeuid = p->p_ucred->cr_uid;
569 			error = 0;
570 			break;
571 		}
572 		if (tmp <= 0) {
573 			tmp = -tmp;
574 		} else {
575 			struct proc *p1 = pfind(tmp);
576 			if (p1 == 0) {
577 				error = ESRCH;
578 				break;
579 			}
580 			tmp = p1->p_pgrp->pg_id;
581 		}
582 		error = (*fp->f_ops->fo_ioctl)
583 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
584 		break;
585 
586 	case FIOGETOWN:
587 		if (fp->f_type == DTYPE_SOCKET) {
588 			error = 0;
589 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
590 			break;
591 		}
592 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
593 		*(int *)data = -*(int *)data;
594 		break;
595 
596 	default:
597 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
598 		break;
599 	}
600 	/*
601 	 * Copy any data to user, size was
602 	 * already set and checked above.
603 	 */
604 	if (error == 0 && (com&IOC_OUT) && size)
605 		error = copyout(data, SCARG(uap, data), (u_int)size);
606 out:
607 	FRELE(fp);
608 	if (memp)
609 		free(memp, M_IOCTLOPS);
610 	return (error);
611 }
612 
613 int	selwait, nselcoll;
614 
615 /*
616  * Select system call.
617  */
618 int
619 sys_select(struct proc *p, void *v, register_t *retval)
620 {
621 	struct sys_select_args /* {
622 		syscallarg(int) nd;
623 		syscallarg(fd_set *) in;
624 		syscallarg(fd_set *) ou;
625 		syscallarg(fd_set *) ex;
626 		syscallarg(struct timeval *) tv;
627 	} */ *uap = v;
628 	fd_mask bits[6];
629 	fd_set *pibits[3], *pobits[3];
630 	struct timeval atv, rtv, ttv;
631 	int s, ncoll, error = 0, timo;
632 	u_int nd, ni;
633 
634 	nd = SCARG(uap, nd);
635 	if (nd > p->p_fd->fd_nfiles) {
636 		/* forgiving; slightly wrong */
637 		nd = p->p_fd->fd_nfiles;
638 	}
639 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
640 	if (nd > sizeof(bits[0])) {
641 		caddr_t mbits;
642 
643 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK|M_ZERO);
644 		pibits[0] = (fd_set *)&mbits[ni * 0];
645 		pibits[1] = (fd_set *)&mbits[ni * 1];
646 		pibits[2] = (fd_set *)&mbits[ni * 2];
647 		pobits[0] = (fd_set *)&mbits[ni * 3];
648 		pobits[1] = (fd_set *)&mbits[ni * 4];
649 		pobits[2] = (fd_set *)&mbits[ni * 5];
650 	} else {
651 		bzero(bits, sizeof(bits));
652 		pibits[0] = (fd_set *)&bits[0];
653 		pibits[1] = (fd_set *)&bits[1];
654 		pibits[2] = (fd_set *)&bits[2];
655 		pobits[0] = (fd_set *)&bits[3];
656 		pobits[1] = (fd_set *)&bits[4];
657 		pobits[2] = (fd_set *)&bits[5];
658 	}
659 
660 #define	getbits(name, x) \
661 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
662 	    pibits[x], ni))) \
663 		goto done;
664 	getbits(in, 0);
665 	getbits(ou, 1);
666 	getbits(ex, 2);
667 #undef	getbits
668 
669 	if (SCARG(uap, tv)) {
670 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
671 		if (error)
672 			goto done;
673 		if (itimerfix(&atv)) {
674 			error = EINVAL;
675 			goto done;
676 		}
677 		getmicrouptime(&rtv);
678 		timeradd(&atv, &rtv, &atv);
679 	} else {
680 		atv.tv_sec = 0;
681 		atv.tv_usec = 0;
682 	}
683 	timo = 0;
684 
685 retry:
686 	ncoll = nselcoll;
687 	atomic_setbits_int(&p->p_flag, P_SELECT);
688 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
689 	if (error || *retval)
690 		goto done;
691 	if (SCARG(uap, tv)) {
692 		getmicrouptime(&rtv);
693 		if (timercmp(&rtv, &atv, >=))
694 			goto done;
695 		ttv = atv;
696 		timersub(&ttv, &rtv, &ttv);
697 		timo = ttv.tv_sec > 24 * 60 * 60 ?
698 			24 * 60 * 60 * hz : tvtohz(&ttv);
699 	}
700 	s = splhigh();
701 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
702 		splx(s);
703 		goto retry;
704 	}
705 	atomic_clearbits_int(&p->p_flag, P_SELECT);
706 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
707 	splx(s);
708 	if (error == 0)
709 		goto retry;
710 done:
711 	atomic_clearbits_int(&p->p_flag, P_SELECT);
712 	/* select is not restarted after signals... */
713 	if (error == ERESTART)
714 		error = EINTR;
715 	if (error == EWOULDBLOCK)
716 		error = 0;
717 #define	putbits(name, x) \
718 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
719 	    SCARG(uap, name), ni))) \
720 		error = error2;
721 	if (error == 0) {
722 		int error2;
723 
724 		putbits(in, 0);
725 		putbits(ou, 1);
726 		putbits(ex, 2);
727 #undef putbits
728 	}
729 
730 	if (pibits[0] != (fd_set *)&bits[0])
731 		free(pibits[0], M_TEMP);
732 	return (error);
733 }
734 
735 int
736 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
737     register_t *retval)
738 {
739 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
740 	struct filedesc *fdp = p->p_fd;
741 	int msk, i, j, fd;
742 	fd_mask bits;
743 	struct file *fp;
744 	int n = 0;
745 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
746 
747 	for (msk = 0; msk < 3; msk++) {
748 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
749 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
750 
751 		for (i = 0; i < nfd; i += NFDBITS) {
752 			bits = pibits->fds_bits[i/NFDBITS];
753 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
754 				bits &= ~(1 << j);
755 				if ((fp = fd_getfile(fdp, fd)) == NULL)
756 					return (EBADF);
757 				FREF(fp);
758 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
759 					FD_SET(fd, pobits);
760 					n++;
761 				}
762 				FRELE(fp);
763 			}
764 		}
765 	}
766 	*retval = n;
767 	return (0);
768 }
769 
770 /*ARGSUSED*/
771 int
772 seltrue(dev_t dev, int events, struct proc *p)
773 {
774 
775 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
776 }
777 
778 /*
779  * Record a select request.
780  */
781 void
782 selrecord(struct proc *selector, struct selinfo *sip)
783 {
784 	struct proc *p;
785 	pid_t mypid;
786 
787 	mypid = selector->p_pid;
788 	if (sip->si_selpid == mypid)
789 		return;
790 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
791 	    p->p_wchan == (caddr_t)&selwait)
792 		sip->si_flags |= SI_COLL;
793 	else
794 		sip->si_selpid = mypid;
795 }
796 
797 /*
798  * Do a wakeup when a selectable event occurs.
799  */
800 void
801 selwakeup(struct selinfo *sip)
802 {
803 	struct proc *p;
804 	int s;
805 
806 	if (sip->si_selpid == 0)
807 		return;
808 	if (sip->si_flags & SI_COLL) {
809 		nselcoll++;
810 		sip->si_flags &= ~SI_COLL;
811 		wakeup(&selwait);
812 	}
813 	p = pfind(sip->si_selpid);
814 	sip->si_selpid = 0;
815 	if (p != NULL) {
816 		SCHED_LOCK(s);
817 		if (p->p_wchan == (caddr_t)&selwait) {
818 			if (p->p_stat == SSLEEP)
819 				setrunnable(p);
820 			else
821 				unsleep(p);
822 		} else if (p->p_flag & P_SELECT)
823 			atomic_clearbits_int(&p->p_flag, P_SELECT);
824 		SCHED_UNLOCK(s);
825 	}
826 }
827 
828 void
829 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
830 {
831 	struct filedesc *fdp = p->p_fd;
832 	struct file *fp;
833 	u_int i;
834 	int n = 0;
835 
836 	for (i = 0; i < nfd; i++, pl++) {
837 		/* Check the file descriptor. */
838 		if (pl->fd < 0) {
839 			pl->revents = 0;
840 			continue;
841 		}
842 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
843 			pl->revents = POLLNVAL;
844 			n++;
845 			continue;
846 		}
847 		FREF(fp);
848 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
849 		FRELE(fp);
850 		if (pl->revents != 0)
851 			n++;
852 	}
853 	*retval = n;
854 }
855 
856 /*
857  * Only copyout the revents field.
858  */
859 int
860 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
861 {
862 	int error = 0;
863 	u_int i = 0;
864 
865 	while (!error && i++ < nfds) {
866 		error = copyout(&pl->revents, &upl->revents,
867 		    sizeof(upl->revents));
868 		pl++;
869 		upl++;
870 	}
871 
872 	return (error);
873 }
874 
875 /*
876  * We are using the same mechanism as select only we encode/decode args
877  * differently.
878  */
879 int
880 sys_poll(struct proc *p, void *v, register_t *retval)
881 {
882 	struct sys_poll_args /* {
883 		syscallarg(struct pollfd *) fds;
884 		syscallarg(u_int) nfds;
885 		syscallarg(int) timeout;
886 	} */ *uap = v;
887 	size_t sz;
888 	struct pollfd pfds[4], *pl = pfds;
889 	int msec = SCARG(uap, timeout);
890 	struct timeval atv, rtv, ttv;
891 	int timo, ncoll, i, s, error;
892 	extern int nselcoll, selwait;
893 	u_int nfds = SCARG(uap, nfds);
894 
895 	/* Standards say no more than MAX_OPEN; this is possibly better. */
896 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
897 		return (EINVAL);
898 
899 	sz = sizeof(struct pollfd) * nfds;
900 
901 	/* optimize for the default case, of a small nfds value */
902 	if (sz > sizeof(pfds))
903 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
904 
905 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
906 		goto bad;
907 
908 	for (i = 0; i < nfds; i++)
909 		pl[i].revents = 0;
910 
911 	if (msec != INFTIM) {
912 		atv.tv_sec = msec / 1000;
913 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
914 
915 		if (itimerfix(&atv)) {
916 			error = EINVAL;
917 			goto done;
918 		}
919 		getmicrouptime(&rtv);
920 		timeradd(&atv, &rtv, &atv);
921 	} else {
922 		atv.tv_sec = 0;
923 		atv.tv_usec = 0;
924 	}
925 	timo = 0;
926 
927 retry:
928 	ncoll = nselcoll;
929 	atomic_setbits_int(&p->p_flag, P_SELECT);
930 	pollscan(p, pl, nfds, retval);
931 	if (*retval)
932 		goto done;
933 	if (msec != INFTIM) {
934 		getmicrouptime(&rtv);
935 		if (timercmp(&rtv, &atv, >=))
936 			goto done;
937 		ttv = atv;
938 		timersub(&ttv, &rtv, &ttv);
939 		timo = ttv.tv_sec > 24 * 60 * 60 ?
940 			24 * 60 * 60 * hz : tvtohz(&ttv);
941 	}
942 	s = splhigh();
943 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
944 		splx(s);
945 		goto retry;
946 	}
947 	atomic_clearbits_int(&p->p_flag, P_SELECT);
948 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
949 	splx(s);
950 	if (error == 0)
951 		goto retry;
952 
953 done:
954 	atomic_clearbits_int(&p->p_flag, P_SELECT);
955 	/*
956 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
957 	 *       ignored (since the whole point is to see what would block).
958 	 */
959 	switch (error) {
960 	case ERESTART:
961 		error = pollout(pl, SCARG(uap, fds), nfds);
962 		if (error == 0)
963 			error = EINTR;
964 		break;
965 	case EWOULDBLOCK:
966 	case 0:
967 		error = pollout(pl, SCARG(uap, fds), nfds);
968 		break;
969 	}
970 bad:
971 	if (pl != pfds)
972 		free(pl, M_TEMP);
973 	return (error);
974 }
975