xref: /openbsd-src/sys/kern/sys_generic.c (revision daf88648c0e349d5c02e1504293082072c981640)
1 /*	$OpenBSD: sys_generic.c,v 1.54 2006/04/15 20:02:19 miod Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 int seltrue(dev_t, int, struct proc *);
67 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
68 
69 /*
70  * Read system call.
71  */
72 /* ARGSUSED */
73 int
74 sys_read(struct proc *p, void *v, register_t *retval)
75 {
76 	struct sys_read_args /* {
77 		syscallarg(int) fd;
78 		syscallarg(void *) buf;
79 		syscallarg(size_t) nbyte;
80 	} */ *uap = v;
81 	int fd = SCARG(uap, fd);
82 	struct file *fp;
83 	struct filedesc *fdp = p->p_fd;
84 
85 	if ((fp = fd_getfile(fdp, fd)) == NULL)
86 		return (EBADF);
87 	if ((fp->f_flag & FREAD) == 0)
88 		return (EBADF);
89 
90 	FREF(fp);
91 
92 	/* dofileread() will FRELE the descriptor for us */
93 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
94 	    &fp->f_offset, retval));
95 }
96 
97 int
98 dofileread(struct proc *p, int fd, struct file *fp, void *buf, size_t nbyte,
99     off_t *offset, register_t *retval)
100 {
101 	struct uio auio;
102 	struct iovec aiov;
103 	long cnt, error = 0;
104 #ifdef KTRACE
105 	struct iovec ktriov;
106 #endif
107 
108 	aiov.iov_base = buf;
109 	aiov.iov_len = nbyte;
110 	auio.uio_iov = &aiov;
111 	auio.uio_iovcnt = 1;
112 	auio.uio_resid = nbyte;
113 	auio.uio_rw = UIO_READ;
114 	auio.uio_segflg = UIO_USERSPACE;
115 	auio.uio_procp = p;
116 
117 	/*
118 	 * Reads return ssize_t because -1 is returned on error.  Therefore
119 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
120 	 * values.
121 	 */
122 	if (auio.uio_resid > SSIZE_MAX) {
123 		error = EINVAL;
124 		goto out;
125 	}
126 
127 #ifdef KTRACE
128 	/*
129 	 * if tracing, save a copy of iovec
130 	 */
131 	if (KTRPOINT(p, KTR_GENIO))
132 		ktriov = aiov;
133 #endif
134 	cnt = auio.uio_resid;
135 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
136 	if (error)
137 		if (auio.uio_resid != cnt && (error == ERESTART ||
138 		    error == EINTR || error == EWOULDBLOCK))
139 			error = 0;
140 	cnt -= auio.uio_resid;
141 
142 	fp->f_rxfer++;
143 	fp->f_rbytes += cnt;
144 #ifdef KTRACE
145 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
146 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
147 #endif
148 	*retval = cnt;
149  out:
150 	FRELE(fp);
151 	return (error);
152 }
153 
154 /*
155  * Scatter read system call.
156  */
157 int
158 sys_readv(struct proc *p, void *v, register_t *retval)
159 {
160 	struct sys_readv_args /* {
161 		syscallarg(int) fd;
162 		syscallarg(const struct iovec *) iovp;
163 		syscallarg(int) iovcnt;
164 	} */ *uap = v;
165 	int fd = SCARG(uap, fd);
166 	struct file *fp;
167 	struct filedesc *fdp = p->p_fd;
168 
169 	if ((fp = fd_getfile(fdp, fd)) == NULL)
170 		return (EBADF);
171 	if ((fp->f_flag & FREAD) == 0)
172 		return (EBADF);
173 
174 	FREF(fp);
175 
176 	/* dofilereadv() will FRELE the descriptor for us */
177 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
178 	    &fp->f_offset, retval));
179 }
180 
181 int
182 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
183     int iovcnt, off_t *offset, register_t *retval)
184 {
185 	struct uio auio;
186 	struct iovec *iov;
187 	struct iovec *needfree;
188 	struct iovec aiov[UIO_SMALLIOV];
189 	long i, cnt, error = 0;
190 	u_int iovlen;
191 #ifdef KTRACE
192 	struct iovec *ktriov = NULL;
193 #endif
194 
195 	/* note: can't use iovlen until iovcnt is validated */
196 	iovlen = iovcnt * sizeof(struct iovec);
197 	if ((u_int)iovcnt > UIO_SMALLIOV) {
198 		if ((u_int)iovcnt > IOV_MAX) {
199 			error = EINVAL;
200 			goto out;
201 		}
202 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
203 	} else if ((u_int)iovcnt > 0) {
204 		iov = aiov;
205 		needfree = NULL;
206 	} else {
207 		error = EINVAL;
208 		goto out;
209 	}
210 
211 	auio.uio_iov = iov;
212 	auio.uio_iovcnt = iovcnt;
213 	auio.uio_rw = UIO_READ;
214 	auio.uio_segflg = UIO_USERSPACE;
215 	auio.uio_procp = p;
216 	error = copyin(iovp, iov, iovlen);
217 	if (error)
218 		goto done;
219 	auio.uio_resid = 0;
220 	for (i = 0; i < iovcnt; i++) {
221 		auio.uio_resid += iov->iov_len;
222 		/*
223 		 * Reads return ssize_t because -1 is returned on error.
224 		 * Therefore we must restrict the length to SSIZE_MAX to
225 		 * avoid garbage return values.
226 		 */
227 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
228 			error = EINVAL;
229 			goto done;
230 		}
231 		iov++;
232 	}
233 #ifdef KTRACE
234 	/*
235 	 * if tracing, save a copy of iovec
236 	 */
237 	if (KTRPOINT(p, KTR_GENIO))  {
238 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
239 		bcopy(auio.uio_iov, ktriov, iovlen);
240 	}
241 #endif
242 	cnt = auio.uio_resid;
243 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
244 	if (error)
245 		if (auio.uio_resid != cnt && (error == ERESTART ||
246 		    error == EINTR || error == EWOULDBLOCK))
247 			error = 0;
248 	cnt -= auio.uio_resid;
249 
250 	fp->f_rxfer++;
251 	fp->f_rbytes += cnt;
252 #ifdef KTRACE
253 	if (ktriov != NULL) {
254 		if (error == 0)
255 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
256 			    error);
257 		free(ktriov, M_TEMP);
258 	}
259 #endif
260 	*retval = cnt;
261  done:
262 	if (needfree)
263 		free(needfree, M_IOV);
264  out:
265 	FRELE(fp);
266 	return (error);
267 }
268 
269 /*
270  * Write system call
271  */
272 int
273 sys_write(struct proc *p, void *v, register_t *retval)
274 {
275 	struct sys_write_args /* {
276 		syscallarg(int) fd;
277 		syscallarg(const void *) buf;
278 		syscallarg(size_t) nbyte;
279 	} */ *uap = v;
280 	int fd = SCARG(uap, fd);
281 	struct file *fp;
282 	struct filedesc *fdp = p->p_fd;
283 
284 	if ((fp = fd_getfile(fdp, fd)) == NULL)
285 		return (EBADF);
286 	if ((fp->f_flag & FWRITE) == 0)
287 		return (EBADF);
288 
289 	FREF(fp);
290 
291 	/* dofilewrite() will FRELE the descriptor for us */
292 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
293 	    &fp->f_offset, retval));
294 }
295 
296 int
297 dofilewrite(struct proc *p, int fd, struct file *fp, const void *buf,
298     size_t nbyte, off_t *offset, register_t *retval)
299 {
300 	struct uio auio;
301 	struct iovec aiov;
302 	long cnt, error = 0;
303 #ifdef KTRACE
304 	struct iovec ktriov;
305 #endif
306 
307 	aiov.iov_base = (void *)buf;		/* XXX kills const */
308 	aiov.iov_len = nbyte;
309 	auio.uio_iov = &aiov;
310 	auio.uio_iovcnt = 1;
311 	auio.uio_resid = nbyte;
312 	auio.uio_rw = UIO_WRITE;
313 	auio.uio_segflg = UIO_USERSPACE;
314 	auio.uio_procp = p;
315 
316 	/*
317 	 * Writes return ssize_t because -1 is returned on error.  Therefore
318 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
319 	 * values.
320 	 */
321 	if (auio.uio_resid > SSIZE_MAX) {
322 		error = EINVAL;
323 		goto out;
324 	}
325 
326 #ifdef KTRACE
327 	/*
328 	 * if tracing, save a copy of iovec
329 	 */
330 	if (KTRPOINT(p, KTR_GENIO))
331 		ktriov = aiov;
332 #endif
333 	cnt = auio.uio_resid;
334 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
335 	if (error) {
336 		if (auio.uio_resid != cnt && (error == ERESTART ||
337 		    error == EINTR || error == EWOULDBLOCK))
338 			error = 0;
339 		if (error == EPIPE)
340 			psignal(p, SIGPIPE);
341 	}
342 	cnt -= auio.uio_resid;
343 
344 	fp->f_wxfer++;
345 	fp->f_wbytes += cnt;
346 #ifdef KTRACE
347 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
348 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
349 #endif
350 	*retval = cnt;
351  out:
352 	FRELE(fp);
353 	return (error);
354 }
355 
356 /*
357  * Gather write system call
358  */
359 int
360 sys_writev(struct proc *p, void *v, register_t *retval)
361 {
362 	struct sys_writev_args /* {
363 		syscallarg(int) fd;
364 		syscallarg(const struct iovec *) iovp;
365 		syscallarg(int) iovcnt;
366 	} */ *uap = v;
367 	int fd = SCARG(uap, fd);
368 	struct file *fp;
369 	struct filedesc *fdp = p->p_fd;
370 
371 	if ((fp = fd_getfile(fdp, fd)) == NULL)
372 		return (EBADF);
373 	if ((fp->f_flag & FWRITE) == 0)
374 		return (EBADF);
375 
376 	FREF(fp);
377 
378 	/* dofilewritev() will FRELE the descriptor for us */
379 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
380 	    &fp->f_offset, retval));
381 }
382 
383 int
384 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
385     int iovcnt, off_t *offset, register_t *retval)
386 {
387 	struct uio auio;
388 	struct iovec *iov;
389 	struct iovec *needfree;
390 	struct iovec aiov[UIO_SMALLIOV];
391 	long i, cnt, error = 0;
392 	u_int iovlen;
393 #ifdef KTRACE
394 	struct iovec *ktriov = NULL;
395 #endif
396 
397 	/* note: can't use iovlen until iovcnt is validated */
398 	iovlen = iovcnt * sizeof(struct iovec);
399 	if ((u_int)iovcnt > UIO_SMALLIOV) {
400 		if ((u_int)iovcnt > IOV_MAX) {
401 			error = EINVAL;
402 			goto out;
403 		}
404 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
405 	} else if ((u_int)iovcnt > 0) {
406 		iov = aiov;
407 		needfree = NULL;
408 	} else {
409 		error = EINVAL;
410 		goto out;
411 	}
412 
413 	auio.uio_iov = iov;
414 	auio.uio_iovcnt = iovcnt;
415 	auio.uio_rw = UIO_WRITE;
416 	auio.uio_segflg = UIO_USERSPACE;
417 	auio.uio_procp = p;
418 	error = copyin(iovp, iov, iovlen);
419 	if (error)
420 		goto done;
421 	auio.uio_resid = 0;
422 	for (i = 0; i < iovcnt; i++) {
423 		auio.uio_resid += iov->iov_len;
424 		/*
425 		 * Writes return ssize_t because -1 is returned on error.
426 		 * Therefore we must restrict the length to SSIZE_MAX to
427 		 * avoid garbage return values.
428 		 */
429 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
430 			error = EINVAL;
431 			goto done;
432 		}
433 		iov++;
434 	}
435 #ifdef KTRACE
436 	/*
437 	 * if tracing, save a copy of iovec
438 	 */
439 	if (KTRPOINT(p, KTR_GENIO))  {
440 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
441 		bcopy(auio.uio_iov, ktriov, iovlen);
442 	}
443 #endif
444 	cnt = auio.uio_resid;
445 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
446 	if (error) {
447 		if (auio.uio_resid != cnt && (error == ERESTART ||
448 		    error == EINTR || error == EWOULDBLOCK))
449 			error = 0;
450 		if (error == EPIPE)
451 			psignal(p, SIGPIPE);
452 	}
453 	cnt -= auio.uio_resid;
454 
455 	fp->f_wxfer++;
456 	fp->f_wbytes += cnt;
457 #ifdef KTRACE
458 	if (ktriov != NULL) {
459 		if (error == 0)
460 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
461 		free(ktriov, M_TEMP);
462 	}
463 #endif
464 	*retval = cnt;
465  done:
466 	if (needfree)
467 		free(needfree, M_IOV);
468  out:
469 	FRELE(fp);
470 	return (error);
471 }
472 
473 /*
474  * Ioctl system call
475  */
476 /* ARGSUSED */
477 int
478 sys_ioctl(struct proc *p, void *v, register_t *retval)
479 {
480 	struct sys_ioctl_args /* {
481 		syscallarg(int) fd;
482 		syscallarg(u_long) com;
483 		syscallarg(void *) data;
484 	} */ *uap = v;
485 	struct file *fp;
486 	struct filedesc *fdp;
487 	u_long com;
488 	int error;
489 	u_int size;
490 	caddr_t data, memp;
491 	int tmp;
492 #define STK_PARAMS	128
493 	char stkbuf[STK_PARAMS];
494 
495 	fdp = p->p_fd;
496 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
497 		return (EBADF);
498 
499 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
500 		return (EBADF);
501 
502 	switch (com = SCARG(uap, com)) {
503 	case FIONCLEX:
504 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
505 		return (0);
506 	case FIOCLEX:
507 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
508 		return (0);
509 	}
510 
511 	/*
512 	 * Interpret high order word to find amount of data to be
513 	 * copied to/from the user's address space.
514 	 */
515 	size = IOCPARM_LEN(com);
516 	if (size > IOCPARM_MAX)
517 		return (ENOTTY);
518 	FREF(fp);
519 	memp = NULL;
520 	if (size > sizeof (stkbuf)) {
521 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
522 		data = memp;
523 	} else
524 		data = stkbuf;
525 	if (com&IOC_IN) {
526 		if (size) {
527 			error = copyin(SCARG(uap, data), data, (u_int)size);
528 			if (error) {
529 				goto out;
530 			}
531 		} else
532 			*(caddr_t *)data = SCARG(uap, data);
533 	} else if ((com&IOC_OUT) && size)
534 		/*
535 		 * Zero the buffer so the user always
536 		 * gets back something deterministic.
537 		 */
538 		bzero(data, size);
539 	else if (com&IOC_VOID)
540 		*(caddr_t *)data = SCARG(uap, data);
541 
542 	switch (com) {
543 
544 	case FIONBIO:
545 		if ((tmp = *(int *)data) != 0)
546 			fp->f_flag |= FNONBLOCK;
547 		else
548 			fp->f_flag &= ~FNONBLOCK;
549 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
550 		break;
551 
552 	case FIOASYNC:
553 		if ((tmp = *(int *)data) != 0)
554 			fp->f_flag |= FASYNC;
555 		else
556 			fp->f_flag &= ~FASYNC;
557 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
558 		break;
559 
560 	case FIOSETOWN:
561 		tmp = *(int *)data;
562 		if (fp->f_type == DTYPE_SOCKET) {
563 			struct socket *so = (struct socket *)fp->f_data;
564 
565 			so->so_pgid = tmp;
566 			so->so_siguid = p->p_cred->p_ruid;
567 			so->so_sigeuid = p->p_ucred->cr_uid;
568 			error = 0;
569 			break;
570 		}
571 		if (tmp <= 0) {
572 			tmp = -tmp;
573 		} else {
574 			struct proc *p1 = pfind(tmp);
575 			if (p1 == 0) {
576 				error = ESRCH;
577 				break;
578 			}
579 			tmp = p1->p_pgrp->pg_id;
580 		}
581 		error = (*fp->f_ops->fo_ioctl)
582 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
583 		break;
584 
585 	case FIOGETOWN:
586 		if (fp->f_type == DTYPE_SOCKET) {
587 			error = 0;
588 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
589 			break;
590 		}
591 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
592 		*(int *)data = -*(int *)data;
593 		break;
594 
595 	default:
596 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
597 		break;
598 	}
599 	/*
600 	 * Copy any data to user, size was
601 	 * already set and checked above.
602 	 */
603 	if (error == 0 && (com&IOC_OUT) && size)
604 		error = copyout(data, SCARG(uap, data), (u_int)size);
605 out:
606 	FRELE(fp);
607 	if (memp)
608 		free(memp, M_IOCTLOPS);
609 	return (error);
610 }
611 
612 int	selwait, nselcoll;
613 
614 /*
615  * Select system call.
616  */
617 int
618 sys_select(struct proc *p, void *v, register_t *retval)
619 {
620 	struct sys_select_args /* {
621 		syscallarg(int) nd;
622 		syscallarg(fd_set *) in;
623 		syscallarg(fd_set *) ou;
624 		syscallarg(fd_set *) ex;
625 		syscallarg(struct timeval *) tv;
626 	} */ *uap = v;
627 	fd_mask bits[6];
628 	fd_set *pibits[3], *pobits[3];
629 	struct timeval atv, rtv, ttv;
630 	int s, ncoll, error = 0, timo;
631 	u_int nd, ni;
632 
633 	nd = SCARG(uap, nd);
634 	if (nd > p->p_fd->fd_nfiles) {
635 		/* forgiving; slightly wrong */
636 		nd = p->p_fd->fd_nfiles;
637 	}
638 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
639 	if (nd > sizeof(bits[0])) {
640 		caddr_t mbits;
641 
642 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
643 		bzero(mbits, ni * 6);
644 		pibits[0] = (fd_set *)&mbits[ni * 0];
645 		pibits[1] = (fd_set *)&mbits[ni * 1];
646 		pibits[2] = (fd_set *)&mbits[ni * 2];
647 		pobits[0] = (fd_set *)&mbits[ni * 3];
648 		pobits[1] = (fd_set *)&mbits[ni * 4];
649 		pobits[2] = (fd_set *)&mbits[ni * 5];
650 	} else {
651 		bzero(bits, sizeof(bits));
652 		pibits[0] = (fd_set *)&bits[0];
653 		pibits[1] = (fd_set *)&bits[1];
654 		pibits[2] = (fd_set *)&bits[2];
655 		pobits[0] = (fd_set *)&bits[3];
656 		pobits[1] = (fd_set *)&bits[4];
657 		pobits[2] = (fd_set *)&bits[5];
658 	}
659 
660 #define	getbits(name, x) \
661 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
662 	    pibits[x], ni))) \
663 		goto done;
664 	getbits(in, 0);
665 	getbits(ou, 1);
666 	getbits(ex, 2);
667 #undef	getbits
668 
669 	if (SCARG(uap, tv)) {
670 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
671 		if (error)
672 			goto done;
673 		if (itimerfix(&atv)) {
674 			error = EINVAL;
675 			goto done;
676 		}
677 		getmicrouptime(&rtv);
678 		timeradd(&atv, &rtv, &atv);
679 	} else {
680 		atv.tv_sec = 0;
681 		atv.tv_usec = 0;
682 	}
683 	timo = 0;
684 
685 retry:
686 	ncoll = nselcoll;
687 	p->p_flag |= P_SELECT;
688 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
689 	if (error || *retval)
690 		goto done;
691 	if (SCARG(uap, tv)) {
692 		getmicrouptime(&rtv);
693 		if (timercmp(&rtv, &atv, >=))
694 			goto done;
695 		ttv = atv;
696 		timersub(&ttv, &rtv, &ttv);
697 		timo = ttv.tv_sec > 24 * 60 * 60 ?
698 			24 * 60 * 60 * hz : tvtohz(&ttv);
699 	}
700 	s = splhigh();
701 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
702 		splx(s);
703 		goto retry;
704 	}
705 	p->p_flag &= ~P_SELECT;
706 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
707 	splx(s);
708 	if (error == 0)
709 		goto retry;
710 done:
711 	p->p_flag &= ~P_SELECT;
712 	/* select is not restarted after signals... */
713 	if (error == ERESTART)
714 		error = EINTR;
715 	if (error == EWOULDBLOCK)
716 		error = 0;
717 #define	putbits(name, x) \
718 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
719 	    SCARG(uap, name), ni))) \
720 		error = error2;
721 	if (error == 0) {
722 		int error2;
723 
724 		putbits(in, 0);
725 		putbits(ou, 1);
726 		putbits(ex, 2);
727 #undef putbits
728 	}
729 
730 	if (pibits[0] != (fd_set *)&bits[0])
731 		free(pibits[0], M_TEMP);
732 	return (error);
733 }
734 
735 int
736 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
737     register_t *retval)
738 {
739 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
740 	struct filedesc *fdp = p->p_fd;
741 	int msk, i, j, fd;
742 	fd_mask bits;
743 	struct file *fp;
744 	int n = 0;
745 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
746 
747 	for (msk = 0; msk < 3; msk++) {
748 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
749 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
750 
751 		for (i = 0; i < nfd; i += NFDBITS) {
752 			bits = pibits->fds_bits[i/NFDBITS];
753 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
754 				bits &= ~(1 << j);
755 				if ((fp = fd_getfile(fdp, fd)) == NULL)
756 					return (EBADF);
757 				FREF(fp);
758 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
759 					FD_SET(fd, pobits);
760 					n++;
761 				}
762 				FRELE(fp);
763 			}
764 		}
765 	}
766 	*retval = n;
767 	return (0);
768 }
769 
770 /*ARGSUSED*/
771 int
772 seltrue(dev_t dev, int events, struct proc *p)
773 {
774 
775 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
776 }
777 
778 /*
779  * Record a select request.
780  */
781 void
782 selrecord(struct proc *selector, struct selinfo *sip)
783 {
784 	struct proc *p;
785 	pid_t mypid;
786 
787 	mypid = selector->p_pid;
788 	if (sip->si_selpid == mypid)
789 		return;
790 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
791 	    p->p_wchan == (caddr_t)&selwait)
792 		sip->si_flags |= SI_COLL;
793 	else
794 		sip->si_selpid = mypid;
795 }
796 
797 /*
798  * Do a wakeup when a selectable event occurs.
799  */
800 void
801 selwakeup(struct selinfo *sip)
802 {
803 	struct proc *p;
804 	int s;
805 
806 	if (sip->si_selpid == 0)
807 		return;
808 	if (sip->si_flags & SI_COLL) {
809 		nselcoll++;
810 		sip->si_flags &= ~SI_COLL;
811 		wakeup(&selwait);
812 	}
813 	p = pfind(sip->si_selpid);
814 	sip->si_selpid = 0;
815 	if (p != NULL) {
816 		SCHED_LOCK(s);
817 		if (p->p_wchan == (caddr_t)&selwait) {
818 			if (p->p_stat == SSLEEP)
819 				setrunnable(p);
820 			else
821 				unsleep(p);
822 		} else if (p->p_flag & P_SELECT)
823 			p->p_flag &= ~P_SELECT;
824 		SCHED_UNLOCK(s);
825 	}
826 }
827 
828 void
829 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
830 {
831 	struct filedesc *fdp = p->p_fd;
832 	struct file *fp;
833 	u_int i;
834 	int n = 0;
835 
836 	for (i = 0; i < nfd; i++, pl++) {
837 		/* Check the file descriptor. */
838 		if (pl->fd < 0) {
839 			pl->revents = 0;
840 			continue;
841 		}
842 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
843 			pl->revents = POLLNVAL;
844 			n++;
845 			continue;
846 		}
847 		FREF(fp);
848 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
849 		FRELE(fp);
850 		if (pl->revents != 0)
851 			n++;
852 	}
853 	*retval = n;
854 }
855 
856 /*
857  * We are using the same mechanism as select only we encode/decode args
858  * differently.
859  */
860 int
861 sys_poll(struct proc *p, void *v, register_t *retval)
862 {
863 	struct sys_poll_args /* {
864 		syscallarg(struct pollfd *) fds;
865 		syscallarg(u_int) nfds;
866 		syscallarg(int) timeout;
867 	} */ *uap = v;
868 	size_t sz;
869 	struct pollfd pfds[4], *pl = pfds;
870 	int msec = SCARG(uap, timeout);
871 	struct timeval atv, rtv, ttv;
872 	int timo, ncoll, i, s, error;
873 	extern int nselcoll, selwait;
874 	u_int nfds = SCARG(uap, nfds);
875 
876 	/* Standards say no more than MAX_OPEN; this is possibly better. */
877 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
878 		return (EINVAL);
879 
880 	sz = sizeof(struct pollfd) * nfds;
881 
882 	/* optimize for the default case, of a small nfds value */
883 	if (sz > sizeof(pfds))
884 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
885 
886 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
887 		goto bad;
888 
889 	for (i = 0; i < nfds; i++)
890 		pl[i].revents = 0;
891 
892 	if (msec != INFTIM) {
893 		atv.tv_sec = msec / 1000;
894 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
895 
896 		if (itimerfix(&atv)) {
897 			error = EINVAL;
898 			goto done;
899 		}
900 		getmicrouptime(&rtv);
901 		timeradd(&atv, &rtv, &atv);
902 	} else {
903 		atv.tv_sec = 0;
904 		atv.tv_usec = 0;
905 	}
906 	timo = 0;
907 
908 retry:
909 	ncoll = nselcoll;
910 	p->p_flag |= P_SELECT;
911 	pollscan(p, pl, nfds, retval);
912 	if (*retval)
913 		goto done;
914 	if (msec != INFTIM) {
915 		getmicrouptime(&rtv);
916 		if (timercmp(&rtv, &atv, >=))
917 			goto done;
918 		ttv = atv;
919 		timersub(&ttv, &rtv, &ttv);
920 		timo = ttv.tv_sec > 24 * 60 * 60 ?
921 			24 * 60 * 60 * hz : tvtohz(&ttv);
922 	}
923 	s = splhigh();
924 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
925 		splx(s);
926 		goto retry;
927 	}
928 	p->p_flag &= ~P_SELECT;
929 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
930 	splx(s);
931 	if (error == 0)
932 		goto retry;
933 
934 done:
935 	p->p_flag &= ~P_SELECT;
936 	/*
937 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
938 	 *       ignored (since the whole point is to see what would block).
939 	 */
940 	switch (error) {
941 	case ERESTART:
942 		error = copyout(pl, SCARG(uap, fds), sz);
943 		if (error == 0)
944 			error = EINTR;
945 		break;
946 	case EWOULDBLOCK:
947 	case 0:
948 		error = copyout(pl, SCARG(uap, fds), sz);
949 		break;
950 	}
951 bad:
952 	if (pl != pfds)
953 		free(pl, M_TEMP);
954 	return (error);
955 }
956