xref: /openbsd-src/sys/kern/sys_generic.c (revision 94fd4554194a14f126fba33b837cc68a1df42468)
1 /*	$OpenBSD: sys_generic.c,v 1.56 2007/03/24 16:01:22 art Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 int seltrue(dev_t, int, struct proc *);
67 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
68 
69 void sel_clean_proclist(struct proc *);
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct proc *p, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int) fd;
80 		syscallarg(void *) buf;
81 		syscallarg(size_t) nbyte;
82 	} */ *uap = v;
83 	int fd = SCARG(uap, fd);
84 	struct file *fp;
85 	struct filedesc *fdp = p->p_fd;
86 
87 	if ((fp = fd_getfile(fdp, fd)) == NULL)
88 		return (EBADF);
89 	if ((fp->f_flag & FREAD) == 0)
90 		return (EBADF);
91 
92 	FREF(fp);
93 
94 	/* dofileread() will FRELE the descriptor for us */
95 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
96 	    &fp->f_offset, retval));
97 }
98 
99 int
100 dofileread(struct proc *p, int fd, struct file *fp, void *buf, size_t nbyte,
101     off_t *offset, register_t *retval)
102 {
103 	struct uio auio;
104 	struct iovec aiov;
105 	long cnt, error = 0;
106 #ifdef KTRACE
107 	struct iovec ktriov;
108 #endif
109 
110 	aiov.iov_base = buf;
111 	aiov.iov_len = nbyte;
112 	auio.uio_iov = &aiov;
113 	auio.uio_iovcnt = 1;
114 	auio.uio_resid = nbyte;
115 	auio.uio_rw = UIO_READ;
116 	auio.uio_segflg = UIO_USERSPACE;
117 	auio.uio_procp = p;
118 
119 	/*
120 	 * Reads return ssize_t because -1 is returned on error.  Therefore
121 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
122 	 * values.
123 	 */
124 	if (auio.uio_resid > SSIZE_MAX) {
125 		error = EINVAL;
126 		goto out;
127 	}
128 
129 #ifdef KTRACE
130 	/*
131 	 * if tracing, save a copy of iovec
132 	 */
133 	if (KTRPOINT(p, KTR_GENIO))
134 		ktriov = aiov;
135 #endif
136 	cnt = auio.uio_resid;
137 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
138 	if (error)
139 		if (auio.uio_resid != cnt && (error == ERESTART ||
140 		    error == EINTR || error == EWOULDBLOCK))
141 			error = 0;
142 	cnt -= auio.uio_resid;
143 
144 	fp->f_rxfer++;
145 	fp->f_rbytes += cnt;
146 #ifdef KTRACE
147 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
148 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
149 #endif
150 	*retval = cnt;
151  out:
152 	FRELE(fp);
153 	return (error);
154 }
155 
156 /*
157  * Scatter read system call.
158  */
159 int
160 sys_readv(struct proc *p, void *v, register_t *retval)
161 {
162 	struct sys_readv_args /* {
163 		syscallarg(int) fd;
164 		syscallarg(const struct iovec *) iovp;
165 		syscallarg(int) iovcnt;
166 	} */ *uap = v;
167 	int fd = SCARG(uap, fd);
168 	struct file *fp;
169 	struct filedesc *fdp = p->p_fd;
170 
171 	if ((fp = fd_getfile(fdp, fd)) == NULL)
172 		return (EBADF);
173 	if ((fp->f_flag & FREAD) == 0)
174 		return (EBADF);
175 
176 	FREF(fp);
177 
178 	/* dofilereadv() will FRELE the descriptor for us */
179 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
180 	    &fp->f_offset, retval));
181 }
182 
183 int
184 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
185     int iovcnt, off_t *offset, register_t *retval)
186 {
187 	struct uio auio;
188 	struct iovec *iov;
189 	struct iovec *needfree;
190 	struct iovec aiov[UIO_SMALLIOV];
191 	long i, cnt, error = 0;
192 	u_int iovlen;
193 #ifdef KTRACE
194 	struct iovec *ktriov = NULL;
195 #endif
196 
197 	/* note: can't use iovlen until iovcnt is validated */
198 	iovlen = iovcnt * sizeof(struct iovec);
199 	if ((u_int)iovcnt > UIO_SMALLIOV) {
200 		if ((u_int)iovcnt > IOV_MAX) {
201 			error = EINVAL;
202 			goto out;
203 		}
204 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
205 	} else if ((u_int)iovcnt > 0) {
206 		iov = aiov;
207 		needfree = NULL;
208 	} else {
209 		error = EINVAL;
210 		goto out;
211 	}
212 
213 	auio.uio_iov = iov;
214 	auio.uio_iovcnt = iovcnt;
215 	auio.uio_rw = UIO_READ;
216 	auio.uio_segflg = UIO_USERSPACE;
217 	auio.uio_procp = p;
218 	error = copyin(iovp, iov, iovlen);
219 	if (error)
220 		goto done;
221 	auio.uio_resid = 0;
222 	for (i = 0; i < iovcnt; i++) {
223 		auio.uio_resid += iov->iov_len;
224 		/*
225 		 * Reads return ssize_t because -1 is returned on error.
226 		 * Therefore we must restrict the length to SSIZE_MAX to
227 		 * avoid garbage return values.
228 		 */
229 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
230 			error = EINVAL;
231 			goto done;
232 		}
233 		iov++;
234 	}
235 #ifdef KTRACE
236 	/*
237 	 * if tracing, save a copy of iovec
238 	 */
239 	if (KTRPOINT(p, KTR_GENIO))  {
240 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
241 		bcopy(auio.uio_iov, ktriov, iovlen);
242 	}
243 #endif
244 	cnt = auio.uio_resid;
245 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
246 	if (error)
247 		if (auio.uio_resid != cnt && (error == ERESTART ||
248 		    error == EINTR || error == EWOULDBLOCK))
249 			error = 0;
250 	cnt -= auio.uio_resid;
251 
252 	fp->f_rxfer++;
253 	fp->f_rbytes += cnt;
254 #ifdef KTRACE
255 	if (ktriov != NULL) {
256 		if (error == 0)
257 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
258 			    error);
259 		free(ktriov, M_TEMP);
260 	}
261 #endif
262 	*retval = cnt;
263  done:
264 	if (needfree)
265 		free(needfree, M_IOV);
266  out:
267 	FRELE(fp);
268 	return (error);
269 }
270 
271 /*
272  * Write system call
273  */
274 int
275 sys_write(struct proc *p, void *v, register_t *retval)
276 {
277 	struct sys_write_args /* {
278 		syscallarg(int) fd;
279 		syscallarg(const void *) buf;
280 		syscallarg(size_t) nbyte;
281 	} */ *uap = v;
282 	int fd = SCARG(uap, fd);
283 	struct file *fp;
284 	struct filedesc *fdp = p->p_fd;
285 
286 	if ((fp = fd_getfile(fdp, fd)) == NULL)
287 		return (EBADF);
288 	if ((fp->f_flag & FWRITE) == 0)
289 		return (EBADF);
290 
291 	FREF(fp);
292 
293 	/* dofilewrite() will FRELE the descriptor for us */
294 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
295 	    &fp->f_offset, retval));
296 }
297 
298 int
299 dofilewrite(struct proc *p, int fd, struct file *fp, const void *buf,
300     size_t nbyte, off_t *offset, register_t *retval)
301 {
302 	struct uio auio;
303 	struct iovec aiov;
304 	long cnt, error = 0;
305 #ifdef KTRACE
306 	struct iovec ktriov;
307 #endif
308 
309 	aiov.iov_base = (void *)buf;		/* XXX kills const */
310 	aiov.iov_len = nbyte;
311 	auio.uio_iov = &aiov;
312 	auio.uio_iovcnt = 1;
313 	auio.uio_resid = nbyte;
314 	auio.uio_rw = UIO_WRITE;
315 	auio.uio_segflg = UIO_USERSPACE;
316 	auio.uio_procp = p;
317 
318 	/*
319 	 * Writes return ssize_t because -1 is returned on error.  Therefore
320 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
321 	 * values.
322 	 */
323 	if (auio.uio_resid > SSIZE_MAX) {
324 		error = EINVAL;
325 		goto out;
326 	}
327 
328 #ifdef KTRACE
329 	/*
330 	 * if tracing, save a copy of iovec
331 	 */
332 	if (KTRPOINT(p, KTR_GENIO))
333 		ktriov = aiov;
334 #endif
335 	cnt = auio.uio_resid;
336 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
337 	if (error) {
338 		if (auio.uio_resid != cnt && (error == ERESTART ||
339 		    error == EINTR || error == EWOULDBLOCK))
340 			error = 0;
341 		if (error == EPIPE)
342 			psignal(p, SIGPIPE);
343 	}
344 	cnt -= auio.uio_resid;
345 
346 	fp->f_wxfer++;
347 	fp->f_wbytes += cnt;
348 #ifdef KTRACE
349 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
350 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
351 #endif
352 	*retval = cnt;
353  out:
354 	FRELE(fp);
355 	return (error);
356 }
357 
358 /*
359  * Gather write system call
360  */
361 int
362 sys_writev(struct proc *p, void *v, register_t *retval)
363 {
364 	struct sys_writev_args /* {
365 		syscallarg(int) fd;
366 		syscallarg(const struct iovec *) iovp;
367 		syscallarg(int) iovcnt;
368 	} */ *uap = v;
369 	int fd = SCARG(uap, fd);
370 	struct file *fp;
371 	struct filedesc *fdp = p->p_fd;
372 
373 	if ((fp = fd_getfile(fdp, fd)) == NULL)
374 		return (EBADF);
375 	if ((fp->f_flag & FWRITE) == 0)
376 		return (EBADF);
377 
378 	FREF(fp);
379 
380 	/* dofilewritev() will FRELE the descriptor for us */
381 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
382 	    &fp->f_offset, retval));
383 }
384 
385 int
386 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
387     int iovcnt, off_t *offset, register_t *retval)
388 {
389 	struct uio auio;
390 	struct iovec *iov;
391 	struct iovec *needfree;
392 	struct iovec aiov[UIO_SMALLIOV];
393 	long i, cnt, error = 0;
394 	u_int iovlen;
395 #ifdef KTRACE
396 	struct iovec *ktriov = NULL;
397 #endif
398 
399 	/* note: can't use iovlen until iovcnt is validated */
400 	iovlen = iovcnt * sizeof(struct iovec);
401 	if ((u_int)iovcnt > UIO_SMALLIOV) {
402 		if ((u_int)iovcnt > IOV_MAX) {
403 			error = EINVAL;
404 			goto out;
405 		}
406 		iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
407 	} else if ((u_int)iovcnt > 0) {
408 		iov = aiov;
409 		needfree = NULL;
410 	} else {
411 		error = EINVAL;
412 		goto out;
413 	}
414 
415 	auio.uio_iov = iov;
416 	auio.uio_iovcnt = iovcnt;
417 	auio.uio_rw = UIO_WRITE;
418 	auio.uio_segflg = UIO_USERSPACE;
419 	auio.uio_procp = p;
420 	error = copyin(iovp, iov, iovlen);
421 	if (error)
422 		goto done;
423 	auio.uio_resid = 0;
424 	for (i = 0; i < iovcnt; i++) {
425 		auio.uio_resid += iov->iov_len;
426 		/*
427 		 * Writes return ssize_t because -1 is returned on error.
428 		 * Therefore we must restrict the length to SSIZE_MAX to
429 		 * avoid garbage return values.
430 		 */
431 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
432 			error = EINVAL;
433 			goto done;
434 		}
435 		iov++;
436 	}
437 #ifdef KTRACE
438 	/*
439 	 * if tracing, save a copy of iovec
440 	 */
441 	if (KTRPOINT(p, KTR_GENIO))  {
442 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
443 		bcopy(auio.uio_iov, ktriov, iovlen);
444 	}
445 #endif
446 	cnt = auio.uio_resid;
447 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
448 	if (error) {
449 		if (auio.uio_resid != cnt && (error == ERESTART ||
450 		    error == EINTR || error == EWOULDBLOCK))
451 			error = 0;
452 		if (error == EPIPE)
453 			psignal(p, SIGPIPE);
454 	}
455 	cnt -= auio.uio_resid;
456 
457 	fp->f_wxfer++;
458 	fp->f_wbytes += cnt;
459 #ifdef KTRACE
460 	if (ktriov != NULL) {
461 		if (error == 0)
462 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
463 		free(ktriov, M_TEMP);
464 	}
465 #endif
466 	*retval = cnt;
467  done:
468 	if (needfree)
469 		free(needfree, M_IOV);
470  out:
471 	FRELE(fp);
472 	return (error);
473 }
474 
475 /*
476  * Ioctl system call
477  */
478 /* ARGSUSED */
479 int
480 sys_ioctl(struct proc *p, void *v, register_t *retval)
481 {
482 	struct sys_ioctl_args /* {
483 		syscallarg(int) fd;
484 		syscallarg(u_long) com;
485 		syscallarg(void *) data;
486 	} */ *uap = v;
487 	struct file *fp;
488 	struct filedesc *fdp;
489 	u_long com;
490 	int error;
491 	u_int size;
492 	caddr_t data, memp;
493 	int tmp;
494 #define STK_PARAMS	128
495 	char stkbuf[STK_PARAMS];
496 
497 	fdp = p->p_fd;
498 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
499 		return (EBADF);
500 
501 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
502 		return (EBADF);
503 
504 	switch (com = SCARG(uap, com)) {
505 	case FIONCLEX:
506 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
507 		return (0);
508 	case FIOCLEX:
509 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
510 		return (0);
511 	}
512 
513 	/*
514 	 * Interpret high order word to find amount of data to be
515 	 * copied to/from the user's address space.
516 	 */
517 	size = IOCPARM_LEN(com);
518 	if (size > IOCPARM_MAX)
519 		return (ENOTTY);
520 	FREF(fp);
521 	memp = NULL;
522 	if (size > sizeof (stkbuf)) {
523 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
524 		data = memp;
525 	} else
526 		data = stkbuf;
527 	if (com&IOC_IN) {
528 		if (size) {
529 			error = copyin(SCARG(uap, data), data, (u_int)size);
530 			if (error) {
531 				goto out;
532 			}
533 		} else
534 			*(caddr_t *)data = SCARG(uap, data);
535 	} else if ((com&IOC_OUT) && size)
536 		/*
537 		 * Zero the buffer so the user always
538 		 * gets back something deterministic.
539 		 */
540 		bzero(data, size);
541 	else if (com&IOC_VOID)
542 		*(caddr_t *)data = SCARG(uap, data);
543 
544 	switch (com) {
545 
546 	case FIONBIO:
547 		if ((tmp = *(int *)data) != 0)
548 			fp->f_flag |= FNONBLOCK;
549 		else
550 			fp->f_flag &= ~FNONBLOCK;
551 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
552 		break;
553 
554 	case FIOASYNC:
555 		if ((tmp = *(int *)data) != 0)
556 			fp->f_flag |= FASYNC;
557 		else
558 			fp->f_flag &= ~FASYNC;
559 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
560 		break;
561 
562 	case FIOSETOWN:
563 		tmp = *(int *)data;
564 		if (fp->f_type == DTYPE_SOCKET) {
565 			struct socket *so = (struct socket *)fp->f_data;
566 
567 			so->so_pgid = tmp;
568 			so->so_siguid = p->p_cred->p_ruid;
569 			so->so_sigeuid = p->p_ucred->cr_uid;
570 			error = 0;
571 			break;
572 		}
573 		if (tmp <= 0) {
574 			tmp = -tmp;
575 		} else {
576 			struct proc *p1 = pfind(tmp);
577 			if (p1 == 0) {
578 				error = ESRCH;
579 				break;
580 			}
581 			tmp = p1->p_pgrp->pg_id;
582 		}
583 		error = (*fp->f_ops->fo_ioctl)
584 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
585 		break;
586 
587 	case FIOGETOWN:
588 		if (fp->f_type == DTYPE_SOCKET) {
589 			error = 0;
590 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
591 			break;
592 		}
593 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
594 		*(int *)data = -*(int *)data;
595 		break;
596 
597 	default:
598 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
599 		break;
600 	}
601 	/*
602 	 * Copy any data to user, size was
603 	 * already set and checked above.
604 	 */
605 	if (error == 0 && (com&IOC_OUT) && size)
606 		error = copyout(data, SCARG(uap, data), (u_int)size);
607 out:
608 	FRELE(fp);
609 	if (memp)
610 		free(memp, M_IOCTLOPS);
611 	return (error);
612 }
613 
614 int	selwait, nselcoll;
615 
616 /*
617  * Select system call.
618  */
619 int
620 sys_select(struct proc *p, void *v, register_t *retval)
621 {
622 	struct sys_select_args /* {
623 		syscallarg(int) nd;
624 		syscallarg(fd_set *) in;
625 		syscallarg(fd_set *) ou;
626 		syscallarg(fd_set *) ex;
627 		syscallarg(struct timeval *) tv;
628 	} */ *uap = v;
629 	fd_mask bits[6];
630 	fd_set *pibits[3], *pobits[3];
631 	struct timeval atv, rtv, ttv;
632 	int s, ncoll, error = 0, timo;
633 	u_int nd, ni;
634 
635 	nd = SCARG(uap, nd);
636 	if (nd > p->p_fd->fd_nfiles) {
637 		/* forgiving; slightly wrong */
638 		nd = p->p_fd->fd_nfiles;
639 	}
640 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
641 	if (nd > sizeof(bits[0])) {
642 		caddr_t mbits;
643 
644 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK);
645 		bzero(mbits, ni * 6);
646 		pibits[0] = (fd_set *)&mbits[ni * 0];
647 		pibits[1] = (fd_set *)&mbits[ni * 1];
648 		pibits[2] = (fd_set *)&mbits[ni * 2];
649 		pobits[0] = (fd_set *)&mbits[ni * 3];
650 		pobits[1] = (fd_set *)&mbits[ni * 4];
651 		pobits[2] = (fd_set *)&mbits[ni * 5];
652 	} else {
653 		bzero(bits, sizeof(bits));
654 		pibits[0] = (fd_set *)&bits[0];
655 		pibits[1] = (fd_set *)&bits[1];
656 		pibits[2] = (fd_set *)&bits[2];
657 		pobits[0] = (fd_set *)&bits[3];
658 		pobits[1] = (fd_set *)&bits[4];
659 		pobits[2] = (fd_set *)&bits[5];
660 	}
661 
662 #define	getbits(name, x) \
663 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
664 	    pibits[x], ni))) \
665 		goto done;
666 	getbits(in, 0);
667 	getbits(ou, 1);
668 	getbits(ex, 2);
669 #undef	getbits
670 
671 	if (SCARG(uap, tv)) {
672 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
673 		if (error)
674 			goto done;
675 		if (itimerfix(&atv)) {
676 			error = EINVAL;
677 			goto done;
678 		}
679 		getmicrouptime(&rtv);
680 		timeradd(&atv, &rtv, &atv);
681 	} else {
682 		atv.tv_sec = 0;
683 		atv.tv_usec = 0;
684 	}
685 	timo = 0;
686 
687 retry:
688 	ncoll = nselcoll;
689 	atomic_setbits_int(&p->p_flag, P_SELECT);
690 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
691 	if (error || *retval)
692 		goto done;
693 	if (SCARG(uap, tv)) {
694 		getmicrouptime(&rtv);
695 		if (timercmp(&rtv, &atv, >=))
696 			goto done;
697 		ttv = atv;
698 		timersub(&ttv, &rtv, &ttv);
699 		timo = ttv.tv_sec > 24 * 60 * 60 ?
700 			24 * 60 * 60 * hz : tvtohz(&ttv);
701 	}
702 	s = splhigh();
703 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
704 		splx(s);
705 		goto retry;
706 	}
707 	atomic_clearbits_int(&p->p_flag, P_SELECT);
708 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
709 	splx(s);
710 	if (error == 0)
711 		goto retry;
712 done:
713 	sel_clean_proclist(p);
714 	atomic_clearbits_int(&p->p_flag, P_SELECT);
715 	/* select is not restarted after signals... */
716 	if (error == ERESTART)
717 		error = EINTR;
718 	if (error == EWOULDBLOCK)
719 		error = 0;
720 #define	putbits(name, x) \
721 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
722 	    SCARG(uap, name), ni))) \
723 		error = error2;
724 	if (error == 0) {
725 		int error2;
726 
727 		putbits(in, 0);
728 		putbits(ou, 1);
729 		putbits(ex, 2);
730 #undef putbits
731 	}
732 
733 	if (pibits[0] != (fd_set *)&bits[0])
734 		free(pibits[0], M_TEMP);
735 	return (error);
736 }
737 
738 int
739 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
740     register_t *retval)
741 {
742 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
743 	struct filedesc *fdp = p->p_fd;
744 	int msk, i, j, fd;
745 	fd_mask bits;
746 	struct file *fp;
747 	int n = 0;
748 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
749 
750 	for (msk = 0; msk < 3; msk++) {
751 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
752 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
753 
754 		for (i = 0; i < nfd; i += NFDBITS) {
755 			bits = pibits->fds_bits[i/NFDBITS];
756 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
757 				bits &= ~(1 << j);
758 				if ((fp = fd_getfile(fdp, fd)) == NULL)
759 					return (EBADF);
760 				FREF(fp);
761 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
762 					FD_SET(fd, pobits);
763 					n++;
764 				}
765 				FRELE(fp);
766 			}
767 		}
768 	}
769 	*retval = n;
770 	return (0);
771 }
772 
773 /*ARGSUSED*/
774 int
775 seltrue(dev_t dev, int events, struct proc *p)
776 {
777 
778 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
779 }
780 
781 /*
782  * Record a select request.
783  */
784 void
785 selrecord(struct proc *selector, struct selinfo *sip)
786 {
787 	if (sip->si_selproc == NULL) {
788 		sip->si_selproc = selector;
789 		TAILQ_INSERT_TAIL(&selector->p_selects, sip, si_list);
790 	} else if (sip->si_selproc != selector) {
791 		sip->si_flags |= SI_COLL;
792 	}
793 }
794 
795 /*
796  * Do a wakeup when a selectable event occurs.
797  */
798 void
799 selwakeup(struct selinfo *sip)
800 {
801 	struct proc *p;
802 	int s;
803 
804 	if (sip->si_flags & SI_COLL) {
805 		nselcoll++;
806 		sip->si_flags &= ~SI_COLL;
807 		wakeup(&selwait);
808 	}
809 
810 	/*
811 	 * We check the process once before locking.
812 	 * Then we wake the process and clean up its
813 	 * selects list.
814 	 */
815 	if (sip->si_selproc == NULL)
816 		return;
817 
818 	SCHED_LOCK(s);
819 	if ((p = sip->si_selproc) != NULL) {
820 		if (p->p_wchan != NULL) {
821 			if (p->p_stat == SSLEEP)
822 				setrunnable(p);
823 			else
824 				unsleep(p);
825 		} else {
826 			atomic_clearbits_int(&p->p_flag, P_SELECT);
827 		}
828 	}
829 	SCHED_UNLOCK(s);
830 }
831 
832 void
833 sel_clean_proclist(struct proc *p)
834 {
835 	struct selinfo *sip;
836 
837 	while ((sip = TAILQ_FIRST(&p->p_selects)) != NULL) {
838 		sip->si_selproc = NULL;
839 		TAILQ_REMOVE(&p->p_selects, sip, si_list);
840 	}
841 }
842 
843 void
844 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
845 {
846 	struct filedesc *fdp = p->p_fd;
847 	struct file *fp;
848 	u_int i;
849 	int n = 0;
850 
851 	for (i = 0; i < nfd; i++, pl++) {
852 		/* Check the file descriptor. */
853 		if (pl->fd < 0) {
854 			pl->revents = 0;
855 			continue;
856 		}
857 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
858 			pl->revents = POLLNVAL;
859 			n++;
860 			continue;
861 		}
862 		FREF(fp);
863 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
864 		FRELE(fp);
865 		if (pl->revents != 0)
866 			n++;
867 	}
868 	*retval = n;
869 }
870 
871 /*
872  * We are using the same mechanism as select only we encode/decode args
873  * differently.
874  */
875 int
876 sys_poll(struct proc *p, void *v, register_t *retval)
877 {
878 	struct sys_poll_args /* {
879 		syscallarg(struct pollfd *) fds;
880 		syscallarg(u_int) nfds;
881 		syscallarg(int) timeout;
882 	} */ *uap = v;
883 	size_t sz;
884 	struct pollfd pfds[4], *pl = pfds;
885 	int msec = SCARG(uap, timeout);
886 	struct timeval atv, rtv, ttv;
887 	int timo, ncoll, i, s, error;
888 	extern int nselcoll, selwait;
889 	u_int nfds = SCARG(uap, nfds);
890 
891 	/* Standards say no more than MAX_OPEN; this is possibly better. */
892 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
893 		return (EINVAL);
894 
895 	sz = sizeof(struct pollfd) * nfds;
896 
897 	/* optimize for the default case, of a small nfds value */
898 	if (sz > sizeof(pfds))
899 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
900 
901 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
902 		goto bad;
903 
904 	for (i = 0; i < nfds; i++)
905 		pl[i].revents = 0;
906 
907 	if (msec != INFTIM) {
908 		atv.tv_sec = msec / 1000;
909 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
910 
911 		if (itimerfix(&atv)) {
912 			error = EINVAL;
913 			goto done;
914 		}
915 		getmicrouptime(&rtv);
916 		timeradd(&atv, &rtv, &atv);
917 	} else {
918 		atv.tv_sec = 0;
919 		atv.tv_usec = 0;
920 	}
921 	timo = 0;
922 
923 retry:
924 	ncoll = nselcoll;
925 	atomic_setbits_int(&p->p_flag, P_SELECT);
926 	pollscan(p, pl, nfds, retval);
927 	if (*retval)
928 		goto done;
929 	if (msec != INFTIM) {
930 		getmicrouptime(&rtv);
931 		if (timercmp(&rtv, &atv, >=))
932 			goto done;
933 		ttv = atv;
934 		timersub(&ttv, &rtv, &ttv);
935 		timo = ttv.tv_sec > 24 * 60 * 60 ?
936 			24 * 60 * 60 * hz : tvtohz(&ttv);
937 	}
938 	s = splhigh();
939 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
940 		splx(s);
941 		goto retry;
942 	}
943 	atomic_clearbits_int(&p->p_flag, P_SELECT);
944 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
945 	splx(s);
946 	if (error == 0)
947 		goto retry;
948 
949 done:
950 	sel_clean_proclist(p);
951 	atomic_clearbits_int(&p->p_flag, P_SELECT);
952 	/*
953 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
954 	 *       ignored (since the whole point is to see what would block).
955 	 */
956 	switch (error) {
957 	case ERESTART:
958 		error = copyout(pl, SCARG(uap, fds), sz);
959 		if (error == 0)
960 			error = EINTR;
961 		break;
962 	case EWOULDBLOCK:
963 	case 0:
964 		error = copyout(pl, SCARG(uap, fds), sz);
965 		break;
966 	}
967 bad:
968 	if (pl != pfds)
969 		free(pl, M_TEMP);
970 	return (error);
971 }
972