xref: /openbsd-src/sys/kern/sys_generic.c (revision 43003dfe3ad45d1698bed8a37f2b0f5b14f20d4f)
1 /*	$OpenBSD: sys_generic.c,v 1.66 2009/06/08 23:18:42 deraadt Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 int seltrue(dev_t, int, struct proc *);
67 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
68 int pollout(struct pollfd *, struct pollfd *, u_int);
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(struct proc *p, void *v, register_t *retval)
76 {
77 	struct sys_read_args /* {
78 		syscallarg(int) fd;
79 		syscallarg(void *) buf;
80 		syscallarg(size_t) nbyte;
81 	} */ *uap = v;
82 	struct iovec iov;
83 	int fd = SCARG(uap, fd);
84 	struct file *fp;
85 	struct filedesc *fdp = p->p_fd;
86 
87 	if ((fp = fd_getfile(fdp, fd)) == NULL)
88 		return (EBADF);
89 	if ((fp->f_flag & FREAD) == 0)
90 		return (EBADF);
91 
92 	iov.iov_base = SCARG(uap, buf);
93 	iov.iov_len = SCARG(uap, nbyte);
94 
95 	FREF(fp);
96 
97 	/* dofilereadv() will FRELE the descriptor for us */
98 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
99 }
100 
101 /*
102  * Scatter read system call.
103  */
104 int
105 sys_readv(struct proc *p, void *v, register_t *retval)
106 {
107 	struct sys_readv_args /* {
108 		syscallarg(int) fd;
109 		syscallarg(const struct iovec *) iovp;
110 		syscallarg(int) iovcnt;
111 	} */ *uap = v;
112 	int fd = SCARG(uap, fd);
113 	struct file *fp;
114 	struct filedesc *fdp = p->p_fd;
115 
116 	if ((fp = fd_getfile(fdp, fd)) == NULL)
117 		return (EBADF);
118 	if ((fp->f_flag & FREAD) == 0)
119 		return (EBADF);
120 
121 	FREF(fp);
122 
123 	/* dofilereadv() will FRELE the descriptor for us */
124 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
125 	    &fp->f_offset, retval));
126 }
127 
128 int
129 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
130     int iovcnt, int userspace, off_t *offset, register_t *retval)
131 {
132 	struct iovec aiov[UIO_SMALLIOV];
133 	struct uio auio;
134 	struct iovec *iov;
135 	struct iovec *needfree = NULL;
136 	long i, cnt, error = 0;
137 	u_int iovlen;
138 #ifdef KTRACE
139 	struct iovec *ktriov = NULL;
140 #endif
141 
142 	/* note: can't use iovlen until iovcnt is validated */
143 	iovlen = iovcnt * sizeof(struct iovec);
144 
145 	/*
146 	 * If the iovec array exists in userspace, it needs to be copied in;
147 	 * otherwise, it can be used directly.
148 	 */
149 	if (userspace) {
150 		if ((u_int)iovcnt > UIO_SMALLIOV) {
151 			if ((u_int)iovcnt > IOV_MAX) {
152 				error = EINVAL;
153 				goto out;
154 			}
155 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
156 		} else if ((u_int)iovcnt > 0) {
157 			iov = aiov;
158 			needfree = NULL;
159 		} else {
160 			error = EINVAL;
161 			goto out;
162 		}
163 		if ((error = copyin(iovp, iov, iovlen)))
164 			goto done;
165 	} else {
166 		iov = (struct iovec *)iovp;		/* de-constify */
167 	}
168 
169 	auio.uio_iov = iov;
170 	auio.uio_iovcnt = iovcnt;
171 	auio.uio_rw = UIO_READ;
172 	auio.uio_segflg = UIO_USERSPACE;
173 	auio.uio_procp = p;
174 	auio.uio_resid = 0;
175 	for (i = 0; i < iovcnt; i++) {
176 		auio.uio_resid += iov->iov_len;
177 		/*
178 		 * Reads return ssize_t because -1 is returned on error.
179 		 * Therefore we must restrict the length to SSIZE_MAX to
180 		 * avoid garbage return values.
181 		 */
182 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
183 			error = EINVAL;
184 			goto done;
185 		}
186 		iov++;
187 	}
188 #ifdef KTRACE
189 	/*
190 	 * if tracing, save a copy of iovec
191 	 */
192 	if (KTRPOINT(p, KTR_GENIO))  {
193 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
194 		bcopy(auio.uio_iov, ktriov, iovlen);
195 	}
196 #endif
197 	cnt = auio.uio_resid;
198 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
199 	if (error)
200 		if (auio.uio_resid != cnt && (error == ERESTART ||
201 		    error == EINTR || error == EWOULDBLOCK))
202 			error = 0;
203 	cnt -= auio.uio_resid;
204 
205 	fp->f_rxfer++;
206 	fp->f_rbytes += cnt;
207 #ifdef KTRACE
208 	if (ktriov != NULL) {
209 		if (error == 0)
210 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
211 			    error);
212 		free(ktriov, M_TEMP);
213 	}
214 #endif
215 	*retval = cnt;
216  done:
217 	if (needfree)
218 		free(needfree, M_IOV);
219  out:
220 	FRELE(fp);
221 	return (error);
222 }
223 
224 /*
225  * Write system call
226  */
227 int
228 sys_write(struct proc *p, void *v, register_t *retval)
229 {
230 	struct sys_write_args /* {
231 		syscallarg(int) fd;
232 		syscallarg(const void *) buf;
233 		syscallarg(size_t) nbyte;
234 	} */ *uap = v;
235 	struct iovec iov;
236 	int fd = SCARG(uap, fd);
237 	struct file *fp;
238 	struct filedesc *fdp = p->p_fd;
239 
240 	if ((fp = fd_getfile(fdp, fd)) == NULL)
241 		return (EBADF);
242 	if ((fp->f_flag & FWRITE) == 0)
243 		return (EBADF);
244 
245 	iov.iov_base = (void *)SCARG(uap, buf);
246 	iov.iov_len = SCARG(uap, nbyte);
247 
248 	FREF(fp);
249 
250 	/* dofilewritev() will FRELE the descriptor for us */
251 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
252 }
253 
254 /*
255  * Gather write system call
256  */
257 int
258 sys_writev(struct proc *p, void *v, register_t *retval)
259 {
260 	struct sys_writev_args /* {
261 		syscallarg(int) fd;
262 		syscallarg(const struct iovec *) iovp;
263 		syscallarg(int) iovcnt;
264 	} */ *uap = v;
265 	int fd = SCARG(uap, fd);
266 	struct file *fp;
267 	struct filedesc *fdp = p->p_fd;
268 
269 	if ((fp = fd_getfile(fdp, fd)) == NULL)
270 		return (EBADF);
271 	if ((fp->f_flag & FWRITE) == 0)
272 		return (EBADF);
273 
274 	FREF(fp);
275 
276 	/* dofilewritev() will FRELE the descriptor for us */
277 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
278 	    &fp->f_offset, retval));
279 }
280 
281 int
282 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
283     int iovcnt, int userspace, off_t *offset, register_t *retval)
284 {
285 	struct iovec aiov[UIO_SMALLIOV];
286 	struct uio auio;
287 	struct iovec *iov;
288 	struct iovec *needfree = NULL;
289 	long i, cnt, error = 0;
290 	u_int iovlen;
291 #ifdef KTRACE
292 	struct iovec *ktriov = NULL;
293 #endif
294 
295 	/* note: can't use iovlen until iovcnt is validated */
296 	iovlen = iovcnt * sizeof(struct iovec);
297 
298 	/*
299 	 * If the iovec array exists in userspace, it needs to be copied in;
300 	 * otherwise, it can be used directly.
301 	 */
302 	if (userspace) {
303 		if ((u_int)iovcnt > UIO_SMALLIOV) {
304 			if ((u_int)iovcnt > IOV_MAX) {
305 				error = EINVAL;
306 				goto out;
307 			}
308 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
309 		} else if ((u_int)iovcnt > 0) {
310 			iov = aiov;
311 			needfree = NULL;
312 		} else {
313 			error = EINVAL;
314 			goto out;
315 		}
316 		if ((error = copyin(iovp, iov, iovlen)))
317 			goto done;
318 	} else {
319 		iov = (struct iovec *)iovp;		/* de-constify */
320 	}
321 
322 	auio.uio_iov = iov;
323 	auio.uio_iovcnt = iovcnt;
324 	auio.uio_rw = UIO_WRITE;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	auio.uio_procp = p;
327 	auio.uio_resid = 0;
328 	for (i = 0; i < iovcnt; i++) {
329 		auio.uio_resid += iov->iov_len;
330 		/*
331 		 * Writes return ssize_t because -1 is returned on error.
332 		 * Therefore we must restrict the length to SSIZE_MAX to
333 		 * avoid garbage return values.
334 		 */
335 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
336 			error = EINVAL;
337 			goto done;
338 		}
339 		iov++;
340 	}
341 #ifdef KTRACE
342 	/*
343 	 * if tracing, save a copy of iovec
344 	 */
345 	if (KTRPOINT(p, KTR_GENIO))  {
346 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
347 		bcopy(auio.uio_iov, ktriov, iovlen);
348 	}
349 #endif
350 	cnt = auio.uio_resid;
351 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
352 	if (error) {
353 		if (auio.uio_resid != cnt && (error == ERESTART ||
354 		    error == EINTR || error == EWOULDBLOCK))
355 			error = 0;
356 		if (error == EPIPE)
357 			ptsignal(p, SIGPIPE, STHREAD);
358 	}
359 	cnt -= auio.uio_resid;
360 
361 	fp->f_wxfer++;
362 	fp->f_wbytes += cnt;
363 #ifdef KTRACE
364 	if (ktriov != NULL) {
365 		if (error == 0)
366 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
367 		free(ktriov, M_TEMP);
368 	}
369 #endif
370 	*retval = cnt;
371  done:
372 	if (needfree)
373 		free(needfree, M_IOV);
374  out:
375 	FRELE(fp);
376 	return (error);
377 }
378 
379 /*
380  * Ioctl system call
381  */
382 /* ARGSUSED */
383 int
384 sys_ioctl(struct proc *p, void *v, register_t *retval)
385 {
386 	struct sys_ioctl_args /* {
387 		syscallarg(int) fd;
388 		syscallarg(u_long) com;
389 		syscallarg(void *) data;
390 	} */ *uap = v;
391 	struct file *fp;
392 	struct filedesc *fdp;
393 	u_long com;
394 	int error;
395 	u_int size;
396 	caddr_t data, memp;
397 	int tmp;
398 #define STK_PARAMS	128
399 	char stkbuf[STK_PARAMS];
400 
401 	fdp = p->p_fd;
402 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
403 		return (EBADF);
404 
405 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
406 		return (EBADF);
407 
408 	switch (com = SCARG(uap, com)) {
409 	case FIONCLEX:
410 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
411 		return (0);
412 	case FIOCLEX:
413 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
414 		return (0);
415 	}
416 
417 	/*
418 	 * Interpret high order word to find amount of data to be
419 	 * copied to/from the user's address space.
420 	 */
421 	size = IOCPARM_LEN(com);
422 	if (size > IOCPARM_MAX)
423 		return (ENOTTY);
424 	FREF(fp);
425 	memp = NULL;
426 	if (size > sizeof (stkbuf)) {
427 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
428 		data = memp;
429 	} else
430 		data = stkbuf;
431 	if (com&IOC_IN) {
432 		if (size) {
433 			error = copyin(SCARG(uap, data), data, (u_int)size);
434 			if (error) {
435 				goto out;
436 			}
437 		} else
438 			*(caddr_t *)data = SCARG(uap, data);
439 	} else if ((com&IOC_OUT) && size)
440 		/*
441 		 * Zero the buffer so the user always
442 		 * gets back something deterministic.
443 		 */
444 		bzero(data, size);
445 	else if (com&IOC_VOID)
446 		*(caddr_t *)data = SCARG(uap, data);
447 
448 	switch (com) {
449 
450 	case FIONBIO:
451 		if ((tmp = *(int *)data) != 0)
452 			fp->f_flag |= FNONBLOCK;
453 		else
454 			fp->f_flag &= ~FNONBLOCK;
455 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
456 		break;
457 
458 	case FIOASYNC:
459 		if ((tmp = *(int *)data) != 0)
460 			fp->f_flag |= FASYNC;
461 		else
462 			fp->f_flag &= ~FASYNC;
463 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
464 		break;
465 
466 	case FIOSETOWN:
467 		tmp = *(int *)data;
468 		if (fp->f_type == DTYPE_SOCKET) {
469 			struct socket *so = (struct socket *)fp->f_data;
470 
471 			so->so_pgid = tmp;
472 			so->so_siguid = p->p_cred->p_ruid;
473 			so->so_sigeuid = p->p_ucred->cr_uid;
474 			error = 0;
475 			break;
476 		}
477 		if (tmp <= 0) {
478 			tmp = -tmp;
479 		} else {
480 			struct proc *p1 = pfind(tmp);
481 			if (p1 == 0) {
482 				error = ESRCH;
483 				break;
484 			}
485 			tmp = p1->p_pgrp->pg_id;
486 		}
487 		error = (*fp->f_ops->fo_ioctl)
488 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
489 		break;
490 
491 	case FIOGETOWN:
492 		if (fp->f_type == DTYPE_SOCKET) {
493 			error = 0;
494 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
495 			break;
496 		}
497 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
498 		*(int *)data = -*(int *)data;
499 		break;
500 
501 	default:
502 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
503 		break;
504 	}
505 	/*
506 	 * Copy any data to user, size was
507 	 * already set and checked above.
508 	 */
509 	if (error == 0 && (com&IOC_OUT) && size)
510 		error = copyout(data, SCARG(uap, data), (u_int)size);
511 out:
512 	FRELE(fp);
513 	if (memp)
514 		free(memp, M_IOCTLOPS);
515 	return (error);
516 }
517 
518 int	selwait, nselcoll;
519 
520 /*
521  * Select system call.
522  */
523 int
524 sys_select(struct proc *p, void *v, register_t *retval)
525 {
526 	struct sys_select_args /* {
527 		syscallarg(int) nd;
528 		syscallarg(fd_set *) in;
529 		syscallarg(fd_set *) ou;
530 		syscallarg(fd_set *) ex;
531 		syscallarg(struct timeval *) tv;
532 	} */ *uap = v;
533 	fd_mask bits[6];
534 	fd_set *pibits[3], *pobits[3];
535 	struct timeval atv, rtv, ttv;
536 	int s, ncoll, error = 0, timo;
537 	u_int nd, ni;
538 
539 	nd = SCARG(uap, nd);
540 	if (nd > p->p_fd->fd_nfiles) {
541 		/* forgiving; slightly wrong */
542 		nd = p->p_fd->fd_nfiles;
543 	}
544 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
545 	if (nd > sizeof(bits[0])) {
546 		caddr_t mbits;
547 
548 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK|M_ZERO);
549 		pibits[0] = (fd_set *)&mbits[ni * 0];
550 		pibits[1] = (fd_set *)&mbits[ni * 1];
551 		pibits[2] = (fd_set *)&mbits[ni * 2];
552 		pobits[0] = (fd_set *)&mbits[ni * 3];
553 		pobits[1] = (fd_set *)&mbits[ni * 4];
554 		pobits[2] = (fd_set *)&mbits[ni * 5];
555 	} else {
556 		bzero(bits, sizeof(bits));
557 		pibits[0] = (fd_set *)&bits[0];
558 		pibits[1] = (fd_set *)&bits[1];
559 		pibits[2] = (fd_set *)&bits[2];
560 		pobits[0] = (fd_set *)&bits[3];
561 		pobits[1] = (fd_set *)&bits[4];
562 		pobits[2] = (fd_set *)&bits[5];
563 	}
564 
565 #define	getbits(name, x) \
566 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
567 	    pibits[x], ni))) \
568 		goto done;
569 	getbits(in, 0);
570 	getbits(ou, 1);
571 	getbits(ex, 2);
572 #undef	getbits
573 
574 	if (SCARG(uap, tv)) {
575 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
576 		if (error)
577 			goto done;
578 		if (itimerfix(&atv)) {
579 			error = EINVAL;
580 			goto done;
581 		}
582 		getmicrouptime(&rtv);
583 		timeradd(&atv, &rtv, &atv);
584 	} else {
585 		atv.tv_sec = 0;
586 		atv.tv_usec = 0;
587 	}
588 	timo = 0;
589 
590 retry:
591 	ncoll = nselcoll;
592 	atomic_setbits_int(&p->p_flag, P_SELECT);
593 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
594 	if (error || *retval)
595 		goto done;
596 	if (SCARG(uap, tv)) {
597 		getmicrouptime(&rtv);
598 		if (timercmp(&rtv, &atv, >=))
599 			goto done;
600 		ttv = atv;
601 		timersub(&ttv, &rtv, &ttv);
602 		timo = ttv.tv_sec > 24 * 60 * 60 ?
603 			24 * 60 * 60 * hz : tvtohz(&ttv);
604 	}
605 	s = splhigh();
606 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
607 		splx(s);
608 		goto retry;
609 	}
610 	atomic_clearbits_int(&p->p_flag, P_SELECT);
611 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
612 	splx(s);
613 	if (error == 0)
614 		goto retry;
615 done:
616 	atomic_clearbits_int(&p->p_flag, P_SELECT);
617 	/* select is not restarted after signals... */
618 	if (error == ERESTART)
619 		error = EINTR;
620 	if (error == EWOULDBLOCK)
621 		error = 0;
622 #define	putbits(name, x) \
623 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
624 	    SCARG(uap, name), ni))) \
625 		error = error2;
626 	if (error == 0) {
627 		int error2;
628 
629 		putbits(in, 0);
630 		putbits(ou, 1);
631 		putbits(ex, 2);
632 #undef putbits
633 	}
634 
635 	if (pibits[0] != (fd_set *)&bits[0])
636 		free(pibits[0], M_TEMP);
637 	return (error);
638 }
639 
640 int
641 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
642     register_t *retval)
643 {
644 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
645 	struct filedesc *fdp = p->p_fd;
646 	int msk, i, j, fd;
647 	fd_mask bits;
648 	struct file *fp;
649 	int n = 0;
650 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
651 
652 	for (msk = 0; msk < 3; msk++) {
653 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
654 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
655 
656 		for (i = 0; i < nfd; i += NFDBITS) {
657 			bits = pibits->fds_bits[i/NFDBITS];
658 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
659 				bits &= ~(1 << j);
660 				if ((fp = fd_getfile(fdp, fd)) == NULL)
661 					return (EBADF);
662 				FREF(fp);
663 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
664 					FD_SET(fd, pobits);
665 					n++;
666 				}
667 				FRELE(fp);
668 			}
669 		}
670 	}
671 	*retval = n;
672 	return (0);
673 }
674 
675 /*ARGSUSED*/
676 int
677 seltrue(dev_t dev, int events, struct proc *p)
678 {
679 
680 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
681 }
682 
683 /*
684  * Record a select request.
685  */
686 void
687 selrecord(struct proc *selector, struct selinfo *sip)
688 {
689 	struct proc *p;
690 	pid_t mypid;
691 
692 	mypid = selector->p_pid;
693 	if (sip->si_selpid == mypid)
694 		return;
695 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
696 	    p->p_wchan == (caddr_t)&selwait)
697 		sip->si_flags |= SI_COLL;
698 	else
699 		sip->si_selpid = mypid;
700 }
701 
702 /*
703  * Do a wakeup when a selectable event occurs.
704  */
705 void
706 selwakeup(struct selinfo *sip)
707 {
708 	struct proc *p;
709 	int s;
710 
711 	if (sip->si_selpid == 0)
712 		return;
713 	if (sip->si_flags & SI_COLL) {
714 		nselcoll++;
715 		sip->si_flags &= ~SI_COLL;
716 		wakeup(&selwait);
717 	}
718 	p = pfind(sip->si_selpid);
719 	sip->si_selpid = 0;
720 	if (p != NULL) {
721 		SCHED_LOCK(s);
722 		if (p->p_wchan == (caddr_t)&selwait) {
723 			if (p->p_stat == SSLEEP)
724 				setrunnable(p);
725 			else
726 				unsleep(p);
727 		} else if (p->p_flag & P_SELECT)
728 			atomic_clearbits_int(&p->p_flag, P_SELECT);
729 		SCHED_UNLOCK(s);
730 	}
731 }
732 
733 void
734 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
735 {
736 	struct filedesc *fdp = p->p_fd;
737 	struct file *fp;
738 	u_int i;
739 	int n = 0;
740 
741 	for (i = 0; i < nfd; i++, pl++) {
742 		/* Check the file descriptor. */
743 		if (pl->fd < 0) {
744 			pl->revents = 0;
745 			continue;
746 		}
747 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
748 			pl->revents = POLLNVAL;
749 			n++;
750 			continue;
751 		}
752 		FREF(fp);
753 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
754 		FRELE(fp);
755 		if (pl->revents != 0)
756 			n++;
757 	}
758 	*retval = n;
759 }
760 
761 /*
762  * Only copyout the revents field.
763  */
764 int
765 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
766 {
767 	int error = 0;
768 	u_int i = 0;
769 
770 	while (!error && i++ < nfds) {
771 		error = copyout(&pl->revents, &upl->revents,
772 		    sizeof(upl->revents));
773 		pl++;
774 		upl++;
775 	}
776 
777 	return (error);
778 }
779 
780 /*
781  * We are using the same mechanism as select only we encode/decode args
782  * differently.
783  */
784 int
785 sys_poll(struct proc *p, void *v, register_t *retval)
786 {
787 	struct sys_poll_args /* {
788 		syscallarg(struct pollfd *) fds;
789 		syscallarg(u_int) nfds;
790 		syscallarg(int) timeout;
791 	} */ *uap = v;
792 	size_t sz;
793 	struct pollfd pfds[4], *pl = pfds;
794 	int msec = SCARG(uap, timeout);
795 	struct timeval atv, rtv, ttv;
796 	int timo, ncoll, i, s, error;
797 	extern int nselcoll, selwait;
798 	u_int nfds = SCARG(uap, nfds);
799 
800 	/* Standards say no more than MAX_OPEN; this is possibly better. */
801 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
802 		return (EINVAL);
803 
804 	sz = sizeof(struct pollfd) * nfds;
805 
806 	/* optimize for the default case, of a small nfds value */
807 	if (sz > sizeof(pfds))
808 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
809 
810 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
811 		goto bad;
812 
813 	for (i = 0; i < nfds; i++)
814 		pl[i].revents = 0;
815 
816 	if (msec != INFTIM) {
817 		atv.tv_sec = msec / 1000;
818 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
819 
820 		if (itimerfix(&atv)) {
821 			error = EINVAL;
822 			goto done;
823 		}
824 		getmicrouptime(&rtv);
825 		timeradd(&atv, &rtv, &atv);
826 	} else {
827 		atv.tv_sec = 0;
828 		atv.tv_usec = 0;
829 	}
830 	timo = 0;
831 
832 retry:
833 	ncoll = nselcoll;
834 	atomic_setbits_int(&p->p_flag, P_SELECT);
835 	pollscan(p, pl, nfds, retval);
836 	if (*retval)
837 		goto done;
838 	if (msec != INFTIM) {
839 		getmicrouptime(&rtv);
840 		if (timercmp(&rtv, &atv, >=))
841 			goto done;
842 		ttv = atv;
843 		timersub(&ttv, &rtv, &ttv);
844 		timo = ttv.tv_sec > 24 * 60 * 60 ?
845 			24 * 60 * 60 * hz : tvtohz(&ttv);
846 	}
847 	s = splhigh();
848 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
849 		splx(s);
850 		goto retry;
851 	}
852 	atomic_clearbits_int(&p->p_flag, P_SELECT);
853 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
854 	splx(s);
855 	if (error == 0)
856 		goto retry;
857 
858 done:
859 	atomic_clearbits_int(&p->p_flag, P_SELECT);
860 	/*
861 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
862 	 *       ignored (since the whole point is to see what would block).
863 	 */
864 	switch (error) {
865 	case ERESTART:
866 		error = pollout(pl, SCARG(uap, fds), nfds);
867 		if (error == 0)
868 			error = EINTR;
869 		break;
870 	case EWOULDBLOCK:
871 	case 0:
872 		error = pollout(pl, SCARG(uap, fds), nfds);
873 		break;
874 	}
875 bad:
876 	if (pl != pfds)
877 		free(pl, M_TEMP);
878 	return (error);
879 }
880