xref: /openbsd-src/sys/kern/sys_generic.c (revision 48950c12d106c85f315112191a0228d7b83b9510)
1 /*	$OpenBSD: sys_generic.c,v 1.78 2012/07/09 17:51:08 claudio Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
67 int pollout(struct pollfd *, struct pollfd *, u_int);
68 
69 /*
70  * Read system call.
71  */
72 /* ARGSUSED */
73 int
74 sys_read(struct proc *p, void *v, register_t *retval)
75 {
76 	struct sys_read_args /* {
77 		syscallarg(int) fd;
78 		syscallarg(void *) buf;
79 		syscallarg(size_t) nbyte;
80 	} */ *uap = v;
81 	struct iovec iov;
82 	int fd = SCARG(uap, fd);
83 	struct file *fp;
84 	struct filedesc *fdp = p->p_fd;
85 
86 	if ((fp = fd_getfile(fdp, fd)) == NULL)
87 		return (EBADF);
88 	if ((fp->f_flag & FREAD) == 0)
89 		return (EBADF);
90 
91 	iov.iov_base = SCARG(uap, buf);
92 	iov.iov_len = SCARG(uap, nbyte);
93 
94 	FREF(fp);
95 
96 	/* dofilereadv() will FRELE the descriptor for us */
97 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
98 }
99 
100 /*
101  * Scatter read system call.
102  */
103 int
104 sys_readv(struct proc *p, void *v, register_t *retval)
105 {
106 	struct sys_readv_args /* {
107 		syscallarg(int) fd;
108 		syscallarg(const struct iovec *) iovp;
109 		syscallarg(int) iovcnt;
110 	} */ *uap = v;
111 	int fd = SCARG(uap, fd);
112 	struct file *fp;
113 	struct filedesc *fdp = p->p_fd;
114 
115 	if ((fp = fd_getfile(fdp, fd)) == NULL)
116 		return (EBADF);
117 	if ((fp->f_flag & FREAD) == 0)
118 		return (EBADF);
119 
120 	FREF(fp);
121 
122 	/* dofilereadv() will FRELE the descriptor for us */
123 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
124 	    &fp->f_offset, retval));
125 }
126 
127 int
128 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
129     int iovcnt, int userspace, off_t *offset, register_t *retval)
130 {
131 	struct iovec aiov[UIO_SMALLIOV];
132 	struct uio auio;
133 	struct iovec *iov;
134 	struct iovec *needfree = NULL;
135 	long i, cnt, error = 0;
136 	u_int iovlen;
137 #ifdef KTRACE
138 	struct iovec *ktriov = NULL;
139 #endif
140 
141 	/* note: can't use iovlen until iovcnt is validated */
142 	iovlen = iovcnt * sizeof(struct iovec);
143 
144 	/*
145 	 * If the iovec array exists in userspace, it needs to be copied in;
146 	 * otherwise, it can be used directly.
147 	 */
148 	if (userspace) {
149 		if ((u_int)iovcnt > UIO_SMALLIOV) {
150 			if ((u_int)iovcnt > IOV_MAX) {
151 				error = EINVAL;
152 				goto out;
153 			}
154 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
155 		} else if ((u_int)iovcnt > 0) {
156 			iov = aiov;
157 			needfree = NULL;
158 		} else {
159 			error = EINVAL;
160 			goto out;
161 		}
162 		if ((error = copyin(iovp, iov, iovlen)))
163 			goto done;
164 	} else {
165 		iov = (struct iovec *)iovp;		/* de-constify */
166 	}
167 
168 	auio.uio_iov = iov;
169 	auio.uio_iovcnt = iovcnt;
170 	auio.uio_rw = UIO_READ;
171 	auio.uio_segflg = UIO_USERSPACE;
172 	auio.uio_procp = p;
173 	auio.uio_resid = 0;
174 	for (i = 0; i < iovcnt; i++) {
175 		auio.uio_resid += iov->iov_len;
176 		/*
177 		 * Reads return ssize_t because -1 is returned on error.
178 		 * Therefore we must restrict the length to SSIZE_MAX to
179 		 * avoid garbage return values.  Note that the addition is
180 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
181 		 */
182 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
183 			error = EINVAL;
184 			goto done;
185 		}
186 		iov++;
187 	}
188 #ifdef KTRACE
189 	/*
190 	 * if tracing, save a copy of iovec
191 	 */
192 	if (KTRPOINT(p, KTR_GENIO)) {
193 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
194 		bcopy(auio.uio_iov, ktriov, iovlen);
195 	}
196 #endif
197 	cnt = auio.uio_resid;
198 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
199 	if (error)
200 		if (auio.uio_resid != cnt && (error == ERESTART ||
201 		    error == EINTR || error == EWOULDBLOCK))
202 			error = 0;
203 	cnt -= auio.uio_resid;
204 
205 	fp->f_rxfer++;
206 	fp->f_rbytes += cnt;
207 #ifdef KTRACE
208 	if (ktriov != NULL) {
209 		if (error == 0)
210 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
211 			    error);
212 		free(ktriov, M_TEMP);
213 	}
214 #endif
215 	*retval = cnt;
216  done:
217 	if (needfree)
218 		free(needfree, M_IOV);
219  out:
220 	FRELE(fp, p);
221 	return (error);
222 }
223 
224 /*
225  * Write system call
226  */
227 int
228 sys_write(struct proc *p, void *v, register_t *retval)
229 {
230 	struct sys_write_args /* {
231 		syscallarg(int) fd;
232 		syscallarg(const void *) buf;
233 		syscallarg(size_t) nbyte;
234 	} */ *uap = v;
235 	struct iovec iov;
236 	int fd = SCARG(uap, fd);
237 	struct file *fp;
238 	struct filedesc *fdp = p->p_fd;
239 
240 	if ((fp = fd_getfile(fdp, fd)) == NULL)
241 		return (EBADF);
242 	if ((fp->f_flag & FWRITE) == 0)
243 		return (EBADF);
244 
245 	iov.iov_base = (void *)SCARG(uap, buf);
246 	iov.iov_len = SCARG(uap, nbyte);
247 
248 	FREF(fp);
249 
250 	/* dofilewritev() will FRELE the descriptor for us */
251 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
252 }
253 
254 /*
255  * Gather write system call
256  */
257 int
258 sys_writev(struct proc *p, void *v, register_t *retval)
259 {
260 	struct sys_writev_args /* {
261 		syscallarg(int) fd;
262 		syscallarg(const struct iovec *) iovp;
263 		syscallarg(int) iovcnt;
264 	} */ *uap = v;
265 	int fd = SCARG(uap, fd);
266 	struct file *fp;
267 	struct filedesc *fdp = p->p_fd;
268 
269 	if ((fp = fd_getfile(fdp, fd)) == NULL)
270 		return (EBADF);
271 	if ((fp->f_flag & FWRITE) == 0)
272 		return (EBADF);
273 
274 	FREF(fp);
275 
276 	/* dofilewritev() will FRELE the descriptor for us */
277 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
278 	    &fp->f_offset, retval));
279 }
280 
281 int
282 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
283     int iovcnt, int userspace, off_t *offset, register_t *retval)
284 {
285 	struct iovec aiov[UIO_SMALLIOV];
286 	struct uio auio;
287 	struct iovec *iov;
288 	struct iovec *needfree = NULL;
289 	long i, cnt, error = 0;
290 	u_int iovlen;
291 #ifdef KTRACE
292 	struct iovec *ktriov = NULL;
293 #endif
294 
295 	/* note: can't use iovlen until iovcnt is validated */
296 	iovlen = iovcnt * sizeof(struct iovec);
297 
298 	/*
299 	 * If the iovec array exists in userspace, it needs to be copied in;
300 	 * otherwise, it can be used directly.
301 	 */
302 	if (userspace) {
303 		if ((u_int)iovcnt > UIO_SMALLIOV) {
304 			if ((u_int)iovcnt > IOV_MAX) {
305 				error = EINVAL;
306 				goto out;
307 			}
308 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
309 		} else if ((u_int)iovcnt > 0) {
310 			iov = aiov;
311 			needfree = NULL;
312 		} else {
313 			error = EINVAL;
314 			goto out;
315 		}
316 		if ((error = copyin(iovp, iov, iovlen)))
317 			goto done;
318 	} else {
319 		iov = (struct iovec *)iovp;		/* de-constify */
320 	}
321 
322 	auio.uio_iov = iov;
323 	auio.uio_iovcnt = iovcnt;
324 	auio.uio_rw = UIO_WRITE;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	auio.uio_procp = p;
327 	auio.uio_resid = 0;
328 	for (i = 0; i < iovcnt; i++) {
329 		auio.uio_resid += iov->iov_len;
330 		/*
331 		 * Writes return ssize_t because -1 is returned on error.
332 		 * Therefore we must restrict the length to SSIZE_MAX to
333 		 * avoid garbage return values.  Note that the addition is
334 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
335 		 */
336 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
337 			error = EINVAL;
338 			goto done;
339 		}
340 		iov++;
341 	}
342 #ifdef KTRACE
343 	/*
344 	 * if tracing, save a copy of iovec
345 	 */
346 	if (KTRPOINT(p, KTR_GENIO)) {
347 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
348 		bcopy(auio.uio_iov, ktriov, iovlen);
349 	}
350 #endif
351 	cnt = auio.uio_resid;
352 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
353 	if (error) {
354 		if (auio.uio_resid != cnt && (error == ERESTART ||
355 		    error == EINTR || error == EWOULDBLOCK))
356 			error = 0;
357 		if (error == EPIPE)
358 			ptsignal(p, SIGPIPE, STHREAD);
359 	}
360 	cnt -= auio.uio_resid;
361 
362 	fp->f_wxfer++;
363 	fp->f_wbytes += cnt;
364 #ifdef KTRACE
365 	if (ktriov != NULL) {
366 		if (error == 0)
367 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
368 		free(ktriov, M_TEMP);
369 	}
370 #endif
371 	*retval = cnt;
372  done:
373 	if (needfree)
374 		free(needfree, M_IOV);
375  out:
376 	FRELE(fp, p);
377 	return (error);
378 }
379 
380 /*
381  * Ioctl system call
382  */
383 /* ARGSUSED */
384 int
385 sys_ioctl(struct proc *p, void *v, register_t *retval)
386 {
387 	struct sys_ioctl_args /* {
388 		syscallarg(int) fd;
389 		syscallarg(u_long) com;
390 		syscallarg(void *) data;
391 	} */ *uap = v;
392 	struct file *fp;
393 	struct filedesc *fdp;
394 	u_long com;
395 	int error;
396 	u_int size;
397 	caddr_t data, memp;
398 	int tmp;
399 #define STK_PARAMS	128
400 	long long stkbuf[STK_PARAMS / sizeof(long long)];
401 
402 	fdp = p->p_fd;
403 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
404 		return (EBADF);
405 
406 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
407 		return (EBADF);
408 
409 	switch (com = SCARG(uap, com)) {
410 	case FIONCLEX:
411 	case FIOCLEX:
412 		fdplock(fdp);
413 		if (com == FIONCLEX)
414 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
415 		else
416 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
417 		fdpunlock(fdp);
418 		return (0);
419 	}
420 
421 	/*
422 	 * Interpret high order word to find amount of data to be
423 	 * copied to/from the user's address space.
424 	 */
425 	size = IOCPARM_LEN(com);
426 	if (size > IOCPARM_MAX)
427 		return (ENOTTY);
428 	FREF(fp);
429 	memp = NULL;
430 	if (size > sizeof (stkbuf)) {
431 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
432 		data = memp;
433 	} else
434 		data = (caddr_t)stkbuf;
435 	if (com&IOC_IN) {
436 		if (size) {
437 			error = copyin(SCARG(uap, data), data, (u_int)size);
438 			if (error) {
439 				goto out;
440 			}
441 		} else
442 			*(caddr_t *)data = SCARG(uap, data);
443 	} else if ((com&IOC_OUT) && size)
444 		/*
445 		 * Zero the buffer so the user always
446 		 * gets back something deterministic.
447 		 */
448 		bzero(data, size);
449 	else if (com&IOC_VOID)
450 		*(caddr_t *)data = SCARG(uap, data);
451 
452 	switch (com) {
453 
454 	case FIONBIO:
455 		if ((tmp = *(int *)data) != 0)
456 			fp->f_flag |= FNONBLOCK;
457 		else
458 			fp->f_flag &= ~FNONBLOCK;
459 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
460 		break;
461 
462 	case FIOASYNC:
463 		if ((tmp = *(int *)data) != 0)
464 			fp->f_flag |= FASYNC;
465 		else
466 			fp->f_flag &= ~FASYNC;
467 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
468 		break;
469 
470 	case FIOSETOWN:
471 		tmp = *(int *)data;
472 		if (fp->f_type == DTYPE_SOCKET) {
473 			struct socket *so = (struct socket *)fp->f_data;
474 
475 			so->so_pgid = tmp;
476 			so->so_siguid = p->p_cred->p_ruid;
477 			so->so_sigeuid = p->p_ucred->cr_uid;
478 			error = 0;
479 			break;
480 		}
481 		if (tmp <= 0) {
482 			tmp = -tmp;
483 		} else {
484 			struct process *pr = prfind(tmp);
485 			if (pr == NULL) {
486 				error = ESRCH;
487 				break;
488 			}
489 			tmp = pr->ps_pgrp->pg_id;
490 		}
491 		error = (*fp->f_ops->fo_ioctl)
492 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
493 		break;
494 
495 	case FIOGETOWN:
496 		if (fp->f_type == DTYPE_SOCKET) {
497 			error = 0;
498 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
499 			break;
500 		}
501 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
502 		*(int *)data = -*(int *)data;
503 		break;
504 
505 	default:
506 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
507 		break;
508 	}
509 	/*
510 	 * Copy any data to user, size was
511 	 * already set and checked above.
512 	 */
513 	if (error == 0 && (com&IOC_OUT) && size)
514 		error = copyout(data, SCARG(uap, data), (u_int)size);
515 out:
516 	FRELE(fp, p);
517 	if (memp)
518 		free(memp, M_IOCTLOPS);
519 	return (error);
520 }
521 
522 int	selwait, nselcoll;
523 
524 /*
525  * Select system call.
526  */
527 int
528 sys_select(struct proc *p, void *v, register_t *retval)
529 {
530 	struct sys_select_args /* {
531 		syscallarg(int) nd;
532 		syscallarg(fd_set *) in;
533 		syscallarg(fd_set *) ou;
534 		syscallarg(fd_set *) ex;
535 		syscallarg(struct timeval *) tv;
536 	} */ *uap = v;
537 	fd_mask bits[6];
538 	fd_set *pibits[3], *pobits[3];
539 	struct timeval atv, rtv, ttv;
540 	int s, ncoll, error = 0, timo;
541 	u_int nd, ni;
542 
543 	nd = SCARG(uap, nd);
544 	if (nd > p->p_fd->fd_nfiles) {
545 		/* forgiving; slightly wrong */
546 		nd = p->p_fd->fd_nfiles;
547 	}
548 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
549 	if (ni > sizeof(bits[0])) {
550 		caddr_t mbits;
551 
552 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK|M_ZERO);
553 		pibits[0] = (fd_set *)&mbits[ni * 0];
554 		pibits[1] = (fd_set *)&mbits[ni * 1];
555 		pibits[2] = (fd_set *)&mbits[ni * 2];
556 		pobits[0] = (fd_set *)&mbits[ni * 3];
557 		pobits[1] = (fd_set *)&mbits[ni * 4];
558 		pobits[2] = (fd_set *)&mbits[ni * 5];
559 	} else {
560 		bzero(bits, sizeof(bits));
561 		pibits[0] = (fd_set *)&bits[0];
562 		pibits[1] = (fd_set *)&bits[1];
563 		pibits[2] = (fd_set *)&bits[2];
564 		pobits[0] = (fd_set *)&bits[3];
565 		pobits[1] = (fd_set *)&bits[4];
566 		pobits[2] = (fd_set *)&bits[5];
567 	}
568 
569 #define	getbits(name, x) \
570 	if (SCARG(uap, name) && (error = copyin(SCARG(uap, name), \
571 	    pibits[x], ni))) \
572 		goto done;
573 	getbits(in, 0);
574 	getbits(ou, 1);
575 	getbits(ex, 2);
576 #undef	getbits
577 #ifdef KTRACE
578 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
579 		if (SCARG(uap, in)) ktrfdset(p, pibits[0], ni);
580 		if (SCARG(uap, ou)) ktrfdset(p, pibits[1], ni);
581 		if (SCARG(uap, ex)) ktrfdset(p, pibits[2], ni);
582 	}
583 #endif
584 
585 	if (SCARG(uap, tv)) {
586 		error = copyin(SCARG(uap, tv), &atv, sizeof (atv));
587 		if (error)
588 			goto done;
589 #ifdef KTRACE
590 		if (KTRPOINT(p, KTR_STRUCT))
591 			ktrreltimeval(p, &atv);
592 #endif
593 		if (itimerfix(&atv)) {
594 			error = EINVAL;
595 			goto done;
596 		}
597 		getmicrouptime(&rtv);
598 		timeradd(&atv, &rtv, &atv);
599 	} else {
600 		atv.tv_sec = 0;
601 		atv.tv_usec = 0;
602 	}
603 	timo = 0;
604 
605 retry:
606 	ncoll = nselcoll;
607 	atomic_setbits_int(&p->p_flag, P_SELECT);
608 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
609 	if (error || *retval)
610 		goto done;
611 	if (SCARG(uap, tv)) {
612 		getmicrouptime(&rtv);
613 		if (timercmp(&rtv, &atv, >=))
614 			goto done;
615 		ttv = atv;
616 		timersub(&ttv, &rtv, &ttv);
617 		timo = ttv.tv_sec > 24 * 60 * 60 ?
618 			24 * 60 * 60 * hz : tvtohz(&ttv);
619 	}
620 	s = splhigh();
621 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
622 		splx(s);
623 		goto retry;
624 	}
625 	atomic_clearbits_int(&p->p_flag, P_SELECT);
626 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
627 	splx(s);
628 	if (error == 0)
629 		goto retry;
630 done:
631 	atomic_clearbits_int(&p->p_flag, P_SELECT);
632 	/* select is not restarted after signals... */
633 	if (error == ERESTART)
634 		error = EINTR;
635 	if (error == EWOULDBLOCK)
636 		error = 0;
637 #define	putbits(name, x) \
638 	if (SCARG(uap, name) && (error2 = copyout(pobits[x], \
639 	    SCARG(uap, name), ni))) \
640 		error = error2;
641 	if (error == 0) {
642 		int error2;
643 
644 		putbits(in, 0);
645 		putbits(ou, 1);
646 		putbits(ex, 2);
647 #undef putbits
648 #ifdef KTRACE
649 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
650 			if (SCARG(uap, in)) ktrfdset(p, pobits[0], ni);
651 			if (SCARG(uap, ou)) ktrfdset(p, pobits[1], ni);
652 			if (SCARG(uap, ex)) ktrfdset(p, pobits[2], ni);
653 		}
654 #endif
655 	}
656 
657 	if (pibits[0] != (fd_set *)&bits[0])
658 		free(pibits[0], M_TEMP);
659 	return (error);
660 }
661 
662 int
663 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
664     register_t *retval)
665 {
666 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
667 	struct filedesc *fdp = p->p_fd;
668 	int msk, i, j, fd;
669 	fd_mask bits;
670 	struct file *fp;
671 	int n = 0;
672 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
673 
674 	for (msk = 0; msk < 3; msk++) {
675 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
676 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
677 
678 		for (i = 0; i < nfd; i += NFDBITS) {
679 			bits = pibits->fds_bits[i/NFDBITS];
680 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
681 				bits &= ~(1 << j);
682 				if ((fp = fd_getfile(fdp, fd)) == NULL)
683 					return (EBADF);
684 				FREF(fp);
685 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
686 					FD_SET(fd, pobits);
687 					n++;
688 				}
689 				FRELE(fp, p);
690 			}
691 		}
692 	}
693 	*retval = n;
694 	return (0);
695 }
696 
697 /*ARGSUSED*/
698 int
699 seltrue(dev_t dev, int events, struct proc *p)
700 {
701 
702 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
703 }
704 
705 int
706 selfalse(dev_t dev, int events, struct proc *p)
707 {
708 
709 	return (0);
710 }
711 
712 /*
713  * Record a select request.
714  */
715 void
716 selrecord(struct proc *selector, struct selinfo *sip)
717 {
718 	struct proc *p;
719 	pid_t mypid;
720 
721 	mypid = selector->p_pid;
722 	if (sip->si_selpid == mypid)
723 		return;
724 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
725 	    p->p_wchan == (caddr_t)&selwait)
726 		sip->si_flags |= SI_COLL;
727 	else
728 		sip->si_selpid = mypid;
729 }
730 
731 /*
732  * Do a wakeup when a selectable event occurs.
733  */
734 void
735 selwakeup(struct selinfo *sip)
736 {
737 	struct proc *p;
738 	int s;
739 
740 	KNOTE(&sip->si_note, 0);
741 	if (sip->si_selpid == 0)
742 		return;
743 	if (sip->si_flags & SI_COLL) {
744 		nselcoll++;
745 		sip->si_flags &= ~SI_COLL;
746 		wakeup(&selwait);
747 	}
748 	p = pfind(sip->si_selpid);
749 	sip->si_selpid = 0;
750 	if (p != NULL) {
751 		SCHED_LOCK(s);
752 		if (p->p_wchan == (caddr_t)&selwait) {
753 			if (p->p_stat == SSLEEP)
754 				setrunnable(p);
755 			else
756 				unsleep(p);
757 		} else if (p->p_flag & P_SELECT)
758 			atomic_clearbits_int(&p->p_flag, P_SELECT);
759 		SCHED_UNLOCK(s);
760 	}
761 }
762 
763 void
764 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
765 {
766 	struct filedesc *fdp = p->p_fd;
767 	struct file *fp;
768 	u_int i;
769 	int n = 0;
770 
771 	for (i = 0; i < nfd; i++, pl++) {
772 		/* Check the file descriptor. */
773 		if (pl->fd < 0) {
774 			pl->revents = 0;
775 			continue;
776 		}
777 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
778 			pl->revents = POLLNVAL;
779 			n++;
780 			continue;
781 		}
782 		FREF(fp);
783 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
784 		FRELE(fp, p);
785 		if (pl->revents != 0)
786 			n++;
787 	}
788 	*retval = n;
789 }
790 
791 /*
792  * Only copyout the revents field.
793  */
794 int
795 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
796 {
797 	int error = 0;
798 	u_int i = 0;
799 
800 	while (!error && i++ < nfds) {
801 		error = copyout(&pl->revents, &upl->revents,
802 		    sizeof(upl->revents));
803 		pl++;
804 		upl++;
805 	}
806 
807 	return (error);
808 }
809 
810 /*
811  * We are using the same mechanism as select only we encode/decode args
812  * differently.
813  */
814 int
815 sys_poll(struct proc *p, void *v, register_t *retval)
816 {
817 	struct sys_poll_args /* {
818 		syscallarg(struct pollfd *) fds;
819 		syscallarg(u_int) nfds;
820 		syscallarg(int) timeout;
821 	} */ *uap = v;
822 	size_t sz;
823 	struct pollfd pfds[4], *pl = pfds;
824 	int msec = SCARG(uap, timeout);
825 	struct timeval atv, rtv, ttv;
826 	int timo, ncoll, i, s, error;
827 	extern int nselcoll, selwait;
828 	u_int nfds = SCARG(uap, nfds);
829 
830 	/* Standards say no more than MAX_OPEN; this is possibly better. */
831 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
832 		return (EINVAL);
833 
834 	sz = sizeof(struct pollfd) * nfds;
835 
836 	/* optimize for the default case, of a small nfds value */
837 	if (sz > sizeof(pfds))
838 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
839 
840 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
841 		goto bad;
842 
843 	for (i = 0; i < nfds; i++)
844 		pl[i].revents = 0;
845 
846 	if (msec != INFTIM) {
847 		atv.tv_sec = msec / 1000;
848 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
849 
850 		if (itimerfix(&atv)) {
851 			error = EINVAL;
852 			goto done;
853 		}
854 		getmicrouptime(&rtv);
855 		timeradd(&atv, &rtv, &atv);
856 	} else {
857 		atv.tv_sec = 0;
858 		atv.tv_usec = 0;
859 	}
860 	timo = 0;
861 
862 retry:
863 	ncoll = nselcoll;
864 	atomic_setbits_int(&p->p_flag, P_SELECT);
865 	pollscan(p, pl, nfds, retval);
866 	if (*retval)
867 		goto done;
868 	if (msec != INFTIM) {
869 		getmicrouptime(&rtv);
870 		if (timercmp(&rtv, &atv, >=))
871 			goto done;
872 		ttv = atv;
873 		timersub(&ttv, &rtv, &ttv);
874 		timo = ttv.tv_sec > 24 * 60 * 60 ?
875 			24 * 60 * 60 * hz : tvtohz(&ttv);
876 	}
877 	s = splhigh();
878 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
879 		splx(s);
880 		goto retry;
881 	}
882 	atomic_clearbits_int(&p->p_flag, P_SELECT);
883 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
884 	splx(s);
885 	if (error == 0)
886 		goto retry;
887 
888 done:
889 	atomic_clearbits_int(&p->p_flag, P_SELECT);
890 	/*
891 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
892 	 *       ignored (since the whole point is to see what would block).
893 	 */
894 	switch (error) {
895 	case ERESTART:
896 		error = pollout(pl, SCARG(uap, fds), nfds);
897 		if (error == 0)
898 			error = EINTR;
899 		break;
900 	case EWOULDBLOCK:
901 	case 0:
902 		error = pollout(pl, SCARG(uap, fds), nfds);
903 		break;
904 	}
905 bad:
906 	if (pl != pfds)
907 		free(pl, M_TEMP);
908 	return (error);
909 }
910