xref: /openbsd-src/sys/kern/sys_generic.c (revision e76a246632ce83ad1a548ed3149e7a96e5199a58)
1 /*	$OpenBSD: sys_generic.c,v 1.21 1999/11/29 19:56:59 deraadt Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/socketvar.h>
53 #include <sys/signalvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <sys/mount.h>
64 #include <sys/syscallargs.h>
65 
66 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
67 int seltrue __P((dev_t, int, struct proc *));
68 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
69 
70 /*
71  * Read system call.
72  */
73 /* ARGSUSED */
74 int
75 sys_read(p, v, retval)
76 	struct proc *p;
77 	void *v;
78 	register_t *retval;
79 {
80 	register struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	register struct file *fp;
86 	register struct filedesc *fdp = p->p_fd;
87 	struct uio auio;
88 	struct iovec aiov;
89 	long cnt, error = 0;
90 #ifdef KTRACE
91 	struct iovec ktriov;
92 #endif
93 
94 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
95 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
96 	    (fp->f_flag & FREAD) == 0)
97 		return (EBADF);
98 	/* Don't allow nbyte to be larger than max return val */
99 	if (SCARG(uap, nbyte) > SSIZE_MAX)
100 		return(EINVAL);
101 	aiov.iov_base = (caddr_t)SCARG(uap, buf);
102 	aiov.iov_len = SCARG(uap, nbyte);
103 	auio.uio_iov = &aiov;
104 	auio.uio_iovcnt = 1;
105 	auio.uio_resid = SCARG(uap, nbyte);
106 	auio.uio_rw = UIO_READ;
107 	auio.uio_segflg = UIO_USERSPACE;
108 	auio.uio_procp = p;
109 #ifdef KTRACE
110 	/*
111 	 * if tracing, save a copy of iovec
112 	 */
113 	if (KTRPOINT(p, KTR_GENIO))
114 		ktriov = aiov;
115 #endif
116 	cnt = SCARG(uap, nbyte);
117 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
118 	if (error)
119 		if (auio.uio_resid != cnt && (error == ERESTART ||
120 		    error == EINTR || error == EWOULDBLOCK))
121 			error = 0;
122 	cnt -= auio.uio_resid;
123 #ifdef KTRACE
124 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
125 		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov,
126 		    cnt, error);
127 #endif
128 	*retval = cnt;
129 	return (error);
130 }
131 
132 /*
133  * Scatter read system call.
134  */
135 int
136 sys_readv(p, v, retval)
137 	struct proc *p;
138 	void *v;
139 	register_t *retval;
140 {
141 	register struct sys_readv_args /* {
142 		syscallarg(int) fd;
143 		syscallarg(struct iovec *) iovp;
144 		syscallarg(int) iovcnt;
145 	} */ *uap = v;
146 	register struct file *fp;
147 	register struct filedesc *fdp = p->p_fd;
148 	struct uio auio;
149 	register struct iovec *iov;
150 	struct iovec *needfree;
151 	struct iovec aiov[UIO_SMALLIOV];
152 	long i, cnt, error = 0;
153 	u_int iovlen;
154 #ifdef KTRACE
155 	struct iovec *ktriov = NULL;
156 #endif
157 
158 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
159 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
160 	    (fp->f_flag & FREAD) == 0)
161 		return (EBADF);
162 	if (SCARG(uap, iovcnt) <= 0)
163 		return (EINVAL);
164 	/* note: can't use iovlen until iovcnt is validated */
165 	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
166 	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
167 		if (SCARG(uap, iovcnt) > IOV_MAX)
168 			return (EINVAL);
169 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
170 		needfree = iov;
171 	} else {
172 		iov = aiov;
173 		needfree = NULL;
174 	}
175 	auio.uio_iov = iov;
176 	auio.uio_iovcnt = SCARG(uap, iovcnt);
177 	auio.uio_rw = UIO_READ;
178 	auio.uio_segflg = UIO_USERSPACE;
179 	auio.uio_procp = p;
180 	error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen);
181 	if (error)
182 		goto done;
183 	auio.uio_resid = 0;
184 	for (i = 0; i < SCARG(uap, iovcnt); i++, iov++) {
185 		/* Don't allow sum > SSIZE_MAX */
186 		if (iov->iov_len > SSIZE_MAX ||
187 		    (auio.uio_resid += iov->iov_len) > SSIZE_MAX) {
188 			error = EINVAL;
189 			goto done;
190 		}
191 	}
192 #ifdef KTRACE
193 	/*
194 	 * if tracing, save a copy of iovec
195 	 */
196 	if (KTRPOINT(p, KTR_GENIO))  {
197 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
198 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
199 	}
200 #endif
201 	cnt = auio.uio_resid;
202 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
203 	if (error)
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	cnt -= auio.uio_resid;
208 #ifdef KTRACE
209 	if (ktriov != NULL) {
210 		if (error == 0)
211 			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov,
212 			    cnt, error);
213 		FREE(ktriov, M_TEMP);
214 	}
215 #endif
216 	*retval = cnt;
217 done:
218 	if (needfree)
219 		FREE(needfree, M_IOV);
220 	return (error);
221 }
222 
223 /*
224  * Write system call
225  */
226 int
227 sys_write(p, v, retval)
228 	struct proc *p;
229 	void *v;
230 	register_t *retval;
231 {
232 	register struct sys_write_args /* {
233 		syscallarg(int) fd;
234 		syscallarg(void *) buf;
235 		syscallarg(size_t) nbyte;
236 	} */ *uap = v;
237 	register struct file *fp;
238 	register struct filedesc *fdp = p->p_fd;
239 	struct uio auio;
240 	struct iovec aiov;
241 	long cnt, error = 0;
242 #ifdef KTRACE
243 	struct iovec ktriov;
244 #endif
245 
246 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
247 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
248 	    (fp->f_flag & FWRITE) == 0)
249 		return (EBADF);
250 	/* Don't allow nbyte to be larger than max return val */
251 	if (SCARG(uap, nbyte) > SSIZE_MAX)
252 		return(EINVAL);
253 	aiov.iov_base = (caddr_t)SCARG(uap, buf);
254 	aiov.iov_len = SCARG(uap, nbyte);
255 	auio.uio_iov = &aiov;
256 	auio.uio_iovcnt = 1;
257 	auio.uio_resid = SCARG(uap, nbyte);
258 	auio.uio_rw = UIO_WRITE;
259 	auio.uio_segflg = UIO_USERSPACE;
260 	auio.uio_procp = p;
261 #ifdef KTRACE
262 	/*
263 	 * if tracing, save a copy of iovec
264 	 */
265 	if (KTRPOINT(p, KTR_GENIO))
266 		ktriov = aiov;
267 #endif
268 	cnt = SCARG(uap, nbyte);
269 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
270 	if (error) {
271 		if (auio.uio_resid != cnt && (error == ERESTART ||
272 		    error == EINTR || error == EWOULDBLOCK))
273 			error = 0;
274 		if (error == EPIPE)
275 			psignal(p, SIGPIPE);
276 	}
277 	cnt -= auio.uio_resid;
278 #ifdef KTRACE
279 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
280 		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
281 		    &ktriov, cnt, error);
282 #endif
283 	*retval = cnt;
284 	return (error);
285 }
286 
287 /*
288  * Gather write system call
289  */
290 int
291 sys_writev(p, v, retval)
292 	struct proc *p;
293 	void *v;
294 	register_t *retval;
295 {
296 	register struct sys_writev_args /* {
297 		syscallarg(int) fd;
298 		syscallarg(struct iovec *) iovp;
299 		syscallarg(int) iovcnt;
300 	} */ *uap = v;
301 	register struct file *fp;
302 	register struct filedesc *fdp = p->p_fd;
303 	struct uio auio;
304 	register struct iovec *iov;
305 	struct iovec *needfree;
306 	struct iovec aiov[UIO_SMALLIOV];
307 	long i, cnt, error = 0;
308 	u_int iovlen;
309 #ifdef KTRACE
310 	struct iovec *ktriov = NULL;
311 #endif
312 
313 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
314 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
315 	    (fp->f_flag & FWRITE) == 0)
316 		return (EBADF);
317 	if (SCARG(uap, iovcnt) <= 0)
318 		return (EINVAL);
319 	/* note: can't use iovlen until iovcnt is validated */
320 	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
321 	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
322 		if (SCARG(uap, iovcnt) > IOV_MAX)
323 			return (EINVAL);
324 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
325 		needfree = iov;
326 	} else {
327 		iov = aiov;
328 		needfree = NULL;
329 	}
330 	auio.uio_iov = iov;
331 	auio.uio_iovcnt = SCARG(uap, iovcnt);
332 	auio.uio_rw = UIO_WRITE;
333 	auio.uio_segflg = UIO_USERSPACE;
334 	auio.uio_procp = p;
335 	error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen);
336 	if (error)
337 		goto done;
338 	auio.uio_resid = 0;
339 	for (i = 0; i < SCARG(uap, iovcnt); i++, iov++) {
340 		/* Don't allow sum > SSIZE_MAX */
341 		if (iov->iov_len > SSIZE_MAX ||
342 		    (auio.uio_resid += iov->iov_len) > SSIZE_MAX) {
343 			error = EINVAL;
344 			goto done;
345 		}
346 	}
347 #ifdef KTRACE
348 	/*
349 	 * if tracing, save a copy of iovec
350 	 */
351 	if (KTRPOINT(p, KTR_GENIO))  {
352 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
353 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
354 	}
355 #endif
356 	cnt = auio.uio_resid;
357 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
358 	if (error) {
359 		if (auio.uio_resid != cnt && (error == ERESTART ||
360 		    error == EINTR || error == EWOULDBLOCK))
361 			error = 0;
362 		if (error == EPIPE)
363 			psignal(p, SIGPIPE);
364 	}
365 	cnt -= auio.uio_resid;
366 #ifdef KTRACE
367 	if (ktriov != NULL) {
368 		if (error == 0)
369 			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
370 				ktriov, cnt, error);
371 		FREE(ktriov, M_TEMP);
372 	}
373 #endif
374 	*retval = cnt;
375 done:
376 	if (needfree)
377 		FREE(needfree, M_IOV);
378 	return (error);
379 }
380 
381 /*
382  * Ioctl system call
383  */
384 /* ARGSUSED */
385 int
386 sys_ioctl(p, v, retval)
387 	struct proc *p;
388 	void *v;
389 	register_t *retval;
390 {
391 	register struct sys_ioctl_args /* {
392 		syscallarg(int) fd;
393 		syscallarg(u_long) com;
394 		syscallarg(caddr_t) data;
395 	} */ *uap = v;
396 	register struct file *fp;
397 	register struct filedesc *fdp;
398 	register u_long com;
399 	register int error;
400 	register u_int size;
401 	caddr_t data, memp;
402 	int tmp;
403 #define STK_PARAMS	128
404 	char stkbuf[STK_PARAMS];
405 
406 	fdp = p->p_fd;
407 	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
408 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
409 		return (EBADF);
410 
411 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
412 		return (EBADF);
413 
414 	switch (com = SCARG(uap, com)) {
415 	case FIONCLEX:
416 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
417 		return (0);
418 	case FIOCLEX:
419 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
420 		return (0);
421 	}
422 
423 	/*
424 	 * Interpret high order word to find amount of data to be
425 	 * copied to/from the user's address space.
426 	 */
427 	size = IOCPARM_LEN(com);
428 	if (size > IOCPARM_MAX)
429 		return (ENOTTY);
430 	memp = NULL;
431 	if (size > sizeof (stkbuf)) {
432 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
433 		data = memp;
434 	} else
435 		data = stkbuf;
436 	if (com&IOC_IN) {
437 		if (size) {
438 			error = copyin(SCARG(uap, data), data, (u_int)size);
439 			if (error) {
440 				if (memp)
441 					free(memp, M_IOCTLOPS);
442 				return (error);
443 			}
444 		} else
445 			*(caddr_t *)data = SCARG(uap, data);
446 	} else if ((com&IOC_OUT) && size)
447 		/*
448 		 * Zero the buffer so the user always
449 		 * gets back something deterministic.
450 		 */
451 		bzero(data, size);
452 	else if (com&IOC_VOID)
453 		*(caddr_t *)data = SCARG(uap, data);
454 
455 	switch (com) {
456 
457 	case FIONBIO:
458 		if ((tmp = *(int *)data) != 0)
459 			fp->f_flag |= FNONBLOCK;
460 		else
461 			fp->f_flag &= ~FNONBLOCK;
462 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
463 		break;
464 
465 	case FIOASYNC:
466 		if ((tmp = *(int *)data) != 0)
467 			fp->f_flag |= FASYNC;
468 		else
469 			fp->f_flag &= ~FASYNC;
470 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
471 		break;
472 
473 	case FIOSETOWN:
474 		tmp = *(int *)data;
475 		if (fp->f_type == DTYPE_SOCKET) {
476 			struct socket *so = (struct socket *)fp->f_data;
477 
478 			so->so_pgid = tmp;
479 			so->so_siguid = p->p_cred->p_ruid;
480 			so->so_sigeuid = p->p_ucred->cr_uid;
481 			error = 0;
482 			break;
483 		}
484 		if (tmp <= 0) {
485 			tmp = -tmp;
486 		} else {
487 			struct proc *p1 = pfind(tmp);
488 			if (p1 == 0) {
489 				error = ESRCH;
490 				break;
491 			}
492 			tmp = p1->p_pgrp->pg_id;
493 		}
494 		error = (*fp->f_ops->fo_ioctl)
495 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
496 		break;
497 
498 	case FIOGETOWN:
499 		if (fp->f_type == DTYPE_SOCKET) {
500 			error = 0;
501 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
502 			break;
503 		}
504 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
505 		*(int *)data = -*(int *)data;
506 		break;
507 
508 	default:
509 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
510 		/*
511 		 * Copy any data to user, size was
512 		 * already set and checked above.
513 		 */
514 		if (error == 0 && (com&IOC_OUT) && size)
515 			error = copyout(data, SCARG(uap, data), (u_int)size);
516 		break;
517 	}
518 	if (memp)
519 		free(memp, M_IOCTLOPS);
520 	return (error);
521 }
522 
523 int	selwait, nselcoll;
524 
525 /*
526  * Select system call.
527  */
528 int
529 sys_select(p, v, retval)
530 	register struct proc *p;
531 	void *v;
532 	register_t *retval;
533 {
534 	register struct sys_select_args /* {
535 		syscallarg(int) nd;
536 		syscallarg(fd_set *) in;
537 		syscallarg(fd_set *) ou;
538 		syscallarg(fd_set *) ex;
539 		syscallarg(struct timeval *) tv;
540 	} */ *uap = v;
541 	fd_set bits[6], *pibits[3], *pobits[3];
542 	struct timeval atv;
543 	int s, ncoll, error = 0, timo;
544 	u_int ni;
545 
546 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
547 		/* forgiving; slightly wrong */
548 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
549 	}
550 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
551 	if (SCARG(uap, nd) > FD_SETSIZE) {
552 		caddr_t mbits;
553 
554 		if ((mbits = malloc(ni * 6, M_TEMP, M_WAITOK)) == NULL) {
555 			error = EINVAL;
556 			goto cleanup;
557 		}
558 		bzero(mbits, ni * 6);
559 		pibits[0] = (fd_set *)&mbits[ni * 0];
560 		pibits[1] = (fd_set *)&mbits[ni * 1];
561 		pibits[2] = (fd_set *)&mbits[ni * 2];
562 		pobits[0] = (fd_set *)&mbits[ni * 3];
563 		pobits[1] = (fd_set *)&mbits[ni * 4];
564 		pobits[2] = (fd_set *)&mbits[ni * 5];
565 	} else {
566 		bzero((caddr_t)bits, sizeof(bits));
567 		pibits[0] = &bits[0];
568 		pibits[1] = &bits[1];
569 		pibits[2] = &bits[2];
570 		pobits[0] = &bits[3];
571 		pobits[1] = &bits[4];
572 		pobits[2] = &bits[5];
573 	}
574 
575 #define	getbits(name, x) \
576 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
577 	    (caddr_t)pibits[x], ni))) \
578 		goto done;
579 	getbits(in, 0);
580 	getbits(ou, 1);
581 	getbits(ex, 2);
582 #undef	getbits
583 
584 	if (SCARG(uap, tv)) {
585 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
586 			sizeof (atv));
587 		if (error)
588 			goto done;
589 		if (itimerfix(&atv)) {
590 			error = EINVAL;
591 			goto done;
592 		}
593 		s = splclock();
594 		timeradd(&atv, &time, &atv);
595 		timo = hzto(&atv);
596 		/*
597 		 * Avoid inadvertently sleeping forever.
598 		 */
599 		if (timo == 0)
600 			timo = 1;
601 		splx(s);
602 	} else
603 		timo = 0;
604 retry:
605 	ncoll = nselcoll;
606 	p->p_flag |= P_SELECT;
607 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
608 	if (error || *retval)
609 		goto done;
610 	s = splhigh();
611 	/* this should be timercmp(&time, &atv, >=) */
612 	if (SCARG(uap, tv) && (time.tv_sec > atv.tv_sec ||
613 	    (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
614 		splx(s);
615 		goto done;
616 	}
617 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
618 		splx(s);
619 		goto retry;
620 	}
621 	p->p_flag &= ~P_SELECT;
622 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
623 	splx(s);
624 	if (error == 0)
625 		goto retry;
626 done:
627 	p->p_flag &= ~P_SELECT;
628 	/* select is not restarted after signals... */
629 	if (error == ERESTART)
630 		error = EINTR;
631 	if (error == EWOULDBLOCK)
632 		error = 0;
633 #define	putbits(name, x) \
634 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
635 	    (caddr_t)SCARG(uap, name), ni))) \
636 		error = error2;
637 	if (error == 0) {
638 		int error2;
639 
640 		putbits(in, 0);
641 		putbits(ou, 1);
642 		putbits(ex, 2);
643 #undef putbits
644 	}
645 
646 cleanup:
647 	if (pibits[0] != &bits[0])
648 		free(pibits[0], M_TEMP);
649 	return (error);
650 }
651 
652 int
653 selscan(p, ibits, obits, nfd, retval)
654 	struct proc *p;
655 	fd_set *ibits, *obits;
656 	int nfd;
657 	register_t *retval;
658 {
659 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
660 	register struct filedesc *fdp = p->p_fd;
661 	register int msk, i, j, fd;
662 	register fd_mask bits;
663 	struct file *fp;
664 	int ni, n = 0;
665 	static int flag[3] = { FREAD, FWRITE, 0 };
666 
667 	/*
668 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
669 	 * up to the next byte) otherwise the fd_set's are normal sized.
670 	 */
671 	ni = sizeof(fd_set);
672 	if (nfd > FD_SETSIZE)
673 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
674 
675 	for (msk = 0; msk < 3; msk++) {
676 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
677 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
678 
679 		for (i = 0; i < nfd; i += NFDBITS) {
680 			bits = pibits->fds_bits[i/NFDBITS];
681 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
682 				bits &= ~(1 << j);
683 				fp = fdp->fd_ofiles[fd];
684 				if (fp == NULL)
685 					return (EBADF);
686 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
687 					FD_SET(fd, pobits);
688 					n++;
689 				}
690 			}
691 		}
692 	}
693 	*retval = n;
694 	return (0);
695 }
696 
697 /*ARGSUSED*/
698 int
699 seltrue(dev, flag, p)
700 	dev_t dev;
701 	int flag;
702 	struct proc *p;
703 {
704 
705 	return (1);
706 }
707 
708 /*
709  * Record a select request.
710  */
711 void
712 selrecord(selector, sip)
713 	struct proc *selector;
714 	struct selinfo *sip;
715 {
716 	struct proc *p;
717 	pid_t mypid;
718 
719 	mypid = selector->p_pid;
720 	if (sip->si_selpid == mypid)
721 		return;
722 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
723 	    p->p_wchan == (caddr_t)&selwait)
724 		sip->si_flags |= SI_COLL;
725 	else
726 		sip->si_selpid = mypid;
727 }
728 
729 /*
730  * Do a wakeup when a selectable event occurs.
731  */
732 void
733 selwakeup(sip)
734 	register struct selinfo *sip;
735 {
736 	register struct proc *p;
737 	int s;
738 
739 	if (sip->si_selpid == 0)
740 		return;
741 	if (sip->si_flags & SI_COLL) {
742 		nselcoll++;
743 		sip->si_flags &= ~SI_COLL;
744 		wakeup((caddr_t)&selwait);
745 	}
746 	p = pfind(sip->si_selpid);
747 	sip->si_selpid = 0;
748 	if (p != NULL) {
749 		s = splhigh();
750 		if (p->p_wchan == (caddr_t)&selwait) {
751 			if (p->p_stat == SSLEEP)
752 				setrunnable(p);
753 			else
754 				unsleep(p);
755 		} else if (p->p_flag & P_SELECT)
756 			p->p_flag &= ~P_SELECT;
757 		splx(s);
758 	}
759 }
760 
761 void
762 pollscan(p, pl, nfd, retval)
763 	struct proc *p;
764 	struct pollfd *pl;
765 	int nfd;
766 	register_t *retval;
767 {
768 	register struct filedesc *fdp = p->p_fd;
769 	register int msk, i;
770 	struct file *fp;
771 	int x, n = 0;
772 	static int flag[3] = { FREAD, FWRITE, 0 };
773 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
774 
775 	/*
776 	 * XXX: We need to implement the rest of the flags.
777 	 */
778 	for (i = 0; i < nfd; i++) {
779 		/* Check the file descriptor. */
780 		if (pl[i].fd < 0) {
781 			pl[i].revents = 0;
782 			continue;
783 		}
784 		if (pl[i].fd >= fdp->fd_nfiles) {
785 			pl[i].revents = POLLNVAL;
786 			n++;
787 			continue;
788 		}
789 
790 		fp = fdp->fd_ofiles[pl[i].fd];
791 		if (fp == NULL) {
792 			pl[i].revents = POLLNVAL;
793 			n++;
794 			continue;
795 		}
796 		for (x = msk = 0; msk < 3; msk++) {
797 			if (pl[i].events & pflag[msk]) {
798 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
799 					pl[i].revents |= pflag[msk] &
800 					    pl[i].events;
801 					x++;
802 				}
803 			}
804 		}
805 		if (x)
806 			n++;
807 	}
808 	*retval = n;
809 }
810 
811 /*
812  * We are using the same mechanism as select only we encode/decode args
813  * differently.
814  */
815 int
816 sys_poll(p, v, retval)
817 	register struct proc *p;
818 	void *v;
819 	register_t *retval;
820 {
821 	struct sys_poll_args *uap = v;
822 	size_t sz;
823 	struct pollfd pfds[4], *pl = pfds;
824 	int msec = SCARG(uap, timeout);
825 	struct timeval atv;
826 	int timo, ncoll, i, s, error, error2;
827 	extern int nselcoll, selwait;
828 
829 	/* Standards say no more than MAX_OPEN; this is possibly better. */
830 	if (SCARG(uap, nfds) > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
831 	    maxfiles))
832 		return (EINVAL);
833 
834 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
835 
836 	/* optimize for the default case, of a small nfds value */
837 	if (sz > sizeof(pfds))
838 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
839 
840 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
841 		goto bad;
842 
843 	for (i = 0; i < SCARG(uap, nfds); i++)
844 		pl[i].revents = 0;
845 
846 	if (msec != -1) {
847 		atv.tv_sec = msec / 1000;
848 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
849 
850 		if (itimerfix(&atv)) {
851 			error = EINVAL;
852 			goto done;
853 		}
854 		s = splclock();
855 		timeradd(&atv, &time, &atv);
856 		timo = hzto(&atv);
857 		/*
858 		 * Avoid inadvertently sleeping forever.
859 		 */
860 		if (timo == 0)
861 			timo = 1;
862 		splx(s);
863 	} else
864 		timo = 0;
865 
866 retry:
867 	ncoll = nselcoll;
868 	p->p_flag |= P_SELECT;
869 	pollscan(p, pl, SCARG(uap, nfds), retval);
870 	if (*retval)
871 		goto done;
872 	s = splhigh();
873 	if (timo && timercmp(&time, &atv, >=)) {
874 		splx(s);
875 		goto done;
876 	}
877 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
878 		splx(s);
879 		goto retry;
880 	}
881 	p->p_flag &= ~P_SELECT;
882 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
883 	splx(s);
884 	if (error == 0)
885 		goto retry;
886 
887 done:
888 	p->p_flag &= ~P_SELECT;
889 	/* poll is not restarted after signals... */
890 	if (error == ERESTART)
891 		error = EINTR;
892 	if (error == EWOULDBLOCK)
893 		error = 0;
894 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
895 		error = error2;
896 bad:
897 	if (pl != pfds)
898 		free((char *) pl, M_TEMP);
899 	return (error);
900 }
901