xref: /openbsd-src/sys/kern/sys_generic.c (revision 1f681a38ff09a42a10d6fd9d1fe951b7460bf87a)
1 /*	$OpenBSD: sys_generic.c,v 1.14 1998/07/28 19:47:07 millert Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/socketvar.h>
52 #include <sys/signalvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/stat.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #ifdef KTRACE
59 #include <sys/ktrace.h>
60 #endif
61 
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64 
65 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
66 int seltrue __P((dev_t, int, struct proc *));
67 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
68 
69 /*
70  * Read system call.
71  */
72 /* ARGSUSED */
73 int
74 sys_read(p, v, retval)
75 	struct proc *p;
76 	void *v;
77 	register_t *retval;
78 {
79 	register struct sys_read_args /* {
80 		syscallarg(int) fd;
81 		syscallarg(void *) buf;
82 		syscallarg(size_t) nbyte;
83 	} */ *uap = v;
84 	register struct file *fp;
85 	register struct filedesc *fdp = p->p_fd;
86 	struct uio auio;
87 	struct iovec aiov;
88 	long cnt, error = 0;
89 #ifdef KTRACE
90 	struct iovec ktriov;
91 #endif
92 
93 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
94 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
95 	    (fp->f_flag & FREAD) == 0)
96 		return (EBADF);
97 	/* Don't allow nbyte to be larger than max return val */
98 	if (SCARG(uap, nbyte) > SSIZE_MAX)
99 		return(EINVAL);
100 	aiov.iov_base = (caddr_t)SCARG(uap, buf);
101 	aiov.iov_len = SCARG(uap, nbyte);
102 	auio.uio_iov = &aiov;
103 	auio.uio_iovcnt = 1;
104 	auio.uio_resid = SCARG(uap, nbyte);
105 	auio.uio_rw = UIO_READ;
106 	auio.uio_segflg = UIO_USERSPACE;
107 	auio.uio_procp = p;
108 #ifdef KTRACE
109 	/*
110 	 * if tracing, save a copy of iovec
111 	 */
112 	if (KTRPOINT(p, KTR_GENIO))
113 		ktriov = aiov;
114 #endif
115 	cnt = SCARG(uap, nbyte);
116 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
117 	if (error)
118 		if (auio.uio_resid != cnt && (error == ERESTART ||
119 		    error == EINTR || error == EWOULDBLOCK))
120 			error = 0;
121 	cnt -= auio.uio_resid;
122 #ifdef KTRACE
123 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
124 		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov,
125 		    cnt, error);
126 #endif
127 	*retval = cnt;
128 	return (error);
129 }
130 
131 /*
132  * Scatter read system call.
133  */
134 int
135 sys_readv(p, v, retval)
136 	struct proc *p;
137 	void *v;
138 	register_t *retval;
139 {
140 	register struct sys_readv_args /* {
141 		syscallarg(int) fd;
142 		syscallarg(struct iovec *) iovp;
143 		syscallarg(int) iovcnt;
144 	} */ *uap = v;
145 	register struct file *fp;
146 	register struct filedesc *fdp = p->p_fd;
147 	struct uio auio;
148 	register struct iovec *iov;
149 	struct iovec *needfree;
150 	struct iovec aiov[UIO_SMALLIOV];
151 	long i, cnt, error = 0;
152 	u_int iovlen;
153 #ifdef KTRACE
154 	struct iovec *ktriov = NULL;
155 #endif
156 
157 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
158 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
159 	    (fp->f_flag & FREAD) == 0)
160 		return (EBADF);
161 	if (SCARG(uap, iovcnt) <= 0)
162 		return (EINVAL);
163 	/* note: can't use iovlen until iovcnt is validated */
164 	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
165 	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
166 		if (SCARG(uap, iovcnt) > UIO_MAXIOV)
167 			return (EINVAL);
168 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
169 		needfree = iov;
170 	} else {
171 		iov = aiov;
172 		needfree = NULL;
173 	}
174 	auio.uio_iov = iov;
175 	auio.uio_iovcnt = SCARG(uap, iovcnt);
176 	auio.uio_rw = UIO_READ;
177 	auio.uio_segflg = UIO_USERSPACE;
178 	auio.uio_procp = p;
179 	error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen);
180 	if (error)
181 		goto done;
182 	auio.uio_resid = 0;
183 	for (i = 0; i < SCARG(uap, iovcnt); i++, iov++) {
184 		/* Don't allow sum > SSIZE_MAX */
185 		if ((ssize_t)(auio.uio_resid += iov->iov_len) <= 0) {
186 			error = EINVAL;
187 			goto done;
188 		}
189 	}
190 #ifdef KTRACE
191 	/*
192 	 * if tracing, save a copy of iovec
193 	 */
194 	if (KTRPOINT(p, KTR_GENIO))  {
195 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
196 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
197 	}
198 #endif
199 	cnt = auio.uio_resid;
200 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
201 	if (error)
202 		if (auio.uio_resid != cnt && (error == ERESTART ||
203 		    error == EINTR || error == EWOULDBLOCK))
204 			error = 0;
205 	cnt -= auio.uio_resid;
206 #ifdef KTRACE
207 	if (ktriov != NULL) {
208 		if (error == 0)
209 			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov,
210 			    cnt, error);
211 		FREE(ktriov, M_TEMP);
212 	}
213 #endif
214 	*retval = cnt;
215 done:
216 	if (needfree)
217 		FREE(needfree, M_IOV);
218 	return (error);
219 }
220 
221 /*
222  * Write system call
223  */
224 int
225 sys_write(p, v, retval)
226 	struct proc *p;
227 	void *v;
228 	register_t *retval;
229 {
230 	register struct sys_write_args /* {
231 		syscallarg(int) fd;
232 		syscallarg(void *) buf;
233 		syscallarg(size_t) nbyte;
234 	} */ *uap = v;
235 	register struct file *fp;
236 	register struct filedesc *fdp = p->p_fd;
237 	struct uio auio;
238 	struct iovec aiov;
239 	long cnt, error = 0;
240 #ifdef KTRACE
241 	struct iovec ktriov;
242 #endif
243 
244 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
245 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
246 	    (fp->f_flag & FWRITE) == 0)
247 		return (EBADF);
248 	/* Don't allow nbyte to be larger than max return val */
249 	if (SCARG(uap, nbyte) > SSIZE_MAX)
250 		return(EINVAL);
251 	aiov.iov_base = (caddr_t)SCARG(uap, buf);
252 	aiov.iov_len = SCARG(uap, nbyte);
253 	auio.uio_iov = &aiov;
254 	auio.uio_iovcnt = 1;
255 	auio.uio_resid = SCARG(uap, nbyte);
256 	auio.uio_rw = UIO_WRITE;
257 	auio.uio_segflg = UIO_USERSPACE;
258 	auio.uio_procp = p;
259 #ifdef KTRACE
260 	/*
261 	 * if tracing, save a copy of iovec
262 	 */
263 	if (KTRPOINT(p, KTR_GENIO))
264 		ktriov = aiov;
265 #endif
266 	cnt = SCARG(uap, nbyte);
267 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
268 	if (error) {
269 		if (auio.uio_resid != cnt && (error == ERESTART ||
270 		    error == EINTR || error == EWOULDBLOCK))
271 			error = 0;
272 		if (error == EPIPE)
273 			psignal(p, SIGPIPE);
274 	}
275 	cnt -= auio.uio_resid;
276 #ifdef KTRACE
277 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
278 		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
279 		    &ktriov, cnt, error);
280 #endif
281 	*retval = cnt;
282 	return (error);
283 }
284 
285 /*
286  * Gather write system call
287  */
288 int
289 sys_writev(p, v, retval)
290 	struct proc *p;
291 	void *v;
292 	register_t *retval;
293 {
294 	register struct sys_writev_args /* {
295 		syscallarg(int) fd;
296 		syscallarg(struct iovec *) iovp;
297 		syscallarg(u_int) iovcnt;
298 	} */ *uap = v;
299 	register struct file *fp;
300 	register struct filedesc *fdp = p->p_fd;
301 	struct uio auio;
302 	register struct iovec *iov;
303 	struct iovec *needfree;
304 	struct iovec aiov[UIO_SMALLIOV];
305 	long i, cnt, error = 0;
306 	u_int iovlen;
307 #ifdef KTRACE
308 	struct iovec *ktriov = NULL;
309 #endif
310 
311 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
312 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
313 	    (fp->f_flag & FWRITE) == 0)
314 		return (EBADF);
315 	if (SCARG(uap, iovcnt) <= 0)
316 		return (EINVAL);
317 	/* note: can't use iovlen until iovcnt is validated */
318 	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
319 	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
320 		if (SCARG(uap, iovcnt) > UIO_MAXIOV)
321 			return (EINVAL);
322 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
323 		needfree = iov;
324 	} else {
325 		iov = aiov;
326 		needfree = NULL;
327 	}
328 	auio.uio_iov = iov;
329 	auio.uio_iovcnt = SCARG(uap, iovcnt);
330 	auio.uio_rw = UIO_WRITE;
331 	auio.uio_segflg = UIO_USERSPACE;
332 	auio.uio_procp = p;
333 	error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen);
334 	if (error)
335 		goto done;
336 	auio.uio_resid = 0;
337 	for (i = 0; i < SCARG(uap, iovcnt); i++, iov++) {
338 		/* Don't allow sum > SSIZE_MAX */
339 		if ((ssize_t)(auio.uio_resid += iov->iov_len) <= 0) {
340 			error = EINVAL;
341 			goto done;
342 		}
343 	}
344 #ifdef KTRACE
345 	/*
346 	 * if tracing, save a copy of iovec
347 	 */
348 	if (KTRPOINT(p, KTR_GENIO))  {
349 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
350 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
351 	}
352 #endif
353 	cnt = auio.uio_resid;
354 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
355 	if (error) {
356 		if (auio.uio_resid != cnt && (error == ERESTART ||
357 		    error == EINTR || error == EWOULDBLOCK))
358 			error = 0;
359 		if (error == EPIPE)
360 			psignal(p, SIGPIPE);
361 	}
362 	cnt -= auio.uio_resid;
363 #ifdef KTRACE
364 	if (ktriov != NULL) {
365 		if (error == 0)
366 			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
367 				ktriov, cnt, error);
368 		FREE(ktriov, M_TEMP);
369 	}
370 #endif
371 	*retval = cnt;
372 done:
373 	if (needfree)
374 		FREE(needfree, M_IOV);
375 	return (error);
376 }
377 
378 /*
379  * Ioctl system call
380  */
381 /* ARGSUSED */
382 int
383 sys_ioctl(p, v, retval)
384 	struct proc *p;
385 	void *v;
386 	register_t *retval;
387 {
388 	register struct sys_ioctl_args /* {
389 		syscallarg(int) fd;
390 		syscallarg(u_long) com;
391 		syscallarg(caddr_t) data;
392 	} */ *uap = v;
393 	register struct file *fp;
394 	register struct filedesc *fdp;
395 	register u_long com;
396 	register int error;
397 	register u_int size;
398 	caddr_t data, memp;
399 	int tmp;
400 #define STK_PARAMS	128
401 	char stkbuf[STK_PARAMS];
402 
403 	fdp = p->p_fd;
404 	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
405 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
406 		return (EBADF);
407 
408 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
409 		return (EBADF);
410 
411 	switch (com = SCARG(uap, com)) {
412 	case FIONCLEX:
413 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
414 		return (0);
415 	case FIOCLEX:
416 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
417 		return (0);
418 	}
419 
420 	/*
421 	 * Interpret high order word to find amount of data to be
422 	 * copied to/from the user's address space.
423 	 */
424 	size = IOCPARM_LEN(com);
425 	if (size > IOCPARM_MAX)
426 		return (ENOTTY);
427 	memp = NULL;
428 	if (size > sizeof (stkbuf)) {
429 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
430 		data = memp;
431 	} else
432 		data = stkbuf;
433 	if (com&IOC_IN) {
434 		if (size) {
435 			error = copyin(SCARG(uap, data), data, (u_int)size);
436 			if (error) {
437 				if (memp)
438 					free(memp, M_IOCTLOPS);
439 				return (error);
440 			}
441 		} else
442 			*(caddr_t *)data = SCARG(uap, data);
443 	} else if ((com&IOC_OUT) && size)
444 		/*
445 		 * Zero the buffer so the user always
446 		 * gets back something deterministic.
447 		 */
448 		bzero(data, size);
449 	else if (com&IOC_VOID)
450 		*(caddr_t *)data = SCARG(uap, data);
451 
452 	switch (com) {
453 
454 	case FIONBIO:
455 		if ((tmp = *(int *)data) != 0)
456 			fp->f_flag |= FNONBLOCK;
457 		else
458 			fp->f_flag &= ~FNONBLOCK;
459 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
460 		break;
461 
462 	case FIOASYNC:
463 		if ((tmp = *(int *)data) != 0)
464 			fp->f_flag |= FASYNC;
465 		else
466 			fp->f_flag &= ~FASYNC;
467 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
468 		break;
469 
470 	case FIOSETOWN:
471 		tmp = *(int *)data;
472 		if (fp->f_type == DTYPE_SOCKET) {
473 			struct socket *so = (struct socket *)fp->f_data;
474 
475 			so->so_pgid = tmp;
476 			so->so_siguid = p->p_cred->p_ruid;
477 			so->so_sigeuid = p->p_ucred->cr_uid;
478 			error = 0;
479 			break;
480 		}
481 		if (tmp <= 0) {
482 			tmp = -tmp;
483 		} else {
484 			struct proc *p1 = pfind(tmp);
485 			if (p1 == 0) {
486 				error = ESRCH;
487 				break;
488 			}
489 			tmp = p1->p_pgrp->pg_id;
490 		}
491 		error = (*fp->f_ops->fo_ioctl)
492 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
493 		break;
494 
495 	case FIOGETOWN:
496 		if (fp->f_type == DTYPE_SOCKET) {
497 			error = 0;
498 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
499 			break;
500 		}
501 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
502 		*(int *)data = -*(int *)data;
503 		break;
504 
505 	default:
506 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
507 		/*
508 		 * Copy any data to user, size was
509 		 * already set and checked above.
510 		 */
511 		if (error == 0 && (com&IOC_OUT) && size)
512 			error = copyout(data, SCARG(uap, data), (u_int)size);
513 		break;
514 	}
515 	if (memp)
516 		free(memp, M_IOCTLOPS);
517 	return (error);
518 }
519 
520 int	selwait, nselcoll;
521 
522 /*
523  * Select system call.
524  */
525 int
526 sys_select(p, v, retval)
527 	register struct proc *p;
528 	void *v;
529 	register_t *retval;
530 {
531 	register struct sys_select_args /* {
532 		syscallarg(int) nd;
533 		syscallarg(fd_set *) in;
534 		syscallarg(fd_set *) ou;
535 		syscallarg(fd_set *) ex;
536 		syscallarg(struct timeval *) tv;
537 	} */ *uap = v;
538 	fd_set bits[6], *pibits[3], *pobits[3];
539 	struct timeval atv;
540 	int s, ncoll, error = 0, timo;
541 	u_int ni;
542 
543 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
544 		/* forgiving; slightly wrong */
545 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
546 	}
547 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
548 	if (SCARG(uap, nd) > FD_SETSIZE) {
549 		caddr_t mbits;
550 
551 		if ((mbits = malloc(ni * 6, M_TEMP, M_WAITOK)) == NULL) {
552 			error = EINVAL;
553 			goto cleanup;
554 		}
555 		bzero(mbits, ni * 6);
556 		pibits[0] = (fd_set *)&mbits[ni * 0];
557 		pibits[1] = (fd_set *)&mbits[ni * 1];
558 		pibits[2] = (fd_set *)&mbits[ni * 2];
559 		pobits[0] = (fd_set *)&mbits[ni * 3];
560 		pobits[1] = (fd_set *)&mbits[ni * 4];
561 		pobits[2] = (fd_set *)&mbits[ni * 5];
562 	} else {
563 		bzero((caddr_t)bits, sizeof(bits));
564 		pibits[0] = &bits[0];
565 		pibits[1] = &bits[1];
566 		pibits[2] = &bits[2];
567 		pobits[0] = &bits[3];
568 		pobits[1] = &bits[4];
569 		pobits[2] = &bits[5];
570 	}
571 
572 #define	getbits(name, x) \
573 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
574 	    (caddr_t)pibits[x], ni))) \
575 		goto done;
576 	getbits(in, 0);
577 	getbits(ou, 1);
578 	getbits(ex, 2);
579 #undef	getbits
580 
581 	if (SCARG(uap, tv)) {
582 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
583 			sizeof (atv));
584 		if (error)
585 			goto done;
586 		if (itimerfix(&atv)) {
587 			error = EINVAL;
588 			goto done;
589 		}
590 		s = splclock();
591 		timeradd(&atv, &time, &atv);
592 		timo = hzto(&atv);
593 		/*
594 		 * Avoid inadvertently sleeping forever.
595 		 */
596 		if (timo == 0)
597 			timo = 1;
598 		splx(s);
599 	} else
600 		timo = 0;
601 retry:
602 	ncoll = nselcoll;
603 	p->p_flag |= P_SELECT;
604 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
605 	if (error || *retval)
606 		goto done;
607 	s = splhigh();
608 	/* this should be timercmp(&time, &atv, >=) */
609 	if (SCARG(uap, tv) && (time.tv_sec > atv.tv_sec ||
610 	    (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
611 		splx(s);
612 		goto done;
613 	}
614 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
615 		splx(s);
616 		goto retry;
617 	}
618 	p->p_flag &= ~P_SELECT;
619 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
620 	splx(s);
621 	if (error == 0)
622 		goto retry;
623 done:
624 	p->p_flag &= ~P_SELECT;
625 	/* select is not restarted after signals... */
626 	if (error == ERESTART)
627 		error = EINTR;
628 	if (error == EWOULDBLOCK)
629 		error = 0;
630 #define	putbits(name, x) \
631 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
632 	    (caddr_t)SCARG(uap, name), ni))) \
633 		error = error2;
634 	if (error == 0) {
635 		int error2;
636 
637 		putbits(in, 0);
638 		putbits(ou, 1);
639 		putbits(ex, 2);
640 #undef putbits
641 	}
642 
643 cleanup:
644 	if (pibits[0] != &bits[0])
645 		free(pibits[0], M_TEMP);
646 	return (error);
647 }
648 
649 int
650 selscan(p, ibits, obits, nfd, retval)
651 	struct proc *p;
652 	fd_set *ibits, *obits;
653 	int nfd;
654 	register_t *retval;
655 {
656 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
657 	register struct filedesc *fdp = p->p_fd;
658 	register int msk, i, j, fd;
659 	register fd_mask bits;
660 	struct file *fp;
661 	int ni, n = 0;
662 	static int flag[3] = { FREAD, FWRITE, 0 };
663 
664 	/*
665 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
666 	 * up to the next byte) otherwise the fd_set's are normal sized.
667 	 */
668 	ni = sizeof(fd_set);
669 	if (nfd > FD_SETSIZE)
670 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
671 
672 	for (msk = 0; msk < 3; msk++) {
673 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
674 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
675 
676 		for (i = 0; i < nfd; i += NFDBITS) {
677 			bits = pibits->fds_bits[i/NFDBITS];
678 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
679 				bits &= ~(1 << j);
680 				fp = fdp->fd_ofiles[fd];
681 				if (fp == NULL)
682 					return (EBADF);
683 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
684 					FD_SET(fd, pobits);
685 					n++;
686 				}
687 			}
688 		}
689 	}
690 	*retval = n;
691 	return (0);
692 }
693 
694 /*ARGSUSED*/
695 int
696 seltrue(dev, flag, p)
697 	dev_t dev;
698 	int flag;
699 	struct proc *p;
700 {
701 
702 	return (1);
703 }
704 
705 /*
706  * Record a select request.
707  */
708 void
709 selrecord(selector, sip)
710 	struct proc *selector;
711 	struct selinfo *sip;
712 {
713 	struct proc *p;
714 	pid_t mypid;
715 
716 	mypid = selector->p_pid;
717 	if (sip->si_selpid == mypid)
718 		return;
719 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
720 	    p->p_wchan == (caddr_t)&selwait)
721 		sip->si_flags |= SI_COLL;
722 	else
723 		sip->si_selpid = mypid;
724 }
725 
726 /*
727  * Do a wakeup when a selectable event occurs.
728  */
729 void
730 selwakeup(sip)
731 	register struct selinfo *sip;
732 {
733 	register struct proc *p;
734 	int s;
735 
736 	if (sip->si_selpid == 0)
737 		return;
738 	if (sip->si_flags & SI_COLL) {
739 		nselcoll++;
740 		sip->si_flags &= ~SI_COLL;
741 		wakeup((caddr_t)&selwait);
742 	}
743 	p = pfind(sip->si_selpid);
744 	sip->si_selpid = 0;
745 	if (p != NULL) {
746 		s = splhigh();
747 		if (p->p_wchan == (caddr_t)&selwait) {
748 			if (p->p_stat == SSLEEP)
749 				setrunnable(p);
750 			else
751 				unsleep(p);
752 		} else if (p->p_flag & P_SELECT)
753 			p->p_flag &= ~P_SELECT;
754 		splx(s);
755 	}
756 }
757 
758 void
759 pollscan(p, pl, nfd, retval)
760 	struct proc *p;
761 	struct pollfd *pl;
762 	int nfd;
763 	register_t *retval;
764 {
765 	register struct filedesc *fdp = p->p_fd;
766 	register int msk, i;
767 	struct file *fp;
768 	int n = 0;
769 	static int flag[3] = { FREAD, FWRITE, 0 };
770 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
771 
772 	/*
773 	 * XXX: We need to implement the rest of the flags.
774 	 */
775 	for (i = 0; i < nfd; i++) {
776 		fp = fdp->fd_ofiles[pl[i].fd];
777 		if (fp == NULL) {
778 			if (pl[i].events & POLLNVAL) {
779 				pl[i].revents |= POLLNVAL;
780 				n++;
781 			}
782 			continue;
783 		}
784 		for (msk = 0; msk < 3; msk++) {
785 			if (pl[i].events & pflag[msk]) {
786 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
787 					pl[i].revents |= pflag[msk] &
788 					    pl[i].events;
789 					n++;
790 				}
791 			}
792 		}
793 	}
794 	*retval = n;
795 }
796 
797 /*
798  * We are using the same mechanism as select only we encode/decode args
799  * differently.
800  */
801 int
802 sys_poll(p, v, retval)
803 	register struct proc *p;
804 	void *v;
805 	register_t *retval;
806 {
807 	struct sys_poll_args *uap = v;
808 	size_t sz = sizeof(struct pollfd) * SCARG(uap, nfds);
809 	struct pollfd *pl;
810 	int msec = SCARG(uap, timeout);
811 	struct timeval atv;
812 	int timo, ncoll, i, s, error, error2;
813 	extern int nselcoll, selwait;
814 
815 	pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
816 
817 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
818 		goto bad;
819 
820 	for (i = 0; i < SCARG(uap, nfds); i++)
821 		pl[i].revents = 0;
822 
823 	if (msec != -1) {
824 		atv.tv_sec = msec / 1000;
825 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
826 
827 		if (itimerfix(&atv)) {
828 			error = EINVAL;
829 			goto done;
830 		}
831 		s = splclock();
832 		timeradd(&atv, &time, &atv);
833 		timo = hzto(&atv);
834 		/*
835 		 * Avoid inadvertently sleeping forever.
836 		 */
837 		if (timo == 0)
838 			timo = 1;
839 		splx(s);
840 	} else
841 		timo = 0;
842 
843 retry:
844 	ncoll = nselcoll;
845 	p->p_flag |= P_SELECT;
846 	pollscan(p, pl, SCARG(uap, nfds), retval);
847 	if (*retval)
848 		goto done;
849 	s = splhigh();
850 	if (timo && timercmp(&time, &atv, >=)) {
851 		splx(s);
852 		goto done;
853 	}
854 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
855 		splx(s);
856 		goto retry;
857 	}
858 	p->p_flag &= ~P_SELECT;
859 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
860 	splx(s);
861 	if (error == 0)
862 		goto retry;
863 
864 done:
865 	p->p_flag &= ~P_SELECT;
866 	/* poll is not restarted after signals... */
867 	if (error == ERESTART)
868 		error = EINTR;
869 	if (error == EWOULDBLOCK)
870 		error = 0;
871 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
872 		error = error2;
873 bad:
874 	free((char *) pl, M_TEMP);
875 	return (error);
876 }
877