xref: /openbsd-src/sys/kern/sys_generic.c (revision 63e09ba8ca9e5cee20777dbbd7e4de117c8d850c)
1 /*	$OpenBSD: sys_generic.c,v 1.20 1999/08/04 19:18:13 deraadt Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/filedesc.h>
48 #include <sys/ioctl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/socketvar.h>
52 #include <sys/signalvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/stat.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #ifdef KTRACE
59 #include <sys/ktrace.h>
60 #endif
61 
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64 
65 int selscan __P((struct proc *, fd_set *, fd_set *, int, register_t *));
66 int seltrue __P((dev_t, int, struct proc *));
67 void pollscan __P((struct proc *, struct pollfd *, int, register_t *));
68 
69 /*
70  * Read system call.
71  */
72 /* ARGSUSED */
73 int
74 sys_read(p, v, retval)
75 	struct proc *p;
76 	void *v;
77 	register_t *retval;
78 {
79 	register struct sys_read_args /* {
80 		syscallarg(int) fd;
81 		syscallarg(void *) buf;
82 		syscallarg(size_t) nbyte;
83 	} */ *uap = v;
84 	register struct file *fp;
85 	register struct filedesc *fdp = p->p_fd;
86 	struct uio auio;
87 	struct iovec aiov;
88 	long cnt, error = 0;
89 #ifdef KTRACE
90 	struct iovec ktriov;
91 #endif
92 
93 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
94 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
95 	    (fp->f_flag & FREAD) == 0)
96 		return (EBADF);
97 	/* Don't allow nbyte to be larger than max return val */
98 	if (SCARG(uap, nbyte) > SSIZE_MAX)
99 		return(EINVAL);
100 	aiov.iov_base = (caddr_t)SCARG(uap, buf);
101 	aiov.iov_len = SCARG(uap, nbyte);
102 	auio.uio_iov = &aiov;
103 	auio.uio_iovcnt = 1;
104 	auio.uio_resid = SCARG(uap, nbyte);
105 	auio.uio_rw = UIO_READ;
106 	auio.uio_segflg = UIO_USERSPACE;
107 	auio.uio_procp = p;
108 #ifdef KTRACE
109 	/*
110 	 * if tracing, save a copy of iovec
111 	 */
112 	if (KTRPOINT(p, KTR_GENIO))
113 		ktriov = aiov;
114 #endif
115 	cnt = SCARG(uap, nbyte);
116 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
117 	if (error)
118 		if (auio.uio_resid != cnt && (error == ERESTART ||
119 		    error == EINTR || error == EWOULDBLOCK))
120 			error = 0;
121 	cnt -= auio.uio_resid;
122 #ifdef KTRACE
123 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
124 		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov,
125 		    cnt, error);
126 #endif
127 	*retval = cnt;
128 	return (error);
129 }
130 
131 /*
132  * Scatter read system call.
133  */
134 int
135 sys_readv(p, v, retval)
136 	struct proc *p;
137 	void *v;
138 	register_t *retval;
139 {
140 	register struct sys_readv_args /* {
141 		syscallarg(int) fd;
142 		syscallarg(struct iovec *) iovp;
143 		syscallarg(int) iovcnt;
144 	} */ *uap = v;
145 	register struct file *fp;
146 	register struct filedesc *fdp = p->p_fd;
147 	struct uio auio;
148 	register struct iovec *iov;
149 	struct iovec *needfree;
150 	struct iovec aiov[UIO_SMALLIOV];
151 	long i, cnt, error = 0;
152 	u_int iovlen;
153 #ifdef KTRACE
154 	struct iovec *ktriov = NULL;
155 #endif
156 
157 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
158 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
159 	    (fp->f_flag & FREAD) == 0)
160 		return (EBADF);
161 	if (SCARG(uap, iovcnt) <= 0)
162 		return (EINVAL);
163 	/* note: can't use iovlen until iovcnt is validated */
164 	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
165 	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
166 		if (SCARG(uap, iovcnt) > IOV_MAX)
167 			return (EINVAL);
168 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
169 		needfree = iov;
170 	} else {
171 		iov = aiov;
172 		needfree = NULL;
173 	}
174 	auio.uio_iov = iov;
175 	auio.uio_iovcnt = SCARG(uap, iovcnt);
176 	auio.uio_rw = UIO_READ;
177 	auio.uio_segflg = UIO_USERSPACE;
178 	auio.uio_procp = p;
179 	error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen);
180 	if (error)
181 		goto done;
182 	auio.uio_resid = 0;
183 	for (i = 0; i < SCARG(uap, iovcnt); i++, iov++) {
184 		/* Don't allow sum > SSIZE_MAX */
185 		if (iov->iov_len > SSIZE_MAX ||
186 		    (auio.uio_resid += iov->iov_len) > SSIZE_MAX) {
187 			error = EINVAL;
188 			goto done;
189 		}
190 	}
191 #ifdef KTRACE
192 	/*
193 	 * if tracing, save a copy of iovec
194 	 */
195 	if (KTRPOINT(p, KTR_GENIO))  {
196 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
197 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
198 	}
199 #endif
200 	cnt = auio.uio_resid;
201 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
202 	if (error)
203 		if (auio.uio_resid != cnt && (error == ERESTART ||
204 		    error == EINTR || error == EWOULDBLOCK))
205 			error = 0;
206 	cnt -= auio.uio_resid;
207 #ifdef KTRACE
208 	if (ktriov != NULL) {
209 		if (error == 0)
210 			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov,
211 			    cnt, error);
212 		FREE(ktriov, M_TEMP);
213 	}
214 #endif
215 	*retval = cnt;
216 done:
217 	if (needfree)
218 		FREE(needfree, M_IOV);
219 	return (error);
220 }
221 
222 /*
223  * Write system call
224  */
225 int
226 sys_write(p, v, retval)
227 	struct proc *p;
228 	void *v;
229 	register_t *retval;
230 {
231 	register struct sys_write_args /* {
232 		syscallarg(int) fd;
233 		syscallarg(void *) buf;
234 		syscallarg(size_t) nbyte;
235 	} */ *uap = v;
236 	register struct file *fp;
237 	register struct filedesc *fdp = p->p_fd;
238 	struct uio auio;
239 	struct iovec aiov;
240 	long cnt, error = 0;
241 #ifdef KTRACE
242 	struct iovec ktriov;
243 #endif
244 
245 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
246 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
247 	    (fp->f_flag & FWRITE) == 0)
248 		return (EBADF);
249 	/* Don't allow nbyte to be larger than max return val */
250 	if (SCARG(uap, nbyte) > SSIZE_MAX)
251 		return(EINVAL);
252 	aiov.iov_base = (caddr_t)SCARG(uap, buf);
253 	aiov.iov_len = SCARG(uap, nbyte);
254 	auio.uio_iov = &aiov;
255 	auio.uio_iovcnt = 1;
256 	auio.uio_resid = SCARG(uap, nbyte);
257 	auio.uio_rw = UIO_WRITE;
258 	auio.uio_segflg = UIO_USERSPACE;
259 	auio.uio_procp = p;
260 #ifdef KTRACE
261 	/*
262 	 * if tracing, save a copy of iovec
263 	 */
264 	if (KTRPOINT(p, KTR_GENIO))
265 		ktriov = aiov;
266 #endif
267 	cnt = SCARG(uap, nbyte);
268 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
269 	if (error) {
270 		if (auio.uio_resid != cnt && (error == ERESTART ||
271 		    error == EINTR || error == EWOULDBLOCK))
272 			error = 0;
273 		if (error == EPIPE)
274 			psignal(p, SIGPIPE);
275 	}
276 	cnt -= auio.uio_resid;
277 #ifdef KTRACE
278 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
279 		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
280 		    &ktriov, cnt, error);
281 #endif
282 	*retval = cnt;
283 	return (error);
284 }
285 
286 /*
287  * Gather write system call
288  */
289 int
290 sys_writev(p, v, retval)
291 	struct proc *p;
292 	void *v;
293 	register_t *retval;
294 {
295 	register struct sys_writev_args /* {
296 		syscallarg(int) fd;
297 		syscallarg(struct iovec *) iovp;
298 		syscallarg(int) iovcnt;
299 	} */ *uap = v;
300 	register struct file *fp;
301 	register struct filedesc *fdp = p->p_fd;
302 	struct uio auio;
303 	register struct iovec *iov;
304 	struct iovec *needfree;
305 	struct iovec aiov[UIO_SMALLIOV];
306 	long i, cnt, error = 0;
307 	u_int iovlen;
308 #ifdef KTRACE
309 	struct iovec *ktriov = NULL;
310 #endif
311 
312 	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
313 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
314 	    (fp->f_flag & FWRITE) == 0)
315 		return (EBADF);
316 	if (SCARG(uap, iovcnt) <= 0)
317 		return (EINVAL);
318 	/* note: can't use iovlen until iovcnt is validated */
319 	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
320 	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
321 		if (SCARG(uap, iovcnt) > IOV_MAX)
322 			return (EINVAL);
323 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
324 		needfree = iov;
325 	} else {
326 		iov = aiov;
327 		needfree = NULL;
328 	}
329 	auio.uio_iov = iov;
330 	auio.uio_iovcnt = SCARG(uap, iovcnt);
331 	auio.uio_rw = UIO_WRITE;
332 	auio.uio_segflg = UIO_USERSPACE;
333 	auio.uio_procp = p;
334 	error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen);
335 	if (error)
336 		goto done;
337 	auio.uio_resid = 0;
338 	for (i = 0; i < SCARG(uap, iovcnt); i++, iov++) {
339 		/* Don't allow sum > SSIZE_MAX */
340 		if (iov->iov_len > SSIZE_MAX ||
341 		    (auio.uio_resid += iov->iov_len) > SSIZE_MAX) {
342 			error = EINVAL;
343 			goto done;
344 		}
345 	}
346 #ifdef KTRACE
347 	/*
348 	 * if tracing, save a copy of iovec
349 	 */
350 	if (KTRPOINT(p, KTR_GENIO))  {
351 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
352 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
353 	}
354 #endif
355 	cnt = auio.uio_resid;
356 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
357 	if (error) {
358 		if (auio.uio_resid != cnt && (error == ERESTART ||
359 		    error == EINTR || error == EWOULDBLOCK))
360 			error = 0;
361 		if (error == EPIPE)
362 			psignal(p, SIGPIPE);
363 	}
364 	cnt -= auio.uio_resid;
365 #ifdef KTRACE
366 	if (ktriov != NULL) {
367 		if (error == 0)
368 			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
369 				ktriov, cnt, error);
370 		FREE(ktriov, M_TEMP);
371 	}
372 #endif
373 	*retval = cnt;
374 done:
375 	if (needfree)
376 		FREE(needfree, M_IOV);
377 	return (error);
378 }
379 
380 /*
381  * Ioctl system call
382  */
383 /* ARGSUSED */
384 int
385 sys_ioctl(p, v, retval)
386 	struct proc *p;
387 	void *v;
388 	register_t *retval;
389 {
390 	register struct sys_ioctl_args /* {
391 		syscallarg(int) fd;
392 		syscallarg(u_long) com;
393 		syscallarg(caddr_t) data;
394 	} */ *uap = v;
395 	register struct file *fp;
396 	register struct filedesc *fdp;
397 	register u_long com;
398 	register int error;
399 	register u_int size;
400 	caddr_t data, memp;
401 	int tmp;
402 #define STK_PARAMS	128
403 	char stkbuf[STK_PARAMS];
404 
405 	fdp = p->p_fd;
406 	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
407 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
408 		return (EBADF);
409 
410 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
411 		return (EBADF);
412 
413 	switch (com = SCARG(uap, com)) {
414 	case FIONCLEX:
415 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
416 		return (0);
417 	case FIOCLEX:
418 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
419 		return (0);
420 	}
421 
422 	/*
423 	 * Interpret high order word to find amount of data to be
424 	 * copied to/from the user's address space.
425 	 */
426 	size = IOCPARM_LEN(com);
427 	if (size > IOCPARM_MAX)
428 		return (ENOTTY);
429 	memp = NULL;
430 	if (size > sizeof (stkbuf)) {
431 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
432 		data = memp;
433 	} else
434 		data = stkbuf;
435 	if (com&IOC_IN) {
436 		if (size) {
437 			error = copyin(SCARG(uap, data), data, (u_int)size);
438 			if (error) {
439 				if (memp)
440 					free(memp, M_IOCTLOPS);
441 				return (error);
442 			}
443 		} else
444 			*(caddr_t *)data = SCARG(uap, data);
445 	} else if ((com&IOC_OUT) && size)
446 		/*
447 		 * Zero the buffer so the user always
448 		 * gets back something deterministic.
449 		 */
450 		bzero(data, size);
451 	else if (com&IOC_VOID)
452 		*(caddr_t *)data = SCARG(uap, data);
453 
454 	switch (com) {
455 
456 	case FIONBIO:
457 		if ((tmp = *(int *)data) != 0)
458 			fp->f_flag |= FNONBLOCK;
459 		else
460 			fp->f_flag &= ~FNONBLOCK;
461 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
462 		break;
463 
464 	case FIOASYNC:
465 		if ((tmp = *(int *)data) != 0)
466 			fp->f_flag |= FASYNC;
467 		else
468 			fp->f_flag &= ~FASYNC;
469 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
470 		break;
471 
472 	case FIOSETOWN:
473 		tmp = *(int *)data;
474 		if (fp->f_type == DTYPE_SOCKET) {
475 			struct socket *so = (struct socket *)fp->f_data;
476 
477 			so->so_pgid = tmp;
478 			so->so_siguid = p->p_cred->p_ruid;
479 			so->so_sigeuid = p->p_ucred->cr_uid;
480 			error = 0;
481 			break;
482 		}
483 		if (tmp <= 0) {
484 			tmp = -tmp;
485 		} else {
486 			struct proc *p1 = pfind(tmp);
487 			if (p1 == 0) {
488 				error = ESRCH;
489 				break;
490 			}
491 			tmp = p1->p_pgrp->pg_id;
492 		}
493 		error = (*fp->f_ops->fo_ioctl)
494 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
495 		break;
496 
497 	case FIOGETOWN:
498 		if (fp->f_type == DTYPE_SOCKET) {
499 			error = 0;
500 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
501 			break;
502 		}
503 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
504 		*(int *)data = -*(int *)data;
505 		break;
506 
507 	default:
508 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
509 		/*
510 		 * Copy any data to user, size was
511 		 * already set and checked above.
512 		 */
513 		if (error == 0 && (com&IOC_OUT) && size)
514 			error = copyout(data, SCARG(uap, data), (u_int)size);
515 		break;
516 	}
517 	if (memp)
518 		free(memp, M_IOCTLOPS);
519 	return (error);
520 }
521 
522 int	selwait, nselcoll;
523 
524 /*
525  * Select system call.
526  */
527 int
528 sys_select(p, v, retval)
529 	register struct proc *p;
530 	void *v;
531 	register_t *retval;
532 {
533 	register struct sys_select_args /* {
534 		syscallarg(int) nd;
535 		syscallarg(fd_set *) in;
536 		syscallarg(fd_set *) ou;
537 		syscallarg(fd_set *) ex;
538 		syscallarg(struct timeval *) tv;
539 	} */ *uap = v;
540 	fd_set bits[6], *pibits[3], *pobits[3];
541 	struct timeval atv;
542 	int s, ncoll, error = 0, timo;
543 	u_int ni;
544 
545 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
546 		/* forgiving; slightly wrong */
547 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
548 	}
549 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
550 	if (SCARG(uap, nd) > FD_SETSIZE) {
551 		caddr_t mbits;
552 
553 		if ((mbits = malloc(ni * 6, M_TEMP, M_WAITOK)) == NULL) {
554 			error = EINVAL;
555 			goto cleanup;
556 		}
557 		bzero(mbits, ni * 6);
558 		pibits[0] = (fd_set *)&mbits[ni * 0];
559 		pibits[1] = (fd_set *)&mbits[ni * 1];
560 		pibits[2] = (fd_set *)&mbits[ni * 2];
561 		pobits[0] = (fd_set *)&mbits[ni * 3];
562 		pobits[1] = (fd_set *)&mbits[ni * 4];
563 		pobits[2] = (fd_set *)&mbits[ni * 5];
564 	} else {
565 		bzero((caddr_t)bits, sizeof(bits));
566 		pibits[0] = &bits[0];
567 		pibits[1] = &bits[1];
568 		pibits[2] = &bits[2];
569 		pobits[0] = &bits[3];
570 		pobits[1] = &bits[4];
571 		pobits[2] = &bits[5];
572 	}
573 
574 #define	getbits(name, x) \
575 	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
576 	    (caddr_t)pibits[x], ni))) \
577 		goto done;
578 	getbits(in, 0);
579 	getbits(ou, 1);
580 	getbits(ex, 2);
581 #undef	getbits
582 
583 	if (SCARG(uap, tv)) {
584 		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
585 			sizeof (atv));
586 		if (error)
587 			goto done;
588 		if (itimerfix(&atv)) {
589 			error = EINVAL;
590 			goto done;
591 		}
592 		s = splclock();
593 		timeradd(&atv, &time, &atv);
594 		timo = hzto(&atv);
595 		/*
596 		 * Avoid inadvertently sleeping forever.
597 		 */
598 		if (timo == 0)
599 			timo = 1;
600 		splx(s);
601 	} else
602 		timo = 0;
603 retry:
604 	ncoll = nselcoll;
605 	p->p_flag |= P_SELECT;
606 	error = selscan(p, pibits[0], pobits[0], SCARG(uap, nd), retval);
607 	if (error || *retval)
608 		goto done;
609 	s = splhigh();
610 	/* this should be timercmp(&time, &atv, >=) */
611 	if (SCARG(uap, tv) && (time.tv_sec > atv.tv_sec ||
612 	    (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
613 		splx(s);
614 		goto done;
615 	}
616 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
617 		splx(s);
618 		goto retry;
619 	}
620 	p->p_flag &= ~P_SELECT;
621 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
622 	splx(s);
623 	if (error == 0)
624 		goto retry;
625 done:
626 	p->p_flag &= ~P_SELECT;
627 	/* select is not restarted after signals... */
628 	if (error == ERESTART)
629 		error = EINTR;
630 	if (error == EWOULDBLOCK)
631 		error = 0;
632 #define	putbits(name, x) \
633 	if (SCARG(uap, name) && (error2 = copyout((caddr_t)pobits[x], \
634 	    (caddr_t)SCARG(uap, name), ni))) \
635 		error = error2;
636 	if (error == 0) {
637 		int error2;
638 
639 		putbits(in, 0);
640 		putbits(ou, 1);
641 		putbits(ex, 2);
642 #undef putbits
643 	}
644 
645 cleanup:
646 	if (pibits[0] != &bits[0])
647 		free(pibits[0], M_TEMP);
648 	return (error);
649 }
650 
651 int
652 selscan(p, ibits, obits, nfd, retval)
653 	struct proc *p;
654 	fd_set *ibits, *obits;
655 	int nfd;
656 	register_t *retval;
657 {
658 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
659 	register struct filedesc *fdp = p->p_fd;
660 	register int msk, i, j, fd;
661 	register fd_mask bits;
662 	struct file *fp;
663 	int ni, n = 0;
664 	static int flag[3] = { FREAD, FWRITE, 0 };
665 
666 	/*
667 	 * if nfd > FD_SETSIZE then the fd_set's contain nfd bits (rounded
668 	 * up to the next byte) otherwise the fd_set's are normal sized.
669 	 */
670 	ni = sizeof(fd_set);
671 	if (nfd > FD_SETSIZE)
672 		ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
673 
674 	for (msk = 0; msk < 3; msk++) {
675 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
676 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
677 
678 		for (i = 0; i < nfd; i += NFDBITS) {
679 			bits = pibits->fds_bits[i/NFDBITS];
680 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
681 				bits &= ~(1 << j);
682 				fp = fdp->fd_ofiles[fd];
683 				if (fp == NULL)
684 					return (EBADF);
685 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
686 					FD_SET(fd, pobits);
687 					n++;
688 				}
689 			}
690 		}
691 	}
692 	*retval = n;
693 	return (0);
694 }
695 
696 /*ARGSUSED*/
697 int
698 seltrue(dev, flag, p)
699 	dev_t dev;
700 	int flag;
701 	struct proc *p;
702 {
703 
704 	return (1);
705 }
706 
707 /*
708  * Record a select request.
709  */
710 void
711 selrecord(selector, sip)
712 	struct proc *selector;
713 	struct selinfo *sip;
714 {
715 	struct proc *p;
716 	pid_t mypid;
717 
718 	mypid = selector->p_pid;
719 	if (sip->si_selpid == mypid)
720 		return;
721 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
722 	    p->p_wchan == (caddr_t)&selwait)
723 		sip->si_flags |= SI_COLL;
724 	else
725 		sip->si_selpid = mypid;
726 }
727 
728 /*
729  * Do a wakeup when a selectable event occurs.
730  */
731 void
732 selwakeup(sip)
733 	register struct selinfo *sip;
734 {
735 	register struct proc *p;
736 	int s;
737 
738 	if (sip->si_selpid == 0)
739 		return;
740 	if (sip->si_flags & SI_COLL) {
741 		nselcoll++;
742 		sip->si_flags &= ~SI_COLL;
743 		wakeup((caddr_t)&selwait);
744 	}
745 	p = pfind(sip->si_selpid);
746 	sip->si_selpid = 0;
747 	if (p != NULL) {
748 		s = splhigh();
749 		if (p->p_wchan == (caddr_t)&selwait) {
750 			if (p->p_stat == SSLEEP)
751 				setrunnable(p);
752 			else
753 				unsleep(p);
754 		} else if (p->p_flag & P_SELECT)
755 			p->p_flag &= ~P_SELECT;
756 		splx(s);
757 	}
758 }
759 
760 void
761 pollscan(p, pl, nfd, retval)
762 	struct proc *p;
763 	struct pollfd *pl;
764 	int nfd;
765 	register_t *retval;
766 {
767 	register struct filedesc *fdp = p->p_fd;
768 	register int msk, i;
769 	struct file *fp;
770 	int x, n = 0;
771 	static int flag[3] = { FREAD, FWRITE, 0 };
772 	static int pflag[3] = { POLLIN|POLLRDNORM, POLLOUT, POLLERR };
773 
774 	/*
775 	 * XXX: We need to implement the rest of the flags.
776 	 */
777 	for (i = 0; i < nfd; i++) {
778 		fp = fdp->fd_ofiles[pl[i].fd];
779 		if (fp == NULL) {
780 			if (pl[i].events & POLLNVAL) {
781 				pl[i].revents |= POLLNVAL;
782 				n++;
783 			}
784 			continue;
785 		}
786 		for (x = msk = 0; msk < 3; msk++) {
787 			if (pl[i].events & pflag[msk]) {
788 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
789 					pl[i].revents |= pflag[msk] &
790 					    pl[i].events;
791 					x++;
792 				}
793 			}
794 		}
795 		if (x)
796 			n++;
797 	}
798 	*retval = n;
799 }
800 
801 /*
802  * We are using the same mechanism as select only we encode/decode args
803  * differently.
804  */
805 int
806 sys_poll(p, v, retval)
807 	register struct proc *p;
808 	void *v;
809 	register_t *retval;
810 {
811 	struct sys_poll_args *uap = v;
812 	size_t sz;
813 	struct pollfd pfds[4], *pl = pfds;
814 	int msec = SCARG(uap, timeout);
815 	struct timeval atv;
816 	int timo, ncoll, i, s, error, error2;
817 	extern int nselcoll, selwait;
818 
819 	/* XXX constrain; This may not match standards */
820 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles)
821 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
822 	sz = sizeof(struct pollfd) * SCARG(uap, nfds);
823 
824 	/* optimize for the default case, of a small nfds value */
825 	if (sz > sizeof(pfds))
826 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
827 
828 	if ((error = copyin(SCARG(uap, fds), pl, sz)) != 0)
829 		goto bad;
830 
831 	for (i = 0; i < SCARG(uap, nfds); i++)
832 		pl[i].revents = 0;
833 
834 	if (msec != -1) {
835 		atv.tv_sec = msec / 1000;
836 		atv.tv_usec = (msec - (atv.tv_sec * 1000)) * 1000;
837 
838 		if (itimerfix(&atv)) {
839 			error = EINVAL;
840 			goto done;
841 		}
842 		s = splclock();
843 		timeradd(&atv, &time, &atv);
844 		timo = hzto(&atv);
845 		/*
846 		 * Avoid inadvertently sleeping forever.
847 		 */
848 		if (timo == 0)
849 			timo = 1;
850 		splx(s);
851 	} else
852 		timo = 0;
853 
854 retry:
855 	ncoll = nselcoll;
856 	p->p_flag |= P_SELECT;
857 	pollscan(p, pl, SCARG(uap, nfds), retval);
858 	if (*retval)
859 		goto done;
860 	s = splhigh();
861 	if (timo && timercmp(&time, &atv, >=)) {
862 		splx(s);
863 		goto done;
864 	}
865 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
866 		splx(s);
867 		goto retry;
868 	}
869 	p->p_flag &= ~P_SELECT;
870 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
871 	splx(s);
872 	if (error == 0)
873 		goto retry;
874 
875 done:
876 	p->p_flag &= ~P_SELECT;
877 	/* poll is not restarted after signals... */
878 	if (error == ERESTART)
879 		error = EINTR;
880 	if (error == EWOULDBLOCK)
881 		error = 0;
882 	if ((error2 = copyout(pl, SCARG(uap, fds), sz)) != 0)
883 		error = error2;
884 bad:
885 	if (pl != pfds)
886 		free((char *) pl, M_TEMP);
887 	return (error);
888 }
889