xref: /netbsd-src/sys/kern/sys_generic.c (revision aaf4ece63a859a04e37cf3a7229b5fab0157cc06)
1 /*	$NetBSD: sys_generic.c,v 1.84 2005/12/11 12:24:30 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.84 2005/12/11 12:24:30 christos Exp $");
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/filedesc.h>
47 #include <sys/ioctl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 
61 #include <sys/mount.h>
62 #include <sys/sa.h>
63 #include <sys/syscallargs.h>
64 
65 int selscan(struct lwp *, fd_mask *, fd_mask *, int, register_t *);
66 int pollscan(struct lwp *, struct pollfd *, int, register_t *);
67 
68 
69 /*
70  * Read system call.
71  */
72 /* ARGSUSED */
73 int
74 sys_read(struct lwp *l, void *v, register_t *retval)
75 {
76 	struct sys_read_args /* {
77 		syscallarg(int)		fd;
78 		syscallarg(void *)	buf;
79 		syscallarg(size_t)	nbyte;
80 	} */ *uap = v;
81 	int		fd;
82 	struct file	*fp;
83 	struct proc	*p;
84 	struct filedesc	*fdp;
85 
86 	fd = SCARG(uap, fd);
87 	p = l->l_proc;
88 	fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile(fdp, fd)) == NULL)
91 		return (EBADF);
92 
93 	if ((fp->f_flag & FREAD) == 0) {
94 		simple_unlock(&fp->f_slock);
95 		return (EBADF);
96 	}
97 
98 	FILE_USE(fp);
99 
100 	/* dofileread() will unuse the descriptor for us */
101 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
102 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
103 }
104 
105 int
106 dofileread(struct lwp *l, int fd, struct file *fp, void *buf, size_t nbyte,
107 	off_t *offset, int flags, register_t *retval)
108 {
109 	struct iovec aiov;
110 	struct uio auio;
111 	struct proc *p;
112 	size_t cnt;
113 	int error;
114 #ifdef KTRACE
115 	struct iovec	ktriov = {0};
116 #endif
117 	p = l->l_proc;
118 	error = 0;
119 
120 	aiov.iov_base = (caddr_t)buf;
121 	aiov.iov_len = nbyte;
122 	auio.uio_iov = &aiov;
123 	auio.uio_iovcnt = 1;
124 	auio.uio_resid = nbyte;
125 	auio.uio_rw = UIO_READ;
126 	auio.uio_segflg = UIO_USERSPACE;
127 	auio.uio_lwp = l;
128 
129 	/*
130 	 * Reads return ssize_t because -1 is returned on error.  Therefore
131 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
132 	 * values.
133 	 */
134 	if (auio.uio_resid > SSIZE_MAX) {
135 		error = EINVAL;
136 		goto out;
137 	}
138 
139 #ifdef KTRACE
140 	/*
141 	 * if tracing, save a copy of iovec
142 	 */
143 	if (KTRPOINT(p, KTR_GENIO))
144 		ktriov = aiov;
145 #endif
146 	cnt = auio.uio_resid;
147 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
148 	if (error)
149 		if (auio.uio_resid != cnt && (error == ERESTART ||
150 		    error == EINTR || error == EWOULDBLOCK))
151 			error = 0;
152 	cnt -= auio.uio_resid;
153 #ifdef KTRACE
154 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
155 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
156 #endif
157 	*retval = cnt;
158  out:
159 	FILE_UNUSE(fp, l);
160 	return (error);
161 }
162 
163 /*
164  * Scatter read system call.
165  */
166 int
167 sys_readv(struct lwp *l, void *v, register_t *retval)
168 {
169 	struct sys_readv_args /* {
170 		syscallarg(int)				fd;
171 		syscallarg(const struct iovec *)	iovp;
172 		syscallarg(int)				iovcnt;
173 	} */ *uap = v;
174 	struct filedesc	*fdp;
175 	struct file *fp;
176 	struct proc *p;
177 	int fd;
178 
179 	fd = SCARG(uap, fd);
180 	p = l->l_proc;
181 	fdp = p->p_fd;
182 
183 	if ((fp = fd_getfile(fdp, fd)) == NULL)
184 		return (EBADF);
185 
186 	if ((fp->f_flag & FREAD) == 0) {
187 		simple_unlock(&fp->f_slock);
188 		return (EBADF);
189 	}
190 
191 	FILE_USE(fp);
192 
193 	/* dofilereadv() will unuse the descriptor for us */
194 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
195 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
196 }
197 
198 int
199 dofilereadv(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
200 	int iovcnt, off_t *offset, int flags, register_t *retval)
201 {
202 	struct proc *p;
203 	struct uio	auio;
204 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
205 	int		i, error;
206 	size_t		cnt;
207 	u_int		iovlen;
208 #ifdef KTRACE
209 	struct iovec	*ktriov;
210 #endif
211 
212 	p = l->l_proc;
213 	error = 0;
214 #ifdef KTRACE
215 	ktriov = NULL;
216 #endif
217 	/* note: can't use iovlen until iovcnt is validated */
218 	iovlen = iovcnt * sizeof(struct iovec);
219 	if ((u_int)iovcnt > UIO_SMALLIOV) {
220 		if ((u_int)iovcnt > IOV_MAX) {
221 			error = EINVAL;
222 			goto out;
223 		}
224 		iov = malloc(iovlen, M_IOV, M_WAITOK);
225 		needfree = iov;
226 	} else if ((u_int)iovcnt > 0) {
227 		iov = aiov;
228 		needfree = NULL;
229 	} else {
230 		error = EINVAL;
231 		goto out;
232 	}
233 
234 	auio.uio_iov = iov;
235 	auio.uio_iovcnt = iovcnt;
236 	auio.uio_rw = UIO_READ;
237 	auio.uio_segflg = UIO_USERSPACE;
238 	auio.uio_lwp = l;
239 	error = copyin(iovp, iov, iovlen);
240 	if (error)
241 		goto done;
242 	auio.uio_resid = 0;
243 	for (i = 0; i < iovcnt; i++) {
244 		auio.uio_resid += iov->iov_len;
245 		/*
246 		 * Reads return ssize_t because -1 is returned on error.
247 		 * Therefore we must restrict the length to SSIZE_MAX to
248 		 * avoid garbage return values.
249 		 */
250 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
251 			error = EINVAL;
252 			goto done;
253 		}
254 		iov++;
255 	}
256 #ifdef KTRACE
257 	/*
258 	 * if tracing, save a copy of iovec
259 	 */
260 	if (KTRPOINT(p, KTR_GENIO))  {
261 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
262 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
263 	}
264 #endif
265 	cnt = auio.uio_resid;
266 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
267 	if (error)
268 		if (auio.uio_resid != cnt && (error == ERESTART ||
269 		    error == EINTR || error == EWOULDBLOCK))
270 			error = 0;
271 	cnt -= auio.uio_resid;
272 #ifdef KTRACE
273 	if (ktriov != NULL) {
274 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
275 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
276 		free(ktriov, M_TEMP);
277 	}
278 #endif
279 	*retval = cnt;
280  done:
281 	if (needfree)
282 		free(needfree, M_IOV);
283  out:
284 	FILE_UNUSE(fp, l);
285 	return (error);
286 }
287 
288 /*
289  * Write system call
290  */
291 int
292 sys_write(struct lwp *l, void *v, register_t *retval)
293 {
294 	struct sys_write_args /* {
295 		syscallarg(int)			fd;
296 		syscallarg(const void *)	buf;
297 		syscallarg(size_t)		nbyte;
298 	} */ *uap = v;
299 	int		fd;
300 	struct file	*fp;
301 	struct proc	*p;
302 	struct filedesc	*fdp;
303 
304 	fd = SCARG(uap, fd);
305 	p = l->l_proc;
306 	fdp = p->p_fd;
307 
308 	if ((fp = fd_getfile(fdp, fd)) == NULL)
309 		return (EBADF);
310 
311 	if ((fp->f_flag & FWRITE) == 0) {
312 		simple_unlock(&fp->f_slock);
313 		return (EBADF);
314 	}
315 
316 	FILE_USE(fp);
317 
318 	/* dofilewrite() will unuse the descriptor for us */
319 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
320 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
321 }
322 
323 int
324 dofilewrite(struct lwp *l, int fd, struct file *fp, const void *buf,
325 	size_t nbyte, off_t *offset, int flags, register_t *retval)
326 {
327 	struct iovec aiov;
328 	struct uio auio;
329 	struct proc *p;
330 	size_t cnt;
331 	int error;
332 #ifdef KTRACE
333 	struct iovec	ktriov = {0};
334 #endif
335 
336 	p = l->l_proc;
337 	error = 0;
338 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
339 	aiov.iov_len = nbyte;
340 	auio.uio_iov = &aiov;
341 	auio.uio_iovcnt = 1;
342 	auio.uio_resid = nbyte;
343 	auio.uio_rw = UIO_WRITE;
344 	auio.uio_segflg = UIO_USERSPACE;
345 	auio.uio_lwp = l;
346 
347 	/*
348 	 * Writes return ssize_t because -1 is returned on error.  Therefore
349 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
350 	 * values.
351 	 */
352 	if (auio.uio_resid > SSIZE_MAX) {
353 		error = EINVAL;
354 		goto out;
355 	}
356 
357 #ifdef KTRACE
358 	/*
359 	 * if tracing, save a copy of iovec
360 	 */
361 	if (KTRPOINT(p, KTR_GENIO))
362 		ktriov = aiov;
363 #endif
364 	cnt = auio.uio_resid;
365 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
366 	if (error) {
367 		if (auio.uio_resid != cnt && (error == ERESTART ||
368 		    error == EINTR || error == EWOULDBLOCK))
369 			error = 0;
370 		if (error == EPIPE)
371 			psignal(p, SIGPIPE);
372 	}
373 	cnt -= auio.uio_resid;
374 #ifdef KTRACE
375 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
376 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
377 #endif
378 	*retval = cnt;
379  out:
380 	FILE_UNUSE(fp, l);
381 	return (error);
382 }
383 
384 /*
385  * Gather write system call
386  */
387 int
388 sys_writev(struct lwp *l, void *v, register_t *retval)
389 {
390 	struct sys_writev_args /* {
391 		syscallarg(int)				fd;
392 		syscallarg(const struct iovec *)	iovp;
393 		syscallarg(int)				iovcnt;
394 	} */ *uap = v;
395 	int		fd;
396 	struct file	*fp;
397 	struct proc	*p;
398 	struct filedesc	*fdp;
399 
400 	fd = SCARG(uap, fd);
401 	p = l->l_proc;
402 	fdp = p->p_fd;
403 
404 	if ((fp = fd_getfile(fdp, fd)) == NULL)
405 		return (EBADF);
406 
407 	if ((fp->f_flag & FWRITE) == 0) {
408 		simple_unlock(&fp->f_slock);
409 		return (EBADF);
410 	}
411 
412 	FILE_USE(fp);
413 
414 	/* dofilewritev() will unuse the descriptor for us */
415 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
416 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
417 }
418 
419 int
420 dofilewritev(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
421 	int iovcnt, off_t *offset, int flags, register_t *retval)
422 {
423 	struct proc	*p;
424 	struct uio	auio;
425 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
426 	int		i, error;
427 	size_t		cnt;
428 	u_int		iovlen;
429 #ifdef KTRACE
430 	struct iovec	*ktriov;
431 #endif
432 
433 	p = l->l_proc;
434 	error = 0;
435 #ifdef KTRACE
436 	ktriov = NULL;
437 #endif
438 	/* note: can't use iovlen until iovcnt is validated */
439 	iovlen = iovcnt * sizeof(struct iovec);
440 	if ((u_int)iovcnt > UIO_SMALLIOV) {
441 		if ((u_int)iovcnt > IOV_MAX) {
442 			error = EINVAL;
443 			goto out;
444 		}
445 		iov = malloc(iovlen, M_IOV, M_WAITOK);
446 		needfree = iov;
447 	} else if ((u_int)iovcnt > 0) {
448 		iov = aiov;
449 		needfree = NULL;
450 	} else {
451 		error = EINVAL;
452 		goto out;
453 	}
454 
455 	auio.uio_iov = iov;
456 	auio.uio_iovcnt = iovcnt;
457 	auio.uio_rw = UIO_WRITE;
458 	auio.uio_segflg = UIO_USERSPACE;
459 	auio.uio_lwp = l;
460 	error = copyin(iovp, iov, iovlen);
461 	if (error)
462 		goto done;
463 	auio.uio_resid = 0;
464 	for (i = 0; i < iovcnt; i++) {
465 		auio.uio_resid += iov->iov_len;
466 		/*
467 		 * Writes return ssize_t because -1 is returned on error.
468 		 * Therefore we must restrict the length to SSIZE_MAX to
469 		 * avoid garbage return values.
470 		 */
471 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
472 			error = EINVAL;
473 			goto done;
474 		}
475 		iov++;
476 	}
477 #ifdef KTRACE
478 	/*
479 	 * if tracing, save a copy of iovec
480 	 */
481 	if (KTRPOINT(p, KTR_GENIO))  {
482 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
483 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
484 	}
485 #endif
486 	cnt = auio.uio_resid;
487 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
488 	if (error) {
489 		if (auio.uio_resid != cnt && (error == ERESTART ||
490 		    error == EINTR || error == EWOULDBLOCK))
491 			error = 0;
492 		if (error == EPIPE)
493 			psignal(p, SIGPIPE);
494 	}
495 	cnt -= auio.uio_resid;
496 #ifdef KTRACE
497 	if (ktriov != NULL) {
498 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
499 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
500 		free(ktriov, M_TEMP);
501 	}
502 #endif
503 	*retval = cnt;
504  done:
505 	if (needfree)
506 		free(needfree, M_IOV);
507  out:
508 	FILE_UNUSE(fp, l);
509 	return (error);
510 }
511 
512 /*
513  * Ioctl system call
514  */
515 /* ARGSUSED */
516 int
517 sys_ioctl(struct lwp *l, void *v, register_t *retval)
518 {
519 	struct sys_ioctl_args /* {
520 		syscallarg(int)		fd;
521 		syscallarg(u_long)	com;
522 		syscallarg(caddr_t)	data;
523 	} */ *uap = v;
524 	struct file	*fp;
525 	struct proc	*p;
526 	struct filedesc	*fdp;
527 	u_long		com;
528 	int		error;
529 	u_int		size;
530 	caddr_t		data, memp;
531 #define	STK_PARAMS	128
532 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
533 
534 	error = 0;
535 	p = l->l_proc;
536 	fdp = p->p_fd;
537 
538 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
539 		return (EBADF);
540 
541 	FILE_USE(fp);
542 
543 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
544 		error = EBADF;
545 		com = 0;
546 		goto out;
547 	}
548 
549 	switch (com = SCARG(uap, com)) {
550 	case FIONCLEX:
551 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
552 		goto out;
553 
554 	case FIOCLEX:
555 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
556 		goto out;
557 	}
558 
559 	/*
560 	 * Interpret high order word to find amount of data to be
561 	 * copied to/from the user's address space.
562 	 */
563 	size = IOCPARM_LEN(com);
564 	if (size > IOCPARM_MAX) {
565 		error = ENOTTY;
566 		goto out;
567 	}
568 	memp = NULL;
569 	if (size > sizeof(stkbuf)) {
570 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
571 		data = memp;
572 	} else
573 		data = (caddr_t)stkbuf;
574 	if (com&IOC_IN) {
575 		if (size) {
576 			error = copyin(SCARG(uap, data), data, size);
577 			if (error) {
578 				if (memp)
579 					free(memp, M_IOCTLOPS);
580 				goto out;
581 			}
582 #ifdef KTRACE
583 			if (KTRPOINT(p, KTR_GENIO)) {
584 				struct iovec iov;
585 				iov.iov_base = SCARG(uap, data);
586 				iov.iov_len = size;
587 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
588 					size, 0);
589 			}
590 #endif
591 		} else
592 			*(caddr_t *)data = SCARG(uap, data);
593 	} else if ((com&IOC_OUT) && size)
594 		/*
595 		 * Zero the buffer so the user always
596 		 * gets back something deterministic.
597 		 */
598 		memset(data, 0, size);
599 	else if (com&IOC_VOID)
600 		*(caddr_t *)data = SCARG(uap, data);
601 
602 	switch (com) {
603 
604 	case FIONBIO:
605 		if (*(int *)data != 0)
606 			fp->f_flag |= FNONBLOCK;
607 		else
608 			fp->f_flag &= ~FNONBLOCK;
609 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
610 		break;
611 
612 	case FIOASYNC:
613 		if (*(int *)data != 0)
614 			fp->f_flag |= FASYNC;
615 		else
616 			fp->f_flag &= ~FASYNC;
617 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
618 		break;
619 
620 	default:
621 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
622 		/*
623 		 * Copy any data to user, size was
624 		 * already set and checked above.
625 		 */
626 		if (error == 0 && (com&IOC_OUT) && size) {
627 			error = copyout(data, SCARG(uap, data), size);
628 #ifdef KTRACE
629 			if (KTRPOINT(p, KTR_GENIO)) {
630 				struct iovec iov;
631 				iov.iov_base = SCARG(uap, data);
632 				iov.iov_len = size;
633 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
634 					size, error);
635 			}
636 #endif
637 		}
638 		break;
639 	}
640 	if (memp)
641 		free(memp, M_IOCTLOPS);
642  out:
643 	FILE_UNUSE(fp, l);
644 	switch (error) {
645 	case -1:
646 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
647 		    "pid=%d comm=%s\n",
648 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
649 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
650 		    p->p_pid, p->p_comm);
651 		/* FALLTHROUGH */
652 	case EPASSTHROUGH:
653 		error = ENOTTY;
654 		/* FALLTHROUGH */
655 	default:
656 		return (error);
657 	}
658 }
659 
660 int	selwait, nselcoll;
661 
662 /*
663  * Select system call.
664  */
665 int
666 sys_pselect(struct lwp *l, void *v, register_t *retval)
667 {
668 	struct sys_pselect_args /* {
669 		syscallarg(int)				nd;
670 		syscallarg(fd_set *)			in;
671 		syscallarg(fd_set *)			ou;
672 		syscallarg(fd_set *)			ex;
673 		syscallarg(const struct timespec *)	ts;
674 		syscallarg(sigset_t *)			mask;
675 	} */ * const uap = v;
676 	struct timespec	ats;
677 	struct timeval	atv, *tv = NULL;
678 	sigset_t	amask, *mask = NULL;
679 	int		error;
680 
681 	if (SCARG(uap, ts)) {
682 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
683 		if (error)
684 			return error;
685 		atv.tv_sec = ats.tv_sec;
686 		atv.tv_usec = ats.tv_nsec / 1000;
687 		tv = &atv;
688 	}
689 	if (SCARG(uap, mask) != NULL) {
690 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
691 		if (error)
692 			return error;
693 		mask = &amask;
694 	}
695 
696 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
697 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
698 }
699 
700 int
701 sys_select(struct lwp *l, void *v, register_t *retval)
702 {
703 	struct sys_select_args /* {
704 		syscallarg(int)			nd;
705 		syscallarg(fd_set *)		in;
706 		syscallarg(fd_set *)		ou;
707 		syscallarg(fd_set *)		ex;
708 		syscallarg(struct timeval *)	tv;
709 	} */ * const uap = v;
710 	struct timeval atv, *tv = NULL;
711 	int error;
712 
713 	if (SCARG(uap, tv)) {
714 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
715 			sizeof(atv));
716 		if (error)
717 			return error;
718 		tv = &atv;
719 	}
720 
721 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
722 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
723 }
724 
725 int
726 selcommon(struct lwp *l, register_t *retval, int nd, fd_set *u_in,
727 	fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
728 {
729 	struct proc	* const p = l->l_proc;
730 	caddr_t		bits;
731 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
732 			    sizeof(fd_mask) * 6];
733 	int		s, ncoll, error, timo;
734 	size_t		ni;
735 	sigset_t	oldmask;
736 
737 	error = 0;
738 	if (nd < 0)
739 		return (EINVAL);
740 	if (nd > p->p_fd->fd_nfiles) {
741 		/* forgiving; slightly wrong */
742 		nd = p->p_fd->fd_nfiles;
743 	}
744 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
745 	if (ni * 6 > sizeof(smallbits))
746 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
747 	else
748 		bits = smallbits;
749 
750 #define	getbits(name, x)						\
751 	if (u_ ## name) {						\
752 		error = copyin(u_ ## name, bits + ni * x, ni);		\
753 		if (error)						\
754 			goto done;					\
755 	} else								\
756 		memset(bits + ni * x, 0, ni);
757 	getbits(in, 0);
758 	getbits(ou, 1);
759 	getbits(ex, 2);
760 #undef	getbits
761 
762 	timo = 0;
763 	if (tv) {
764 		if (itimerfix(tv)) {
765 			error = EINVAL;
766 			goto done;
767 		}
768 		s = splclock();
769 		timeradd(tv, &time, tv);
770 		splx(s);
771 	}
772 	if (mask)
773 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
774 
775  retry:
776 	ncoll = nselcoll;
777 	l->l_flag |= L_SELECT;
778 	error = selscan(l, (fd_mask *)(bits + ni * 0),
779 			   (fd_mask *)(bits + ni * 3), nd, retval);
780 	if (error || *retval)
781 		goto done;
782 	if (tv) {
783 		/*
784 		 * We have to recalculate the timeout on every retry.
785 		 */
786 		timo = hzto(tv);
787 		if (timo <= 0)
788 			goto done;
789 	}
790 	s = splsched();
791 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
792 		splx(s);
793 		goto retry;
794 	}
795 	l->l_flag &= ~L_SELECT;
796 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
797 	splx(s);
798 	if (error == 0)
799 		goto retry;
800  done:
801 	if (mask)
802 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
803 	l->l_flag &= ~L_SELECT;
804 	/* select is not restarted after signals... */
805 	if (error == ERESTART)
806 		error = EINTR;
807 	if (error == EWOULDBLOCK)
808 		error = 0;
809 	if (error == 0) {
810 
811 #define	putbits(name, x)						\
812 		if (u_ ## name) {					\
813 			error = copyout(bits + ni * x, u_ ## name, ni); \
814 			if (error)					\
815 				goto out;				\
816 		}
817 		putbits(in, 3);
818 		putbits(ou, 4);
819 		putbits(ex, 5);
820 #undef putbits
821 	}
822  out:
823 	if (ni * 6 > sizeof(smallbits))
824 		free(bits, M_TEMP);
825 	return (error);
826 }
827 
828 int
829 selscan(struct lwp *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
830 	register_t *retval)
831 {
832 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
833 			       POLLWRNORM | POLLHUP | POLLERR,
834 			       POLLRDBAND };
835 	struct proc *p = l->l_proc;
836 	struct filedesc	*fdp;
837 	int msk, i, j, fd, n;
838 	fd_mask ibits, obits;
839 	struct file *fp;
840 
841 	fdp = p->p_fd;
842 	n = 0;
843 	for (msk = 0; msk < 3; msk++) {
844 		for (i = 0; i < nfd; i += NFDBITS) {
845 			ibits = *ibitp++;
846 			obits = 0;
847 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
848 				ibits &= ~(1 << j);
849 				if ((fp = fd_getfile(fdp, fd)) == NULL)
850 					return (EBADF);
851 				FILE_USE(fp);
852 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
853 					obits |= (1 << j);
854 					n++;
855 				}
856 				FILE_UNUSE(fp, l);
857 			}
858 			*obitp++ = obits;
859 		}
860 	}
861 	*retval = n;
862 	return (0);
863 }
864 
865 /*
866  * Poll system call.
867  */
868 int
869 sys_poll(struct lwp *l, void *v, register_t *retval)
870 {
871 	struct sys_poll_args /* {
872 		syscallarg(struct pollfd *)	fds;
873 		syscallarg(u_int)		nfds;
874 		syscallarg(int)			timeout;
875 	} */ * const uap = v;
876 	struct timeval	atv, *tv = NULL;
877 
878 	if (SCARG(uap, timeout) != INFTIM) {
879 		atv.tv_sec = SCARG(uap, timeout) / 1000;
880 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
881 		tv = &atv;
882 	}
883 
884 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
885 		tv, NULL);
886 }
887 
888 /*
889  * Poll system call.
890  */
891 int
892 sys_pollts(struct lwp *l, void *v, register_t *retval)
893 {
894 	struct sys_pollts_args /* {
895 		syscallarg(struct pollfd *)		fds;
896 		syscallarg(u_int)			nfds;
897 		syscallarg(const struct timespec *)	ts;
898 		syscallarg(const sigset_t *)		mask;
899 	} */ * const uap = v;
900 	struct timespec	ats;
901 	struct timeval	atv, *tv = NULL;
902 	sigset_t	amask, *mask = NULL;
903 	int		error;
904 
905 	if (SCARG(uap, ts)) {
906 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
907 		if (error)
908 			return error;
909 		atv.tv_sec = ats.tv_sec;
910 		atv.tv_usec = ats.tv_nsec / 1000;
911 		tv = &atv;
912 	}
913 	if (SCARG(uap, mask)) {
914 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
915 		if (error)
916 			return error;
917 		mask = &amask;
918 	}
919 
920 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
921 		tv, mask);
922 }
923 
924 int
925 pollcommon(struct lwp *l, register_t *retval,
926 	struct pollfd *u_fds, u_int nfds,
927 	struct timeval *tv, sigset_t *mask)
928 {
929 	struct proc	* const p = l->l_proc;
930 	caddr_t		bits;
931 	char		smallbits[32 * sizeof(struct pollfd)];
932 	sigset_t	oldmask;
933 	int		s, ncoll, error, timo;
934 	size_t		ni;
935 
936 	if (nfds > p->p_fd->fd_nfiles) {
937 		/* forgiving; slightly wrong */
938 		nfds = p->p_fd->fd_nfiles;
939 	}
940 	ni = nfds * sizeof(struct pollfd);
941 	if (ni > sizeof(smallbits))
942 		bits = malloc(ni, M_TEMP, M_WAITOK);
943 	else
944 		bits = smallbits;
945 
946 	error = copyin(u_fds, bits, ni);
947 	if (error)
948 		goto done;
949 
950 	timo = 0;
951 	if (tv) {
952 		if (itimerfix(tv)) {
953 			error = EINVAL;
954 			goto done;
955 		}
956 		s = splclock();
957 		timeradd(tv, &time, tv);
958 		splx(s);
959 	}
960 	if (mask != NULL)
961 		(void)sigprocmask1(p, SIG_SETMASK, mask, &oldmask);
962 
963  retry:
964 	ncoll = nselcoll;
965 	l->l_flag |= L_SELECT;
966 	error = pollscan(l, (struct pollfd *)bits, nfds, retval);
967 	if (error || *retval)
968 		goto done;
969 	if (tv) {
970 		/*
971 		 * We have to recalculate the timeout on every retry.
972 		 */
973 		timo = hzto(tv);
974 		if (timo <= 0)
975 			goto done;
976 	}
977 	s = splsched();
978 	if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
979 		splx(s);
980 		goto retry;
981 	}
982 	l->l_flag &= ~L_SELECT;
983 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
984 	splx(s);
985 	if (error == 0)
986 		goto retry;
987  done:
988 	if (mask != NULL)
989 		(void)sigprocmask1(p, SIG_SETMASK, &oldmask, NULL);
990 	l->l_flag &= ~L_SELECT;
991 	/* poll is not restarted after signals... */
992 	if (error == ERESTART)
993 		error = EINTR;
994 	if (error == EWOULDBLOCK)
995 		error = 0;
996 	if (error == 0) {
997 		error = copyout(bits, u_fds, ni);
998 		if (error)
999 			goto out;
1000 	}
1001  out:
1002 	if (ni > sizeof(smallbits))
1003 		free(bits, M_TEMP);
1004 	return (error);
1005 }
1006 
1007 int
1008 pollscan(struct lwp *l, struct pollfd *fds, int nfd, register_t *retval)
1009 {
1010 	struct proc	*p = l->l_proc;
1011 	struct filedesc	*fdp;
1012 	int		i, n;
1013 	struct file	*fp;
1014 
1015 	fdp = p->p_fd;
1016 	n = 0;
1017 	for (i = 0; i < nfd; i++, fds++) {
1018 		if (fds->fd >= fdp->fd_nfiles) {
1019 			fds->revents = POLLNVAL;
1020 			n++;
1021 		} else if (fds->fd < 0) {
1022 			fds->revents = 0;
1023 		} else {
1024 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1025 				fds->revents = POLLNVAL;
1026 				n++;
1027 			} else {
1028 				FILE_USE(fp);
1029 				fds->revents = (*fp->f_ops->fo_poll)(fp,
1030 				    fds->events | POLLERR | POLLHUP, l);
1031 				if (fds->revents != 0)
1032 					n++;
1033 				FILE_UNUSE(fp, l);
1034 			}
1035 		}
1036 	}
1037 	*retval = n;
1038 	return (0);
1039 }
1040 
1041 /*ARGSUSED*/
1042 int
1043 seltrue(dev_t dev, int events, struct lwp *l)
1044 {
1045 
1046 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1047 }
1048 
1049 /*
1050  * Record a select request.
1051  */
1052 void
1053 selrecord(struct lwp *selector, struct selinfo *sip)
1054 {
1055 	struct lwp	*l;
1056 	struct proc	*p;
1057 	pid_t		mypid;
1058 
1059 	mypid = selector->l_proc->p_pid;
1060 	if (sip->sel_pid == mypid)
1061 		return;
1062 	if (sip->sel_pid && (p = pfind(sip->sel_pid))) {
1063 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1064 			if (l->l_wchan == (caddr_t)&selwait) {
1065 				sip->sel_collision = 1;
1066 				return;
1067 			}
1068 		}
1069 	}
1070 
1071 	sip->sel_pid = mypid;
1072 }
1073 
1074 /*
1075  * Do a wakeup when a selectable event occurs.
1076  */
1077 void
1078 selwakeup(sip)
1079 	struct selinfo *sip;
1080 {
1081 	struct lwp *l;
1082 	struct proc *p;
1083 	int s;
1084 
1085 	if (sip->sel_pid == 0)
1086 		return;
1087 	if (sip->sel_collision) {
1088 		sip->sel_pid = 0;
1089 		nselcoll++;
1090 		sip->sel_collision = 0;
1091 		wakeup((caddr_t)&selwait);
1092 		return;
1093 	}
1094 	p = pfind(sip->sel_pid);
1095 	sip->sel_pid = 0;
1096 	if (p != NULL) {
1097 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1098 			SCHED_LOCK(s);
1099 			if (l->l_wchan == (caddr_t)&selwait) {
1100 				if (l->l_stat == LSSLEEP)
1101 					setrunnable(l);
1102 				else
1103 					unsleep(l);
1104 			} else if (l->l_flag & L_SELECT)
1105 				l->l_flag &= ~L_SELECT;
1106 			SCHED_UNLOCK(s);
1107 		}
1108 	}
1109 }
1110