xref: /openbsd-src/sys/kern/sys_generic.c (revision aa5e9e10509ffd51558f081f01cd78bfa3c4f2a5)
1 /*	$OpenBSD: sys_generic.c,v 1.81 2013/06/01 16:27:37 tedu Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
67 int pollout(struct pollfd *, struct pollfd *, u_int);
68 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
69     const struct timespec *, const sigset_t *, register_t *);
70 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
71     const sigset_t *, register_t *);
72 
73 /*
74  * Read system call.
75  */
76 /* ARGSUSED */
77 int
78 sys_read(struct proc *p, void *v, register_t *retval)
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	struct iovec iov;
86 	int fd = SCARG(uap, fd);
87 	struct file *fp;
88 	struct filedesc *fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile(fdp, fd)) == NULL)
91 		return (EBADF);
92 	if ((fp->f_flag & FREAD) == 0)
93 		return (EBADF);
94 
95 	iov.iov_base = SCARG(uap, buf);
96 	iov.iov_len = SCARG(uap, nbyte);
97 
98 	FREF(fp);
99 
100 	/* dofilereadv() will FRELE the descriptor for us */
101 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
102 }
103 
104 /*
105  * Scatter read system call.
106  */
107 int
108 sys_readv(struct proc *p, void *v, register_t *retval)
109 {
110 	struct sys_readv_args /* {
111 		syscallarg(int) fd;
112 		syscallarg(const struct iovec *) iovp;
113 		syscallarg(int) iovcnt;
114 	} */ *uap = v;
115 	int fd = SCARG(uap, fd);
116 	struct file *fp;
117 	struct filedesc *fdp = p->p_fd;
118 
119 	if ((fp = fd_getfile(fdp, fd)) == NULL)
120 		return (EBADF);
121 	if ((fp->f_flag & FREAD) == 0)
122 		return (EBADF);
123 
124 	FREF(fp);
125 
126 	/* dofilereadv() will FRELE the descriptor for us */
127 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
128 	    &fp->f_offset, retval));
129 }
130 
131 int
132 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
133     int iovcnt, int userspace, off_t *offset, register_t *retval)
134 {
135 	struct iovec aiov[UIO_SMALLIOV];
136 	struct uio auio;
137 	struct iovec *iov;
138 	struct iovec *needfree = NULL;
139 	long i, cnt, error = 0;
140 	u_int iovlen;
141 #ifdef KTRACE
142 	struct iovec *ktriov = NULL;
143 #endif
144 
145 	/* note: can't use iovlen until iovcnt is validated */
146 	iovlen = iovcnt * sizeof(struct iovec);
147 
148 	/*
149 	 * If the iovec array exists in userspace, it needs to be copied in;
150 	 * otherwise, it can be used directly.
151 	 */
152 	if (userspace) {
153 		if ((u_int)iovcnt > UIO_SMALLIOV) {
154 			if ((u_int)iovcnt > IOV_MAX) {
155 				error = EINVAL;
156 				goto out;
157 			}
158 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
159 		} else if ((u_int)iovcnt > 0) {
160 			iov = aiov;
161 			needfree = NULL;
162 		} else {
163 			error = EINVAL;
164 			goto out;
165 		}
166 		if ((error = copyin(iovp, iov, iovlen)))
167 			goto done;
168 	} else {
169 		iov = (struct iovec *)iovp;		/* de-constify */
170 	}
171 
172 	auio.uio_iov = iov;
173 	auio.uio_iovcnt = iovcnt;
174 	auio.uio_rw = UIO_READ;
175 	auio.uio_segflg = UIO_USERSPACE;
176 	auio.uio_procp = p;
177 	auio.uio_resid = 0;
178 	for (i = 0; i < iovcnt; i++) {
179 		auio.uio_resid += iov->iov_len;
180 		/*
181 		 * Reads return ssize_t because -1 is returned on error.
182 		 * Therefore we must restrict the length to SSIZE_MAX to
183 		 * avoid garbage return values.  Note that the addition is
184 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
185 		 */
186 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
187 			error = EINVAL;
188 			goto done;
189 		}
190 		iov++;
191 	}
192 #ifdef KTRACE
193 	/*
194 	 * if tracing, save a copy of iovec
195 	 */
196 	if (KTRPOINT(p, KTR_GENIO)) {
197 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
198 		bcopy(auio.uio_iov, ktriov, iovlen);
199 	}
200 #endif
201 	cnt = auio.uio_resid;
202 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
203 	if (error)
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	cnt -= auio.uio_resid;
208 
209 	fp->f_rxfer++;
210 	fp->f_rbytes += cnt;
211 #ifdef KTRACE
212 	if (ktriov != NULL) {
213 		if (error == 0)
214 			ktrgenio(p, fd, UIO_READ, ktriov, cnt,
215 			    error);
216 		free(ktriov, M_TEMP);
217 	}
218 #endif
219 	*retval = cnt;
220  done:
221 	if (needfree)
222 		free(needfree, M_IOV);
223  out:
224 	FRELE(fp, p);
225 	return (error);
226 }
227 
228 /*
229  * Write system call
230  */
231 int
232 sys_write(struct proc *p, void *v, register_t *retval)
233 {
234 	struct sys_write_args /* {
235 		syscallarg(int) fd;
236 		syscallarg(const void *) buf;
237 		syscallarg(size_t) nbyte;
238 	} */ *uap = v;
239 	struct iovec iov;
240 	int fd = SCARG(uap, fd);
241 	struct file *fp;
242 	struct filedesc *fdp = p->p_fd;
243 
244 	if ((fp = fd_getfile(fdp, fd)) == NULL)
245 		return (EBADF);
246 	if ((fp->f_flag & FWRITE) == 0)
247 		return (EBADF);
248 
249 	iov.iov_base = (void *)SCARG(uap, buf);
250 	iov.iov_len = SCARG(uap, nbyte);
251 
252 	FREF(fp);
253 
254 	/* dofilewritev() will FRELE the descriptor for us */
255 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
256 }
257 
258 /*
259  * Gather write system call
260  */
261 int
262 sys_writev(struct proc *p, void *v, register_t *retval)
263 {
264 	struct sys_writev_args /* {
265 		syscallarg(int) fd;
266 		syscallarg(const struct iovec *) iovp;
267 		syscallarg(int) iovcnt;
268 	} */ *uap = v;
269 	int fd = SCARG(uap, fd);
270 	struct file *fp;
271 	struct filedesc *fdp = p->p_fd;
272 
273 	if ((fp = fd_getfile(fdp, fd)) == NULL)
274 		return (EBADF);
275 	if ((fp->f_flag & FWRITE) == 0)
276 		return (EBADF);
277 
278 	FREF(fp);
279 
280 	/* dofilewritev() will FRELE the descriptor for us */
281 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
282 	    &fp->f_offset, retval));
283 }
284 
285 int
286 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
287     int iovcnt, int userspace, off_t *offset, register_t *retval)
288 {
289 	struct iovec aiov[UIO_SMALLIOV];
290 	struct uio auio;
291 	struct iovec *iov;
292 	struct iovec *needfree = NULL;
293 	long i, cnt, error = 0;
294 	u_int iovlen;
295 #ifdef KTRACE
296 	struct iovec *ktriov = NULL;
297 #endif
298 
299 	/* note: can't use iovlen until iovcnt is validated */
300 	iovlen = iovcnt * sizeof(struct iovec);
301 
302 	/*
303 	 * If the iovec array exists in userspace, it needs to be copied in;
304 	 * otherwise, it can be used directly.
305 	 */
306 	if (userspace) {
307 		if ((u_int)iovcnt > UIO_SMALLIOV) {
308 			if ((u_int)iovcnt > IOV_MAX) {
309 				error = EINVAL;
310 				goto out;
311 			}
312 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
313 		} else if ((u_int)iovcnt > 0) {
314 			iov = aiov;
315 			needfree = NULL;
316 		} else {
317 			error = EINVAL;
318 			goto out;
319 		}
320 		if ((error = copyin(iovp, iov, iovlen)))
321 			goto done;
322 	} else {
323 		iov = (struct iovec *)iovp;		/* de-constify */
324 	}
325 
326 	auio.uio_iov = iov;
327 	auio.uio_iovcnt = iovcnt;
328 	auio.uio_rw = UIO_WRITE;
329 	auio.uio_segflg = UIO_USERSPACE;
330 	auio.uio_procp = p;
331 	auio.uio_resid = 0;
332 	for (i = 0; i < iovcnt; i++) {
333 		auio.uio_resid += iov->iov_len;
334 		/*
335 		 * Writes return ssize_t because -1 is returned on error.
336 		 * Therefore we must restrict the length to SSIZE_MAX to
337 		 * avoid garbage return values.  Note that the addition is
338 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
339 		 */
340 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
341 			error = EINVAL;
342 			goto done;
343 		}
344 		iov++;
345 	}
346 #ifdef KTRACE
347 	/*
348 	 * if tracing, save a copy of iovec
349 	 */
350 	if (KTRPOINT(p, KTR_GENIO)) {
351 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
352 		bcopy(auio.uio_iov, ktriov, iovlen);
353 	}
354 #endif
355 	cnt = auio.uio_resid;
356 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
357 	if (error) {
358 		if (auio.uio_resid != cnt && (error == ERESTART ||
359 		    error == EINTR || error == EWOULDBLOCK))
360 			error = 0;
361 		if (error == EPIPE)
362 			ptsignal(p, SIGPIPE, STHREAD);
363 	}
364 	cnt -= auio.uio_resid;
365 
366 	fp->f_wxfer++;
367 	fp->f_wbytes += cnt;
368 #ifdef KTRACE
369 	if (ktriov != NULL) {
370 		if (error == 0)
371 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
372 		free(ktriov, M_TEMP);
373 	}
374 #endif
375 	*retval = cnt;
376  done:
377 	if (needfree)
378 		free(needfree, M_IOV);
379  out:
380 	FRELE(fp, p);
381 	return (error);
382 }
383 
384 /*
385  * Ioctl system call
386  */
387 /* ARGSUSED */
388 int
389 sys_ioctl(struct proc *p, void *v, register_t *retval)
390 {
391 	struct sys_ioctl_args /* {
392 		syscallarg(int) fd;
393 		syscallarg(u_long) com;
394 		syscallarg(void *) data;
395 	} */ *uap = v;
396 	struct file *fp;
397 	struct filedesc *fdp;
398 	u_long com;
399 	int error;
400 	u_int size;
401 	caddr_t data, memp;
402 	int tmp;
403 #define STK_PARAMS	128
404 	long long stkbuf[STK_PARAMS / sizeof(long long)];
405 
406 	fdp = p->p_fd;
407 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
408 		return (EBADF);
409 
410 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
411 		return (EBADF);
412 
413 	switch (com = SCARG(uap, com)) {
414 	case FIONCLEX:
415 	case FIOCLEX:
416 		fdplock(fdp);
417 		if (com == FIONCLEX)
418 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
419 		else
420 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
421 		fdpunlock(fdp);
422 		return (0);
423 	}
424 
425 	/*
426 	 * Interpret high order word to find amount of data to be
427 	 * copied to/from the user's address space.
428 	 */
429 	size = IOCPARM_LEN(com);
430 	if (size > IOCPARM_MAX)
431 		return (ENOTTY);
432 	FREF(fp);
433 	memp = NULL;
434 	if (size > sizeof (stkbuf)) {
435 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
436 		data = memp;
437 	} else
438 		data = (caddr_t)stkbuf;
439 	if (com&IOC_IN) {
440 		if (size) {
441 			error = copyin(SCARG(uap, data), data, (u_int)size);
442 			if (error) {
443 				goto out;
444 			}
445 		} else
446 			*(caddr_t *)data = SCARG(uap, data);
447 	} else if ((com&IOC_OUT) && size)
448 		/*
449 		 * Zero the buffer so the user always
450 		 * gets back something deterministic.
451 		 */
452 		bzero(data, size);
453 	else if (com&IOC_VOID)
454 		*(caddr_t *)data = SCARG(uap, data);
455 
456 	switch (com) {
457 
458 	case FIONBIO:
459 		if ((tmp = *(int *)data) != 0)
460 			fp->f_flag |= FNONBLOCK;
461 		else
462 			fp->f_flag &= ~FNONBLOCK;
463 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
464 		break;
465 
466 	case FIOASYNC:
467 		if ((tmp = *(int *)data) != 0)
468 			fp->f_flag |= FASYNC;
469 		else
470 			fp->f_flag &= ~FASYNC;
471 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
472 		break;
473 
474 	case FIOSETOWN:
475 		tmp = *(int *)data;
476 		if (fp->f_type == DTYPE_SOCKET) {
477 			struct socket *so = (struct socket *)fp->f_data;
478 
479 			so->so_pgid = tmp;
480 			so->so_siguid = p->p_cred->p_ruid;
481 			so->so_sigeuid = p->p_ucred->cr_uid;
482 			error = 0;
483 			break;
484 		}
485 		if (tmp <= 0) {
486 			tmp = -tmp;
487 		} else {
488 			struct process *pr = prfind(tmp);
489 			if (pr == NULL) {
490 				error = ESRCH;
491 				break;
492 			}
493 			tmp = pr->ps_pgrp->pg_id;
494 		}
495 		error = (*fp->f_ops->fo_ioctl)
496 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
497 		break;
498 
499 	case FIOGETOWN:
500 		if (fp->f_type == DTYPE_SOCKET) {
501 			error = 0;
502 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
503 			break;
504 		}
505 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
506 		*(int *)data = -*(int *)data;
507 		break;
508 
509 	default:
510 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
511 		break;
512 	}
513 	/*
514 	 * Copy any data to user, size was
515 	 * already set and checked above.
516 	 */
517 	if (error == 0 && (com&IOC_OUT) && size)
518 		error = copyout(data, SCARG(uap, data), (u_int)size);
519 out:
520 	FRELE(fp, p);
521 	if (memp)
522 		free(memp, M_IOCTLOPS);
523 	return (error);
524 }
525 
526 int	selwait, nselcoll;
527 
528 /*
529  * Select system call.
530  */
531 int
532 sys_select(struct proc *p, void *v, register_t *retval)
533 {
534 	struct sys_select_args /* {
535 		syscallarg(int) nd;
536 		syscallarg(fd_set *) in;
537 		syscallarg(fd_set *) ou;
538 		syscallarg(fd_set *) ex;
539 		syscallarg(struct timeval *) tv;
540 	} */ *uap = v;
541 
542 	struct timespec ts, *tsp = NULL;
543 	int error;
544 
545 	if (SCARG(uap, tv) != NULL) {
546 		struct timeval tv;
547 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
548 			return (error);
549 		if ((error = itimerfix(&tv)) != 0)
550 			return (error);
551 #ifdef KTRACE
552 		if (KTRPOINT(p, KTR_STRUCT))
553 			ktrreltimeval(p, &tv);
554 #endif
555 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
556 		tsp = &ts;
557 	}
558 
559 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
560 	    SCARG(uap, ex), tsp, NULL, retval));
561 }
562 
563 int
564 sys_pselect(struct proc *p, void *v, register_t *retval)
565 {
566 	struct sys_pselect_args /* {
567 		syscallarg(int) nd;
568 		syscallarg(fd_set *) in;
569 		syscallarg(fd_set *) ou;
570 		syscallarg(fd_set *) ex;
571 		syscallarg(const struct timespec *) ts;
572 		syscallarg(const sigset_t *) mask;
573 	} */ *uap = v;
574 
575 	struct timespec ts, *tsp = NULL;
576 	sigset_t ss, *ssp = NULL;
577 	int error;
578 
579 	if (SCARG(uap, ts) != NULL) {
580 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
581 			return (error);
582 		if ((error = timespecfix(&ts)) != 0)
583 			return (error);
584 #ifdef KTRACE
585 		if (KTRPOINT(p, KTR_STRUCT))
586 			ktrreltimespec(p, &ts);
587 #endif
588 		tsp = &ts;
589 	}
590 	if (SCARG(uap, mask) != NULL) {
591 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
592 			return (error);
593 		ssp = &ss;
594 	}
595 
596 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
597 	    SCARG(uap, ex), tsp, ssp, retval));
598 }
599 
600 int
601 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
602     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
603 {
604 	fd_mask bits[6];
605 	fd_set *pibits[3], *pobits[3];
606 	struct timespec ats, rts, tts;
607 	int s, ncoll, error = 0, timo;
608 	u_int ni;
609 
610 	if (nd < 0)
611 		return (EINVAL);
612 	if (nd > p->p_fd->fd_nfiles) {
613 		/* forgiving; slightly wrong */
614 		nd = p->p_fd->fd_nfiles;
615 	}
616 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
617 	if (ni > sizeof(bits[0])) {
618 		caddr_t mbits;
619 
620 		mbits = malloc(ni * 6, M_TEMP, M_WAITOK|M_ZERO);
621 		pibits[0] = (fd_set *)&mbits[ni * 0];
622 		pibits[1] = (fd_set *)&mbits[ni * 1];
623 		pibits[2] = (fd_set *)&mbits[ni * 2];
624 		pobits[0] = (fd_set *)&mbits[ni * 3];
625 		pobits[1] = (fd_set *)&mbits[ni * 4];
626 		pobits[2] = (fd_set *)&mbits[ni * 5];
627 	} else {
628 		bzero(bits, sizeof(bits));
629 		pibits[0] = (fd_set *)&bits[0];
630 		pibits[1] = (fd_set *)&bits[1];
631 		pibits[2] = (fd_set *)&bits[2];
632 		pobits[0] = (fd_set *)&bits[3];
633 		pobits[1] = (fd_set *)&bits[4];
634 		pobits[2] = (fd_set *)&bits[5];
635 	}
636 
637 #define	getbits(name, x) \
638 	if (name && (error = copyin(name, pibits[x], ni))) \
639 		goto done;
640 	getbits(in, 0);
641 	getbits(ou, 1);
642 	getbits(ex, 2);
643 #undef	getbits
644 #ifdef KTRACE
645 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
646 		if (in) ktrfdset(p, pibits[0], ni);
647 		if (ou) ktrfdset(p, pibits[1], ni);
648 		if (ex) ktrfdset(p, pibits[2], ni);
649 	}
650 #endif
651 
652 	if (tsp) {
653 		getnanouptime(&rts);
654 		timespecadd(tsp, &rts, &ats);
655 	} else {
656 		ats.tv_sec = 0;
657 		ats.tv_nsec = 0;
658 	}
659 	timo = 0;
660 
661 	if (sigmask) {
662 		p->p_oldmask = p->p_sigmask;
663 		atomic_setbits_int(&p->p_flag, P_SIGSUSPEND);
664 		p->p_sigmask = *sigmask &~ sigcantmask;
665 	}
666 
667 retry:
668 	ncoll = nselcoll;
669 	atomic_setbits_int(&p->p_flag, P_SELECT);
670 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
671 	if (error || *retval)
672 		goto done;
673 	if (tsp) {
674 		getnanouptime(&rts);
675 		if (timespeccmp(&rts, &ats, >=))
676 			goto done;
677 		timespecsub(&ats, &rts, &tts);
678 		timo = tts.tv_sec > 24 * 60 * 60 ?
679 			24 * 60 * 60 * hz : tstohz(&tts);
680 	}
681 	s = splhigh();
682 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
683 		splx(s);
684 		goto retry;
685 	}
686 	atomic_clearbits_int(&p->p_flag, P_SELECT);
687 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
688 	splx(s);
689 	if (error == 0)
690 		goto retry;
691 done:
692 	atomic_clearbits_int(&p->p_flag, P_SELECT);
693 	/* select is not restarted after signals... */
694 	if (error == ERESTART)
695 		error = EINTR;
696 	if (error == EWOULDBLOCK)
697 		error = 0;
698 #define	putbits(name, x) \
699 	if (name && (error2 = copyout(pobits[x], name, ni))) \
700 		error = error2;
701 	if (error == 0) {
702 		int error2;
703 
704 		putbits(in, 0);
705 		putbits(ou, 1);
706 		putbits(ex, 2);
707 #undef putbits
708 #ifdef KTRACE
709 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
710 			if (in) ktrfdset(p, pobits[0], ni);
711 			if (ou) ktrfdset(p, pobits[1], ni);
712 			if (ex) ktrfdset(p, pobits[2], ni);
713 		}
714 #endif
715 	}
716 
717 	if (pibits[0] != (fd_set *)&bits[0])
718 		free(pibits[0], M_TEMP);
719 	return (error);
720 }
721 
722 int
723 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
724     register_t *retval)
725 {
726 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
727 	struct filedesc *fdp = p->p_fd;
728 	int msk, i, j, fd;
729 	fd_mask bits;
730 	struct file *fp;
731 	int n = 0;
732 	static const int flag[3] = { POLLIN, POLLOUT, POLLPRI };
733 
734 	for (msk = 0; msk < 3; msk++) {
735 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
736 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
737 
738 		for (i = 0; i < nfd; i += NFDBITS) {
739 			bits = pibits->fds_bits[i/NFDBITS];
740 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
741 				bits &= ~(1 << j);
742 				if ((fp = fd_getfile(fdp, fd)) == NULL)
743 					return (EBADF);
744 				FREF(fp);
745 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
746 					FD_SET(fd, pobits);
747 					n++;
748 				}
749 				FRELE(fp, p);
750 			}
751 		}
752 	}
753 	*retval = n;
754 	return (0);
755 }
756 
757 /*ARGSUSED*/
758 int
759 seltrue(dev_t dev, int events, struct proc *p)
760 {
761 
762 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
763 }
764 
765 int
766 selfalse(dev_t dev, int events, struct proc *p)
767 {
768 
769 	return (0);
770 }
771 
772 /*
773  * Record a select request.
774  */
775 void
776 selrecord(struct proc *selector, struct selinfo *sip)
777 {
778 	struct proc *p;
779 	pid_t mypid;
780 
781 	mypid = selector->p_pid;
782 	if (sip->si_selpid == mypid)
783 		return;
784 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
785 	    p->p_wchan == (caddr_t)&selwait)
786 		sip->si_flags |= SI_COLL;
787 	else
788 		sip->si_selpid = mypid;
789 }
790 
791 /*
792  * Do a wakeup when a selectable event occurs.
793  */
794 void
795 selwakeup(struct selinfo *sip)
796 {
797 	struct proc *p;
798 	int s;
799 
800 	KNOTE(&sip->si_note, 0);
801 	if (sip->si_selpid == 0)
802 		return;
803 	if (sip->si_flags & SI_COLL) {
804 		nselcoll++;
805 		sip->si_flags &= ~SI_COLL;
806 		wakeup(&selwait);
807 	}
808 	p = pfind(sip->si_selpid);
809 	sip->si_selpid = 0;
810 	if (p != NULL) {
811 		SCHED_LOCK(s);
812 		if (p->p_wchan == (caddr_t)&selwait) {
813 			if (p->p_stat == SSLEEP)
814 				setrunnable(p);
815 			else
816 				unsleep(p);
817 		} else if (p->p_flag & P_SELECT)
818 			atomic_clearbits_int(&p->p_flag, P_SELECT);
819 		SCHED_UNLOCK(s);
820 	}
821 }
822 
823 void
824 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
825 {
826 	struct filedesc *fdp = p->p_fd;
827 	struct file *fp;
828 	u_int i;
829 	int n = 0;
830 
831 	for (i = 0; i < nfd; i++, pl++) {
832 		/* Check the file descriptor. */
833 		if (pl->fd < 0) {
834 			pl->revents = 0;
835 			continue;
836 		}
837 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
838 			pl->revents = POLLNVAL;
839 			n++;
840 			continue;
841 		}
842 		FREF(fp);
843 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
844 		FRELE(fp, p);
845 		if (pl->revents != 0)
846 			n++;
847 	}
848 	*retval = n;
849 }
850 
851 /*
852  * Only copyout the revents field.
853  */
854 int
855 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
856 {
857 	int error = 0;
858 	u_int i = 0;
859 
860 	while (!error && i++ < nfds) {
861 		error = copyout(&pl->revents, &upl->revents,
862 		    sizeof(upl->revents));
863 		pl++;
864 		upl++;
865 	}
866 
867 	return (error);
868 }
869 
870 /*
871  * We are using the same mechanism as select only we encode/decode args
872  * differently.
873  */
874 int
875 sys_poll(struct proc *p, void *v, register_t *retval)
876 {
877 	struct sys_poll_args /* {
878 		syscallarg(struct pollfd *) fds;
879 		syscallarg(u_int) nfds;
880 		syscallarg(int) timeout;
881 	} */ *uap = v;
882 
883 	struct timespec ts, *tsp = NULL;
884 	int msec = SCARG(uap, timeout);
885 
886 	if (msec != INFTIM) {
887 		if (msec < 0)
888 			return (EINVAL);
889 		ts.tv_sec = msec / 1000;
890 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
891 		tsp = &ts;
892 	}
893 
894 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
895 	    retval));
896 }
897 
898 int
899 sys_ppoll(struct proc *p, void *v, register_t *retval)
900 {
901 	struct sys_ppoll_args /* {
902 		syscallarg(struct pollfd *) fds;
903 		syscallarg(u_int) nfds;
904 		syscallarg(const struct timespec *) ts;
905 		syscallarg(const sigset_t *) mask;
906 	} */ *uap = v;
907 
908 	int error;
909 	struct timespec ts, *tsp = NULL;
910 	sigset_t ss, *ssp = NULL;
911 
912 	if (SCARG(uap, ts) != NULL) {
913 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
914 			return (error);
915 		if ((error = timespecfix(&ts)) != 0)
916 			return (error);
917 #ifdef KTRACE
918 		if (KTRPOINT(p, KTR_STRUCT))
919 			ktrreltimespec(p, &ts);
920 #endif
921 		tsp = &ts;
922 	}
923 
924 	if (SCARG(uap, mask) != NULL) {
925 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
926 			return (error);
927 		ssp = &ss;
928 	}
929 
930 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
931 	    retval));
932 }
933 
934 int
935 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
936     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
937 {
938 	size_t sz;
939 	struct pollfd pfds[4], *pl = pfds;
940 	struct timespec ats, rts, tts;
941 	int timo, ncoll, i, s, error;
942 	extern int nselcoll, selwait;
943 
944 	/* Standards say no more than MAX_OPEN; this is possibly better. */
945 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
946 		return (EINVAL);
947 
948 	sz = sizeof(struct pollfd) * nfds;
949 
950 	/* optimize for the default case, of a small nfds value */
951 	if (sz > sizeof(pfds))
952 		pl = (struct pollfd *) malloc(sz, M_TEMP, M_WAITOK);
953 
954 	if ((error = copyin(fds, pl, sz)) != 0)
955 		goto bad;
956 
957 	for (i = 0; i < nfds; i++)
958 		pl[i].revents = 0;
959 
960 	if (tsp != NULL) {
961 		getnanouptime(&rts);
962 		timespecadd(tsp, &rts, &ats);
963 	} else {
964 		ats.tv_sec = 0;
965 		ats.tv_nsec = 0;
966 	}
967 	timo = 0;
968 
969 	if (sigmask) {
970 		p->p_oldmask = p->p_sigmask;
971 		atomic_setbits_int(&p->p_flag, P_SIGSUSPEND);
972 		p->p_sigmask = *sigmask &~ sigcantmask;
973 	}
974 
975 retry:
976 	ncoll = nselcoll;
977 	atomic_setbits_int(&p->p_flag, P_SELECT);
978 	pollscan(p, pl, nfds, retval);
979 	if (*retval)
980 		goto done;
981 	if (tsp != NULL) {
982 		getnanouptime(&rts);
983 		if (timespeccmp(&rts, &ats, >=))
984 			goto done;
985 		timespecsub(&ats, &rts, &tts);
986 		timo = tts.tv_sec > 24 * 60 * 60 ?
987 			24 * 60 * 60 * hz : tstohz(&tts);
988 	}
989 	s = splhigh();
990 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
991 		splx(s);
992 		goto retry;
993 	}
994 	atomic_clearbits_int(&p->p_flag, P_SELECT);
995 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
996 	splx(s);
997 	if (error == 0)
998 		goto retry;
999 
1000 done:
1001 	atomic_clearbits_int(&p->p_flag, P_SELECT);
1002 	/*
1003 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1004 	 *       ignored (since the whole point is to see what would block).
1005 	 */
1006 	switch (error) {
1007 	case ERESTART:
1008 		error = pollout(pl, fds, nfds);
1009 		if (error == 0)
1010 			error = EINTR;
1011 		break;
1012 	case EWOULDBLOCK:
1013 	case 0:
1014 		error = pollout(pl, fds, nfds);
1015 		break;
1016 	}
1017 bad:
1018 	if (pl != pfds)
1019 		free(pl, M_TEMP);
1020 	return (error);
1021 }
1022 
1023 /*
1024  * utrace system call
1025  */
1026 /* ARGSUSED */
1027 int
1028 sys_utrace(struct proc *curp, void *v, register_t *retval)
1029 {
1030 #ifdef KTRACE
1031 	struct sys_utrace_args /* {
1032 		syscallarg(const char *) label;
1033 		syscallarg(const void *) addr;
1034 		syscallarg(size_t) len;
1035 	} */ *uap = v;
1036 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1037 	    SCARG(uap, len)));
1038 #else
1039 	return (0);
1040 #endif
1041 }
1042