1 /* $OpenBSD: sys_generic.c,v 1.134 2020/12/26 14:26:48 visa Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/stat.h> 55 #include <sys/time.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/eventvar.h> 59 #ifdef KTRACE 60 #include <sys/ktrace.h> 61 #endif 62 #include <sys/sched.h> 63 #include <sys/pledge.h> 64 65 #include <sys/mount.h> 66 #include <sys/syscallargs.h> 67 68 #include <uvm/uvm_extern.h> 69 70 /* 71 * Debug values: 72 * 1 - print implementation errors, things that should not happen. 73 * 2 - print ppoll(2) information, somewhat verbose 74 * 3 - print pselect(2) and ppoll(2) information, very verbose 75 */ 76 int kqpoll_debug = 0; 77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 78 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 79 printf(x); \ 80 } 81 82 int pselregister(struct proc *, fd_set *[], int, int *); 83 int pselcollect(struct proc *, struct kevent *, fd_set *[]); 84 85 int pollout(struct pollfd *, struct pollfd *, u_int); 86 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 87 struct timespec *, const sigset_t *, register_t *); 88 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 89 const sigset_t *, register_t *); 90 void doselwakeup(struct selinfo *); 91 92 int 93 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 94 unsigned int iovcnt, size_t *residp) 95 { 96 #ifdef KTRACE 97 struct proc *p = curproc; 98 #endif 99 struct iovec *iov; 100 int error, i; 101 size_t resid = 0; 102 103 if (iovcnt > UIO_SMALLIOV) { 104 if (iovcnt > IOV_MAX) 105 return (EINVAL); 106 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 107 } else if (iovcnt > 0) { 108 iov = aiov; 109 } else { 110 return (EINVAL); 111 } 112 *iovp = iov; 113 114 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 115 return (error); 116 117 #ifdef KTRACE 118 if (KTRPOINT(p, KTR_STRUCT)) 119 ktriovec(p, iov, iovcnt); 120 #endif 121 122 for (i = 0; i < iovcnt; i++) { 123 resid += iov->iov_len; 124 /* 125 * Writes return ssize_t because -1 is returned on error. 126 * Therefore we must restrict the length to SSIZE_MAX to 127 * avoid garbage return values. Note that the addition is 128 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 129 */ 130 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 131 return (EINVAL); 132 iov++; 133 } 134 135 if (residp != NULL) 136 *residp = resid; 137 138 return (0); 139 } 140 141 void 142 iovec_free(struct iovec *iov, unsigned int iovcnt) 143 { 144 if (iovcnt > UIO_SMALLIOV) 145 free(iov, M_IOV, iovcnt * sizeof(*iov)); 146 } 147 148 /* 149 * Read system call. 150 */ 151 int 152 sys_read(struct proc *p, void *v, register_t *retval) 153 { 154 struct sys_read_args /* { 155 syscallarg(int) fd; 156 syscallarg(void *) buf; 157 syscallarg(size_t) nbyte; 158 } */ *uap = v; 159 struct iovec iov; 160 struct uio auio; 161 162 iov.iov_base = SCARG(uap, buf); 163 iov.iov_len = SCARG(uap, nbyte); 164 if (iov.iov_len > SSIZE_MAX) 165 return (EINVAL); 166 167 auio.uio_iov = &iov; 168 auio.uio_iovcnt = 1; 169 auio.uio_resid = iov.iov_len; 170 171 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 172 } 173 174 /* 175 * Scatter read system call. 176 */ 177 int 178 sys_readv(struct proc *p, void *v, register_t *retval) 179 { 180 struct sys_readv_args /* { 181 syscallarg(int) fd; 182 syscallarg(const struct iovec *) iovp; 183 syscallarg(int) iovcnt; 184 } */ *uap = v; 185 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 186 int error, iovcnt = SCARG(uap, iovcnt); 187 struct uio auio; 188 size_t resid; 189 190 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 191 if (error) 192 goto done; 193 194 auio.uio_iov = iov; 195 auio.uio_iovcnt = iovcnt; 196 auio.uio_resid = resid; 197 198 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 199 done: 200 iovec_free(iov, iovcnt); 201 return (error); 202 } 203 204 int 205 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 206 register_t *retval) 207 { 208 struct filedesc *fdp = p->p_fd; 209 struct file *fp; 210 long cnt, error = 0; 211 u_int iovlen; 212 #ifdef KTRACE 213 struct iovec *ktriov = NULL; 214 #endif 215 216 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 217 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 218 219 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 220 return (EBADF); 221 222 /* Checks for positioned read. */ 223 if (flags & FO_POSITION) { 224 struct vnode *vp = fp->f_data; 225 226 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 227 (vp->v_flag & VISTTY)) { 228 error = ESPIPE; 229 goto done; 230 } 231 232 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 233 error = EINVAL; 234 goto done; 235 } 236 } 237 238 uio->uio_rw = UIO_READ; 239 uio->uio_segflg = UIO_USERSPACE; 240 uio->uio_procp = p; 241 #ifdef KTRACE 242 /* 243 * if tracing, save a copy of iovec 244 */ 245 if (KTRPOINT(p, KTR_GENIO)) { 246 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 247 memcpy(ktriov, uio->uio_iov, iovlen); 248 } 249 #endif 250 cnt = uio->uio_resid; 251 error = (*fp->f_ops->fo_read)(fp, uio, flags); 252 if (error) { 253 if (uio->uio_resid != cnt && (error == ERESTART || 254 error == EINTR || error == EWOULDBLOCK)) 255 error = 0; 256 } 257 cnt -= uio->uio_resid; 258 259 mtx_enter(&fp->f_mtx); 260 fp->f_rxfer++; 261 fp->f_rbytes += cnt; 262 mtx_leave(&fp->f_mtx); 263 #ifdef KTRACE 264 if (ktriov != NULL) { 265 if (error == 0) 266 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 267 free(ktriov, M_TEMP, iovlen); 268 } 269 #endif 270 *retval = cnt; 271 done: 272 FRELE(fp, p); 273 return (error); 274 } 275 276 /* 277 * Write system call 278 */ 279 int 280 sys_write(struct proc *p, void *v, register_t *retval) 281 { 282 struct sys_write_args /* { 283 syscallarg(int) fd; 284 syscallarg(const void *) buf; 285 syscallarg(size_t) nbyte; 286 } */ *uap = v; 287 struct iovec iov; 288 struct uio auio; 289 290 iov.iov_base = (void *)SCARG(uap, buf); 291 iov.iov_len = SCARG(uap, nbyte); 292 if (iov.iov_len > SSIZE_MAX) 293 return (EINVAL); 294 295 auio.uio_iov = &iov; 296 auio.uio_iovcnt = 1; 297 auio.uio_resid = iov.iov_len; 298 299 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 300 } 301 302 /* 303 * Gather write system call 304 */ 305 int 306 sys_writev(struct proc *p, void *v, register_t *retval) 307 { 308 struct sys_writev_args /* { 309 syscallarg(int) fd; 310 syscallarg(const struct iovec *) iovp; 311 syscallarg(int) iovcnt; 312 } */ *uap = v; 313 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 314 int error, iovcnt = SCARG(uap, iovcnt); 315 struct uio auio; 316 size_t resid; 317 318 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 319 if (error) 320 goto done; 321 322 auio.uio_iov = iov; 323 auio.uio_iovcnt = iovcnt; 324 auio.uio_resid = resid; 325 326 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 327 done: 328 iovec_free(iov, iovcnt); 329 return (error); 330 } 331 332 int 333 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 334 register_t *retval) 335 { 336 struct filedesc *fdp = p->p_fd; 337 struct file *fp; 338 long cnt, error = 0; 339 u_int iovlen; 340 #ifdef KTRACE 341 struct iovec *ktriov = NULL; 342 #endif 343 344 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 345 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 346 347 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 348 return (EBADF); 349 350 /* Checks for positioned write. */ 351 if (flags & FO_POSITION) { 352 struct vnode *vp = fp->f_data; 353 354 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 355 (vp->v_flag & VISTTY)) { 356 error = ESPIPE; 357 goto done; 358 } 359 360 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 361 error = EINVAL; 362 goto done; 363 } 364 } 365 366 uio->uio_rw = UIO_WRITE; 367 uio->uio_segflg = UIO_USERSPACE; 368 uio->uio_procp = p; 369 #ifdef KTRACE 370 /* 371 * if tracing, save a copy of iovec 372 */ 373 if (KTRPOINT(p, KTR_GENIO)) { 374 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 375 memcpy(ktriov, uio->uio_iov, iovlen); 376 } 377 #endif 378 cnt = uio->uio_resid; 379 error = (*fp->f_ops->fo_write)(fp, uio, flags); 380 if (error) { 381 if (uio->uio_resid != cnt && (error == ERESTART || 382 error == EINTR || error == EWOULDBLOCK)) 383 error = 0; 384 if (error == EPIPE) { 385 KERNEL_LOCK(); 386 ptsignal(p, SIGPIPE, STHREAD); 387 KERNEL_UNLOCK(); 388 } 389 } 390 cnt -= uio->uio_resid; 391 392 mtx_enter(&fp->f_mtx); 393 fp->f_wxfer++; 394 fp->f_wbytes += cnt; 395 mtx_leave(&fp->f_mtx); 396 #ifdef KTRACE 397 if (ktriov != NULL) { 398 if (error == 0) 399 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 400 free(ktriov, M_TEMP, iovlen); 401 } 402 #endif 403 *retval = cnt; 404 done: 405 FRELE(fp, p); 406 return (error); 407 } 408 409 /* 410 * Ioctl system call 411 */ 412 int 413 sys_ioctl(struct proc *p, void *v, register_t *retval) 414 { 415 struct sys_ioctl_args /* { 416 syscallarg(int) fd; 417 syscallarg(u_long) com; 418 syscallarg(void *) data; 419 } */ *uap = v; 420 struct file *fp; 421 struct filedesc *fdp = p->p_fd; 422 u_long com = SCARG(uap, com); 423 int error = 0; 424 u_int size = 0; 425 caddr_t data, memp = NULL; 426 int tmp; 427 #define STK_PARAMS 128 428 long long stkbuf[STK_PARAMS / sizeof(long long)]; 429 430 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 431 return (EBADF); 432 433 if (fp->f_type == DTYPE_SOCKET) { 434 struct socket *so = fp->f_data; 435 436 if (so->so_state & SS_DNS) { 437 error = EINVAL; 438 goto out; 439 } 440 } 441 442 error = pledge_ioctl(p, com, fp); 443 if (error) 444 goto out; 445 446 switch (com) { 447 case FIONCLEX: 448 case FIOCLEX: 449 fdplock(fdp); 450 if (com == FIONCLEX) 451 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 452 else 453 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 454 fdpunlock(fdp); 455 goto out; 456 } 457 458 /* 459 * Interpret high order word to find amount of data to be 460 * copied to/from the user's address space. 461 */ 462 size = IOCPARM_LEN(com); 463 if (size > IOCPARM_MAX) { 464 error = ENOTTY; 465 goto out; 466 } 467 if (size > sizeof (stkbuf)) { 468 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 469 data = memp; 470 } else 471 data = (caddr_t)stkbuf; 472 if (com&IOC_IN) { 473 if (size) { 474 error = copyin(SCARG(uap, data), data, size); 475 if (error) { 476 goto out; 477 } 478 } else 479 *(caddr_t *)data = SCARG(uap, data); 480 } else if ((com&IOC_OUT) && size) 481 /* 482 * Zero the buffer so the user always 483 * gets back something deterministic. 484 */ 485 memset(data, 0, size); 486 else if (com&IOC_VOID) 487 *(caddr_t *)data = SCARG(uap, data); 488 489 switch (com) { 490 491 case FIONBIO: 492 if ((tmp = *(int *)data) != 0) 493 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 494 else 495 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 496 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 497 break; 498 499 case FIOASYNC: 500 if ((tmp = *(int *)data) != 0) 501 atomic_setbits_int(&fp->f_flag, FASYNC); 502 else 503 atomic_clearbits_int(&fp->f_flag, FASYNC); 504 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 505 break; 506 507 default: 508 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 509 break; 510 } 511 /* 512 * Copy any data to user, size was 513 * already set and checked above. 514 */ 515 if (error == 0 && (com&IOC_OUT) && size) 516 error = copyout(data, SCARG(uap, data), size); 517 out: 518 FRELE(fp, p); 519 free(memp, M_IOCTLOPS, size); 520 return (error); 521 } 522 523 int selwait, nselcoll; 524 525 /* 526 * Select system call. 527 */ 528 int 529 sys_select(struct proc *p, void *v, register_t *retval) 530 { 531 struct sys_select_args /* { 532 syscallarg(int) nd; 533 syscallarg(fd_set *) in; 534 syscallarg(fd_set *) ou; 535 syscallarg(fd_set *) ex; 536 syscallarg(struct timeval *) tv; 537 } */ *uap = v; 538 539 struct timespec ts, *tsp = NULL; 540 int error; 541 542 if (SCARG(uap, tv) != NULL) { 543 struct timeval tv; 544 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 545 return (error); 546 #ifdef KTRACE 547 if (KTRPOINT(p, KTR_STRUCT)) 548 ktrreltimeval(p, &tv); 549 #endif 550 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 551 return (EINVAL); 552 TIMEVAL_TO_TIMESPEC(&tv, &ts); 553 tsp = &ts; 554 } 555 556 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 557 SCARG(uap, ex), tsp, NULL, retval)); 558 } 559 560 int 561 sys_pselect(struct proc *p, void *v, register_t *retval) 562 { 563 struct sys_pselect_args /* { 564 syscallarg(int) nd; 565 syscallarg(fd_set *) in; 566 syscallarg(fd_set *) ou; 567 syscallarg(fd_set *) ex; 568 syscallarg(const struct timespec *) ts; 569 syscallarg(const sigset_t *) mask; 570 } */ *uap = v; 571 572 struct timespec ts, *tsp = NULL; 573 sigset_t ss, *ssp = NULL; 574 int error; 575 576 if (SCARG(uap, ts) != NULL) { 577 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 578 return (error); 579 #ifdef KTRACE 580 if (KTRPOINT(p, KTR_STRUCT)) 581 ktrreltimespec(p, &ts); 582 #endif 583 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 584 return (EINVAL); 585 tsp = &ts; 586 } 587 if (SCARG(uap, mask) != NULL) { 588 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 589 return (error); 590 ssp = &ss; 591 } 592 593 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 594 SCARG(uap, ex), tsp, ssp, retval)); 595 } 596 597 int 598 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 599 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 600 { 601 struct kqueue_scan_state scan; 602 fd_mask bits[6]; 603 fd_set *pibits[3], *pobits[3]; 604 int error, nevents = 0; 605 u_int ni; 606 607 if (nd < 0) 608 return (EINVAL); 609 if (nd > p->p_fd->fd_nfiles) { 610 /* forgiving; slightly wrong */ 611 nd = p->p_fd->fd_nfiles; 612 } 613 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 614 if (ni > sizeof(bits[0])) { 615 caddr_t mbits; 616 617 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 618 pibits[0] = (fd_set *)&mbits[ni * 0]; 619 pibits[1] = (fd_set *)&mbits[ni * 1]; 620 pibits[2] = (fd_set *)&mbits[ni * 2]; 621 pobits[0] = (fd_set *)&mbits[ni * 3]; 622 pobits[1] = (fd_set *)&mbits[ni * 4]; 623 pobits[2] = (fd_set *)&mbits[ni * 5]; 624 } else { 625 memset(bits, 0, sizeof(bits)); 626 pibits[0] = (fd_set *)&bits[0]; 627 pibits[1] = (fd_set *)&bits[1]; 628 pibits[2] = (fd_set *)&bits[2]; 629 pobits[0] = (fd_set *)&bits[3]; 630 pobits[1] = (fd_set *)&bits[4]; 631 pobits[2] = (fd_set *)&bits[5]; 632 } 633 634 kqpoll_init(); 635 636 #define getbits(name, x) \ 637 if (name && (error = copyin(name, pibits[x], ni))) \ 638 goto done; 639 getbits(in, 0); 640 getbits(ou, 1); 641 getbits(ex, 2); 642 #undef getbits 643 #ifdef KTRACE 644 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 645 if (in) ktrfdset(p, pibits[0], ni); 646 if (ou) ktrfdset(p, pibits[1], ni); 647 if (ex) ktrfdset(p, pibits[2], ni); 648 } 649 #endif 650 651 if (sigmask) 652 dosigsuspend(p, *sigmask &~ sigcantmask); 653 654 /* Register kqueue events */ 655 error = pselregister(p, pibits, nd, &nevents); 656 if (error != 0) 657 goto done; 658 659 /* 660 * The poll/select family of syscalls has been designed to 661 * block when file descriptors are not available, even if 662 * there's nothing to wait for. 663 */ 664 if (nevents == 0) { 665 uint64_t nsecs = INFSLP; 666 667 if (timeout != NULL) { 668 if (!timespecisset(timeout)) 669 goto done; 670 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 671 } 672 error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs); 673 /* select is not restarted after signals... */ 674 if (error == ERESTART) 675 error = EINTR; 676 if (error == EWOULDBLOCK) 677 error = 0; 678 goto done; 679 } 680 681 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 682 kqueue_scan_setup(&scan, p->p_kq); 683 while (nevents > 0) { 684 struct kevent kev[KQ_NEVENTS]; 685 int i, ready, count; 686 687 /* Maxium number of events per iteration */ 688 count = MIN(nitems(kev), nevents); 689 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 690 #ifdef KTRACE 691 if (KTRPOINT(p, KTR_STRUCT)) 692 ktrevent(p, kev, ready); 693 #endif 694 /* Convert back events that are ready. */ 695 for (i = 0; i < ready; i++) 696 *retval += pselcollect(p, &kev[i], pobits); 697 /* 698 * Stop if there was an error or if we had enough 699 * space to collect all events that were ready. 700 */ 701 if (error || ready < count) 702 break; 703 704 nevents -= ready; 705 } 706 kqueue_scan_finish(&scan); 707 done: 708 #define putbits(name, x) \ 709 if (name && (error2 = copyout(pobits[x], name, ni))) \ 710 error = error2; 711 if (error == 0) { 712 int error2; 713 714 putbits(in, 0); 715 putbits(ou, 1); 716 putbits(ex, 2); 717 #undef putbits 718 #ifdef KTRACE 719 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 720 if (in) ktrfdset(p, pobits[0], ni); 721 if (ou) ktrfdset(p, pobits[1], ni); 722 if (ex) ktrfdset(p, pobits[2], ni); 723 } 724 #endif 725 } 726 727 if (pibits[0] != (fd_set *)&bits[0]) 728 free(pibits[0], M_TEMP, 6 * ni); 729 730 kqueue_purge(p, p->p_kq); 731 p->p_kq_serial += nd; 732 733 return (error); 734 } 735 736 /* 737 * Convert fd_set into kqueue events and register them on the 738 * per-thread queue. 739 */ 740 int 741 pselregister(struct proc *p, fd_set *pibits[3], int nfd, int *nregistered) 742 { 743 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 744 static const int evff[] = { 0, 0, NOTE_OOB }; 745 int msk, i, j, fd, nevents = 0, error = 0; 746 struct kevent kev; 747 fd_mask bits; 748 749 for (msk = 0; msk < 3; msk++) { 750 for (i = 0; i < nfd; i += NFDBITS) { 751 bits = pibits[msk]->fds_bits[i / NFDBITS]; 752 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 753 bits &= ~(1 << j); 754 755 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 756 fd, msk, p->p_kq_serial); 757 EV_SET(&kev, fd, evf[msk], 758 EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 759 evff[msk], 0, (void *)(p->p_kq_serial)); 760 #ifdef KTRACE 761 if (KTRPOINT(p, KTR_STRUCT)) 762 ktrevent(p, &kev, 1); 763 #endif 764 error = kqueue_register(p->p_kq, &kev, p); 765 switch (error) { 766 case 0: 767 nevents++; 768 /* FALLTHROUGH */ 769 case EOPNOTSUPP:/* No underlying kqfilter */ 770 case EINVAL: /* Unimplemented filter */ 771 error = 0; 772 break; 773 case ENXIO: /* Device has been detached */ 774 default: 775 goto bad; 776 } 777 } 778 } 779 } 780 781 *nregistered = nevents; 782 return (0); 783 bad: 784 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 785 kev.filter, error); 786 return (error); 787 } 788 789 /* 790 * Convert given kqueue event into corresponding select(2) bit. 791 */ 792 int 793 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3]) 794 { 795 #ifdef DIAGNOSTIC 796 /* Filter out and lazily delete spurious events */ 797 if ((unsigned long)kevp->udata != p->p_kq_serial) { 798 DPRINTFN(0, "select fd %u mismatched serial %lu\n", 799 (int)kevp->ident, p->p_kq_serial); 800 kevp->flags = EV_DISABLE|EV_DELETE; 801 kqueue_register(p->p_kq, kevp, p); 802 return (0); 803 } 804 #endif 805 806 switch (kevp->filter) { 807 case EVFILT_READ: 808 FD_SET(kevp->ident, pobits[0]); 809 break; 810 case EVFILT_WRITE: 811 FD_SET(kevp->ident, pobits[1]); 812 break; 813 case EVFILT_EXCEPT: 814 FD_SET(kevp->ident, pobits[2]); 815 break; 816 default: 817 KASSERT(0); 818 } 819 820 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 821 return (1); 822 } 823 824 int 825 seltrue(dev_t dev, int events, struct proc *p) 826 { 827 828 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 829 } 830 831 int 832 selfalse(dev_t dev, int events, struct proc *p) 833 { 834 835 return (0); 836 } 837 838 /* 839 * Record a select request. 840 */ 841 void 842 selrecord(struct proc *selector, struct selinfo *sip) 843 { 844 struct proc *p; 845 pid_t mytid; 846 847 KERNEL_ASSERT_LOCKED(); 848 849 mytid = selector->p_tid; 850 if (sip->si_seltid == mytid) 851 return; 852 if (sip->si_seltid && (p = tfind(sip->si_seltid)) && 853 p->p_wchan == (caddr_t)&selwait) 854 sip->si_flags |= SI_COLL; 855 else 856 sip->si_seltid = mytid; 857 } 858 859 /* 860 * Do a wakeup when a selectable event occurs. 861 */ 862 void 863 selwakeup(struct selinfo *sip) 864 { 865 KERNEL_LOCK(); 866 KNOTE(&sip->si_note, NOTE_SUBMIT); 867 doselwakeup(sip); 868 KERNEL_UNLOCK(); 869 } 870 871 void 872 doselwakeup(struct selinfo *sip) 873 { 874 struct proc *p; 875 876 KERNEL_ASSERT_LOCKED(); 877 878 if (sip->si_seltid == 0) 879 return; 880 if (sip->si_flags & SI_COLL) { 881 nselcoll++; 882 sip->si_flags &= ~SI_COLL; 883 wakeup(&selwait); 884 } 885 p = tfind(sip->si_seltid); 886 sip->si_seltid = 0; 887 if (p != NULL) { 888 if (wakeup_proc(p, &selwait)) { 889 /* nothing else to do */ 890 } else if (p->p_flag & P_SELECT) 891 atomic_clearbits_int(&p->p_flag, P_SELECT); 892 } 893 } 894 895 void 896 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval) 897 { 898 struct filedesc *fdp = p->p_fd; 899 struct file *fp; 900 u_int i; 901 int n = 0; 902 903 for (i = 0; i < nfd; i++, pl++) { 904 /* Check the file descriptor. */ 905 if (pl->fd < 0) { 906 pl->revents = 0; 907 continue; 908 } 909 if ((fp = fd_getfile(fdp, pl->fd)) == NULL) { 910 pl->revents = POLLNVAL; 911 n++; 912 continue; 913 } 914 pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p); 915 FRELE(fp, p); 916 if (pl->revents != 0) 917 n++; 918 } 919 *retval = n; 920 } 921 922 /* 923 * Only copyout the revents field. 924 */ 925 int 926 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 927 { 928 int error = 0; 929 u_int i = 0; 930 931 while (!error && i++ < nfds) { 932 error = copyout(&pl->revents, &upl->revents, 933 sizeof(upl->revents)); 934 pl++; 935 upl++; 936 } 937 938 return (error); 939 } 940 941 /* 942 * We are using the same mechanism as select only we encode/decode args 943 * differently. 944 */ 945 int 946 sys_poll(struct proc *p, void *v, register_t *retval) 947 { 948 struct sys_poll_args /* { 949 syscallarg(struct pollfd *) fds; 950 syscallarg(u_int) nfds; 951 syscallarg(int) timeout; 952 } */ *uap = v; 953 954 struct timespec ts, *tsp = NULL; 955 int msec = SCARG(uap, timeout); 956 957 if (msec != INFTIM) { 958 if (msec < 0) 959 return (EINVAL); 960 ts.tv_sec = msec / 1000; 961 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 962 tsp = &ts; 963 } 964 965 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 966 retval)); 967 } 968 969 int 970 sys_ppoll(struct proc *p, void *v, register_t *retval) 971 { 972 struct sys_ppoll_args /* { 973 syscallarg(struct pollfd *) fds; 974 syscallarg(u_int) nfds; 975 syscallarg(const struct timespec *) ts; 976 syscallarg(const sigset_t *) mask; 977 } */ *uap = v; 978 979 int error; 980 struct timespec ts, *tsp = NULL; 981 sigset_t ss, *ssp = NULL; 982 983 if (SCARG(uap, ts) != NULL) { 984 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 985 return (error); 986 #ifdef KTRACE 987 if (KTRPOINT(p, KTR_STRUCT)) 988 ktrreltimespec(p, &ts); 989 #endif 990 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 991 return (EINVAL); 992 tsp = &ts; 993 } 994 995 if (SCARG(uap, mask) != NULL) { 996 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 997 return (error); 998 ssp = &ss; 999 } 1000 1001 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 1002 retval)); 1003 } 1004 1005 int 1006 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 1007 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 1008 { 1009 size_t sz; 1010 struct pollfd pfds[4], *pl = pfds; 1011 struct timespec elapsed, start, stop; 1012 uint64_t nsecs; 1013 int ncoll, i, s, error; 1014 1015 /* Standards say no more than MAX_OPEN; this is possibly better. */ 1016 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 1017 return (EINVAL); 1018 1019 /* optimize for the default case, of a small nfds value */ 1020 if (nfds > nitems(pfds)) { 1021 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 1022 M_WAITOK | M_CANFAIL); 1023 if (pl == NULL) 1024 return (EINVAL); 1025 } 1026 1027 sz = nfds * sizeof(*pl); 1028 1029 if ((error = copyin(fds, pl, sz)) != 0) 1030 goto bad; 1031 1032 for (i = 0; i < nfds; i++) { 1033 pl[i].events &= ~POLL_NOHUP; 1034 pl[i].revents = 0; 1035 } 1036 1037 if (sigmask) 1038 dosigsuspend(p, *sigmask &~ sigcantmask); 1039 1040 retry: 1041 ncoll = nselcoll; 1042 atomic_setbits_int(&p->p_flag, P_SELECT); 1043 pollscan(p, pl, nfds, retval); 1044 if (*retval) 1045 goto done; 1046 if (timeout == NULL || timespecisset(timeout)) { 1047 if (timeout != NULL) { 1048 getnanouptime(&start); 1049 nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP); 1050 } else 1051 nsecs = INFSLP; 1052 s = splhigh(); 1053 if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { 1054 splx(s); 1055 goto retry; 1056 } 1057 atomic_clearbits_int(&p->p_flag, P_SELECT); 1058 error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs); 1059 splx(s); 1060 if (timeout != NULL) { 1061 getnanouptime(&stop); 1062 timespecsub(&stop, &start, &elapsed); 1063 timespecsub(timeout, &elapsed, timeout); 1064 if (timeout->tv_sec < 0) 1065 timespecclear(timeout); 1066 } 1067 if (error == 0 || error == EWOULDBLOCK) 1068 goto retry; 1069 } 1070 1071 done: 1072 atomic_clearbits_int(&p->p_flag, P_SELECT); 1073 /* 1074 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1075 * ignored (since the whole point is to see what would block). 1076 */ 1077 switch (error) { 1078 case ERESTART: 1079 error = pollout(pl, fds, nfds); 1080 if (error == 0) 1081 error = EINTR; 1082 break; 1083 case EWOULDBLOCK: 1084 case 0: 1085 error = pollout(pl, fds, nfds); 1086 break; 1087 } 1088 #ifdef KTRACE 1089 if (KTRPOINT(p, KTR_STRUCT)) 1090 ktrpollfd(p, pl, nfds); 1091 #endif /* KTRACE */ 1092 bad: 1093 if (pl != pfds) 1094 free(pl, M_TEMP, sz); 1095 return (error); 1096 } 1097 1098 /* 1099 * utrace system call 1100 */ 1101 int 1102 sys_utrace(struct proc *curp, void *v, register_t *retval) 1103 { 1104 #ifdef KTRACE 1105 struct sys_utrace_args /* { 1106 syscallarg(const char *) label; 1107 syscallarg(const void *) addr; 1108 syscallarg(size_t) len; 1109 } */ *uap = v; 1110 1111 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1112 SCARG(uap, len))); 1113 #else 1114 return (0); 1115 #endif 1116 } 1117