1 /* $OpenBSD: sys_generic.c,v 1.151 2022/12/27 20:13:03 patrick Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/time.h> 54 #include <sys/malloc.h> 55 #include <sys/poll.h> 56 #include <sys/eventvar.h> 57 #ifdef KTRACE 58 #include <sys/ktrace.h> 59 #endif 60 #include <sys/pledge.h> 61 62 #include <sys/mount.h> 63 #include <sys/syscallargs.h> 64 65 /* 66 * Debug values: 67 * 1 - print implementation errors, things that should not happen. 68 * 2 - print ppoll(2) information, somewhat verbose 69 * 3 - print pselect(2) and ppoll(2) information, very verbose 70 */ 71 int kqpoll_debug = 0; 72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 73 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 74 printf(x); \ 75 } 76 77 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *); 78 int pselcollect(struct proc *, struct kevent *, fd_set **, int *); 79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *); 80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 81 82 int pollout(struct pollfd *, struct pollfd *, u_int); 83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 84 struct timespec *, const sigset_t *, register_t *); 85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 86 const sigset_t *, register_t *); 87 88 int 89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 90 unsigned int iovcnt, size_t *residp) 91 { 92 #ifdef KTRACE 93 struct proc *p = curproc; 94 #endif 95 struct iovec *iov; 96 int error, i; 97 size_t resid = 0; 98 99 if (iovcnt > UIO_SMALLIOV) { 100 if (iovcnt > IOV_MAX) 101 return (EINVAL); 102 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 103 } else if (iovcnt > 0) { 104 iov = aiov; 105 } else { 106 return (EINVAL); 107 } 108 *iovp = iov; 109 110 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 111 return (error); 112 113 #ifdef KTRACE 114 if (KTRPOINT(p, KTR_STRUCT)) 115 ktriovec(p, iov, iovcnt); 116 #endif 117 118 for (i = 0; i < iovcnt; i++) { 119 resid += iov->iov_len; 120 /* 121 * Writes return ssize_t because -1 is returned on error. 122 * Therefore we must restrict the length to SSIZE_MAX to 123 * avoid garbage return values. Note that the addition is 124 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 125 */ 126 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 127 return (EINVAL); 128 iov++; 129 } 130 131 if (residp != NULL) 132 *residp = resid; 133 134 return (0); 135 } 136 137 void 138 iovec_free(struct iovec *iov, unsigned int iovcnt) 139 { 140 if (iovcnt > UIO_SMALLIOV) 141 free(iov, M_IOV, iovcnt * sizeof(*iov)); 142 } 143 144 /* 145 * Read system call. 146 */ 147 int 148 sys_read(struct proc *p, void *v, register_t *retval) 149 { 150 struct sys_read_args /* { 151 syscallarg(int) fd; 152 syscallarg(void *) buf; 153 syscallarg(size_t) nbyte; 154 } */ *uap = v; 155 struct iovec iov; 156 struct uio auio; 157 158 iov.iov_base = SCARG(uap, buf); 159 iov.iov_len = SCARG(uap, nbyte); 160 if (iov.iov_len > SSIZE_MAX) 161 return (EINVAL); 162 163 auio.uio_iov = &iov; 164 auio.uio_iovcnt = 1; 165 auio.uio_resid = iov.iov_len; 166 167 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 168 } 169 170 /* 171 * Scatter read system call. 172 */ 173 int 174 sys_readv(struct proc *p, void *v, register_t *retval) 175 { 176 struct sys_readv_args /* { 177 syscallarg(int) fd; 178 syscallarg(const struct iovec *) iovp; 179 syscallarg(int) iovcnt; 180 } */ *uap = v; 181 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 182 int error, iovcnt = SCARG(uap, iovcnt); 183 struct uio auio; 184 size_t resid; 185 186 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 187 if (error) 188 goto done; 189 190 auio.uio_iov = iov; 191 auio.uio_iovcnt = iovcnt; 192 auio.uio_resid = resid; 193 194 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 195 done: 196 iovec_free(iov, iovcnt); 197 return (error); 198 } 199 200 int 201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 202 register_t *retval) 203 { 204 struct filedesc *fdp = p->p_fd; 205 struct file *fp; 206 long cnt, error = 0; 207 u_int iovlen; 208 #ifdef KTRACE 209 struct iovec *ktriov = NULL; 210 #endif 211 212 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 213 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 214 215 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 216 return (EBADF); 217 218 /* Checks for positioned read. */ 219 if (flags & FO_POSITION) { 220 struct vnode *vp = fp->f_data; 221 222 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 223 (vp->v_flag & VISTTY)) { 224 error = ESPIPE; 225 goto done; 226 } 227 228 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 229 error = EINVAL; 230 goto done; 231 } 232 } 233 234 uio->uio_rw = UIO_READ; 235 uio->uio_segflg = UIO_USERSPACE; 236 uio->uio_procp = p; 237 #ifdef KTRACE 238 /* 239 * if tracing, save a copy of iovec 240 */ 241 if (KTRPOINT(p, KTR_GENIO)) { 242 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 243 memcpy(ktriov, uio->uio_iov, iovlen); 244 } 245 #endif 246 cnt = uio->uio_resid; 247 error = (*fp->f_ops->fo_read)(fp, uio, flags); 248 if (error) { 249 if (uio->uio_resid != cnt && (error == ERESTART || 250 error == EINTR || error == EWOULDBLOCK)) 251 error = 0; 252 } 253 cnt -= uio->uio_resid; 254 255 mtx_enter(&fp->f_mtx); 256 fp->f_rxfer++; 257 fp->f_rbytes += cnt; 258 mtx_leave(&fp->f_mtx); 259 #ifdef KTRACE 260 if (ktriov != NULL) { 261 if (error == 0) 262 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 263 free(ktriov, M_TEMP, iovlen); 264 } 265 #endif 266 *retval = cnt; 267 done: 268 FRELE(fp, p); 269 return (error); 270 } 271 272 /* 273 * Write system call 274 */ 275 int 276 sys_write(struct proc *p, void *v, register_t *retval) 277 { 278 struct sys_write_args /* { 279 syscallarg(int) fd; 280 syscallarg(const void *) buf; 281 syscallarg(size_t) nbyte; 282 } */ *uap = v; 283 struct iovec iov; 284 struct uio auio; 285 286 iov.iov_base = (void *)SCARG(uap, buf); 287 iov.iov_len = SCARG(uap, nbyte); 288 if (iov.iov_len > SSIZE_MAX) 289 return (EINVAL); 290 291 auio.uio_iov = &iov; 292 auio.uio_iovcnt = 1; 293 auio.uio_resid = iov.iov_len; 294 295 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 296 } 297 298 /* 299 * Gather write system call 300 */ 301 int 302 sys_writev(struct proc *p, void *v, register_t *retval) 303 { 304 struct sys_writev_args /* { 305 syscallarg(int) fd; 306 syscallarg(const struct iovec *) iovp; 307 syscallarg(int) iovcnt; 308 } */ *uap = v; 309 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 310 int error, iovcnt = SCARG(uap, iovcnt); 311 struct uio auio; 312 size_t resid; 313 314 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 315 if (error) 316 goto done; 317 318 auio.uio_iov = iov; 319 auio.uio_iovcnt = iovcnt; 320 auio.uio_resid = resid; 321 322 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 323 done: 324 iovec_free(iov, iovcnt); 325 return (error); 326 } 327 328 int 329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 330 register_t *retval) 331 { 332 struct filedesc *fdp = p->p_fd; 333 struct file *fp; 334 long cnt, error = 0; 335 u_int iovlen; 336 #ifdef KTRACE 337 struct iovec *ktriov = NULL; 338 #endif 339 340 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 341 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 342 343 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 344 return (EBADF); 345 346 /* Checks for positioned write. */ 347 if (flags & FO_POSITION) { 348 struct vnode *vp = fp->f_data; 349 350 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 351 (vp->v_flag & VISTTY)) { 352 error = ESPIPE; 353 goto done; 354 } 355 356 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 357 error = EINVAL; 358 goto done; 359 } 360 } 361 362 uio->uio_rw = UIO_WRITE; 363 uio->uio_segflg = UIO_USERSPACE; 364 uio->uio_procp = p; 365 #ifdef KTRACE 366 /* 367 * if tracing, save a copy of iovec 368 */ 369 if (KTRPOINT(p, KTR_GENIO)) { 370 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 371 memcpy(ktriov, uio->uio_iov, iovlen); 372 } 373 #endif 374 cnt = uio->uio_resid; 375 error = (*fp->f_ops->fo_write)(fp, uio, flags); 376 if (error) { 377 if (uio->uio_resid != cnt && (error == ERESTART || 378 error == EINTR || error == EWOULDBLOCK)) 379 error = 0; 380 if (error == EPIPE) { 381 KERNEL_LOCK(); 382 ptsignal(p, SIGPIPE, STHREAD); 383 KERNEL_UNLOCK(); 384 } 385 } 386 cnt -= uio->uio_resid; 387 388 mtx_enter(&fp->f_mtx); 389 fp->f_wxfer++; 390 fp->f_wbytes += cnt; 391 mtx_leave(&fp->f_mtx); 392 #ifdef KTRACE 393 if (ktriov != NULL) { 394 if (error == 0) 395 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 396 free(ktriov, M_TEMP, iovlen); 397 } 398 #endif 399 *retval = cnt; 400 done: 401 FRELE(fp, p); 402 return (error); 403 } 404 405 /* 406 * Ioctl system call 407 */ 408 int 409 sys_ioctl(struct proc *p, void *v, register_t *retval) 410 { 411 struct sys_ioctl_args /* { 412 syscallarg(int) fd; 413 syscallarg(u_long) com; 414 syscallarg(void *) data; 415 } */ *uap = v; 416 struct file *fp; 417 struct filedesc *fdp = p->p_fd; 418 u_long com = SCARG(uap, com); 419 int error = 0; 420 u_int size = 0; 421 caddr_t data, memp = NULL; 422 int tmp; 423 #define STK_PARAMS 128 424 long long stkbuf[STK_PARAMS / sizeof(long long)]; 425 426 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 427 return (EBADF); 428 429 if (fp->f_type == DTYPE_SOCKET) { 430 struct socket *so = fp->f_data; 431 432 if (so->so_state & SS_DNS) { 433 error = EINVAL; 434 goto out; 435 } 436 } 437 438 error = pledge_ioctl(p, com, fp); 439 if (error) 440 goto out; 441 442 switch (com) { 443 case FIONCLEX: 444 case FIOCLEX: 445 fdplock(fdp); 446 if (com == FIONCLEX) 447 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 448 else 449 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 450 fdpunlock(fdp); 451 goto out; 452 } 453 454 /* 455 * Interpret high order word to find amount of data to be 456 * copied to/from the user's address space. 457 */ 458 size = IOCPARM_LEN(com); 459 if (size > IOCPARM_MAX) { 460 error = ENOTTY; 461 goto out; 462 } 463 if (size > sizeof (stkbuf)) { 464 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 465 data = memp; 466 } else 467 data = (caddr_t)stkbuf; 468 if (com&IOC_IN) { 469 if (size) { 470 error = copyin(SCARG(uap, data), data, size); 471 if (error) { 472 goto out; 473 } 474 } else 475 *(caddr_t *)data = SCARG(uap, data); 476 } else if ((com&IOC_OUT) && size) 477 /* 478 * Zero the buffer so the user always 479 * gets back something deterministic. 480 */ 481 memset(data, 0, size); 482 else if (com&IOC_VOID) 483 *(caddr_t *)data = SCARG(uap, data); 484 485 switch (com) { 486 487 case FIONBIO: 488 if ((tmp = *(int *)data) != 0) 489 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 490 else 491 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 492 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 493 break; 494 495 case FIOASYNC: 496 if ((tmp = *(int *)data) != 0) 497 atomic_setbits_int(&fp->f_flag, FASYNC); 498 else 499 atomic_clearbits_int(&fp->f_flag, FASYNC); 500 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 501 break; 502 503 default: 504 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 505 break; 506 } 507 /* 508 * Copy any data to user, size was 509 * already set and checked above. 510 */ 511 if (error == 0 && (com&IOC_OUT) && size) 512 error = copyout(data, SCARG(uap, data), size); 513 out: 514 FRELE(fp, p); 515 free(memp, M_IOCTLOPS, size); 516 return (error); 517 } 518 519 /* 520 * Select system call. 521 */ 522 int 523 sys_select(struct proc *p, void *v, register_t *retval) 524 { 525 struct sys_select_args /* { 526 syscallarg(int) nd; 527 syscallarg(fd_set *) in; 528 syscallarg(fd_set *) ou; 529 syscallarg(fd_set *) ex; 530 syscallarg(struct timeval *) tv; 531 } */ *uap = v; 532 533 struct timespec ts, *tsp = NULL; 534 int error; 535 536 if (SCARG(uap, tv) != NULL) { 537 struct timeval tv; 538 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 539 return (error); 540 #ifdef KTRACE 541 if (KTRPOINT(p, KTR_STRUCT)) 542 ktrreltimeval(p, &tv); 543 #endif 544 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 545 return (EINVAL); 546 TIMEVAL_TO_TIMESPEC(&tv, &ts); 547 tsp = &ts; 548 } 549 550 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 551 SCARG(uap, ex), tsp, NULL, retval)); 552 } 553 554 int 555 sys_pselect(struct proc *p, void *v, register_t *retval) 556 { 557 struct sys_pselect_args /* { 558 syscallarg(int) nd; 559 syscallarg(fd_set *) in; 560 syscallarg(fd_set *) ou; 561 syscallarg(fd_set *) ex; 562 syscallarg(const struct timespec *) ts; 563 syscallarg(const sigset_t *) mask; 564 } */ *uap = v; 565 566 struct timespec ts, *tsp = NULL; 567 sigset_t ss, *ssp = NULL; 568 int error; 569 570 if (SCARG(uap, ts) != NULL) { 571 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 572 return (error); 573 #ifdef KTRACE 574 if (KTRPOINT(p, KTR_STRUCT)) 575 ktrreltimespec(p, &ts); 576 #endif 577 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 578 return (EINVAL); 579 tsp = &ts; 580 } 581 if (SCARG(uap, mask) != NULL) { 582 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 583 return (error); 584 ssp = &ss; 585 } 586 587 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 588 SCARG(uap, ex), tsp, ssp, retval)); 589 } 590 591 int 592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 593 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 594 { 595 struct kqueue_scan_state scan; 596 struct timespec zerots = {}; 597 fd_mask bits[6]; 598 fd_set *pibits[3], *pobits[3]; 599 int error, ncollected = 0, nevents = 0; 600 u_int ni; 601 602 if (nd < 0) 603 return (EINVAL); 604 if (nd > p->p_fd->fd_nfiles) { 605 /* forgiving; slightly wrong */ 606 nd = p->p_fd->fd_nfiles; 607 } 608 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 609 if (ni > sizeof(bits[0])) { 610 caddr_t mbits; 611 612 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 613 pibits[0] = (fd_set *)&mbits[ni * 0]; 614 pibits[1] = (fd_set *)&mbits[ni * 1]; 615 pibits[2] = (fd_set *)&mbits[ni * 2]; 616 pobits[0] = (fd_set *)&mbits[ni * 3]; 617 pobits[1] = (fd_set *)&mbits[ni * 4]; 618 pobits[2] = (fd_set *)&mbits[ni * 5]; 619 } else { 620 memset(bits, 0, sizeof(bits)); 621 pibits[0] = (fd_set *)&bits[0]; 622 pibits[1] = (fd_set *)&bits[1]; 623 pibits[2] = (fd_set *)&bits[2]; 624 pobits[0] = (fd_set *)&bits[3]; 625 pobits[1] = (fd_set *)&bits[4]; 626 pobits[2] = (fd_set *)&bits[5]; 627 } 628 629 kqpoll_init(nd); 630 631 #define getbits(name, x) \ 632 if (name && (error = copyin(name, pibits[x], ni))) \ 633 goto done; 634 getbits(in, 0); 635 getbits(ou, 1); 636 getbits(ex, 2); 637 #undef getbits 638 #ifdef KTRACE 639 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 640 if (in) ktrfdset(p, pibits[0], ni); 641 if (ou) ktrfdset(p, pibits[1], ni); 642 if (ex) ktrfdset(p, pibits[2], ni); 643 } 644 #endif 645 646 if (sigmask) 647 dosigsuspend(p, *sigmask &~ sigcantmask); 648 649 /* Register kqueue events */ 650 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 651 if (error != 0) 652 goto done; 653 654 /* 655 * The poll/select family of syscalls has been designed to 656 * block when file descriptors are not available, even if 657 * there's nothing to wait for. 658 */ 659 if (nevents == 0 && ncollected == 0) { 660 uint64_t nsecs = INFSLP; 661 662 if (timeout != NULL) { 663 if (!timespecisset(timeout)) 664 goto done; 665 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 666 } 667 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 668 /* select is not restarted after signals... */ 669 if (error == ERESTART) 670 error = EINTR; 671 if (error == EWOULDBLOCK) 672 error = 0; 673 goto done; 674 } 675 676 /* Do not block if registering found pending events. */ 677 if (ncollected > 0) 678 timeout = &zerots; 679 680 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 681 kqueue_scan_setup(&scan, p->p_kq); 682 while (nevents > 0) { 683 struct kevent kev[KQ_NEVENTS]; 684 int i, ready, count; 685 686 /* Maximum number of events per iteration */ 687 count = MIN(nitems(kev), nevents); 688 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 689 690 /* Convert back events that are ready. */ 691 for (i = 0; i < ready && error == 0; i++) 692 error = pselcollect(p, &kev[i], pobits, &ncollected); 693 /* 694 * Stop if there was an error or if we had enough 695 * space to collect all events that were ready. 696 */ 697 if (error || ready < count) 698 break; 699 700 nevents -= ready; 701 } 702 kqueue_scan_finish(&scan); 703 *retval = ncollected; 704 done: 705 #define putbits(name, x) \ 706 if (name && (error2 = copyout(pobits[x], name, ni))) \ 707 error = error2; 708 if (error == 0) { 709 int error2; 710 711 putbits(in, 0); 712 putbits(ou, 1); 713 putbits(ex, 2); 714 #undef putbits 715 #ifdef KTRACE 716 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 717 if (in) ktrfdset(p, pobits[0], ni); 718 if (ou) ktrfdset(p, pobits[1], ni); 719 if (ex) ktrfdset(p, pobits[2], ni); 720 } 721 #endif 722 } 723 724 if (pibits[0] != (fd_set *)&bits[0]) 725 free(pibits[0], M_TEMP, 6 * ni); 726 727 kqpoll_done(nd); 728 729 return (error); 730 } 731 732 /* 733 * Convert fd_set into kqueue events and register them on the 734 * per-thread queue. 735 */ 736 int 737 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 738 int *nregistered, int *ncollected) 739 { 740 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 741 static const int evff[] = { 0, 0, NOTE_OOB }; 742 int msk, i, j, fd, nevents = 0, error = 0; 743 struct kevent kev; 744 fd_mask bits; 745 746 for (msk = 0; msk < 3; msk++) { 747 for (i = 0; i < nfd; i += NFDBITS) { 748 bits = pibits[msk]->fds_bits[i / NFDBITS]; 749 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 750 bits &= ~(1 << j); 751 752 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 753 fd, msk, p->p_kq_serial); 754 EV_SET(&kev, fd, evf[msk], 755 EV_ADD|EV_ENABLE|__EV_SELECT, 756 evff[msk], 0, (void *)(p->p_kq_serial)); 757 error = kqueue_register(p->p_kq, &kev, 0, p); 758 switch (error) { 759 case 0: 760 nevents++; 761 /* FALLTHROUGH */ 762 case EOPNOTSUPP:/* No underlying kqfilter */ 763 case EINVAL: /* Unimplemented filter */ 764 case EPERM: /* Specific to FIFO and 765 * __EV_SELECT */ 766 error = 0; 767 break; 768 case EPIPE: /* Specific to pipes */ 769 KASSERT(kev.filter == EVFILT_WRITE); 770 FD_SET(kev.ident, pobits[1]); 771 (*ncollected)++; 772 error = 0; 773 break; 774 case ENXIO: /* Device has been detached */ 775 default: 776 goto bad; 777 } 778 } 779 } 780 } 781 782 *nregistered = nevents; 783 return (0); 784 bad: 785 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 786 kev.filter, error); 787 return (error); 788 } 789 790 /* 791 * Convert given kqueue event into corresponding select(2) bit. 792 */ 793 int 794 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 795 int *ncollected) 796 { 797 if ((unsigned long)kevp->udata != p->p_kq_serial) { 798 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 799 __func__, kevp, (int)kevp->ident, 800 (unsigned long)kevp->udata, p->p_kq_serial); 801 } 802 803 if (kevp->flags & EV_ERROR) { 804 DPRINTFN(2, "select fd %d filt %d error %d\n", 805 (int)kevp->ident, kevp->filter, (int)kevp->data); 806 return (kevp->data); 807 } 808 809 switch (kevp->filter) { 810 case EVFILT_READ: 811 FD_SET(kevp->ident, pobits[0]); 812 break; 813 case EVFILT_WRITE: 814 FD_SET(kevp->ident, pobits[1]); 815 break; 816 case EVFILT_EXCEPT: 817 FD_SET(kevp->ident, pobits[2]); 818 break; 819 default: 820 KASSERT(0); 821 } 822 (*ncollected)++; 823 824 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 825 return (0); 826 } 827 828 /* 829 * Do a wakeup when a selectable event occurs. 830 */ 831 void 832 selwakeup(struct selinfo *sip) 833 { 834 KERNEL_LOCK(); 835 KNOTE(&sip->si_note, NOTE_SUBMIT); 836 KERNEL_UNLOCK(); 837 } 838 839 /* 840 * Only copyout the revents field. 841 */ 842 int 843 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 844 { 845 int error = 0; 846 u_int i = 0; 847 848 while (!error && i++ < nfds) { 849 error = copyout(&pl->revents, &upl->revents, 850 sizeof(upl->revents)); 851 pl++; 852 upl++; 853 } 854 855 return (error); 856 } 857 858 /* 859 * We are using the same mechanism as select only we encode/decode args 860 * differently. 861 */ 862 int 863 sys_poll(struct proc *p, void *v, register_t *retval) 864 { 865 struct sys_poll_args /* { 866 syscallarg(struct pollfd *) fds; 867 syscallarg(u_int) nfds; 868 syscallarg(int) timeout; 869 } */ *uap = v; 870 871 struct timespec ts, *tsp = NULL; 872 int msec = SCARG(uap, timeout); 873 874 if (msec != INFTIM) { 875 if (msec < 0) 876 return (EINVAL); 877 ts.tv_sec = msec / 1000; 878 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 879 tsp = &ts; 880 } 881 882 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 883 retval)); 884 } 885 886 int 887 sys_ppoll(struct proc *p, void *v, register_t *retval) 888 { 889 struct sys_ppoll_args /* { 890 syscallarg(struct pollfd *) fds; 891 syscallarg(u_int) nfds; 892 syscallarg(const struct timespec *) ts; 893 syscallarg(const sigset_t *) mask; 894 } */ *uap = v; 895 896 int error; 897 struct timespec ts, *tsp = NULL; 898 sigset_t ss, *ssp = NULL; 899 900 if (SCARG(uap, ts) != NULL) { 901 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 902 return (error); 903 #ifdef KTRACE 904 if (KTRPOINT(p, KTR_STRUCT)) 905 ktrreltimespec(p, &ts); 906 #endif 907 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 908 return (EINVAL); 909 tsp = &ts; 910 } 911 912 if (SCARG(uap, mask) != NULL) { 913 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 914 return (error); 915 ssp = &ss; 916 } 917 918 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 919 retval)); 920 } 921 922 int 923 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 924 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 925 { 926 struct kqueue_scan_state scan; 927 struct timespec zerots = {}; 928 struct pollfd pfds[4], *pl = pfds; 929 int error, ncollected = 0, nevents = 0; 930 size_t sz; 931 932 /* Standards say no more than MAX_OPEN; this is possibly better. */ 933 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 934 return (EINVAL); 935 936 /* optimize for the default case, of a small nfds value */ 937 if (nfds > nitems(pfds)) { 938 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 939 M_WAITOK | M_CANFAIL); 940 if (pl == NULL) 941 return (EINVAL); 942 } 943 944 kqpoll_init(nfds); 945 946 sz = nfds * sizeof(*pl); 947 948 if ((error = copyin(fds, pl, sz)) != 0) 949 goto bad; 950 951 if (sigmask) 952 dosigsuspend(p, *sigmask &~ sigcantmask); 953 954 /* Register kqueue events */ 955 ppollregister(p, pl, nfds, &nevents, &ncollected); 956 957 /* 958 * The poll/select family of syscalls has been designed to 959 * block when file descriptors are not available, even if 960 * there's nothing to wait for. 961 */ 962 if (nevents == 0 && ncollected == 0) { 963 uint64_t nsecs = INFSLP; 964 965 if (timeout != NULL) { 966 if (!timespecisset(timeout)) 967 goto done; 968 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 969 } 970 971 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 972 if (error == ERESTART) 973 error = EINTR; 974 if (error == EWOULDBLOCK) 975 error = 0; 976 goto done; 977 } 978 979 /* Do not block if registering found pending events. */ 980 if (ncollected > 0) 981 timeout = &zerots; 982 983 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 984 kqueue_scan_setup(&scan, p->p_kq); 985 while (nevents > 0) { 986 struct kevent kev[KQ_NEVENTS]; 987 int i, ready, count; 988 989 /* Maximum number of events per iteration */ 990 count = MIN(nitems(kev), nevents); 991 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 992 993 /* Convert back events that are ready. */ 994 for (i = 0; i < ready; i++) 995 ncollected += ppollcollect(p, &kev[i], pl, nfds); 996 997 /* 998 * Stop if there was an error or if we had enough 999 * place to collect all events that were ready. 1000 */ 1001 if (error || ready < count) 1002 break; 1003 1004 nevents -= ready; 1005 } 1006 kqueue_scan_finish(&scan); 1007 *retval = ncollected; 1008 done: 1009 /* 1010 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1011 * ignored (since the whole point is to see what would block). 1012 */ 1013 switch (error) { 1014 case EINTR: 1015 error = pollout(pl, fds, nfds); 1016 if (error == 0) 1017 error = EINTR; 1018 break; 1019 case EWOULDBLOCK: 1020 case 0: 1021 error = pollout(pl, fds, nfds); 1022 break; 1023 } 1024 #ifdef KTRACE 1025 if (KTRPOINT(p, KTR_STRUCT)) 1026 ktrpollfd(p, pl, nfds); 1027 #endif /* KTRACE */ 1028 bad: 1029 if (pl != pfds) 1030 free(pl, M_TEMP, sz); 1031 1032 kqpoll_done(nfds); 1033 1034 return (error); 1035 } 1036 1037 int 1038 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 1039 struct pollfd *pl, unsigned int pollid) 1040 { 1041 int i, error, nevents = 0; 1042 1043 KASSERT(pl->revents == 0); 1044 1045 for (i = 0; i < nkev; i++, kevp++) { 1046 again: 1047 error = kqueue_register(p->p_kq, kevp, pollid, p); 1048 switch (error) { 1049 case 0: 1050 nevents++; 1051 break; 1052 case EOPNOTSUPP:/* No underlying kqfilter */ 1053 case EINVAL: /* Unimplemented filter */ 1054 break; 1055 case EBADF: /* Bad file descriptor */ 1056 pl->revents |= POLLNVAL; 1057 break; 1058 case EPERM: /* Specific to FIFO */ 1059 KASSERT(kevp->filter == EVFILT_WRITE); 1060 if (nkev == 1) { 1061 /* 1062 * If this is the only filter make sure 1063 * POLLHUP is passed to userland. 1064 */ 1065 kevp->filter = EVFILT_EXCEPT; 1066 goto again; 1067 } 1068 break; 1069 case EPIPE: /* Specific to pipes */ 1070 KASSERT(kevp->filter == EVFILT_WRITE); 1071 pl->revents |= POLLHUP; 1072 break; 1073 default: 1074 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 1075 " %lu filt %d ERROR=%d\n", 1076 ((unsigned long)kevp->udata - p->p_kq_serial), 1077 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 1078 error); 1079 /* FALLTHROUGH */ 1080 case ENXIO: /* Device has been detached */ 1081 pl->revents |= POLLERR; 1082 break; 1083 } 1084 } 1085 1086 return (nevents); 1087 } 1088 1089 /* 1090 * Convert pollfd into kqueue events and register them on the 1091 * per-thread queue. 1092 * 1093 * At most 3 events can correspond to a single pollfd. 1094 */ 1095 void 1096 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, 1097 int *ncollected) 1098 { 1099 int i, nkev, nevt, forcehup; 1100 struct kevent kev[3], *kevp; 1101 1102 for (i = 0; i < nfds; i++) { 1103 pl[i].events &= ~POLL_NOHUP; 1104 pl[i].revents = 0; 1105 1106 if (pl[i].fd < 0) 1107 continue; 1108 1109 /* 1110 * POLLHUP checking is implicit in the event filters. 1111 * However, the checking must be even if no events are 1112 * requested. 1113 */ 1114 forcehup = ((pl[i].events & ~POLLHUP) == 0); 1115 1116 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 1117 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 1118 1119 nevt = 0; 1120 nkev = 0; 1121 kevp = kev; 1122 if (pl[i].events & (POLLIN | POLLRDNORM)) { 1123 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1124 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1125 (void *)(p->p_kq_serial + i)); 1126 nkev++; 1127 kevp++; 1128 } 1129 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1130 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1131 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1132 (void *)(p->p_kq_serial + i)); 1133 nkev++; 1134 kevp++; 1135 } 1136 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1137 int evff = forcehup ? 0 : NOTE_OOB; 1138 1139 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1140 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1141 (void *)(p->p_kq_serial + i)); 1142 nkev++; 1143 kevp++; 1144 } 1145 1146 if (nkev == 0) 1147 continue; 1148 1149 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); 1150 1151 if (pl[i].revents != 0) 1152 (*ncollected)++; 1153 } 1154 1155 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, 1156 *ncollected); 1157 } 1158 1159 /* 1160 * Convert given kqueue event into corresponding poll(2) revents bit. 1161 */ 1162 int 1163 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1164 { 1165 static struct timeval poll_errintvl = { 5, 0 }; 1166 static struct timeval poll_lasterr; 1167 int already_seen; 1168 unsigned long i; 1169 1170 /* Extract poll array index */ 1171 i = (unsigned long)kevp->udata - p->p_kq_serial; 1172 1173 if (i >= nfds) { 1174 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1175 __func__, kevp, nfds, 1176 (unsigned long)kevp->udata, p->p_kq_serial); 1177 } 1178 if ((int)kevp->ident != pl[i].fd) { 1179 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1180 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1181 p->p_kq_serial); 1182 } 1183 1184 /* 1185 * A given descriptor may already have generated an error 1186 * against another filter during kqueue_register(). 1187 * 1188 * Make sure to set the appropriate flags but do not 1189 * increment `*retval' more than once. 1190 */ 1191 already_seen = (pl[i].revents != 0); 1192 1193 /* POLLNVAL preempts other events. */ 1194 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1195 pl[i].revents = POLLNVAL; 1196 goto done; 1197 } else if (pl[i].revents & POLLNVAL) { 1198 goto done; 1199 } 1200 1201 switch (kevp->filter) { 1202 case EVFILT_READ: 1203 if (kevp->flags & __EV_HUP) 1204 pl[i].revents |= POLLHUP; 1205 if (pl[i].events & (POLLIN | POLLRDNORM)) 1206 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1207 break; 1208 case EVFILT_WRITE: 1209 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1210 if (kevp->flags & __EV_HUP) { 1211 pl[i].revents |= POLLHUP; 1212 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1213 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1214 } 1215 break; 1216 case EVFILT_EXCEPT: 1217 if (kevp->flags & __EV_HUP) { 1218 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1219 DPRINTFN(0, "weird events %x\n", pl[i].events); 1220 pl[i].revents |= POLLHUP; 1221 break; 1222 } 1223 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1224 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1225 break; 1226 default: 1227 KASSERT(0); 1228 } 1229 1230 done: 1231 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1232 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1233 kevp->filter); 1234 1235 /* 1236 * Make noise about unclaimed events as they might indicate a bug 1237 * and can result in spurious-looking wakeups of poll(2). 1238 * 1239 * Live-locking within the system call should not happen because 1240 * the scan loop in doppoll() has an upper limit for the number 1241 * of events to process. 1242 */ 1243 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { 1244 printf("%s[%d]: poll index %lu fd %d events 0x%x " 1245 "filter %d/0x%x unclaimed\n", 1246 p->p_p->ps_comm, p->p_tid, i, pl[i].fd, 1247 pl[i].events, kevp->filter, kevp->flags); 1248 } 1249 1250 if (!already_seen && (pl[i].revents != 0)) 1251 return (1); 1252 1253 return (0); 1254 } 1255 1256 /* 1257 * utrace system call 1258 */ 1259 int 1260 sys_utrace(struct proc *curp, void *v, register_t *retval) 1261 { 1262 #ifdef KTRACE 1263 struct sys_utrace_args /* { 1264 syscallarg(const char *) label; 1265 syscallarg(const void *) addr; 1266 syscallarg(size_t) len; 1267 } */ *uap = v; 1268 1269 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1270 SCARG(uap, len))); 1271 #else 1272 return (0); 1273 #endif 1274 } 1275