1 /* $OpenBSD: sys_generic.c,v 1.156 2023/05/09 14:22:17 visa Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/time.h> 54 #include <sys/malloc.h> 55 #include <sys/poll.h> 56 #include <sys/eventvar.h> 57 #ifdef KTRACE 58 #include <sys/ktrace.h> 59 #endif 60 #include <sys/pledge.h> 61 62 #include <sys/mount.h> 63 #include <sys/syscallargs.h> 64 65 /* 66 * Debug values: 67 * 1 - print implementation errors, things that should not happen. 68 * 2 - print ppoll(2) information, somewhat verbose 69 * 3 - print pselect(2) and ppoll(2) information, very verbose 70 */ 71 int kqpoll_debug = 0; 72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 73 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 74 printf(x); \ 75 } 76 77 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *); 78 int pselcollect(struct proc *, struct kevent *, fd_set **, int *); 79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *); 80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 81 82 int pollout(struct pollfd *, struct pollfd *, u_int); 83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 84 struct timespec *, const sigset_t *, register_t *); 85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 86 const sigset_t *, register_t *); 87 88 int 89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 90 unsigned int iovcnt, size_t *residp) 91 { 92 #ifdef KTRACE 93 struct proc *p = curproc; 94 #endif 95 struct iovec *iov; 96 int error, i; 97 size_t resid = 0; 98 99 if (iovcnt > UIO_SMALLIOV) { 100 if (iovcnt > IOV_MAX) 101 return (EINVAL); 102 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 103 } else if (iovcnt > 0) { 104 iov = aiov; 105 } else { 106 return (EINVAL); 107 } 108 *iovp = iov; 109 110 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 111 return (error); 112 113 #ifdef KTRACE 114 if (KTRPOINT(p, KTR_STRUCT)) 115 ktriovec(p, iov, iovcnt); 116 #endif 117 118 for (i = 0; i < iovcnt; i++) { 119 resid += iov->iov_len; 120 /* 121 * Writes return ssize_t because -1 is returned on error. 122 * Therefore we must restrict the length to SSIZE_MAX to 123 * avoid garbage return values. Note that the addition is 124 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 125 */ 126 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 127 return (EINVAL); 128 iov++; 129 } 130 131 if (residp != NULL) 132 *residp = resid; 133 134 return (0); 135 } 136 137 void 138 iovec_free(struct iovec *iov, unsigned int iovcnt) 139 { 140 if (iovcnt > UIO_SMALLIOV) 141 free(iov, M_IOV, iovcnt * sizeof(*iov)); 142 } 143 144 /* 145 * Read system call. 146 */ 147 int 148 sys_read(struct proc *p, void *v, register_t *retval) 149 { 150 struct sys_read_args /* { 151 syscallarg(int) fd; 152 syscallarg(void *) buf; 153 syscallarg(size_t) nbyte; 154 } */ *uap = v; 155 struct iovec iov; 156 struct uio auio; 157 158 iov.iov_base = SCARG(uap, buf); 159 iov.iov_len = SCARG(uap, nbyte); 160 if (iov.iov_len > SSIZE_MAX) 161 return (EINVAL); 162 163 auio.uio_iov = &iov; 164 auio.uio_iovcnt = 1; 165 auio.uio_resid = iov.iov_len; 166 167 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 168 } 169 170 /* 171 * Scatter read system call. 172 */ 173 int 174 sys_readv(struct proc *p, void *v, register_t *retval) 175 { 176 struct sys_readv_args /* { 177 syscallarg(int) fd; 178 syscallarg(const struct iovec *) iovp; 179 syscallarg(int) iovcnt; 180 } */ *uap = v; 181 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 182 int error, iovcnt = SCARG(uap, iovcnt); 183 struct uio auio; 184 size_t resid; 185 186 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 187 if (error) 188 goto done; 189 190 auio.uio_iov = iov; 191 auio.uio_iovcnt = iovcnt; 192 auio.uio_resid = resid; 193 194 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 195 done: 196 iovec_free(iov, iovcnt); 197 return (error); 198 } 199 200 int 201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 202 register_t *retval) 203 { 204 struct filedesc *fdp = p->p_fd; 205 struct file *fp; 206 long cnt, error = 0; 207 u_int iovlen; 208 #ifdef KTRACE 209 struct iovec *ktriov = NULL; 210 #endif 211 212 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 213 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 214 215 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 216 return (EBADF); 217 218 /* Checks for positioned read. */ 219 if (flags & FO_POSITION) { 220 struct vnode *vp = fp->f_data; 221 222 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 223 (vp->v_flag & VISTTY)) { 224 error = ESPIPE; 225 goto done; 226 } 227 228 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 229 error = EINVAL; 230 goto done; 231 } 232 } 233 234 uio->uio_rw = UIO_READ; 235 uio->uio_segflg = UIO_USERSPACE; 236 uio->uio_procp = p; 237 #ifdef KTRACE 238 /* 239 * if tracing, save a copy of iovec 240 */ 241 if (KTRPOINT(p, KTR_GENIO)) { 242 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 243 memcpy(ktriov, uio->uio_iov, iovlen); 244 } 245 #endif 246 cnt = uio->uio_resid; 247 error = (*fp->f_ops->fo_read)(fp, uio, flags); 248 if (error) { 249 if (uio->uio_resid != cnt && (error == ERESTART || 250 error == EINTR || error == EWOULDBLOCK)) 251 error = 0; 252 } 253 cnt -= uio->uio_resid; 254 255 mtx_enter(&fp->f_mtx); 256 fp->f_rxfer++; 257 fp->f_rbytes += cnt; 258 mtx_leave(&fp->f_mtx); 259 #ifdef KTRACE 260 if (ktriov != NULL) { 261 if (error == 0) 262 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 263 free(ktriov, M_TEMP, iovlen); 264 } 265 #endif 266 *retval = cnt; 267 done: 268 FRELE(fp, p); 269 return (error); 270 } 271 272 /* 273 * Write system call 274 */ 275 int 276 sys_write(struct proc *p, void *v, register_t *retval) 277 { 278 struct sys_write_args /* { 279 syscallarg(int) fd; 280 syscallarg(const void *) buf; 281 syscallarg(size_t) nbyte; 282 } */ *uap = v; 283 struct iovec iov; 284 struct uio auio; 285 286 iov.iov_base = (void *)SCARG(uap, buf); 287 iov.iov_len = SCARG(uap, nbyte); 288 if (iov.iov_len > SSIZE_MAX) 289 return (EINVAL); 290 291 auio.uio_iov = &iov; 292 auio.uio_iovcnt = 1; 293 auio.uio_resid = iov.iov_len; 294 295 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 296 } 297 298 /* 299 * Gather write system call 300 */ 301 int 302 sys_writev(struct proc *p, void *v, register_t *retval) 303 { 304 struct sys_writev_args /* { 305 syscallarg(int) fd; 306 syscallarg(const struct iovec *) iovp; 307 syscallarg(int) iovcnt; 308 } */ *uap = v; 309 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 310 int error, iovcnt = SCARG(uap, iovcnt); 311 struct uio auio; 312 size_t resid; 313 314 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 315 if (error) 316 goto done; 317 318 auio.uio_iov = iov; 319 auio.uio_iovcnt = iovcnt; 320 auio.uio_resid = resid; 321 322 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 323 done: 324 iovec_free(iov, iovcnt); 325 return (error); 326 } 327 328 int 329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 330 register_t *retval) 331 { 332 struct filedesc *fdp = p->p_fd; 333 struct file *fp; 334 long cnt, error = 0; 335 u_int iovlen; 336 #ifdef KTRACE 337 struct iovec *ktriov = NULL; 338 #endif 339 340 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 341 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 342 343 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 344 return (EBADF); 345 346 /* Checks for positioned write. */ 347 if (flags & FO_POSITION) { 348 struct vnode *vp = fp->f_data; 349 350 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 351 (vp->v_flag & VISTTY)) { 352 error = ESPIPE; 353 goto done; 354 } 355 356 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 357 error = EINVAL; 358 goto done; 359 } 360 } 361 362 uio->uio_rw = UIO_WRITE; 363 uio->uio_segflg = UIO_USERSPACE; 364 uio->uio_procp = p; 365 #ifdef KTRACE 366 /* 367 * if tracing, save a copy of iovec 368 */ 369 if (KTRPOINT(p, KTR_GENIO)) { 370 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 371 memcpy(ktriov, uio->uio_iov, iovlen); 372 } 373 #endif 374 cnt = uio->uio_resid; 375 error = (*fp->f_ops->fo_write)(fp, uio, flags); 376 if (error) { 377 if (uio->uio_resid != cnt && (error == ERESTART || 378 error == EINTR || error == EWOULDBLOCK)) 379 error = 0; 380 if (error == EPIPE) { 381 KERNEL_LOCK(); 382 ptsignal(p, SIGPIPE, STHREAD); 383 KERNEL_UNLOCK(); 384 } 385 } 386 cnt -= uio->uio_resid; 387 388 mtx_enter(&fp->f_mtx); 389 fp->f_wxfer++; 390 fp->f_wbytes += cnt; 391 mtx_leave(&fp->f_mtx); 392 #ifdef KTRACE 393 if (ktriov != NULL) { 394 if (error == 0) 395 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 396 free(ktriov, M_TEMP, iovlen); 397 } 398 #endif 399 *retval = cnt; 400 done: 401 FRELE(fp, p); 402 return (error); 403 } 404 405 /* 406 * Ioctl system call 407 */ 408 int 409 sys_ioctl(struct proc *p, void *v, register_t *retval) 410 { 411 struct sys_ioctl_args /* { 412 syscallarg(int) fd; 413 syscallarg(u_long) com; 414 syscallarg(void *) data; 415 } */ *uap = v; 416 struct file *fp; 417 struct filedesc *fdp = p->p_fd; 418 u_long com = SCARG(uap, com); 419 int error = 0; 420 u_int size = 0; 421 caddr_t data, memp = NULL; 422 int tmp; 423 #define STK_PARAMS 128 424 long long stkbuf[STK_PARAMS / sizeof(long long)]; 425 426 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 427 return (EBADF); 428 429 if (fp->f_type == DTYPE_SOCKET) { 430 struct socket *so = fp->f_data; 431 432 if (so->so_state & SS_DNS) { 433 error = EINVAL; 434 goto out; 435 } 436 } 437 438 error = pledge_ioctl(p, com, fp); 439 if (error) 440 goto out; 441 442 switch (com) { 443 case FIONCLEX: 444 case FIOCLEX: 445 fdplock(fdp); 446 if (com == FIONCLEX) 447 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 448 else 449 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 450 fdpunlock(fdp); 451 goto out; 452 } 453 454 /* 455 * Interpret high order word to find amount of data to be 456 * copied to/from the user's address space. 457 */ 458 size = IOCPARM_LEN(com); 459 if (size > IOCPARM_MAX) { 460 error = ENOTTY; 461 goto out; 462 } 463 if (size > sizeof (stkbuf)) { 464 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 465 data = memp; 466 } else 467 data = (caddr_t)stkbuf; 468 if (com&IOC_IN) { 469 if (size) { 470 error = copyin(SCARG(uap, data), data, size); 471 if (error) { 472 goto out; 473 } 474 } else 475 *(caddr_t *)data = SCARG(uap, data); 476 } else if ((com&IOC_OUT) && size) 477 /* 478 * Zero the buffer so the user always 479 * gets back something deterministic. 480 */ 481 memset(data, 0, size); 482 else if (com&IOC_VOID) 483 *(caddr_t *)data = SCARG(uap, data); 484 485 switch (com) { 486 487 case FIONBIO: 488 if ((tmp = *(int *)data) != 0) 489 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 490 else 491 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 492 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 493 break; 494 495 case FIOASYNC: 496 if ((tmp = *(int *)data) != 0) 497 atomic_setbits_int(&fp->f_flag, FASYNC); 498 else 499 atomic_clearbits_int(&fp->f_flag, FASYNC); 500 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 501 break; 502 503 default: 504 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 505 break; 506 } 507 /* 508 * Copy any data to user, size was 509 * already set and checked above. 510 */ 511 if (error == 0 && (com&IOC_OUT) && size) 512 error = copyout(data, SCARG(uap, data), size); 513 out: 514 FRELE(fp, p); 515 free(memp, M_IOCTLOPS, size); 516 return (error); 517 } 518 519 /* 520 * Select system call. 521 */ 522 int 523 sys_select(struct proc *p, void *v, register_t *retval) 524 { 525 struct sys_select_args /* { 526 syscallarg(int) nd; 527 syscallarg(fd_set *) in; 528 syscallarg(fd_set *) ou; 529 syscallarg(fd_set *) ex; 530 syscallarg(struct timeval *) tv; 531 } */ *uap = v; 532 533 struct timespec ts, *tsp = NULL; 534 int error; 535 536 if (SCARG(uap, tv) != NULL) { 537 struct timeval tv; 538 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 539 return (error); 540 #ifdef KTRACE 541 if (KTRPOINT(p, KTR_STRUCT)) 542 ktrreltimeval(p, &tv); 543 #endif 544 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 545 return (EINVAL); 546 TIMEVAL_TO_TIMESPEC(&tv, &ts); 547 tsp = &ts; 548 } 549 550 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 551 SCARG(uap, ex), tsp, NULL, retval)); 552 } 553 554 int 555 sys_pselect(struct proc *p, void *v, register_t *retval) 556 { 557 struct sys_pselect_args /* { 558 syscallarg(int) nd; 559 syscallarg(fd_set *) in; 560 syscallarg(fd_set *) ou; 561 syscallarg(fd_set *) ex; 562 syscallarg(const struct timespec *) ts; 563 syscallarg(const sigset_t *) mask; 564 } */ *uap = v; 565 566 struct timespec ts, *tsp = NULL; 567 sigset_t ss, *ssp = NULL; 568 int error; 569 570 if (SCARG(uap, ts) != NULL) { 571 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 572 return (error); 573 #ifdef KTRACE 574 if (KTRPOINT(p, KTR_STRUCT)) 575 ktrreltimespec(p, &ts); 576 #endif 577 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 578 return (EINVAL); 579 tsp = &ts; 580 } 581 if (SCARG(uap, mask) != NULL) { 582 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 583 return (error); 584 ssp = &ss; 585 } 586 587 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 588 SCARG(uap, ex), tsp, ssp, retval)); 589 } 590 591 int 592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 593 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 594 { 595 struct kqueue_scan_state scan; 596 struct timespec zerots = {}; 597 fd_mask bits[6]; 598 fd_set *pibits[3], *pobits[3]; 599 int error, nfiles, ncollected = 0, nevents = 0; 600 u_int ni; 601 602 if (nd < 0) 603 return (EINVAL); 604 605 nfiles = READ_ONCE(p->p_fd->fd_nfiles); 606 if (nd > nfiles) 607 nd = nfiles; 608 609 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 610 if (ni > sizeof(bits[0])) { 611 caddr_t mbits; 612 613 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 614 pibits[0] = (fd_set *)&mbits[ni * 0]; 615 pibits[1] = (fd_set *)&mbits[ni * 1]; 616 pibits[2] = (fd_set *)&mbits[ni * 2]; 617 pobits[0] = (fd_set *)&mbits[ni * 3]; 618 pobits[1] = (fd_set *)&mbits[ni * 4]; 619 pobits[2] = (fd_set *)&mbits[ni * 5]; 620 } else { 621 memset(bits, 0, sizeof(bits)); 622 pibits[0] = (fd_set *)&bits[0]; 623 pibits[1] = (fd_set *)&bits[1]; 624 pibits[2] = (fd_set *)&bits[2]; 625 pobits[0] = (fd_set *)&bits[3]; 626 pobits[1] = (fd_set *)&bits[4]; 627 pobits[2] = (fd_set *)&bits[5]; 628 } 629 630 kqpoll_init(nd); 631 632 #define getbits(name, x) \ 633 if (name && (error = copyin(name, pibits[x], ni))) \ 634 goto done; 635 getbits(in, 0); 636 getbits(ou, 1); 637 getbits(ex, 2); 638 #undef getbits 639 #ifdef KTRACE 640 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 641 if (in) ktrfdset(p, pibits[0], ni); 642 if (ou) ktrfdset(p, pibits[1], ni); 643 if (ex) ktrfdset(p, pibits[2], ni); 644 } 645 #endif 646 647 if (sigmask) { 648 KERNEL_LOCK(); 649 dosigsuspend(p, *sigmask &~ sigcantmask); 650 KERNEL_UNLOCK(); 651 } 652 653 /* Register kqueue events */ 654 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 655 if (error != 0) 656 goto done; 657 658 /* 659 * The poll/select family of syscalls has been designed to 660 * block when file descriptors are not available, even if 661 * there's nothing to wait for. 662 */ 663 if (nevents == 0 && ncollected == 0) { 664 uint64_t nsecs = INFSLP; 665 666 if (timeout != NULL) { 667 if (!timespecisset(timeout)) 668 goto done; 669 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 670 } 671 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 672 /* select is not restarted after signals... */ 673 if (error == ERESTART) 674 error = EINTR; 675 if (error == EWOULDBLOCK) 676 error = 0; 677 goto done; 678 } 679 680 /* Do not block if registering found pending events. */ 681 if (ncollected > 0) 682 timeout = &zerots; 683 684 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 685 kqueue_scan_setup(&scan, p->p_kq); 686 while (nevents > 0) { 687 struct kevent kev[KQ_NEVENTS]; 688 int i, ready, count; 689 690 /* Maximum number of events per iteration */ 691 count = MIN(nitems(kev), nevents); 692 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 693 694 /* Convert back events that are ready. */ 695 for (i = 0; i < ready && error == 0; i++) 696 error = pselcollect(p, &kev[i], pobits, &ncollected); 697 /* 698 * Stop if there was an error or if we had enough 699 * space to collect all events that were ready. 700 */ 701 if (error || ready < count) 702 break; 703 704 nevents -= ready; 705 } 706 kqueue_scan_finish(&scan); 707 *retval = ncollected; 708 done: 709 #define putbits(name, x) \ 710 if (name && (error2 = copyout(pobits[x], name, ni))) \ 711 error = error2; 712 if (error == 0) { 713 int error2; 714 715 putbits(in, 0); 716 putbits(ou, 1); 717 putbits(ex, 2); 718 #undef putbits 719 #ifdef KTRACE 720 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 721 if (in) ktrfdset(p, pobits[0], ni); 722 if (ou) ktrfdset(p, pobits[1], ni); 723 if (ex) ktrfdset(p, pobits[2], ni); 724 } 725 #endif 726 } 727 728 if (pibits[0] != (fd_set *)&bits[0]) 729 free(pibits[0], M_TEMP, 6 * ni); 730 731 kqpoll_done(nd); 732 733 return (error); 734 } 735 736 /* 737 * Convert fd_set into kqueue events and register them on the 738 * per-thread queue. 739 */ 740 int 741 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 742 int *nregistered, int *ncollected) 743 { 744 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 745 static const int evff[] = { 0, 0, NOTE_OOB }; 746 int msk, i, j, fd, nevents = 0, error = 0; 747 struct kevent kev; 748 fd_mask bits; 749 750 for (msk = 0; msk < 3; msk++) { 751 for (i = 0; i < nfd; i += NFDBITS) { 752 bits = pibits[msk]->fds_bits[i / NFDBITS]; 753 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 754 bits &= ~(1 << j); 755 756 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 757 fd, msk, p->p_kq_serial); 758 EV_SET(&kev, fd, evf[msk], 759 EV_ADD|EV_ENABLE|__EV_SELECT, 760 evff[msk], 0, (void *)(p->p_kq_serial)); 761 error = kqueue_register(p->p_kq, &kev, 0, p); 762 switch (error) { 763 case 0: 764 nevents++; 765 /* FALLTHROUGH */ 766 case EOPNOTSUPP:/* No underlying kqfilter */ 767 case EINVAL: /* Unimplemented filter */ 768 case EPERM: /* Specific to FIFO and 769 * __EV_SELECT */ 770 error = 0; 771 break; 772 case ENXIO: /* Device has been detached */ 773 default: 774 goto bad; 775 } 776 } 777 } 778 } 779 780 *nregistered = nevents; 781 return (0); 782 bad: 783 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 784 kev.filter, error); 785 return (error); 786 } 787 788 /* 789 * Convert given kqueue event into corresponding select(2) bit. 790 */ 791 int 792 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 793 int *ncollected) 794 { 795 if ((unsigned long)kevp->udata != p->p_kq_serial) { 796 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 797 __func__, kevp, (int)kevp->ident, 798 (unsigned long)kevp->udata, p->p_kq_serial); 799 } 800 801 if (kevp->flags & EV_ERROR) { 802 DPRINTFN(2, "select fd %d filt %d error %d\n", 803 (int)kevp->ident, kevp->filter, (int)kevp->data); 804 return (kevp->data); 805 } 806 807 switch (kevp->filter) { 808 case EVFILT_READ: 809 FD_SET(kevp->ident, pobits[0]); 810 break; 811 case EVFILT_WRITE: 812 FD_SET(kevp->ident, pobits[1]); 813 break; 814 case EVFILT_EXCEPT: 815 FD_SET(kevp->ident, pobits[2]); 816 break; 817 default: 818 KASSERT(0); 819 } 820 (*ncollected)++; 821 822 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 823 return (0); 824 } 825 826 /* 827 * Do a wakeup when a selectable event occurs. 828 */ 829 void 830 selwakeup(struct selinfo *sip) 831 { 832 KERNEL_LOCK(); 833 knote_locked(&sip->si_note, NOTE_SUBMIT); 834 KERNEL_UNLOCK(); 835 } 836 837 /* 838 * Only copyout the revents field. 839 */ 840 int 841 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 842 { 843 int error = 0; 844 u_int i = 0; 845 846 while (!error && i++ < nfds) { 847 error = copyout(&pl->revents, &upl->revents, 848 sizeof(upl->revents)); 849 pl++; 850 upl++; 851 } 852 853 return (error); 854 } 855 856 /* 857 * We are using the same mechanism as select only we encode/decode args 858 * differently. 859 */ 860 int 861 sys_poll(struct proc *p, void *v, register_t *retval) 862 { 863 struct sys_poll_args /* { 864 syscallarg(struct pollfd *) fds; 865 syscallarg(u_int) nfds; 866 syscallarg(int) timeout; 867 } */ *uap = v; 868 869 struct timespec ts, *tsp = NULL; 870 int msec = SCARG(uap, timeout); 871 872 if (msec != INFTIM) { 873 if (msec < 0) 874 return (EINVAL); 875 ts.tv_sec = msec / 1000; 876 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 877 tsp = &ts; 878 } 879 880 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 881 retval)); 882 } 883 884 int 885 sys_ppoll(struct proc *p, void *v, register_t *retval) 886 { 887 struct sys_ppoll_args /* { 888 syscallarg(struct pollfd *) fds; 889 syscallarg(u_int) nfds; 890 syscallarg(const struct timespec *) ts; 891 syscallarg(const sigset_t *) mask; 892 } */ *uap = v; 893 894 int error; 895 struct timespec ts, *tsp = NULL; 896 sigset_t ss, *ssp = NULL; 897 898 if (SCARG(uap, ts) != NULL) { 899 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 900 return (error); 901 #ifdef KTRACE 902 if (KTRPOINT(p, KTR_STRUCT)) 903 ktrreltimespec(p, &ts); 904 #endif 905 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 906 return (EINVAL); 907 tsp = &ts; 908 } 909 910 if (SCARG(uap, mask) != NULL) { 911 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 912 return (error); 913 ssp = &ss; 914 } 915 916 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 917 retval)); 918 } 919 920 int 921 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 922 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 923 { 924 struct kqueue_scan_state scan; 925 struct timespec zerots = {}; 926 struct pollfd pfds[4], *pl = pfds; 927 int error, ncollected = 0, nevents = 0; 928 size_t sz; 929 930 /* Standards say no more than MAX_OPEN; this is possibly better. */ 931 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 932 return (EINVAL); 933 934 /* optimize for the default case, of a small nfds value */ 935 if (nfds > nitems(pfds)) { 936 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 937 M_WAITOK | M_CANFAIL); 938 if (pl == NULL) 939 return (EINVAL); 940 } 941 942 kqpoll_init(nfds); 943 944 sz = nfds * sizeof(*pl); 945 946 if ((error = copyin(fds, pl, sz)) != 0) 947 goto bad; 948 949 if (sigmask) { 950 KERNEL_LOCK(); 951 dosigsuspend(p, *sigmask &~ sigcantmask); 952 KERNEL_UNLOCK(); 953 } 954 955 /* Register kqueue events */ 956 ppollregister(p, pl, nfds, &nevents, &ncollected); 957 958 /* 959 * The poll/select family of syscalls has been designed to 960 * block when file descriptors are not available, even if 961 * there's nothing to wait for. 962 */ 963 if (nevents == 0 && ncollected == 0) { 964 uint64_t nsecs = INFSLP; 965 966 if (timeout != NULL) { 967 if (!timespecisset(timeout)) 968 goto done; 969 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 970 } 971 972 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 973 if (error == ERESTART) 974 error = EINTR; 975 if (error == EWOULDBLOCK) 976 error = 0; 977 goto done; 978 } 979 980 /* Do not block if registering found pending events. */ 981 if (ncollected > 0) 982 timeout = &zerots; 983 984 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 985 kqueue_scan_setup(&scan, p->p_kq); 986 while (nevents > 0) { 987 struct kevent kev[KQ_NEVENTS]; 988 int i, ready, count; 989 990 /* Maximum number of events per iteration */ 991 count = MIN(nitems(kev), nevents); 992 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 993 994 /* Convert back events that are ready. */ 995 for (i = 0; i < ready; i++) 996 ncollected += ppollcollect(p, &kev[i], pl, nfds); 997 998 /* 999 * Stop if there was an error or if we had enough 1000 * place to collect all events that were ready. 1001 */ 1002 if (error || ready < count) 1003 break; 1004 1005 nevents -= ready; 1006 } 1007 kqueue_scan_finish(&scan); 1008 *retval = ncollected; 1009 done: 1010 /* 1011 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1012 * ignored (since the whole point is to see what would block). 1013 */ 1014 switch (error) { 1015 case EINTR: 1016 error = pollout(pl, fds, nfds); 1017 if (error == 0) 1018 error = EINTR; 1019 break; 1020 case EWOULDBLOCK: 1021 case 0: 1022 error = pollout(pl, fds, nfds); 1023 break; 1024 } 1025 #ifdef KTRACE 1026 if (KTRPOINT(p, KTR_STRUCT)) 1027 ktrpollfd(p, pl, nfds); 1028 #endif /* KTRACE */ 1029 bad: 1030 if (pl != pfds) 1031 free(pl, M_TEMP, sz); 1032 1033 kqpoll_done(nfds); 1034 1035 return (error); 1036 } 1037 1038 int 1039 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 1040 struct pollfd *pl, unsigned int pollid) 1041 { 1042 int i, error, nevents = 0; 1043 1044 KASSERT(pl->revents == 0); 1045 1046 for (i = 0; i < nkev; i++, kevp++) { 1047 again: 1048 error = kqueue_register(p->p_kq, kevp, pollid, p); 1049 switch (error) { 1050 case 0: 1051 nevents++; 1052 break; 1053 case EOPNOTSUPP:/* No underlying kqfilter */ 1054 case EINVAL: /* Unimplemented filter */ 1055 break; 1056 case EBADF: /* Bad file descriptor */ 1057 pl->revents |= POLLNVAL; 1058 break; 1059 case EPERM: /* Specific to FIFO */ 1060 KASSERT(kevp->filter == EVFILT_WRITE); 1061 if (nkev == 1) { 1062 /* 1063 * If this is the only filter make sure 1064 * POLLHUP is passed to userland. 1065 */ 1066 kevp->filter = EVFILT_EXCEPT; 1067 goto again; 1068 } 1069 break; 1070 default: 1071 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 1072 " %lu filt %d ERROR=%d\n", 1073 ((unsigned long)kevp->udata - p->p_kq_serial), 1074 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 1075 error); 1076 /* FALLTHROUGH */ 1077 case ENXIO: /* Device has been detached */ 1078 pl->revents |= POLLERR; 1079 break; 1080 } 1081 } 1082 1083 return (nevents); 1084 } 1085 1086 /* 1087 * Convert pollfd into kqueue events and register them on the 1088 * per-thread queue. 1089 * 1090 * At most 3 events can correspond to a single pollfd. 1091 */ 1092 void 1093 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, 1094 int *ncollected) 1095 { 1096 int i, nkev, nevt, forcehup; 1097 struct kevent kev[3], *kevp; 1098 1099 for (i = 0; i < nfds; i++) { 1100 pl[i].events &= ~POLL_NOHUP; 1101 pl[i].revents = 0; 1102 1103 if (pl[i].fd < 0) 1104 continue; 1105 1106 /* 1107 * POLLHUP checking is implicit in the event filters. 1108 * However, the checking must be even if no events are 1109 * requested. 1110 */ 1111 forcehup = ((pl[i].events & ~POLLHUP) == 0); 1112 1113 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 1114 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 1115 1116 nevt = 0; 1117 nkev = 0; 1118 kevp = kev; 1119 if (pl[i].events & (POLLIN | POLLRDNORM)) { 1120 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1121 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1122 (void *)(p->p_kq_serial + i)); 1123 nkev++; 1124 kevp++; 1125 } 1126 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1127 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1128 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1129 (void *)(p->p_kq_serial + i)); 1130 nkev++; 1131 kevp++; 1132 } 1133 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1134 int evff = forcehup ? 0 : NOTE_OOB; 1135 1136 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1137 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1138 (void *)(p->p_kq_serial + i)); 1139 nkev++; 1140 kevp++; 1141 } 1142 1143 if (nkev == 0) 1144 continue; 1145 1146 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); 1147 1148 if (pl[i].revents != 0) 1149 (*ncollected)++; 1150 } 1151 1152 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, 1153 *ncollected); 1154 } 1155 1156 /* 1157 * Convert given kqueue event into corresponding poll(2) revents bit. 1158 */ 1159 int 1160 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1161 { 1162 static struct timeval poll_errintvl = { 5, 0 }; 1163 static struct timeval poll_lasterr; 1164 int already_seen; 1165 unsigned long i; 1166 1167 /* Extract poll array index */ 1168 i = (unsigned long)kevp->udata - p->p_kq_serial; 1169 1170 if (i >= nfds) { 1171 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1172 __func__, kevp, nfds, 1173 (unsigned long)kevp->udata, p->p_kq_serial); 1174 } 1175 if ((int)kevp->ident != pl[i].fd) { 1176 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1177 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1178 p->p_kq_serial); 1179 } 1180 1181 /* 1182 * A given descriptor may already have generated an error 1183 * against another filter during kqueue_register(). 1184 * 1185 * Make sure to set the appropriate flags but do not 1186 * increment `*retval' more than once. 1187 */ 1188 already_seen = (pl[i].revents != 0); 1189 1190 /* POLLNVAL preempts other events. */ 1191 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1192 pl[i].revents = POLLNVAL; 1193 goto done; 1194 } else if (pl[i].revents & POLLNVAL) { 1195 goto done; 1196 } 1197 1198 switch (kevp->filter) { 1199 case EVFILT_READ: 1200 if (kevp->flags & __EV_HUP) 1201 pl[i].revents |= POLLHUP; 1202 if (pl[i].events & (POLLIN | POLLRDNORM)) 1203 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1204 break; 1205 case EVFILT_WRITE: 1206 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1207 if (kevp->flags & __EV_HUP) { 1208 pl[i].revents |= POLLHUP; 1209 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1210 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1211 } 1212 break; 1213 case EVFILT_EXCEPT: 1214 if (kevp->flags & __EV_HUP) { 1215 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1216 DPRINTFN(0, "weird events %x\n", pl[i].events); 1217 pl[i].revents |= POLLHUP; 1218 break; 1219 } 1220 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1221 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1222 break; 1223 default: 1224 KASSERT(0); 1225 } 1226 1227 done: 1228 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1229 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1230 kevp->filter); 1231 1232 /* 1233 * Make noise about unclaimed events as they might indicate a bug 1234 * and can result in spurious-looking wakeups of poll(2). 1235 * 1236 * Live-locking within the system call should not happen because 1237 * the scan loop in doppoll() has an upper limit for the number 1238 * of events to process. 1239 */ 1240 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { 1241 printf("%s[%d]: poll index %lu fd %d events 0x%x " 1242 "filter %d/0x%x unclaimed\n", 1243 p->p_p->ps_comm, p->p_tid, i, pl[i].fd, 1244 pl[i].events, kevp->filter, kevp->flags); 1245 } 1246 1247 if (!already_seen && (pl[i].revents != 0)) 1248 return (1); 1249 1250 return (0); 1251 } 1252 1253 /* 1254 * utrace system call 1255 */ 1256 int 1257 sys_utrace(struct proc *curp, void *v, register_t *retval) 1258 { 1259 #ifdef KTRACE 1260 struct sys_utrace_args /* { 1261 syscallarg(const char *) label; 1262 syscallarg(const void *) addr; 1263 syscallarg(size_t) len; 1264 } */ *uap = v; 1265 1266 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1267 SCARG(uap, len))); 1268 #else 1269 return (0); 1270 #endif 1271 } 1272