1 /* $OpenBSD: sys_generic.c,v 1.157 2024/04/10 10:05:26 claudio Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/time.h> 54 #include <sys/malloc.h> 55 #include <sys/poll.h> 56 #include <sys/eventvar.h> 57 #ifdef KTRACE 58 #include <sys/ktrace.h> 59 #endif 60 #include <sys/pledge.h> 61 62 #include <sys/mount.h> 63 #include <sys/syscallargs.h> 64 65 /* 66 * Debug values: 67 * 1 - print implementation errors, things that should not happen. 68 * 2 - print ppoll(2) information, somewhat verbose 69 * 3 - print pselect(2) and ppoll(2) information, very verbose 70 */ 71 int kqpoll_debug = 0; 72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 73 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 74 printf(x); \ 75 } 76 77 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *); 78 int pselcollect(struct proc *, struct kevent *, fd_set **, int *); 79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *); 80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 81 82 int pollout(struct pollfd *, struct pollfd *, u_int); 83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 84 struct timespec *, const sigset_t *, register_t *); 85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 86 const sigset_t *, register_t *); 87 88 int 89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 90 unsigned int iovcnt, size_t *residp) 91 { 92 #ifdef KTRACE 93 struct proc *p = curproc; 94 #endif 95 struct iovec *iov; 96 int error, i; 97 size_t resid = 0; 98 99 if (iovcnt > UIO_SMALLIOV) { 100 if (iovcnt > IOV_MAX) 101 return (EINVAL); 102 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 103 } else if (iovcnt > 0) { 104 iov = aiov; 105 } else { 106 return (EINVAL); 107 } 108 *iovp = iov; 109 110 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 111 return (error); 112 113 #ifdef KTRACE 114 if (KTRPOINT(p, KTR_STRUCT)) 115 ktriovec(p, iov, iovcnt); 116 #endif 117 118 for (i = 0; i < iovcnt; i++) { 119 resid += iov->iov_len; 120 /* 121 * Writes return ssize_t because -1 is returned on error. 122 * Therefore we must restrict the length to SSIZE_MAX to 123 * avoid garbage return values. Note that the addition is 124 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 125 */ 126 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 127 return (EINVAL); 128 iov++; 129 } 130 131 if (residp != NULL) 132 *residp = resid; 133 134 return (0); 135 } 136 137 void 138 iovec_free(struct iovec *iov, unsigned int iovcnt) 139 { 140 if (iovcnt > UIO_SMALLIOV) 141 free(iov, M_IOV, iovcnt * sizeof(*iov)); 142 } 143 144 /* 145 * Read system call. 146 */ 147 int 148 sys_read(struct proc *p, void *v, register_t *retval) 149 { 150 struct sys_read_args /* { 151 syscallarg(int) fd; 152 syscallarg(void *) buf; 153 syscallarg(size_t) nbyte; 154 } */ *uap = v; 155 struct iovec iov; 156 struct uio auio; 157 158 iov.iov_base = SCARG(uap, buf); 159 iov.iov_len = SCARG(uap, nbyte); 160 if (iov.iov_len > SSIZE_MAX) 161 return (EINVAL); 162 163 auio.uio_iov = &iov; 164 auio.uio_iovcnt = 1; 165 auio.uio_resid = iov.iov_len; 166 167 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 168 } 169 170 /* 171 * Scatter read system call. 172 */ 173 int 174 sys_readv(struct proc *p, void *v, register_t *retval) 175 { 176 struct sys_readv_args /* { 177 syscallarg(int) fd; 178 syscallarg(const struct iovec *) iovp; 179 syscallarg(int) iovcnt; 180 } */ *uap = v; 181 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 182 int error, iovcnt = SCARG(uap, iovcnt); 183 struct uio auio; 184 size_t resid; 185 186 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 187 if (error) 188 goto done; 189 190 auio.uio_iov = iov; 191 auio.uio_iovcnt = iovcnt; 192 auio.uio_resid = resid; 193 194 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 195 done: 196 iovec_free(iov, iovcnt); 197 return (error); 198 } 199 200 int 201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 202 register_t *retval) 203 { 204 struct filedesc *fdp = p->p_fd; 205 struct file *fp; 206 long cnt, error = 0; 207 u_int iovlen; 208 #ifdef KTRACE 209 struct iovec *ktriov = NULL; 210 #endif 211 212 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 213 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 214 215 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 216 return (EBADF); 217 218 /* Checks for positioned read. */ 219 if (flags & FO_POSITION) { 220 struct vnode *vp = fp->f_data; 221 222 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 223 (vp->v_flag & VISTTY)) { 224 error = ESPIPE; 225 goto done; 226 } 227 228 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 229 error = EINVAL; 230 goto done; 231 } 232 } 233 234 uio->uio_rw = UIO_READ; 235 uio->uio_segflg = UIO_USERSPACE; 236 uio->uio_procp = p; 237 #ifdef KTRACE 238 /* 239 * if tracing, save a copy of iovec 240 */ 241 if (KTRPOINT(p, KTR_GENIO)) { 242 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 243 memcpy(ktriov, uio->uio_iov, iovlen); 244 } 245 #endif 246 cnt = uio->uio_resid; 247 error = (*fp->f_ops->fo_read)(fp, uio, flags); 248 if (error) { 249 if (uio->uio_resid != cnt && (error == ERESTART || 250 error == EINTR || error == EWOULDBLOCK)) 251 error = 0; 252 } 253 cnt -= uio->uio_resid; 254 255 mtx_enter(&fp->f_mtx); 256 fp->f_rxfer++; 257 fp->f_rbytes += cnt; 258 mtx_leave(&fp->f_mtx); 259 #ifdef KTRACE 260 if (ktriov != NULL) { 261 if (error == 0) 262 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 263 free(ktriov, M_TEMP, iovlen); 264 } 265 #endif 266 *retval = cnt; 267 done: 268 FRELE(fp, p); 269 return (error); 270 } 271 272 /* 273 * Write system call 274 */ 275 int 276 sys_write(struct proc *p, void *v, register_t *retval) 277 { 278 struct sys_write_args /* { 279 syscallarg(int) fd; 280 syscallarg(const void *) buf; 281 syscallarg(size_t) nbyte; 282 } */ *uap = v; 283 struct iovec iov; 284 struct uio auio; 285 286 iov.iov_base = (void *)SCARG(uap, buf); 287 iov.iov_len = SCARG(uap, nbyte); 288 if (iov.iov_len > SSIZE_MAX) 289 return (EINVAL); 290 291 auio.uio_iov = &iov; 292 auio.uio_iovcnt = 1; 293 auio.uio_resid = iov.iov_len; 294 295 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 296 } 297 298 /* 299 * Gather write system call 300 */ 301 int 302 sys_writev(struct proc *p, void *v, register_t *retval) 303 { 304 struct sys_writev_args /* { 305 syscallarg(int) fd; 306 syscallarg(const struct iovec *) iovp; 307 syscallarg(int) iovcnt; 308 } */ *uap = v; 309 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 310 int error, iovcnt = SCARG(uap, iovcnt); 311 struct uio auio; 312 size_t resid; 313 314 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 315 if (error) 316 goto done; 317 318 auio.uio_iov = iov; 319 auio.uio_iovcnt = iovcnt; 320 auio.uio_resid = resid; 321 322 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 323 done: 324 iovec_free(iov, iovcnt); 325 return (error); 326 } 327 328 int 329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 330 register_t *retval) 331 { 332 struct filedesc *fdp = p->p_fd; 333 struct file *fp; 334 long cnt, error = 0; 335 u_int iovlen; 336 #ifdef KTRACE 337 struct iovec *ktriov = NULL; 338 #endif 339 340 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 341 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 342 343 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 344 return (EBADF); 345 346 /* Checks for positioned write. */ 347 if (flags & FO_POSITION) { 348 struct vnode *vp = fp->f_data; 349 350 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 351 (vp->v_flag & VISTTY)) { 352 error = ESPIPE; 353 goto done; 354 } 355 356 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 357 error = EINVAL; 358 goto done; 359 } 360 } 361 362 uio->uio_rw = UIO_WRITE; 363 uio->uio_segflg = UIO_USERSPACE; 364 uio->uio_procp = p; 365 #ifdef KTRACE 366 /* 367 * if tracing, save a copy of iovec 368 */ 369 if (KTRPOINT(p, KTR_GENIO)) { 370 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 371 memcpy(ktriov, uio->uio_iov, iovlen); 372 } 373 #endif 374 cnt = uio->uio_resid; 375 error = (*fp->f_ops->fo_write)(fp, uio, flags); 376 if (error) { 377 if (uio->uio_resid != cnt && (error == ERESTART || 378 error == EINTR || error == EWOULDBLOCK)) 379 error = 0; 380 if (error == EPIPE) { 381 KERNEL_LOCK(); 382 ptsignal(p, SIGPIPE, STHREAD); 383 KERNEL_UNLOCK(); 384 } 385 } 386 cnt -= uio->uio_resid; 387 388 mtx_enter(&fp->f_mtx); 389 fp->f_wxfer++; 390 fp->f_wbytes += cnt; 391 mtx_leave(&fp->f_mtx); 392 #ifdef KTRACE 393 if (ktriov != NULL) { 394 if (error == 0) 395 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 396 free(ktriov, M_TEMP, iovlen); 397 } 398 #endif 399 *retval = cnt; 400 done: 401 FRELE(fp, p); 402 return (error); 403 } 404 405 /* 406 * Ioctl system call 407 */ 408 int 409 sys_ioctl(struct proc *p, void *v, register_t *retval) 410 { 411 struct sys_ioctl_args /* { 412 syscallarg(int) fd; 413 syscallarg(u_long) com; 414 syscallarg(void *) data; 415 } */ *uap = v; 416 struct file *fp; 417 struct filedesc *fdp = p->p_fd; 418 u_long com = SCARG(uap, com); 419 int error = 0; 420 u_int size = 0; 421 caddr_t data, memp = NULL; 422 int tmp; 423 #define STK_PARAMS 128 424 long long stkbuf[STK_PARAMS / sizeof(long long)]; 425 426 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 427 return (EBADF); 428 429 if (fp->f_type == DTYPE_SOCKET) { 430 struct socket *so = fp->f_data; 431 432 if (so->so_state & SS_DNS) { 433 error = EINVAL; 434 goto out; 435 } 436 } 437 438 error = pledge_ioctl(p, com, fp); 439 if (error) 440 goto out; 441 442 switch (com) { 443 case FIONCLEX: 444 case FIOCLEX: 445 fdplock(fdp); 446 if (com == FIONCLEX) 447 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 448 else 449 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 450 fdpunlock(fdp); 451 goto out; 452 } 453 454 /* 455 * Interpret high order word to find amount of data to be 456 * copied to/from the user's address space. 457 */ 458 size = IOCPARM_LEN(com); 459 if (size > IOCPARM_MAX) { 460 error = ENOTTY; 461 goto out; 462 } 463 if (size > sizeof (stkbuf)) { 464 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 465 data = memp; 466 } else 467 data = (caddr_t)stkbuf; 468 if (com&IOC_IN) { 469 if (size) { 470 error = copyin(SCARG(uap, data), data, size); 471 if (error) { 472 goto out; 473 } 474 } else 475 *(caddr_t *)data = SCARG(uap, data); 476 } else if ((com&IOC_OUT) && size) 477 /* 478 * Zero the buffer so the user always 479 * gets back something deterministic. 480 */ 481 memset(data, 0, size); 482 else if (com&IOC_VOID) 483 *(caddr_t *)data = SCARG(uap, data); 484 485 switch (com) { 486 487 case FIONBIO: 488 if ((tmp = *(int *)data) != 0) 489 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 490 else 491 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 492 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 493 break; 494 495 case FIOASYNC: 496 if ((tmp = *(int *)data) != 0) 497 atomic_setbits_int(&fp->f_flag, FASYNC); 498 else 499 atomic_clearbits_int(&fp->f_flag, FASYNC); 500 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 501 break; 502 503 default: 504 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 505 break; 506 } 507 /* 508 * Copy any data to user, size was 509 * already set and checked above. 510 */ 511 if (error == 0 && (com&IOC_OUT) && size) 512 error = copyout(data, SCARG(uap, data), size); 513 out: 514 FRELE(fp, p); 515 free(memp, M_IOCTLOPS, size); 516 return (error); 517 } 518 519 /* 520 * Select system call. 521 */ 522 int 523 sys_select(struct proc *p, void *v, register_t *retval) 524 { 525 struct sys_select_args /* { 526 syscallarg(int) nd; 527 syscallarg(fd_set *) in; 528 syscallarg(fd_set *) ou; 529 syscallarg(fd_set *) ex; 530 syscallarg(struct timeval *) tv; 531 } */ *uap = v; 532 533 struct timespec ts, *tsp = NULL; 534 int error; 535 536 if (SCARG(uap, tv) != NULL) { 537 struct timeval tv; 538 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 539 return (error); 540 #ifdef KTRACE 541 if (KTRPOINT(p, KTR_STRUCT)) 542 ktrreltimeval(p, &tv); 543 #endif 544 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 545 return (EINVAL); 546 TIMEVAL_TO_TIMESPEC(&tv, &ts); 547 tsp = &ts; 548 } 549 550 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 551 SCARG(uap, ex), tsp, NULL, retval)); 552 } 553 554 int 555 sys_pselect(struct proc *p, void *v, register_t *retval) 556 { 557 struct sys_pselect_args /* { 558 syscallarg(int) nd; 559 syscallarg(fd_set *) in; 560 syscallarg(fd_set *) ou; 561 syscallarg(fd_set *) ex; 562 syscallarg(const struct timespec *) ts; 563 syscallarg(const sigset_t *) mask; 564 } */ *uap = v; 565 566 struct timespec ts, *tsp = NULL; 567 sigset_t ss, *ssp = NULL; 568 int error; 569 570 if (SCARG(uap, ts) != NULL) { 571 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 572 return (error); 573 #ifdef KTRACE 574 if (KTRPOINT(p, KTR_STRUCT)) 575 ktrreltimespec(p, &ts); 576 #endif 577 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 578 return (EINVAL); 579 tsp = &ts; 580 } 581 if (SCARG(uap, mask) != NULL) { 582 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 583 return (error); 584 ssp = &ss; 585 } 586 587 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 588 SCARG(uap, ex), tsp, ssp, retval)); 589 } 590 591 int 592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 593 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 594 { 595 struct kqueue_scan_state scan; 596 struct timespec zerots = {}; 597 fd_mask bits[6]; 598 fd_set *pibits[3], *pobits[3]; 599 int error, nfiles, ncollected = 0, nevents = 0; 600 u_int ni; 601 602 if (nd < 0) 603 return (EINVAL); 604 605 nfiles = READ_ONCE(p->p_fd->fd_nfiles); 606 if (nd > nfiles) 607 nd = nfiles; 608 609 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 610 if (ni > sizeof(bits[0])) { 611 caddr_t mbits; 612 613 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 614 pibits[0] = (fd_set *)&mbits[ni * 0]; 615 pibits[1] = (fd_set *)&mbits[ni * 1]; 616 pibits[2] = (fd_set *)&mbits[ni * 2]; 617 pobits[0] = (fd_set *)&mbits[ni * 3]; 618 pobits[1] = (fd_set *)&mbits[ni * 4]; 619 pobits[2] = (fd_set *)&mbits[ni * 5]; 620 } else { 621 memset(bits, 0, sizeof(bits)); 622 pibits[0] = (fd_set *)&bits[0]; 623 pibits[1] = (fd_set *)&bits[1]; 624 pibits[2] = (fd_set *)&bits[2]; 625 pobits[0] = (fd_set *)&bits[3]; 626 pobits[1] = (fd_set *)&bits[4]; 627 pobits[2] = (fd_set *)&bits[5]; 628 } 629 630 kqpoll_init(nd); 631 632 #define getbits(name, x) \ 633 if (name && (error = copyin(name, pibits[x], ni))) \ 634 goto done; 635 getbits(in, 0); 636 getbits(ou, 1); 637 getbits(ex, 2); 638 #undef getbits 639 #ifdef KTRACE 640 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 641 if (in) ktrfdset(p, pibits[0], ni); 642 if (ou) ktrfdset(p, pibits[1], ni); 643 if (ex) ktrfdset(p, pibits[2], ni); 644 } 645 #endif 646 647 if (sigmask) 648 dosigsuspend(p, *sigmask &~ sigcantmask); 649 650 /* Register kqueue events */ 651 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 652 if (error != 0) 653 goto done; 654 655 /* 656 * The poll/select family of syscalls has been designed to 657 * block when file descriptors are not available, even if 658 * there's nothing to wait for. 659 */ 660 if (nevents == 0 && ncollected == 0) { 661 uint64_t nsecs = INFSLP; 662 663 if (timeout != NULL) { 664 if (!timespecisset(timeout)) 665 goto done; 666 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 667 } 668 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 669 /* select is not restarted after signals... */ 670 if (error == ERESTART) 671 error = EINTR; 672 if (error == EWOULDBLOCK) 673 error = 0; 674 goto done; 675 } 676 677 /* Do not block if registering found pending events. */ 678 if (ncollected > 0) 679 timeout = &zerots; 680 681 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 682 kqueue_scan_setup(&scan, p->p_kq); 683 while (nevents > 0) { 684 struct kevent kev[KQ_NEVENTS]; 685 int i, ready, count; 686 687 /* Maximum number of events per iteration */ 688 count = MIN(nitems(kev), nevents); 689 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 690 691 /* Convert back events that are ready. */ 692 for (i = 0; i < ready && error == 0; i++) 693 error = pselcollect(p, &kev[i], pobits, &ncollected); 694 /* 695 * Stop if there was an error or if we had enough 696 * space to collect all events that were ready. 697 */ 698 if (error || ready < count) 699 break; 700 701 nevents -= ready; 702 } 703 kqueue_scan_finish(&scan); 704 *retval = ncollected; 705 done: 706 #define putbits(name, x) \ 707 if (name && (error2 = copyout(pobits[x], name, ni))) \ 708 error = error2; 709 if (error == 0) { 710 int error2; 711 712 putbits(in, 0); 713 putbits(ou, 1); 714 putbits(ex, 2); 715 #undef putbits 716 #ifdef KTRACE 717 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 718 if (in) ktrfdset(p, pobits[0], ni); 719 if (ou) ktrfdset(p, pobits[1], ni); 720 if (ex) ktrfdset(p, pobits[2], ni); 721 } 722 #endif 723 } 724 725 if (pibits[0] != (fd_set *)&bits[0]) 726 free(pibits[0], M_TEMP, 6 * ni); 727 728 kqpoll_done(nd); 729 730 return (error); 731 } 732 733 /* 734 * Convert fd_set into kqueue events and register them on the 735 * per-thread queue. 736 */ 737 int 738 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 739 int *nregistered, int *ncollected) 740 { 741 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 742 static const int evff[] = { 0, 0, NOTE_OOB }; 743 int msk, i, j, fd, nevents = 0, error = 0; 744 struct kevent kev; 745 fd_mask bits; 746 747 for (msk = 0; msk < 3; msk++) { 748 for (i = 0; i < nfd; i += NFDBITS) { 749 bits = pibits[msk]->fds_bits[i / NFDBITS]; 750 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 751 bits &= ~(1 << j); 752 753 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 754 fd, msk, p->p_kq_serial); 755 EV_SET(&kev, fd, evf[msk], 756 EV_ADD|EV_ENABLE|__EV_SELECT, 757 evff[msk], 0, (void *)(p->p_kq_serial)); 758 error = kqueue_register(p->p_kq, &kev, 0, p); 759 switch (error) { 760 case 0: 761 nevents++; 762 /* FALLTHROUGH */ 763 case EOPNOTSUPP:/* No underlying kqfilter */ 764 case EINVAL: /* Unimplemented filter */ 765 case EPERM: /* Specific to FIFO and 766 * __EV_SELECT */ 767 error = 0; 768 break; 769 case ENXIO: /* Device has been detached */ 770 default: 771 goto bad; 772 } 773 } 774 } 775 } 776 777 *nregistered = nevents; 778 return (0); 779 bad: 780 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 781 kev.filter, error); 782 return (error); 783 } 784 785 /* 786 * Convert given kqueue event into corresponding select(2) bit. 787 */ 788 int 789 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 790 int *ncollected) 791 { 792 if ((unsigned long)kevp->udata != p->p_kq_serial) { 793 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 794 __func__, kevp, (int)kevp->ident, 795 (unsigned long)kevp->udata, p->p_kq_serial); 796 } 797 798 if (kevp->flags & EV_ERROR) { 799 DPRINTFN(2, "select fd %d filt %d error %d\n", 800 (int)kevp->ident, kevp->filter, (int)kevp->data); 801 return (kevp->data); 802 } 803 804 switch (kevp->filter) { 805 case EVFILT_READ: 806 FD_SET(kevp->ident, pobits[0]); 807 break; 808 case EVFILT_WRITE: 809 FD_SET(kevp->ident, pobits[1]); 810 break; 811 case EVFILT_EXCEPT: 812 FD_SET(kevp->ident, pobits[2]); 813 break; 814 default: 815 KASSERT(0); 816 } 817 (*ncollected)++; 818 819 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 820 return (0); 821 } 822 823 /* 824 * Do a wakeup when a selectable event occurs. 825 */ 826 void 827 selwakeup(struct selinfo *sip) 828 { 829 KERNEL_LOCK(); 830 knote_locked(&sip->si_note, NOTE_SUBMIT); 831 KERNEL_UNLOCK(); 832 } 833 834 /* 835 * Only copyout the revents field. 836 */ 837 int 838 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 839 { 840 int error = 0; 841 u_int i = 0; 842 843 while (!error && i++ < nfds) { 844 error = copyout(&pl->revents, &upl->revents, 845 sizeof(upl->revents)); 846 pl++; 847 upl++; 848 } 849 850 return (error); 851 } 852 853 /* 854 * We are using the same mechanism as select only we encode/decode args 855 * differently. 856 */ 857 int 858 sys_poll(struct proc *p, void *v, register_t *retval) 859 { 860 struct sys_poll_args /* { 861 syscallarg(struct pollfd *) fds; 862 syscallarg(u_int) nfds; 863 syscallarg(int) timeout; 864 } */ *uap = v; 865 866 struct timespec ts, *tsp = NULL; 867 int msec = SCARG(uap, timeout); 868 869 if (msec != INFTIM) { 870 if (msec < 0) 871 return (EINVAL); 872 ts.tv_sec = msec / 1000; 873 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 874 tsp = &ts; 875 } 876 877 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 878 retval)); 879 } 880 881 int 882 sys_ppoll(struct proc *p, void *v, register_t *retval) 883 { 884 struct sys_ppoll_args /* { 885 syscallarg(struct pollfd *) fds; 886 syscallarg(u_int) nfds; 887 syscallarg(const struct timespec *) ts; 888 syscallarg(const sigset_t *) mask; 889 } */ *uap = v; 890 891 int error; 892 struct timespec ts, *tsp = NULL; 893 sigset_t ss, *ssp = NULL; 894 895 if (SCARG(uap, ts) != NULL) { 896 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 897 return (error); 898 #ifdef KTRACE 899 if (KTRPOINT(p, KTR_STRUCT)) 900 ktrreltimespec(p, &ts); 901 #endif 902 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 903 return (EINVAL); 904 tsp = &ts; 905 } 906 907 if (SCARG(uap, mask) != NULL) { 908 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 909 return (error); 910 ssp = &ss; 911 } 912 913 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 914 retval)); 915 } 916 917 int 918 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 919 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 920 { 921 struct kqueue_scan_state scan; 922 struct timespec zerots = {}; 923 struct pollfd pfds[4], *pl = pfds; 924 int error, ncollected = 0, nevents = 0; 925 size_t sz; 926 927 /* Standards say no more than MAX_OPEN; this is possibly better. */ 928 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 929 return (EINVAL); 930 931 /* optimize for the default case, of a small nfds value */ 932 if (nfds > nitems(pfds)) { 933 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 934 M_WAITOK | M_CANFAIL); 935 if (pl == NULL) 936 return (EINVAL); 937 } 938 939 kqpoll_init(nfds); 940 941 sz = nfds * sizeof(*pl); 942 943 if ((error = copyin(fds, pl, sz)) != 0) 944 goto bad; 945 946 if (sigmask) 947 dosigsuspend(p, *sigmask &~ sigcantmask); 948 949 /* Register kqueue events */ 950 ppollregister(p, pl, nfds, &nevents, &ncollected); 951 952 /* 953 * The poll/select family of syscalls has been designed to 954 * block when file descriptors are not available, even if 955 * there's nothing to wait for. 956 */ 957 if (nevents == 0 && ncollected == 0) { 958 uint64_t nsecs = INFSLP; 959 960 if (timeout != NULL) { 961 if (!timespecisset(timeout)) 962 goto done; 963 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 964 } 965 966 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 967 if (error == ERESTART) 968 error = EINTR; 969 if (error == EWOULDBLOCK) 970 error = 0; 971 goto done; 972 } 973 974 /* Do not block if registering found pending events. */ 975 if (ncollected > 0) 976 timeout = &zerots; 977 978 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 979 kqueue_scan_setup(&scan, p->p_kq); 980 while (nevents > 0) { 981 struct kevent kev[KQ_NEVENTS]; 982 int i, ready, count; 983 984 /* Maximum number of events per iteration */ 985 count = MIN(nitems(kev), nevents); 986 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 987 988 /* Convert back events that are ready. */ 989 for (i = 0; i < ready; i++) 990 ncollected += ppollcollect(p, &kev[i], pl, nfds); 991 992 /* 993 * Stop if there was an error or if we had enough 994 * place to collect all events that were ready. 995 */ 996 if (error || ready < count) 997 break; 998 999 nevents -= ready; 1000 } 1001 kqueue_scan_finish(&scan); 1002 *retval = ncollected; 1003 done: 1004 /* 1005 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1006 * ignored (since the whole point is to see what would block). 1007 */ 1008 switch (error) { 1009 case EINTR: 1010 error = pollout(pl, fds, nfds); 1011 if (error == 0) 1012 error = EINTR; 1013 break; 1014 case EWOULDBLOCK: 1015 case 0: 1016 error = pollout(pl, fds, nfds); 1017 break; 1018 } 1019 #ifdef KTRACE 1020 if (KTRPOINT(p, KTR_STRUCT)) 1021 ktrpollfd(p, pl, nfds); 1022 #endif /* KTRACE */ 1023 bad: 1024 if (pl != pfds) 1025 free(pl, M_TEMP, sz); 1026 1027 kqpoll_done(nfds); 1028 1029 return (error); 1030 } 1031 1032 int 1033 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 1034 struct pollfd *pl, unsigned int pollid) 1035 { 1036 int i, error, nevents = 0; 1037 1038 KASSERT(pl->revents == 0); 1039 1040 for (i = 0; i < nkev; i++, kevp++) { 1041 again: 1042 error = kqueue_register(p->p_kq, kevp, pollid, p); 1043 switch (error) { 1044 case 0: 1045 nevents++; 1046 break; 1047 case EOPNOTSUPP:/* No underlying kqfilter */ 1048 case EINVAL: /* Unimplemented filter */ 1049 break; 1050 case EBADF: /* Bad file descriptor */ 1051 pl->revents |= POLLNVAL; 1052 break; 1053 case EPERM: /* Specific to FIFO */ 1054 KASSERT(kevp->filter == EVFILT_WRITE); 1055 if (nkev == 1) { 1056 /* 1057 * If this is the only filter make sure 1058 * POLLHUP is passed to userland. 1059 */ 1060 kevp->filter = EVFILT_EXCEPT; 1061 goto again; 1062 } 1063 break; 1064 default: 1065 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 1066 " %lu filt %d ERROR=%d\n", 1067 ((unsigned long)kevp->udata - p->p_kq_serial), 1068 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 1069 error); 1070 /* FALLTHROUGH */ 1071 case ENXIO: /* Device has been detached */ 1072 pl->revents |= POLLERR; 1073 break; 1074 } 1075 } 1076 1077 return (nevents); 1078 } 1079 1080 /* 1081 * Convert pollfd into kqueue events and register them on the 1082 * per-thread queue. 1083 * 1084 * At most 3 events can correspond to a single pollfd. 1085 */ 1086 void 1087 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, 1088 int *ncollected) 1089 { 1090 int i, nkev, nevt, forcehup; 1091 struct kevent kev[3], *kevp; 1092 1093 for (i = 0; i < nfds; i++) { 1094 pl[i].events &= ~POLL_NOHUP; 1095 pl[i].revents = 0; 1096 1097 if (pl[i].fd < 0) 1098 continue; 1099 1100 /* 1101 * POLLHUP checking is implicit in the event filters. 1102 * However, the checking must be even if no events are 1103 * requested. 1104 */ 1105 forcehup = ((pl[i].events & ~POLLHUP) == 0); 1106 1107 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 1108 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 1109 1110 nevt = 0; 1111 nkev = 0; 1112 kevp = kev; 1113 if (pl[i].events & (POLLIN | POLLRDNORM)) { 1114 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1115 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1116 (void *)(p->p_kq_serial + i)); 1117 nkev++; 1118 kevp++; 1119 } 1120 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1121 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1122 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1123 (void *)(p->p_kq_serial + i)); 1124 nkev++; 1125 kevp++; 1126 } 1127 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1128 int evff = forcehup ? 0 : NOTE_OOB; 1129 1130 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1131 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1132 (void *)(p->p_kq_serial + i)); 1133 nkev++; 1134 kevp++; 1135 } 1136 1137 if (nkev == 0) 1138 continue; 1139 1140 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); 1141 1142 if (pl[i].revents != 0) 1143 (*ncollected)++; 1144 } 1145 1146 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, 1147 *ncollected); 1148 } 1149 1150 /* 1151 * Convert given kqueue event into corresponding poll(2) revents bit. 1152 */ 1153 int 1154 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1155 { 1156 static struct timeval poll_errintvl = { 5, 0 }; 1157 static struct timeval poll_lasterr; 1158 int already_seen; 1159 unsigned long i; 1160 1161 /* Extract poll array index */ 1162 i = (unsigned long)kevp->udata - p->p_kq_serial; 1163 1164 if (i >= nfds) { 1165 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1166 __func__, kevp, nfds, 1167 (unsigned long)kevp->udata, p->p_kq_serial); 1168 } 1169 if ((int)kevp->ident != pl[i].fd) { 1170 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1171 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1172 p->p_kq_serial); 1173 } 1174 1175 /* 1176 * A given descriptor may already have generated an error 1177 * against another filter during kqueue_register(). 1178 * 1179 * Make sure to set the appropriate flags but do not 1180 * increment `*retval' more than once. 1181 */ 1182 already_seen = (pl[i].revents != 0); 1183 1184 /* POLLNVAL preempts other events. */ 1185 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1186 pl[i].revents = POLLNVAL; 1187 goto done; 1188 } else if (pl[i].revents & POLLNVAL) { 1189 goto done; 1190 } 1191 1192 switch (kevp->filter) { 1193 case EVFILT_READ: 1194 if (kevp->flags & __EV_HUP) 1195 pl[i].revents |= POLLHUP; 1196 if (pl[i].events & (POLLIN | POLLRDNORM)) 1197 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1198 break; 1199 case EVFILT_WRITE: 1200 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1201 if (kevp->flags & __EV_HUP) { 1202 pl[i].revents |= POLLHUP; 1203 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1204 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1205 } 1206 break; 1207 case EVFILT_EXCEPT: 1208 if (kevp->flags & __EV_HUP) { 1209 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1210 DPRINTFN(0, "weird events %x\n", pl[i].events); 1211 pl[i].revents |= POLLHUP; 1212 break; 1213 } 1214 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1215 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1216 break; 1217 default: 1218 KASSERT(0); 1219 } 1220 1221 done: 1222 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1223 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1224 kevp->filter); 1225 1226 /* 1227 * Make noise about unclaimed events as they might indicate a bug 1228 * and can result in spurious-looking wakeups of poll(2). 1229 * 1230 * Live-locking within the system call should not happen because 1231 * the scan loop in doppoll() has an upper limit for the number 1232 * of events to process. 1233 */ 1234 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { 1235 printf("%s[%d]: poll index %lu fd %d events 0x%x " 1236 "filter %d/0x%x unclaimed\n", 1237 p->p_p->ps_comm, p->p_tid, i, pl[i].fd, 1238 pl[i].events, kevp->filter, kevp->flags); 1239 } 1240 1241 if (!already_seen && (pl[i].revents != 0)) 1242 return (1); 1243 1244 return (0); 1245 } 1246 1247 /* 1248 * utrace system call 1249 */ 1250 int 1251 sys_utrace(struct proc *curp, void *v, register_t *retval) 1252 { 1253 #ifdef KTRACE 1254 struct sys_utrace_args /* { 1255 syscallarg(const char *) label; 1256 syscallarg(const void *) addr; 1257 syscallarg(size_t) len; 1258 } */ *uap = v; 1259 1260 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1261 SCARG(uap, len))); 1262 #else 1263 return (0); 1264 #endif 1265 } 1266