1 /* $OpenBSD: sys_generic.c,v 1.155 2023/02/25 09:55:46 mvs Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/time.h> 54 #include <sys/malloc.h> 55 #include <sys/poll.h> 56 #include <sys/eventvar.h> 57 #ifdef KTRACE 58 #include <sys/ktrace.h> 59 #endif 60 #include <sys/pledge.h> 61 62 #include <sys/mount.h> 63 #include <sys/syscallargs.h> 64 65 /* 66 * Debug values: 67 * 1 - print implementation errors, things that should not happen. 68 * 2 - print ppoll(2) information, somewhat verbose 69 * 3 - print pselect(2) and ppoll(2) information, very verbose 70 */ 71 int kqpoll_debug = 0; 72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 73 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 74 printf(x); \ 75 } 76 77 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *); 78 int pselcollect(struct proc *, struct kevent *, fd_set **, int *); 79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *); 80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 81 82 int pollout(struct pollfd *, struct pollfd *, u_int); 83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 84 struct timespec *, const sigset_t *, register_t *); 85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 86 const sigset_t *, register_t *); 87 88 int 89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 90 unsigned int iovcnt, size_t *residp) 91 { 92 #ifdef KTRACE 93 struct proc *p = curproc; 94 #endif 95 struct iovec *iov; 96 int error, i; 97 size_t resid = 0; 98 99 if (iovcnt > UIO_SMALLIOV) { 100 if (iovcnt > IOV_MAX) 101 return (EINVAL); 102 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 103 } else if (iovcnt > 0) { 104 iov = aiov; 105 } else { 106 return (EINVAL); 107 } 108 *iovp = iov; 109 110 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 111 return (error); 112 113 #ifdef KTRACE 114 if (KTRPOINT(p, KTR_STRUCT)) 115 ktriovec(p, iov, iovcnt); 116 #endif 117 118 for (i = 0; i < iovcnt; i++) { 119 resid += iov->iov_len; 120 /* 121 * Writes return ssize_t because -1 is returned on error. 122 * Therefore we must restrict the length to SSIZE_MAX to 123 * avoid garbage return values. Note that the addition is 124 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 125 */ 126 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 127 return (EINVAL); 128 iov++; 129 } 130 131 if (residp != NULL) 132 *residp = resid; 133 134 return (0); 135 } 136 137 void 138 iovec_free(struct iovec *iov, unsigned int iovcnt) 139 { 140 if (iovcnt > UIO_SMALLIOV) 141 free(iov, M_IOV, iovcnt * sizeof(*iov)); 142 } 143 144 /* 145 * Read system call. 146 */ 147 int 148 sys_read(struct proc *p, void *v, register_t *retval) 149 { 150 struct sys_read_args /* { 151 syscallarg(int) fd; 152 syscallarg(void *) buf; 153 syscallarg(size_t) nbyte; 154 } */ *uap = v; 155 struct iovec iov; 156 struct uio auio; 157 158 iov.iov_base = SCARG(uap, buf); 159 iov.iov_len = SCARG(uap, nbyte); 160 if (iov.iov_len > SSIZE_MAX) 161 return (EINVAL); 162 163 auio.uio_iov = &iov; 164 auio.uio_iovcnt = 1; 165 auio.uio_resid = iov.iov_len; 166 167 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 168 } 169 170 /* 171 * Scatter read system call. 172 */ 173 int 174 sys_readv(struct proc *p, void *v, register_t *retval) 175 { 176 struct sys_readv_args /* { 177 syscallarg(int) fd; 178 syscallarg(const struct iovec *) iovp; 179 syscallarg(int) iovcnt; 180 } */ *uap = v; 181 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 182 int error, iovcnt = SCARG(uap, iovcnt); 183 struct uio auio; 184 size_t resid; 185 186 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 187 if (error) 188 goto done; 189 190 auio.uio_iov = iov; 191 auio.uio_iovcnt = iovcnt; 192 auio.uio_resid = resid; 193 194 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 195 done: 196 iovec_free(iov, iovcnt); 197 return (error); 198 } 199 200 int 201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 202 register_t *retval) 203 { 204 struct filedesc *fdp = p->p_fd; 205 struct file *fp; 206 long cnt, error = 0; 207 u_int iovlen; 208 #ifdef KTRACE 209 struct iovec *ktriov = NULL; 210 #endif 211 212 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 213 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 214 215 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 216 return (EBADF); 217 218 /* Checks for positioned read. */ 219 if (flags & FO_POSITION) { 220 struct vnode *vp = fp->f_data; 221 222 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 223 (vp->v_flag & VISTTY)) { 224 error = ESPIPE; 225 goto done; 226 } 227 228 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 229 error = EINVAL; 230 goto done; 231 } 232 } 233 234 uio->uio_rw = UIO_READ; 235 uio->uio_segflg = UIO_USERSPACE; 236 uio->uio_procp = p; 237 #ifdef KTRACE 238 /* 239 * if tracing, save a copy of iovec 240 */ 241 if (KTRPOINT(p, KTR_GENIO)) { 242 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 243 memcpy(ktriov, uio->uio_iov, iovlen); 244 } 245 #endif 246 cnt = uio->uio_resid; 247 error = (*fp->f_ops->fo_read)(fp, uio, flags); 248 if (error) { 249 if (uio->uio_resid != cnt && (error == ERESTART || 250 error == EINTR || error == EWOULDBLOCK)) 251 error = 0; 252 } 253 cnt -= uio->uio_resid; 254 255 mtx_enter(&fp->f_mtx); 256 fp->f_rxfer++; 257 fp->f_rbytes += cnt; 258 mtx_leave(&fp->f_mtx); 259 #ifdef KTRACE 260 if (ktriov != NULL) { 261 if (error == 0) 262 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 263 free(ktriov, M_TEMP, iovlen); 264 } 265 #endif 266 *retval = cnt; 267 done: 268 FRELE(fp, p); 269 return (error); 270 } 271 272 /* 273 * Write system call 274 */ 275 int 276 sys_write(struct proc *p, void *v, register_t *retval) 277 { 278 struct sys_write_args /* { 279 syscallarg(int) fd; 280 syscallarg(const void *) buf; 281 syscallarg(size_t) nbyte; 282 } */ *uap = v; 283 struct iovec iov; 284 struct uio auio; 285 286 iov.iov_base = (void *)SCARG(uap, buf); 287 iov.iov_len = SCARG(uap, nbyte); 288 if (iov.iov_len > SSIZE_MAX) 289 return (EINVAL); 290 291 auio.uio_iov = &iov; 292 auio.uio_iovcnt = 1; 293 auio.uio_resid = iov.iov_len; 294 295 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 296 } 297 298 /* 299 * Gather write system call 300 */ 301 int 302 sys_writev(struct proc *p, void *v, register_t *retval) 303 { 304 struct sys_writev_args /* { 305 syscallarg(int) fd; 306 syscallarg(const struct iovec *) iovp; 307 syscallarg(int) iovcnt; 308 } */ *uap = v; 309 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 310 int error, iovcnt = SCARG(uap, iovcnt); 311 struct uio auio; 312 size_t resid; 313 314 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 315 if (error) 316 goto done; 317 318 auio.uio_iov = iov; 319 auio.uio_iovcnt = iovcnt; 320 auio.uio_resid = resid; 321 322 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 323 done: 324 iovec_free(iov, iovcnt); 325 return (error); 326 } 327 328 int 329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 330 register_t *retval) 331 { 332 struct filedesc *fdp = p->p_fd; 333 struct file *fp; 334 long cnt, error = 0; 335 u_int iovlen; 336 #ifdef KTRACE 337 struct iovec *ktriov = NULL; 338 #endif 339 340 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 341 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 342 343 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 344 return (EBADF); 345 346 /* Checks for positioned write. */ 347 if (flags & FO_POSITION) { 348 struct vnode *vp = fp->f_data; 349 350 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 351 (vp->v_flag & VISTTY)) { 352 error = ESPIPE; 353 goto done; 354 } 355 356 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 357 error = EINVAL; 358 goto done; 359 } 360 } 361 362 uio->uio_rw = UIO_WRITE; 363 uio->uio_segflg = UIO_USERSPACE; 364 uio->uio_procp = p; 365 #ifdef KTRACE 366 /* 367 * if tracing, save a copy of iovec 368 */ 369 if (KTRPOINT(p, KTR_GENIO)) { 370 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 371 memcpy(ktriov, uio->uio_iov, iovlen); 372 } 373 #endif 374 cnt = uio->uio_resid; 375 error = (*fp->f_ops->fo_write)(fp, uio, flags); 376 if (error) { 377 if (uio->uio_resid != cnt && (error == ERESTART || 378 error == EINTR || error == EWOULDBLOCK)) 379 error = 0; 380 if (error == EPIPE) { 381 KERNEL_LOCK(); 382 ptsignal(p, SIGPIPE, STHREAD); 383 KERNEL_UNLOCK(); 384 } 385 } 386 cnt -= uio->uio_resid; 387 388 mtx_enter(&fp->f_mtx); 389 fp->f_wxfer++; 390 fp->f_wbytes += cnt; 391 mtx_leave(&fp->f_mtx); 392 #ifdef KTRACE 393 if (ktriov != NULL) { 394 if (error == 0) 395 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 396 free(ktriov, M_TEMP, iovlen); 397 } 398 #endif 399 *retval = cnt; 400 done: 401 FRELE(fp, p); 402 return (error); 403 } 404 405 /* 406 * Ioctl system call 407 */ 408 int 409 sys_ioctl(struct proc *p, void *v, register_t *retval) 410 { 411 struct sys_ioctl_args /* { 412 syscallarg(int) fd; 413 syscallarg(u_long) com; 414 syscallarg(void *) data; 415 } */ *uap = v; 416 struct file *fp; 417 struct filedesc *fdp = p->p_fd; 418 u_long com = SCARG(uap, com); 419 int error = 0; 420 u_int size = 0; 421 caddr_t data, memp = NULL; 422 int tmp; 423 #define STK_PARAMS 128 424 long long stkbuf[STK_PARAMS / sizeof(long long)]; 425 426 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 427 return (EBADF); 428 429 if (fp->f_type == DTYPE_SOCKET) { 430 struct socket *so = fp->f_data; 431 432 if (so->so_state & SS_DNS) { 433 error = EINVAL; 434 goto out; 435 } 436 } 437 438 error = pledge_ioctl(p, com, fp); 439 if (error) 440 goto out; 441 442 switch (com) { 443 case FIONCLEX: 444 case FIOCLEX: 445 fdplock(fdp); 446 if (com == FIONCLEX) 447 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 448 else 449 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 450 fdpunlock(fdp); 451 goto out; 452 } 453 454 /* 455 * Interpret high order word to find amount of data to be 456 * copied to/from the user's address space. 457 */ 458 size = IOCPARM_LEN(com); 459 if (size > IOCPARM_MAX) { 460 error = ENOTTY; 461 goto out; 462 } 463 if (size > sizeof (stkbuf)) { 464 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 465 data = memp; 466 } else 467 data = (caddr_t)stkbuf; 468 if (com&IOC_IN) { 469 if (size) { 470 error = copyin(SCARG(uap, data), data, size); 471 if (error) { 472 goto out; 473 } 474 } else 475 *(caddr_t *)data = SCARG(uap, data); 476 } else if ((com&IOC_OUT) && size) 477 /* 478 * Zero the buffer so the user always 479 * gets back something deterministic. 480 */ 481 memset(data, 0, size); 482 else if (com&IOC_VOID) 483 *(caddr_t *)data = SCARG(uap, data); 484 485 switch (com) { 486 487 case FIONBIO: 488 if ((tmp = *(int *)data) != 0) 489 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 490 else 491 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 492 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 493 break; 494 495 case FIOASYNC: 496 if ((tmp = *(int *)data) != 0) 497 atomic_setbits_int(&fp->f_flag, FASYNC); 498 else 499 atomic_clearbits_int(&fp->f_flag, FASYNC); 500 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 501 break; 502 503 default: 504 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 505 break; 506 } 507 /* 508 * Copy any data to user, size was 509 * already set and checked above. 510 */ 511 if (error == 0 && (com&IOC_OUT) && size) 512 error = copyout(data, SCARG(uap, data), size); 513 out: 514 FRELE(fp, p); 515 free(memp, M_IOCTLOPS, size); 516 return (error); 517 } 518 519 /* 520 * Select system call. 521 */ 522 int 523 sys_select(struct proc *p, void *v, register_t *retval) 524 { 525 struct sys_select_args /* { 526 syscallarg(int) nd; 527 syscallarg(fd_set *) in; 528 syscallarg(fd_set *) ou; 529 syscallarg(fd_set *) ex; 530 syscallarg(struct timeval *) tv; 531 } */ *uap = v; 532 533 struct timespec ts, *tsp = NULL; 534 int error; 535 536 if (SCARG(uap, tv) != NULL) { 537 struct timeval tv; 538 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 539 return (error); 540 #ifdef KTRACE 541 if (KTRPOINT(p, KTR_STRUCT)) 542 ktrreltimeval(p, &tv); 543 #endif 544 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 545 return (EINVAL); 546 TIMEVAL_TO_TIMESPEC(&tv, &ts); 547 tsp = &ts; 548 } 549 550 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 551 SCARG(uap, ex), tsp, NULL, retval)); 552 } 553 554 int 555 sys_pselect(struct proc *p, void *v, register_t *retval) 556 { 557 struct sys_pselect_args /* { 558 syscallarg(int) nd; 559 syscallarg(fd_set *) in; 560 syscallarg(fd_set *) ou; 561 syscallarg(fd_set *) ex; 562 syscallarg(const struct timespec *) ts; 563 syscallarg(const sigset_t *) mask; 564 } */ *uap = v; 565 566 struct timespec ts, *tsp = NULL; 567 sigset_t ss, *ssp = NULL; 568 int error; 569 570 if (SCARG(uap, ts) != NULL) { 571 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 572 return (error); 573 #ifdef KTRACE 574 if (KTRPOINT(p, KTR_STRUCT)) 575 ktrreltimespec(p, &ts); 576 #endif 577 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 578 return (EINVAL); 579 tsp = &ts; 580 } 581 if (SCARG(uap, mask) != NULL) { 582 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 583 return (error); 584 ssp = &ss; 585 } 586 587 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 588 SCARG(uap, ex), tsp, ssp, retval)); 589 } 590 591 int 592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 593 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 594 { 595 struct kqueue_scan_state scan; 596 struct timespec zerots = {}; 597 fd_mask bits[6]; 598 fd_set *pibits[3], *pobits[3]; 599 int error, nfiles, ncollected = 0, nevents = 0; 600 u_int ni; 601 602 if (nd < 0) 603 return (EINVAL); 604 605 nfiles = READ_ONCE(p->p_fd->fd_nfiles); 606 if (nd > nfiles) 607 nd = nfiles; 608 609 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 610 if (ni > sizeof(bits[0])) { 611 caddr_t mbits; 612 613 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 614 pibits[0] = (fd_set *)&mbits[ni * 0]; 615 pibits[1] = (fd_set *)&mbits[ni * 1]; 616 pibits[2] = (fd_set *)&mbits[ni * 2]; 617 pobits[0] = (fd_set *)&mbits[ni * 3]; 618 pobits[1] = (fd_set *)&mbits[ni * 4]; 619 pobits[2] = (fd_set *)&mbits[ni * 5]; 620 } else { 621 memset(bits, 0, sizeof(bits)); 622 pibits[0] = (fd_set *)&bits[0]; 623 pibits[1] = (fd_set *)&bits[1]; 624 pibits[2] = (fd_set *)&bits[2]; 625 pobits[0] = (fd_set *)&bits[3]; 626 pobits[1] = (fd_set *)&bits[4]; 627 pobits[2] = (fd_set *)&bits[5]; 628 } 629 630 kqpoll_init(nd); 631 632 #define getbits(name, x) \ 633 if (name && (error = copyin(name, pibits[x], ni))) \ 634 goto done; 635 getbits(in, 0); 636 getbits(ou, 1); 637 getbits(ex, 2); 638 #undef getbits 639 #ifdef KTRACE 640 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 641 if (in) ktrfdset(p, pibits[0], ni); 642 if (ou) ktrfdset(p, pibits[1], ni); 643 if (ex) ktrfdset(p, pibits[2], ni); 644 } 645 #endif 646 647 if (sigmask) { 648 KERNEL_LOCK(); 649 dosigsuspend(p, *sigmask &~ sigcantmask); 650 KERNEL_UNLOCK(); 651 } 652 653 /* Register kqueue events */ 654 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 655 if (error != 0) 656 goto done; 657 658 /* 659 * The poll/select family of syscalls has been designed to 660 * block when file descriptors are not available, even if 661 * there's nothing to wait for. 662 */ 663 if (nevents == 0 && ncollected == 0) { 664 uint64_t nsecs = INFSLP; 665 666 if (timeout != NULL) { 667 if (!timespecisset(timeout)) 668 goto done; 669 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 670 } 671 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 672 /* select is not restarted after signals... */ 673 if (error == ERESTART) 674 error = EINTR; 675 if (error == EWOULDBLOCK) 676 error = 0; 677 goto done; 678 } 679 680 /* Do not block if registering found pending events. */ 681 if (ncollected > 0) 682 timeout = &zerots; 683 684 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 685 kqueue_scan_setup(&scan, p->p_kq); 686 while (nevents > 0) { 687 struct kevent kev[KQ_NEVENTS]; 688 int i, ready, count; 689 690 /* Maximum number of events per iteration */ 691 count = MIN(nitems(kev), nevents); 692 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 693 694 /* Convert back events that are ready. */ 695 for (i = 0; i < ready && error == 0; i++) 696 error = pselcollect(p, &kev[i], pobits, &ncollected); 697 /* 698 * Stop if there was an error or if we had enough 699 * space to collect all events that were ready. 700 */ 701 if (error || ready < count) 702 break; 703 704 nevents -= ready; 705 } 706 kqueue_scan_finish(&scan); 707 *retval = ncollected; 708 done: 709 #define putbits(name, x) \ 710 if (name && (error2 = copyout(pobits[x], name, ni))) \ 711 error = error2; 712 if (error == 0) { 713 int error2; 714 715 putbits(in, 0); 716 putbits(ou, 1); 717 putbits(ex, 2); 718 #undef putbits 719 #ifdef KTRACE 720 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 721 if (in) ktrfdset(p, pobits[0], ni); 722 if (ou) ktrfdset(p, pobits[1], ni); 723 if (ex) ktrfdset(p, pobits[2], ni); 724 } 725 #endif 726 } 727 728 if (pibits[0] != (fd_set *)&bits[0]) 729 free(pibits[0], M_TEMP, 6 * ni); 730 731 kqpoll_done(nd); 732 733 return (error); 734 } 735 736 /* 737 * Convert fd_set into kqueue events and register them on the 738 * per-thread queue. 739 */ 740 int 741 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 742 int *nregistered, int *ncollected) 743 { 744 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 745 static const int evff[] = { 0, 0, NOTE_OOB }; 746 int msk, i, j, fd, nevents = 0, error = 0; 747 struct kevent kev; 748 fd_mask bits; 749 750 for (msk = 0; msk < 3; msk++) { 751 for (i = 0; i < nfd; i += NFDBITS) { 752 bits = pibits[msk]->fds_bits[i / NFDBITS]; 753 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 754 bits &= ~(1 << j); 755 756 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 757 fd, msk, p->p_kq_serial); 758 EV_SET(&kev, fd, evf[msk], 759 EV_ADD|EV_ENABLE|__EV_SELECT, 760 evff[msk], 0, (void *)(p->p_kq_serial)); 761 error = kqueue_register(p->p_kq, &kev, 0, p); 762 switch (error) { 763 case 0: 764 nevents++; 765 /* FALLTHROUGH */ 766 case EOPNOTSUPP:/* No underlying kqfilter */ 767 case EINVAL: /* Unimplemented filter */ 768 case EPERM: /* Specific to FIFO and 769 * __EV_SELECT */ 770 error = 0; 771 break; 772 case EPIPE: /* Specific to pipes */ 773 KASSERT(kev.filter == EVFILT_WRITE); 774 FD_SET(kev.ident, pobits[1]); 775 (*ncollected)++; 776 error = 0; 777 break; 778 case ENXIO: /* Device has been detached */ 779 default: 780 goto bad; 781 } 782 } 783 } 784 } 785 786 *nregistered = nevents; 787 return (0); 788 bad: 789 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 790 kev.filter, error); 791 return (error); 792 } 793 794 /* 795 * Convert given kqueue event into corresponding select(2) bit. 796 */ 797 int 798 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 799 int *ncollected) 800 { 801 if ((unsigned long)kevp->udata != p->p_kq_serial) { 802 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 803 __func__, kevp, (int)kevp->ident, 804 (unsigned long)kevp->udata, p->p_kq_serial); 805 } 806 807 if (kevp->flags & EV_ERROR) { 808 DPRINTFN(2, "select fd %d filt %d error %d\n", 809 (int)kevp->ident, kevp->filter, (int)kevp->data); 810 return (kevp->data); 811 } 812 813 switch (kevp->filter) { 814 case EVFILT_READ: 815 FD_SET(kevp->ident, pobits[0]); 816 break; 817 case EVFILT_WRITE: 818 FD_SET(kevp->ident, pobits[1]); 819 break; 820 case EVFILT_EXCEPT: 821 FD_SET(kevp->ident, pobits[2]); 822 break; 823 default: 824 KASSERT(0); 825 } 826 (*ncollected)++; 827 828 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 829 return (0); 830 } 831 832 /* 833 * Do a wakeup when a selectable event occurs. 834 */ 835 void 836 selwakeup(struct selinfo *sip) 837 { 838 KERNEL_LOCK(); 839 knote_locked(&sip->si_note, NOTE_SUBMIT); 840 KERNEL_UNLOCK(); 841 } 842 843 /* 844 * Only copyout the revents field. 845 */ 846 int 847 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 848 { 849 int error = 0; 850 u_int i = 0; 851 852 while (!error && i++ < nfds) { 853 error = copyout(&pl->revents, &upl->revents, 854 sizeof(upl->revents)); 855 pl++; 856 upl++; 857 } 858 859 return (error); 860 } 861 862 /* 863 * We are using the same mechanism as select only we encode/decode args 864 * differently. 865 */ 866 int 867 sys_poll(struct proc *p, void *v, register_t *retval) 868 { 869 struct sys_poll_args /* { 870 syscallarg(struct pollfd *) fds; 871 syscallarg(u_int) nfds; 872 syscallarg(int) timeout; 873 } */ *uap = v; 874 875 struct timespec ts, *tsp = NULL; 876 int msec = SCARG(uap, timeout); 877 878 if (msec != INFTIM) { 879 if (msec < 0) 880 return (EINVAL); 881 ts.tv_sec = msec / 1000; 882 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 883 tsp = &ts; 884 } 885 886 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 887 retval)); 888 } 889 890 int 891 sys_ppoll(struct proc *p, void *v, register_t *retval) 892 { 893 struct sys_ppoll_args /* { 894 syscallarg(struct pollfd *) fds; 895 syscallarg(u_int) nfds; 896 syscallarg(const struct timespec *) ts; 897 syscallarg(const sigset_t *) mask; 898 } */ *uap = v; 899 900 int error; 901 struct timespec ts, *tsp = NULL; 902 sigset_t ss, *ssp = NULL; 903 904 if (SCARG(uap, ts) != NULL) { 905 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 906 return (error); 907 #ifdef KTRACE 908 if (KTRPOINT(p, KTR_STRUCT)) 909 ktrreltimespec(p, &ts); 910 #endif 911 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 912 return (EINVAL); 913 tsp = &ts; 914 } 915 916 if (SCARG(uap, mask) != NULL) { 917 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 918 return (error); 919 ssp = &ss; 920 } 921 922 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 923 retval)); 924 } 925 926 int 927 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 928 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 929 { 930 struct kqueue_scan_state scan; 931 struct timespec zerots = {}; 932 struct pollfd pfds[4], *pl = pfds; 933 int error, ncollected = 0, nevents = 0; 934 size_t sz; 935 936 /* Standards say no more than MAX_OPEN; this is possibly better. */ 937 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 938 return (EINVAL); 939 940 /* optimize for the default case, of a small nfds value */ 941 if (nfds > nitems(pfds)) { 942 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 943 M_WAITOK | M_CANFAIL); 944 if (pl == NULL) 945 return (EINVAL); 946 } 947 948 kqpoll_init(nfds); 949 950 sz = nfds * sizeof(*pl); 951 952 if ((error = copyin(fds, pl, sz)) != 0) 953 goto bad; 954 955 if (sigmask) { 956 KERNEL_LOCK(); 957 dosigsuspend(p, *sigmask &~ sigcantmask); 958 KERNEL_UNLOCK(); 959 } 960 961 /* Register kqueue events */ 962 ppollregister(p, pl, nfds, &nevents, &ncollected); 963 964 /* 965 * The poll/select family of syscalls has been designed to 966 * block when file descriptors are not available, even if 967 * there's nothing to wait for. 968 */ 969 if (nevents == 0 && ncollected == 0) { 970 uint64_t nsecs = INFSLP; 971 972 if (timeout != NULL) { 973 if (!timespecisset(timeout)) 974 goto done; 975 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 976 } 977 978 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 979 if (error == ERESTART) 980 error = EINTR; 981 if (error == EWOULDBLOCK) 982 error = 0; 983 goto done; 984 } 985 986 /* Do not block if registering found pending events. */ 987 if (ncollected > 0) 988 timeout = &zerots; 989 990 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 991 kqueue_scan_setup(&scan, p->p_kq); 992 while (nevents > 0) { 993 struct kevent kev[KQ_NEVENTS]; 994 int i, ready, count; 995 996 /* Maximum number of events per iteration */ 997 count = MIN(nitems(kev), nevents); 998 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 999 1000 /* Convert back events that are ready. */ 1001 for (i = 0; i < ready; i++) 1002 ncollected += ppollcollect(p, &kev[i], pl, nfds); 1003 1004 /* 1005 * Stop if there was an error or if we had enough 1006 * place to collect all events that were ready. 1007 */ 1008 if (error || ready < count) 1009 break; 1010 1011 nevents -= ready; 1012 } 1013 kqueue_scan_finish(&scan); 1014 *retval = ncollected; 1015 done: 1016 /* 1017 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1018 * ignored (since the whole point is to see what would block). 1019 */ 1020 switch (error) { 1021 case EINTR: 1022 error = pollout(pl, fds, nfds); 1023 if (error == 0) 1024 error = EINTR; 1025 break; 1026 case EWOULDBLOCK: 1027 case 0: 1028 error = pollout(pl, fds, nfds); 1029 break; 1030 } 1031 #ifdef KTRACE 1032 if (KTRPOINT(p, KTR_STRUCT)) 1033 ktrpollfd(p, pl, nfds); 1034 #endif /* KTRACE */ 1035 bad: 1036 if (pl != pfds) 1037 free(pl, M_TEMP, sz); 1038 1039 kqpoll_done(nfds); 1040 1041 return (error); 1042 } 1043 1044 int 1045 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 1046 struct pollfd *pl, unsigned int pollid) 1047 { 1048 int i, error, nevents = 0; 1049 1050 KASSERT(pl->revents == 0); 1051 1052 for (i = 0; i < nkev; i++, kevp++) { 1053 again: 1054 error = kqueue_register(p->p_kq, kevp, pollid, p); 1055 switch (error) { 1056 case 0: 1057 nevents++; 1058 break; 1059 case EOPNOTSUPP:/* No underlying kqfilter */ 1060 case EINVAL: /* Unimplemented filter */ 1061 break; 1062 case EBADF: /* Bad file descriptor */ 1063 pl->revents |= POLLNVAL; 1064 break; 1065 case EPERM: /* Specific to FIFO */ 1066 KASSERT(kevp->filter == EVFILT_WRITE); 1067 if (nkev == 1) { 1068 /* 1069 * If this is the only filter make sure 1070 * POLLHUP is passed to userland. 1071 */ 1072 kevp->filter = EVFILT_EXCEPT; 1073 goto again; 1074 } 1075 break; 1076 case EPIPE: /* Specific to pipes */ 1077 KASSERT(kevp->filter == EVFILT_WRITE); 1078 pl->revents |= POLLHUP; 1079 break; 1080 default: 1081 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 1082 " %lu filt %d ERROR=%d\n", 1083 ((unsigned long)kevp->udata - p->p_kq_serial), 1084 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 1085 error); 1086 /* FALLTHROUGH */ 1087 case ENXIO: /* Device has been detached */ 1088 pl->revents |= POLLERR; 1089 break; 1090 } 1091 } 1092 1093 return (nevents); 1094 } 1095 1096 /* 1097 * Convert pollfd into kqueue events and register them on the 1098 * per-thread queue. 1099 * 1100 * At most 3 events can correspond to a single pollfd. 1101 */ 1102 void 1103 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, 1104 int *ncollected) 1105 { 1106 int i, nkev, nevt, forcehup; 1107 struct kevent kev[3], *kevp; 1108 1109 for (i = 0; i < nfds; i++) { 1110 pl[i].events &= ~POLL_NOHUP; 1111 pl[i].revents = 0; 1112 1113 if (pl[i].fd < 0) 1114 continue; 1115 1116 /* 1117 * POLLHUP checking is implicit in the event filters. 1118 * However, the checking must be even if no events are 1119 * requested. 1120 */ 1121 forcehup = ((pl[i].events & ~POLLHUP) == 0); 1122 1123 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 1124 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 1125 1126 nevt = 0; 1127 nkev = 0; 1128 kevp = kev; 1129 if (pl[i].events & (POLLIN | POLLRDNORM)) { 1130 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1131 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1132 (void *)(p->p_kq_serial + i)); 1133 nkev++; 1134 kevp++; 1135 } 1136 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1137 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1138 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1139 (void *)(p->p_kq_serial + i)); 1140 nkev++; 1141 kevp++; 1142 } 1143 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1144 int evff = forcehup ? 0 : NOTE_OOB; 1145 1146 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1147 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1148 (void *)(p->p_kq_serial + i)); 1149 nkev++; 1150 kevp++; 1151 } 1152 1153 if (nkev == 0) 1154 continue; 1155 1156 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); 1157 1158 if (pl[i].revents != 0) 1159 (*ncollected)++; 1160 } 1161 1162 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, 1163 *ncollected); 1164 } 1165 1166 /* 1167 * Convert given kqueue event into corresponding poll(2) revents bit. 1168 */ 1169 int 1170 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1171 { 1172 static struct timeval poll_errintvl = { 5, 0 }; 1173 static struct timeval poll_lasterr; 1174 int already_seen; 1175 unsigned long i; 1176 1177 /* Extract poll array index */ 1178 i = (unsigned long)kevp->udata - p->p_kq_serial; 1179 1180 if (i >= nfds) { 1181 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1182 __func__, kevp, nfds, 1183 (unsigned long)kevp->udata, p->p_kq_serial); 1184 } 1185 if ((int)kevp->ident != pl[i].fd) { 1186 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1187 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1188 p->p_kq_serial); 1189 } 1190 1191 /* 1192 * A given descriptor may already have generated an error 1193 * against another filter during kqueue_register(). 1194 * 1195 * Make sure to set the appropriate flags but do not 1196 * increment `*retval' more than once. 1197 */ 1198 already_seen = (pl[i].revents != 0); 1199 1200 /* POLLNVAL preempts other events. */ 1201 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1202 pl[i].revents = POLLNVAL; 1203 goto done; 1204 } else if (pl[i].revents & POLLNVAL) { 1205 goto done; 1206 } 1207 1208 switch (kevp->filter) { 1209 case EVFILT_READ: 1210 if (kevp->flags & __EV_HUP) 1211 pl[i].revents |= POLLHUP; 1212 if (pl[i].events & (POLLIN | POLLRDNORM)) 1213 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1214 break; 1215 case EVFILT_WRITE: 1216 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1217 if (kevp->flags & __EV_HUP) { 1218 pl[i].revents |= POLLHUP; 1219 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1220 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1221 } 1222 break; 1223 case EVFILT_EXCEPT: 1224 if (kevp->flags & __EV_HUP) { 1225 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1226 DPRINTFN(0, "weird events %x\n", pl[i].events); 1227 pl[i].revents |= POLLHUP; 1228 break; 1229 } 1230 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1231 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1232 break; 1233 default: 1234 KASSERT(0); 1235 } 1236 1237 done: 1238 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1239 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1240 kevp->filter); 1241 1242 /* 1243 * Make noise about unclaimed events as they might indicate a bug 1244 * and can result in spurious-looking wakeups of poll(2). 1245 * 1246 * Live-locking within the system call should not happen because 1247 * the scan loop in doppoll() has an upper limit for the number 1248 * of events to process. 1249 */ 1250 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { 1251 printf("%s[%d]: poll index %lu fd %d events 0x%x " 1252 "filter %d/0x%x unclaimed\n", 1253 p->p_p->ps_comm, p->p_tid, i, pl[i].fd, 1254 pl[i].events, kevp->filter, kevp->flags); 1255 } 1256 1257 if (!already_seen && (pl[i].revents != 0)) 1258 return (1); 1259 1260 return (0); 1261 } 1262 1263 /* 1264 * utrace system call 1265 */ 1266 int 1267 sys_utrace(struct proc *curp, void *v, register_t *retval) 1268 { 1269 #ifdef KTRACE 1270 struct sys_utrace_args /* { 1271 syscallarg(const char *) label; 1272 syscallarg(const void *) addr; 1273 syscallarg(size_t) len; 1274 } */ *uap = v; 1275 1276 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1277 SCARG(uap, len))); 1278 #else 1279 return (0); 1280 #endif 1281 } 1282