1 /* $OpenBSD: sys_generic.c,v 1.148 2022/07/05 15:06:16 visa Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/stat.h> 55 #include <sys/time.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/eventvar.h> 59 #ifdef KTRACE 60 #include <sys/ktrace.h> 61 #endif 62 #include <sys/sched.h> 63 #include <sys/pledge.h> 64 65 #include <sys/mount.h> 66 #include <sys/syscallargs.h> 67 68 #include <uvm/uvm_extern.h> 69 70 /* 71 * Debug values: 72 * 1 - print implementation errors, things that should not happen. 73 * 2 - print ppoll(2) information, somewhat verbose 74 * 3 - print pselect(2) and ppoll(2) information, very verbose 75 */ 76 int kqpoll_debug = 0; 77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 78 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 79 printf(x); \ 80 } 81 82 int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *); 83 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *); 84 void ppollregister(struct proc *, struct pollfd *, int, int *, int *); 85 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 86 87 int pollout(struct pollfd *, struct pollfd *, u_int); 88 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 89 struct timespec *, const sigset_t *, register_t *); 90 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 91 const sigset_t *, register_t *); 92 93 int 94 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 95 unsigned int iovcnt, size_t *residp) 96 { 97 #ifdef KTRACE 98 struct proc *p = curproc; 99 #endif 100 struct iovec *iov; 101 int error, i; 102 size_t resid = 0; 103 104 if (iovcnt > UIO_SMALLIOV) { 105 if (iovcnt > IOV_MAX) 106 return (EINVAL); 107 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 108 } else if (iovcnt > 0) { 109 iov = aiov; 110 } else { 111 return (EINVAL); 112 } 113 *iovp = iov; 114 115 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 116 return (error); 117 118 #ifdef KTRACE 119 if (KTRPOINT(p, KTR_STRUCT)) 120 ktriovec(p, iov, iovcnt); 121 #endif 122 123 for (i = 0; i < iovcnt; i++) { 124 resid += iov->iov_len; 125 /* 126 * Writes return ssize_t because -1 is returned on error. 127 * Therefore we must restrict the length to SSIZE_MAX to 128 * avoid garbage return values. Note that the addition is 129 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 130 */ 131 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 132 return (EINVAL); 133 iov++; 134 } 135 136 if (residp != NULL) 137 *residp = resid; 138 139 return (0); 140 } 141 142 void 143 iovec_free(struct iovec *iov, unsigned int iovcnt) 144 { 145 if (iovcnt > UIO_SMALLIOV) 146 free(iov, M_IOV, iovcnt * sizeof(*iov)); 147 } 148 149 /* 150 * Read system call. 151 */ 152 int 153 sys_read(struct proc *p, void *v, register_t *retval) 154 { 155 struct sys_read_args /* { 156 syscallarg(int) fd; 157 syscallarg(void *) buf; 158 syscallarg(size_t) nbyte; 159 } */ *uap = v; 160 struct iovec iov; 161 struct uio auio; 162 163 iov.iov_base = SCARG(uap, buf); 164 iov.iov_len = SCARG(uap, nbyte); 165 if (iov.iov_len > SSIZE_MAX) 166 return (EINVAL); 167 168 auio.uio_iov = &iov; 169 auio.uio_iovcnt = 1; 170 auio.uio_resid = iov.iov_len; 171 172 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 173 } 174 175 /* 176 * Scatter read system call. 177 */ 178 int 179 sys_readv(struct proc *p, void *v, register_t *retval) 180 { 181 struct sys_readv_args /* { 182 syscallarg(int) fd; 183 syscallarg(const struct iovec *) iovp; 184 syscallarg(int) iovcnt; 185 } */ *uap = v; 186 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 187 int error, iovcnt = SCARG(uap, iovcnt); 188 struct uio auio; 189 size_t resid; 190 191 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 192 if (error) 193 goto done; 194 195 auio.uio_iov = iov; 196 auio.uio_iovcnt = iovcnt; 197 auio.uio_resid = resid; 198 199 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 200 done: 201 iovec_free(iov, iovcnt); 202 return (error); 203 } 204 205 int 206 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 207 register_t *retval) 208 { 209 struct filedesc *fdp = p->p_fd; 210 struct file *fp; 211 long cnt, error = 0; 212 u_int iovlen; 213 #ifdef KTRACE 214 struct iovec *ktriov = NULL; 215 #endif 216 217 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 218 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 219 220 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 221 return (EBADF); 222 223 /* Checks for positioned read. */ 224 if (flags & FO_POSITION) { 225 struct vnode *vp = fp->f_data; 226 227 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 228 (vp->v_flag & VISTTY)) { 229 error = ESPIPE; 230 goto done; 231 } 232 233 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 234 error = EINVAL; 235 goto done; 236 } 237 } 238 239 uio->uio_rw = UIO_READ; 240 uio->uio_segflg = UIO_USERSPACE; 241 uio->uio_procp = p; 242 #ifdef KTRACE 243 /* 244 * if tracing, save a copy of iovec 245 */ 246 if (KTRPOINT(p, KTR_GENIO)) { 247 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 248 memcpy(ktriov, uio->uio_iov, iovlen); 249 } 250 #endif 251 cnt = uio->uio_resid; 252 error = (*fp->f_ops->fo_read)(fp, uio, flags); 253 if (error) { 254 if (uio->uio_resid != cnt && (error == ERESTART || 255 error == EINTR || error == EWOULDBLOCK)) 256 error = 0; 257 } 258 cnt -= uio->uio_resid; 259 260 mtx_enter(&fp->f_mtx); 261 fp->f_rxfer++; 262 fp->f_rbytes += cnt; 263 mtx_leave(&fp->f_mtx); 264 #ifdef KTRACE 265 if (ktriov != NULL) { 266 if (error == 0) 267 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 268 free(ktriov, M_TEMP, iovlen); 269 } 270 #endif 271 *retval = cnt; 272 done: 273 FRELE(fp, p); 274 return (error); 275 } 276 277 /* 278 * Write system call 279 */ 280 int 281 sys_write(struct proc *p, void *v, register_t *retval) 282 { 283 struct sys_write_args /* { 284 syscallarg(int) fd; 285 syscallarg(const void *) buf; 286 syscallarg(size_t) nbyte; 287 } */ *uap = v; 288 struct iovec iov; 289 struct uio auio; 290 291 iov.iov_base = (void *)SCARG(uap, buf); 292 iov.iov_len = SCARG(uap, nbyte); 293 if (iov.iov_len > SSIZE_MAX) 294 return (EINVAL); 295 296 auio.uio_iov = &iov; 297 auio.uio_iovcnt = 1; 298 auio.uio_resid = iov.iov_len; 299 300 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 301 } 302 303 /* 304 * Gather write system call 305 */ 306 int 307 sys_writev(struct proc *p, void *v, register_t *retval) 308 { 309 struct sys_writev_args /* { 310 syscallarg(int) fd; 311 syscallarg(const struct iovec *) iovp; 312 syscallarg(int) iovcnt; 313 } */ *uap = v; 314 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 315 int error, iovcnt = SCARG(uap, iovcnt); 316 struct uio auio; 317 size_t resid; 318 319 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 320 if (error) 321 goto done; 322 323 auio.uio_iov = iov; 324 auio.uio_iovcnt = iovcnt; 325 auio.uio_resid = resid; 326 327 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 328 done: 329 iovec_free(iov, iovcnt); 330 return (error); 331 } 332 333 int 334 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 335 register_t *retval) 336 { 337 struct filedesc *fdp = p->p_fd; 338 struct file *fp; 339 long cnt, error = 0; 340 u_int iovlen; 341 #ifdef KTRACE 342 struct iovec *ktriov = NULL; 343 #endif 344 345 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 346 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 347 348 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 349 return (EBADF); 350 351 /* Checks for positioned write. */ 352 if (flags & FO_POSITION) { 353 struct vnode *vp = fp->f_data; 354 355 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 356 (vp->v_flag & VISTTY)) { 357 error = ESPIPE; 358 goto done; 359 } 360 361 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 362 error = EINVAL; 363 goto done; 364 } 365 } 366 367 uio->uio_rw = UIO_WRITE; 368 uio->uio_segflg = UIO_USERSPACE; 369 uio->uio_procp = p; 370 #ifdef KTRACE 371 /* 372 * if tracing, save a copy of iovec 373 */ 374 if (KTRPOINT(p, KTR_GENIO)) { 375 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 376 memcpy(ktriov, uio->uio_iov, iovlen); 377 } 378 #endif 379 cnt = uio->uio_resid; 380 error = (*fp->f_ops->fo_write)(fp, uio, flags); 381 if (error) { 382 if (uio->uio_resid != cnt && (error == ERESTART || 383 error == EINTR || error == EWOULDBLOCK)) 384 error = 0; 385 if (error == EPIPE) { 386 KERNEL_LOCK(); 387 ptsignal(p, SIGPIPE, STHREAD); 388 KERNEL_UNLOCK(); 389 } 390 } 391 cnt -= uio->uio_resid; 392 393 mtx_enter(&fp->f_mtx); 394 fp->f_wxfer++; 395 fp->f_wbytes += cnt; 396 mtx_leave(&fp->f_mtx); 397 #ifdef KTRACE 398 if (ktriov != NULL) { 399 if (error == 0) 400 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 401 free(ktriov, M_TEMP, iovlen); 402 } 403 #endif 404 *retval = cnt; 405 done: 406 FRELE(fp, p); 407 return (error); 408 } 409 410 /* 411 * Ioctl system call 412 */ 413 int 414 sys_ioctl(struct proc *p, void *v, register_t *retval) 415 { 416 struct sys_ioctl_args /* { 417 syscallarg(int) fd; 418 syscallarg(u_long) com; 419 syscallarg(void *) data; 420 } */ *uap = v; 421 struct file *fp; 422 struct filedesc *fdp = p->p_fd; 423 u_long com = SCARG(uap, com); 424 int error = 0; 425 u_int size = 0; 426 caddr_t data, memp = NULL; 427 int tmp; 428 #define STK_PARAMS 128 429 long long stkbuf[STK_PARAMS / sizeof(long long)]; 430 431 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 432 return (EBADF); 433 434 if (fp->f_type == DTYPE_SOCKET) { 435 struct socket *so = fp->f_data; 436 437 if (so->so_state & SS_DNS) { 438 error = EINVAL; 439 goto out; 440 } 441 } 442 443 error = pledge_ioctl(p, com, fp); 444 if (error) 445 goto out; 446 447 switch (com) { 448 case FIONCLEX: 449 case FIOCLEX: 450 fdplock(fdp); 451 if (com == FIONCLEX) 452 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 453 else 454 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 455 fdpunlock(fdp); 456 goto out; 457 } 458 459 /* 460 * Interpret high order word to find amount of data to be 461 * copied to/from the user's address space. 462 */ 463 size = IOCPARM_LEN(com); 464 if (size > IOCPARM_MAX) { 465 error = ENOTTY; 466 goto out; 467 } 468 if (size > sizeof (stkbuf)) { 469 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 470 data = memp; 471 } else 472 data = (caddr_t)stkbuf; 473 if (com&IOC_IN) { 474 if (size) { 475 error = copyin(SCARG(uap, data), data, size); 476 if (error) { 477 goto out; 478 } 479 } else 480 *(caddr_t *)data = SCARG(uap, data); 481 } else if ((com&IOC_OUT) && size) 482 /* 483 * Zero the buffer so the user always 484 * gets back something deterministic. 485 */ 486 memset(data, 0, size); 487 else if (com&IOC_VOID) 488 *(caddr_t *)data = SCARG(uap, data); 489 490 switch (com) { 491 492 case FIONBIO: 493 if ((tmp = *(int *)data) != 0) 494 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 495 else 496 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 497 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 498 break; 499 500 case FIOASYNC: 501 if ((tmp = *(int *)data) != 0) 502 atomic_setbits_int(&fp->f_flag, FASYNC); 503 else 504 atomic_clearbits_int(&fp->f_flag, FASYNC); 505 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 506 break; 507 508 default: 509 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 510 break; 511 } 512 /* 513 * Copy any data to user, size was 514 * already set and checked above. 515 */ 516 if (error == 0 && (com&IOC_OUT) && size) 517 error = copyout(data, SCARG(uap, data), size); 518 out: 519 FRELE(fp, p); 520 free(memp, M_IOCTLOPS, size); 521 return (error); 522 } 523 524 /* 525 * Select system call. 526 */ 527 int 528 sys_select(struct proc *p, void *v, register_t *retval) 529 { 530 struct sys_select_args /* { 531 syscallarg(int) nd; 532 syscallarg(fd_set *) in; 533 syscallarg(fd_set *) ou; 534 syscallarg(fd_set *) ex; 535 syscallarg(struct timeval *) tv; 536 } */ *uap = v; 537 538 struct timespec ts, *tsp = NULL; 539 int error; 540 541 if (SCARG(uap, tv) != NULL) { 542 struct timeval tv; 543 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 544 return (error); 545 #ifdef KTRACE 546 if (KTRPOINT(p, KTR_STRUCT)) 547 ktrreltimeval(p, &tv); 548 #endif 549 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 550 return (EINVAL); 551 TIMEVAL_TO_TIMESPEC(&tv, &ts); 552 tsp = &ts; 553 } 554 555 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 556 SCARG(uap, ex), tsp, NULL, retval)); 557 } 558 559 int 560 sys_pselect(struct proc *p, void *v, register_t *retval) 561 { 562 struct sys_pselect_args /* { 563 syscallarg(int) nd; 564 syscallarg(fd_set *) in; 565 syscallarg(fd_set *) ou; 566 syscallarg(fd_set *) ex; 567 syscallarg(const struct timespec *) ts; 568 syscallarg(const sigset_t *) mask; 569 } */ *uap = v; 570 571 struct timespec ts, *tsp = NULL; 572 sigset_t ss, *ssp = NULL; 573 int error; 574 575 if (SCARG(uap, ts) != NULL) { 576 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 577 return (error); 578 #ifdef KTRACE 579 if (KTRPOINT(p, KTR_STRUCT)) 580 ktrreltimespec(p, &ts); 581 #endif 582 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 583 return (EINVAL); 584 tsp = &ts; 585 } 586 if (SCARG(uap, mask) != NULL) { 587 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 588 return (error); 589 ssp = &ss; 590 } 591 592 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 593 SCARG(uap, ex), tsp, ssp, retval)); 594 } 595 596 int 597 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 598 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 599 { 600 struct kqueue_scan_state scan; 601 struct timespec zerots = {}; 602 fd_mask bits[6]; 603 fd_set *pibits[3], *pobits[3]; 604 int error, ncollected = 0, nevents = 0; 605 u_int ni; 606 607 if (nd < 0) 608 return (EINVAL); 609 if (nd > p->p_fd->fd_nfiles) { 610 /* forgiving; slightly wrong */ 611 nd = p->p_fd->fd_nfiles; 612 } 613 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 614 if (ni > sizeof(bits[0])) { 615 caddr_t mbits; 616 617 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 618 pibits[0] = (fd_set *)&mbits[ni * 0]; 619 pibits[1] = (fd_set *)&mbits[ni * 1]; 620 pibits[2] = (fd_set *)&mbits[ni * 2]; 621 pobits[0] = (fd_set *)&mbits[ni * 3]; 622 pobits[1] = (fd_set *)&mbits[ni * 4]; 623 pobits[2] = (fd_set *)&mbits[ni * 5]; 624 } else { 625 memset(bits, 0, sizeof(bits)); 626 pibits[0] = (fd_set *)&bits[0]; 627 pibits[1] = (fd_set *)&bits[1]; 628 pibits[2] = (fd_set *)&bits[2]; 629 pobits[0] = (fd_set *)&bits[3]; 630 pobits[1] = (fd_set *)&bits[4]; 631 pobits[2] = (fd_set *)&bits[5]; 632 } 633 634 kqpoll_init(nd); 635 636 #define getbits(name, x) \ 637 if (name && (error = copyin(name, pibits[x], ni))) \ 638 goto done; 639 getbits(in, 0); 640 getbits(ou, 1); 641 getbits(ex, 2); 642 #undef getbits 643 #ifdef KTRACE 644 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 645 if (in) ktrfdset(p, pibits[0], ni); 646 if (ou) ktrfdset(p, pibits[1], ni); 647 if (ex) ktrfdset(p, pibits[2], ni); 648 } 649 #endif 650 651 if (sigmask) 652 dosigsuspend(p, *sigmask &~ sigcantmask); 653 654 /* Register kqueue events */ 655 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 656 if (error != 0) 657 goto done; 658 659 /* 660 * The poll/select family of syscalls has been designed to 661 * block when file descriptors are not available, even if 662 * there's nothing to wait for. 663 */ 664 if (nevents == 0 && ncollected == 0) { 665 uint64_t nsecs = INFSLP; 666 667 if (timeout != NULL) { 668 if (!timespecisset(timeout)) 669 goto done; 670 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 671 } 672 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 673 /* select is not restarted after signals... */ 674 if (error == ERESTART) 675 error = EINTR; 676 if (error == EWOULDBLOCK) 677 error = 0; 678 goto done; 679 } 680 681 /* Do not block if registering found pending events. */ 682 if (ncollected > 0) 683 timeout = &zerots; 684 685 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 686 kqueue_scan_setup(&scan, p->p_kq); 687 while (nevents > 0) { 688 struct kevent kev[KQ_NEVENTS]; 689 int i, ready, count; 690 691 /* Maximum number of events per iteration */ 692 count = MIN(nitems(kev), nevents); 693 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 694 #ifdef KTRACE 695 if (KTRPOINT(p, KTR_STRUCT)) 696 ktrevent(p, kev, ready); 697 #endif 698 /* Convert back events that are ready. */ 699 for (i = 0; i < ready && error == 0; i++) 700 error = pselcollect(p, &kev[i], pobits, &ncollected); 701 /* 702 * Stop if there was an error or if we had enough 703 * space to collect all events that were ready. 704 */ 705 if (error || ready < count) 706 break; 707 708 nevents -= ready; 709 } 710 kqueue_scan_finish(&scan); 711 *retval = ncollected; 712 done: 713 #define putbits(name, x) \ 714 if (name && (error2 = copyout(pobits[x], name, ni))) \ 715 error = error2; 716 if (error == 0) { 717 int error2; 718 719 putbits(in, 0); 720 putbits(ou, 1); 721 putbits(ex, 2); 722 #undef putbits 723 #ifdef KTRACE 724 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 725 if (in) ktrfdset(p, pobits[0], ni); 726 if (ou) ktrfdset(p, pobits[1], ni); 727 if (ex) ktrfdset(p, pobits[2], ni); 728 } 729 #endif 730 } 731 732 if (pibits[0] != (fd_set *)&bits[0]) 733 free(pibits[0], M_TEMP, 6 * ni); 734 735 kqpoll_done(nd); 736 737 return (error); 738 } 739 740 /* 741 * Convert fd_set into kqueue events and register them on the 742 * per-thread queue. 743 */ 744 int 745 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 746 int *nregistered, int *ncollected) 747 { 748 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 749 static const int evff[] = { 0, 0, NOTE_OOB }; 750 int msk, i, j, fd, nevents = 0, error = 0; 751 struct kevent kev; 752 fd_mask bits; 753 754 for (msk = 0; msk < 3; msk++) { 755 for (i = 0; i < nfd; i += NFDBITS) { 756 bits = pibits[msk]->fds_bits[i / NFDBITS]; 757 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 758 bits &= ~(1 << j); 759 760 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 761 fd, msk, p->p_kq_serial); 762 EV_SET(&kev, fd, evf[msk], 763 EV_ADD|EV_ENABLE|__EV_SELECT, 764 evff[msk], 0, (void *)(p->p_kq_serial)); 765 #ifdef KTRACE 766 if (KTRPOINT(p, KTR_STRUCT)) 767 ktrevent(p, &kev, 1); 768 #endif 769 error = kqueue_register(p->p_kq, &kev, 0, p); 770 switch (error) { 771 case 0: 772 nevents++; 773 /* FALLTHROUGH */ 774 case EOPNOTSUPP:/* No underlying kqfilter */ 775 case EINVAL: /* Unimplemented filter */ 776 case EPERM: /* Specific to FIFO and 777 * __EV_SELECT */ 778 error = 0; 779 break; 780 case EPIPE: /* Specific to pipes */ 781 KASSERT(kev.filter == EVFILT_WRITE); 782 FD_SET(kev.ident, pobits[1]); 783 (*ncollected)++; 784 error = 0; 785 break; 786 case ENXIO: /* Device has been detached */ 787 default: 788 goto bad; 789 } 790 } 791 } 792 } 793 794 *nregistered = nevents; 795 return (0); 796 bad: 797 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 798 kev.filter, error); 799 return (error); 800 } 801 802 /* 803 * Convert given kqueue event into corresponding select(2) bit. 804 */ 805 int 806 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 807 int *ncollected) 808 { 809 if ((unsigned long)kevp->udata != p->p_kq_serial) { 810 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 811 __func__, kevp, (int)kevp->ident, 812 (unsigned long)kevp->udata, p->p_kq_serial); 813 } 814 815 if (kevp->flags & EV_ERROR) { 816 DPRINTFN(2, "select fd %d filt %d error %d\n", 817 (int)kevp->ident, kevp->filter, (int)kevp->data); 818 return (kevp->data); 819 } 820 821 switch (kevp->filter) { 822 case EVFILT_READ: 823 FD_SET(kevp->ident, pobits[0]); 824 break; 825 case EVFILT_WRITE: 826 FD_SET(kevp->ident, pobits[1]); 827 break; 828 case EVFILT_EXCEPT: 829 FD_SET(kevp->ident, pobits[2]); 830 break; 831 default: 832 KASSERT(0); 833 } 834 (*ncollected)++; 835 836 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 837 return (0); 838 } 839 840 /* 841 * Do a wakeup when a selectable event occurs. 842 */ 843 void 844 selwakeup(struct selinfo *sip) 845 { 846 KERNEL_LOCK(); 847 KNOTE(&sip->si_note, NOTE_SUBMIT); 848 KERNEL_UNLOCK(); 849 } 850 851 /* 852 * Only copyout the revents field. 853 */ 854 int 855 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 856 { 857 int error = 0; 858 u_int i = 0; 859 860 while (!error && i++ < nfds) { 861 error = copyout(&pl->revents, &upl->revents, 862 sizeof(upl->revents)); 863 pl++; 864 upl++; 865 } 866 867 return (error); 868 } 869 870 /* 871 * We are using the same mechanism as select only we encode/decode args 872 * differently. 873 */ 874 int 875 sys_poll(struct proc *p, void *v, register_t *retval) 876 { 877 struct sys_poll_args /* { 878 syscallarg(struct pollfd *) fds; 879 syscallarg(u_int) nfds; 880 syscallarg(int) timeout; 881 } */ *uap = v; 882 883 struct timespec ts, *tsp = NULL; 884 int msec = SCARG(uap, timeout); 885 886 if (msec != INFTIM) { 887 if (msec < 0) 888 return (EINVAL); 889 ts.tv_sec = msec / 1000; 890 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 891 tsp = &ts; 892 } 893 894 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 895 retval)); 896 } 897 898 int 899 sys_ppoll(struct proc *p, void *v, register_t *retval) 900 { 901 struct sys_ppoll_args /* { 902 syscallarg(struct pollfd *) fds; 903 syscallarg(u_int) nfds; 904 syscallarg(const struct timespec *) ts; 905 syscallarg(const sigset_t *) mask; 906 } */ *uap = v; 907 908 int error; 909 struct timespec ts, *tsp = NULL; 910 sigset_t ss, *ssp = NULL; 911 912 if (SCARG(uap, ts) != NULL) { 913 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 914 return (error); 915 #ifdef KTRACE 916 if (KTRPOINT(p, KTR_STRUCT)) 917 ktrreltimespec(p, &ts); 918 #endif 919 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 920 return (EINVAL); 921 tsp = &ts; 922 } 923 924 if (SCARG(uap, mask) != NULL) { 925 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 926 return (error); 927 ssp = &ss; 928 } 929 930 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 931 retval)); 932 } 933 934 int 935 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 936 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 937 { 938 struct kqueue_scan_state scan; 939 struct timespec zerots = {}; 940 struct pollfd pfds[4], *pl = pfds; 941 int error, ncollected = 0, nevents = 0; 942 size_t sz; 943 944 /* Standards say no more than MAX_OPEN; this is possibly better. */ 945 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 946 return (EINVAL); 947 948 /* optimize for the default case, of a small nfds value */ 949 if (nfds > nitems(pfds)) { 950 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 951 M_WAITOK | M_CANFAIL); 952 if (pl == NULL) 953 return (EINVAL); 954 } 955 956 kqpoll_init(nfds); 957 958 sz = nfds * sizeof(*pl); 959 960 if ((error = copyin(fds, pl, sz)) != 0) 961 goto bad; 962 963 if (sigmask) 964 dosigsuspend(p, *sigmask &~ sigcantmask); 965 966 /* Register kqueue events */ 967 ppollregister(p, pl, nfds, &nevents, &ncollected); 968 969 /* 970 * The poll/select family of syscalls has been designed to 971 * block when file descriptors are not available, even if 972 * there's nothing to wait for. 973 */ 974 if (nevents == 0 && ncollected == 0) { 975 uint64_t nsecs = INFSLP; 976 977 if (timeout != NULL) { 978 if (!timespecisset(timeout)) 979 goto done; 980 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 981 } 982 983 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 984 if (error == ERESTART) 985 error = EINTR; 986 if (error == EWOULDBLOCK) 987 error = 0; 988 goto done; 989 } 990 991 /* Do not block if registering found pending events. */ 992 if (ncollected > 0) 993 timeout = &zerots; 994 995 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 996 kqueue_scan_setup(&scan, p->p_kq); 997 while (nevents > 0) { 998 struct kevent kev[KQ_NEVENTS]; 999 int i, ready, count; 1000 1001 /* Maximum number of events per iteration */ 1002 count = MIN(nitems(kev), nevents); 1003 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 1004 #ifdef KTRACE 1005 if (KTRPOINT(p, KTR_STRUCT)) 1006 ktrevent(p, kev, ready); 1007 #endif 1008 /* Convert back events that are ready. */ 1009 for (i = 0; i < ready; i++) 1010 ncollected += ppollcollect(p, &kev[i], pl, nfds); 1011 1012 /* 1013 * Stop if there was an error or if we had enough 1014 * place to collect all events that were ready. 1015 */ 1016 if (error || ready < count) 1017 break; 1018 1019 nevents -= ready; 1020 } 1021 kqueue_scan_finish(&scan); 1022 *retval = ncollected; 1023 done: 1024 /* 1025 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1026 * ignored (since the whole point is to see what would block). 1027 */ 1028 switch (error) { 1029 case EINTR: 1030 error = pollout(pl, fds, nfds); 1031 if (error == 0) 1032 error = EINTR; 1033 break; 1034 case EWOULDBLOCK: 1035 case 0: 1036 error = pollout(pl, fds, nfds); 1037 break; 1038 } 1039 #ifdef KTRACE 1040 if (KTRPOINT(p, KTR_STRUCT)) 1041 ktrpollfd(p, pl, nfds); 1042 #endif /* KTRACE */ 1043 bad: 1044 if (pl != pfds) 1045 free(pl, M_TEMP, sz); 1046 1047 kqpoll_done(nfds); 1048 1049 return (error); 1050 } 1051 1052 int 1053 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 1054 struct pollfd *pl, unsigned int pollid) 1055 { 1056 int i, error, nevents = 0; 1057 1058 KASSERT(pl->revents == 0); 1059 1060 #ifdef KTRACE 1061 if (KTRPOINT(p, KTR_STRUCT)) 1062 ktrevent(p, kevp, nkev); 1063 #endif 1064 for (i = 0; i < nkev; i++, kevp++) { 1065 again: 1066 error = kqueue_register(p->p_kq, kevp, pollid, p); 1067 switch (error) { 1068 case 0: 1069 nevents++; 1070 break; 1071 case EOPNOTSUPP:/* No underlying kqfilter */ 1072 case EINVAL: /* Unimplemented filter */ 1073 break; 1074 case EBADF: /* Bad file descriptor */ 1075 pl->revents |= POLLNVAL; 1076 break; 1077 case EPERM: /* Specific to FIFO */ 1078 KASSERT(kevp->filter == EVFILT_WRITE); 1079 if (nkev == 1) { 1080 /* 1081 * If this is the only filter make sure 1082 * POLLHUP is passed to userland. 1083 */ 1084 kevp->filter = EVFILT_EXCEPT; 1085 goto again; 1086 } 1087 break; 1088 case EPIPE: /* Specific to pipes */ 1089 KASSERT(kevp->filter == EVFILT_WRITE); 1090 pl->revents |= POLLHUP; 1091 break; 1092 default: 1093 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 1094 " %lu filt %d ERROR=%d\n", 1095 ((unsigned long)kevp->udata - p->p_kq_serial), 1096 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 1097 error); 1098 /* FALLTHROUGH */ 1099 case ENXIO: /* Device has been detached */ 1100 pl->revents |= POLLERR; 1101 break; 1102 } 1103 } 1104 1105 return (nevents); 1106 } 1107 1108 /* 1109 * Convert pollfd into kqueue events and register them on the 1110 * per-thread queue. 1111 * 1112 * At most 3 events can correspond to a single pollfd. 1113 */ 1114 void 1115 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, 1116 int *ncollected) 1117 { 1118 int i, nkev, nevt, forcehup; 1119 struct kevent kev[3], *kevp; 1120 1121 for (i = 0; i < nfds; i++) { 1122 pl[i].events &= ~POLL_NOHUP; 1123 pl[i].revents = 0; 1124 1125 if (pl[i].fd < 0) 1126 continue; 1127 1128 /* 1129 * POLLHUP checking is implicit in the event filters. 1130 * However, the checking must be even if no events are 1131 * requested. 1132 */ 1133 forcehup = ((pl[i].events & ~POLLHUP) == 0); 1134 1135 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 1136 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 1137 1138 nevt = 0; 1139 nkev = 0; 1140 kevp = kev; 1141 if (pl[i].events & (POLLIN | POLLRDNORM)) { 1142 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1143 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1144 (void *)(p->p_kq_serial + i)); 1145 nkev++; 1146 kevp++; 1147 } 1148 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1149 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1150 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1151 (void *)(p->p_kq_serial + i)); 1152 nkev++; 1153 kevp++; 1154 } 1155 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1156 int evff = forcehup ? 0 : NOTE_OOB; 1157 1158 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1159 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1160 (void *)(p->p_kq_serial + i)); 1161 nkev++; 1162 kevp++; 1163 } 1164 1165 if (nkev == 0) 1166 continue; 1167 1168 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); 1169 1170 if (pl[i].revents != 0) 1171 (*ncollected)++; 1172 } 1173 1174 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, 1175 *ncollected); 1176 } 1177 1178 /* 1179 * Convert given kqueue event into corresponding poll(2) revents bit. 1180 */ 1181 int 1182 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1183 { 1184 static struct timeval poll_errintvl = { 5, 0 }; 1185 static struct timeval poll_lasterr; 1186 int already_seen; 1187 unsigned long i; 1188 1189 /* Extract poll array index */ 1190 i = (unsigned long)kevp->udata - p->p_kq_serial; 1191 1192 if (i >= nfds) { 1193 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1194 __func__, kevp, nfds, 1195 (unsigned long)kevp->udata, p->p_kq_serial); 1196 } 1197 if ((int)kevp->ident != pl[i].fd) { 1198 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1199 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1200 p->p_kq_serial); 1201 } 1202 1203 /* 1204 * A given descriptor may already have generated an error 1205 * against another filter during kqueue_register(). 1206 * 1207 * Make sure to set the appropriate flags but do not 1208 * increment `*retval' more than once. 1209 */ 1210 already_seen = (pl[i].revents != 0); 1211 1212 /* POLLNVAL preempts other events. */ 1213 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1214 pl[i].revents = POLLNVAL; 1215 goto done; 1216 } else if (pl[i].revents & POLLNVAL) { 1217 goto done; 1218 } 1219 1220 switch (kevp->filter) { 1221 case EVFILT_READ: 1222 if (kevp->flags & __EV_HUP) 1223 pl[i].revents |= POLLHUP; 1224 if (pl[i].events & (POLLIN | POLLRDNORM)) 1225 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1226 break; 1227 case EVFILT_WRITE: 1228 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1229 if (kevp->flags & __EV_HUP) { 1230 pl[i].revents |= POLLHUP; 1231 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1232 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1233 } 1234 break; 1235 case EVFILT_EXCEPT: 1236 if (kevp->flags & __EV_HUP) { 1237 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1238 DPRINTFN(0, "weird events %x\n", pl[i].events); 1239 pl[i].revents |= POLLHUP; 1240 break; 1241 } 1242 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1243 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1244 break; 1245 default: 1246 KASSERT(0); 1247 } 1248 1249 done: 1250 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1251 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1252 kevp->filter); 1253 1254 /* 1255 * Make noise about unclaimed events as they might indicate a bug 1256 * and can result in spurious-looking wakeups of poll(2). 1257 * 1258 * Live-locking within the system call should not happen because 1259 * the scan loop in doppoll() has an upper limit for the number 1260 * of events to process. 1261 */ 1262 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { 1263 printf("%s[%d]: poll index %lu fd %d events 0x%x " 1264 "filter %d/0x%x unclaimed\n", 1265 p->p_p->ps_comm, p->p_tid, i, pl[i].fd, 1266 pl[i].events, kevp->filter, kevp->flags); 1267 } 1268 1269 if (!already_seen && (pl[i].revents != 0)) 1270 return (1); 1271 1272 return (0); 1273 } 1274 1275 /* 1276 * utrace system call 1277 */ 1278 int 1279 sys_utrace(struct proc *curp, void *v, register_t *retval) 1280 { 1281 #ifdef KTRACE 1282 struct sys_utrace_args /* { 1283 syscallarg(const char *) label; 1284 syscallarg(const void *) addr; 1285 syscallarg(size_t) len; 1286 } */ *uap = v; 1287 1288 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1289 SCARG(uap, len))); 1290 #else 1291 return (0); 1292 #endif 1293 } 1294