1 /* $OpenBSD: sys_generic.c,v 1.147 2022/02/08 08:56:41 visa Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/stat.h> 55 #include <sys/time.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/eventvar.h> 59 #ifdef KTRACE 60 #include <sys/ktrace.h> 61 #endif 62 #include <sys/sched.h> 63 #include <sys/pledge.h> 64 65 #include <sys/mount.h> 66 #include <sys/syscallargs.h> 67 68 #include <uvm/uvm_extern.h> 69 70 /* 71 * Debug values: 72 * 1 - print implementation errors, things that should not happen. 73 * 2 - print ppoll(2) information, somewhat verbose 74 * 3 - print pselect(2) and ppoll(2) information, very verbose 75 */ 76 int kqpoll_debug = 0; 77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 78 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 79 printf(x); \ 80 } 81 82 int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *); 83 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *); 84 void ppollregister(struct proc *, struct pollfd *, int, int *, int *); 85 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 86 87 int pollout(struct pollfd *, struct pollfd *, u_int); 88 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 89 struct timespec *, const sigset_t *, register_t *); 90 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 91 const sigset_t *, register_t *); 92 void doselwakeup(struct selinfo *); 93 94 int 95 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 96 unsigned int iovcnt, size_t *residp) 97 { 98 #ifdef KTRACE 99 struct proc *p = curproc; 100 #endif 101 struct iovec *iov; 102 int error, i; 103 size_t resid = 0; 104 105 if (iovcnt > UIO_SMALLIOV) { 106 if (iovcnt > IOV_MAX) 107 return (EINVAL); 108 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 109 } else if (iovcnt > 0) { 110 iov = aiov; 111 } else { 112 return (EINVAL); 113 } 114 *iovp = iov; 115 116 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 117 return (error); 118 119 #ifdef KTRACE 120 if (KTRPOINT(p, KTR_STRUCT)) 121 ktriovec(p, iov, iovcnt); 122 #endif 123 124 for (i = 0; i < iovcnt; i++) { 125 resid += iov->iov_len; 126 /* 127 * Writes return ssize_t because -1 is returned on error. 128 * Therefore we must restrict the length to SSIZE_MAX to 129 * avoid garbage return values. Note that the addition is 130 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 131 */ 132 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 133 return (EINVAL); 134 iov++; 135 } 136 137 if (residp != NULL) 138 *residp = resid; 139 140 return (0); 141 } 142 143 void 144 iovec_free(struct iovec *iov, unsigned int iovcnt) 145 { 146 if (iovcnt > UIO_SMALLIOV) 147 free(iov, M_IOV, iovcnt * sizeof(*iov)); 148 } 149 150 /* 151 * Read system call. 152 */ 153 int 154 sys_read(struct proc *p, void *v, register_t *retval) 155 { 156 struct sys_read_args /* { 157 syscallarg(int) fd; 158 syscallarg(void *) buf; 159 syscallarg(size_t) nbyte; 160 } */ *uap = v; 161 struct iovec iov; 162 struct uio auio; 163 164 iov.iov_base = SCARG(uap, buf); 165 iov.iov_len = SCARG(uap, nbyte); 166 if (iov.iov_len > SSIZE_MAX) 167 return (EINVAL); 168 169 auio.uio_iov = &iov; 170 auio.uio_iovcnt = 1; 171 auio.uio_resid = iov.iov_len; 172 173 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 174 } 175 176 /* 177 * Scatter read system call. 178 */ 179 int 180 sys_readv(struct proc *p, void *v, register_t *retval) 181 { 182 struct sys_readv_args /* { 183 syscallarg(int) fd; 184 syscallarg(const struct iovec *) iovp; 185 syscallarg(int) iovcnt; 186 } */ *uap = v; 187 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 188 int error, iovcnt = SCARG(uap, iovcnt); 189 struct uio auio; 190 size_t resid; 191 192 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 193 if (error) 194 goto done; 195 196 auio.uio_iov = iov; 197 auio.uio_iovcnt = iovcnt; 198 auio.uio_resid = resid; 199 200 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 201 done: 202 iovec_free(iov, iovcnt); 203 return (error); 204 } 205 206 int 207 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 208 register_t *retval) 209 { 210 struct filedesc *fdp = p->p_fd; 211 struct file *fp; 212 long cnt, error = 0; 213 u_int iovlen; 214 #ifdef KTRACE 215 struct iovec *ktriov = NULL; 216 #endif 217 218 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 219 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 220 221 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 222 return (EBADF); 223 224 /* Checks for positioned read. */ 225 if (flags & FO_POSITION) { 226 struct vnode *vp = fp->f_data; 227 228 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 229 (vp->v_flag & VISTTY)) { 230 error = ESPIPE; 231 goto done; 232 } 233 234 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 235 error = EINVAL; 236 goto done; 237 } 238 } 239 240 uio->uio_rw = UIO_READ; 241 uio->uio_segflg = UIO_USERSPACE; 242 uio->uio_procp = p; 243 #ifdef KTRACE 244 /* 245 * if tracing, save a copy of iovec 246 */ 247 if (KTRPOINT(p, KTR_GENIO)) { 248 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 249 memcpy(ktriov, uio->uio_iov, iovlen); 250 } 251 #endif 252 cnt = uio->uio_resid; 253 error = (*fp->f_ops->fo_read)(fp, uio, flags); 254 if (error) { 255 if (uio->uio_resid != cnt && (error == ERESTART || 256 error == EINTR || error == EWOULDBLOCK)) 257 error = 0; 258 } 259 cnt -= uio->uio_resid; 260 261 mtx_enter(&fp->f_mtx); 262 fp->f_rxfer++; 263 fp->f_rbytes += cnt; 264 mtx_leave(&fp->f_mtx); 265 #ifdef KTRACE 266 if (ktriov != NULL) { 267 if (error == 0) 268 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 269 free(ktriov, M_TEMP, iovlen); 270 } 271 #endif 272 *retval = cnt; 273 done: 274 FRELE(fp, p); 275 return (error); 276 } 277 278 /* 279 * Write system call 280 */ 281 int 282 sys_write(struct proc *p, void *v, register_t *retval) 283 { 284 struct sys_write_args /* { 285 syscallarg(int) fd; 286 syscallarg(const void *) buf; 287 syscallarg(size_t) nbyte; 288 } */ *uap = v; 289 struct iovec iov; 290 struct uio auio; 291 292 iov.iov_base = (void *)SCARG(uap, buf); 293 iov.iov_len = SCARG(uap, nbyte); 294 if (iov.iov_len > SSIZE_MAX) 295 return (EINVAL); 296 297 auio.uio_iov = &iov; 298 auio.uio_iovcnt = 1; 299 auio.uio_resid = iov.iov_len; 300 301 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 302 } 303 304 /* 305 * Gather write system call 306 */ 307 int 308 sys_writev(struct proc *p, void *v, register_t *retval) 309 { 310 struct sys_writev_args /* { 311 syscallarg(int) fd; 312 syscallarg(const struct iovec *) iovp; 313 syscallarg(int) iovcnt; 314 } */ *uap = v; 315 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 316 int error, iovcnt = SCARG(uap, iovcnt); 317 struct uio auio; 318 size_t resid; 319 320 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 321 if (error) 322 goto done; 323 324 auio.uio_iov = iov; 325 auio.uio_iovcnt = iovcnt; 326 auio.uio_resid = resid; 327 328 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 329 done: 330 iovec_free(iov, iovcnt); 331 return (error); 332 } 333 334 int 335 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 336 register_t *retval) 337 { 338 struct filedesc *fdp = p->p_fd; 339 struct file *fp; 340 long cnt, error = 0; 341 u_int iovlen; 342 #ifdef KTRACE 343 struct iovec *ktriov = NULL; 344 #endif 345 346 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 347 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 348 349 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 350 return (EBADF); 351 352 /* Checks for positioned write. */ 353 if (flags & FO_POSITION) { 354 struct vnode *vp = fp->f_data; 355 356 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 357 (vp->v_flag & VISTTY)) { 358 error = ESPIPE; 359 goto done; 360 } 361 362 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 363 error = EINVAL; 364 goto done; 365 } 366 } 367 368 uio->uio_rw = UIO_WRITE; 369 uio->uio_segflg = UIO_USERSPACE; 370 uio->uio_procp = p; 371 #ifdef KTRACE 372 /* 373 * if tracing, save a copy of iovec 374 */ 375 if (KTRPOINT(p, KTR_GENIO)) { 376 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 377 memcpy(ktriov, uio->uio_iov, iovlen); 378 } 379 #endif 380 cnt = uio->uio_resid; 381 error = (*fp->f_ops->fo_write)(fp, uio, flags); 382 if (error) { 383 if (uio->uio_resid != cnt && (error == ERESTART || 384 error == EINTR || error == EWOULDBLOCK)) 385 error = 0; 386 if (error == EPIPE) { 387 KERNEL_LOCK(); 388 ptsignal(p, SIGPIPE, STHREAD); 389 KERNEL_UNLOCK(); 390 } 391 } 392 cnt -= uio->uio_resid; 393 394 mtx_enter(&fp->f_mtx); 395 fp->f_wxfer++; 396 fp->f_wbytes += cnt; 397 mtx_leave(&fp->f_mtx); 398 #ifdef KTRACE 399 if (ktriov != NULL) { 400 if (error == 0) 401 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 402 free(ktriov, M_TEMP, iovlen); 403 } 404 #endif 405 *retval = cnt; 406 done: 407 FRELE(fp, p); 408 return (error); 409 } 410 411 /* 412 * Ioctl system call 413 */ 414 int 415 sys_ioctl(struct proc *p, void *v, register_t *retval) 416 { 417 struct sys_ioctl_args /* { 418 syscallarg(int) fd; 419 syscallarg(u_long) com; 420 syscallarg(void *) data; 421 } */ *uap = v; 422 struct file *fp; 423 struct filedesc *fdp = p->p_fd; 424 u_long com = SCARG(uap, com); 425 int error = 0; 426 u_int size = 0; 427 caddr_t data, memp = NULL; 428 int tmp; 429 #define STK_PARAMS 128 430 long long stkbuf[STK_PARAMS / sizeof(long long)]; 431 432 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 433 return (EBADF); 434 435 if (fp->f_type == DTYPE_SOCKET) { 436 struct socket *so = fp->f_data; 437 438 if (so->so_state & SS_DNS) { 439 error = EINVAL; 440 goto out; 441 } 442 } 443 444 error = pledge_ioctl(p, com, fp); 445 if (error) 446 goto out; 447 448 switch (com) { 449 case FIONCLEX: 450 case FIOCLEX: 451 fdplock(fdp); 452 if (com == FIONCLEX) 453 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 454 else 455 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 456 fdpunlock(fdp); 457 goto out; 458 } 459 460 /* 461 * Interpret high order word to find amount of data to be 462 * copied to/from the user's address space. 463 */ 464 size = IOCPARM_LEN(com); 465 if (size > IOCPARM_MAX) { 466 error = ENOTTY; 467 goto out; 468 } 469 if (size > sizeof (stkbuf)) { 470 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 471 data = memp; 472 } else 473 data = (caddr_t)stkbuf; 474 if (com&IOC_IN) { 475 if (size) { 476 error = copyin(SCARG(uap, data), data, size); 477 if (error) { 478 goto out; 479 } 480 } else 481 *(caddr_t *)data = SCARG(uap, data); 482 } else if ((com&IOC_OUT) && size) 483 /* 484 * Zero the buffer so the user always 485 * gets back something deterministic. 486 */ 487 memset(data, 0, size); 488 else if (com&IOC_VOID) 489 *(caddr_t *)data = SCARG(uap, data); 490 491 switch (com) { 492 493 case FIONBIO: 494 if ((tmp = *(int *)data) != 0) 495 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 496 else 497 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 498 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 499 break; 500 501 case FIOASYNC: 502 if ((tmp = *(int *)data) != 0) 503 atomic_setbits_int(&fp->f_flag, FASYNC); 504 else 505 atomic_clearbits_int(&fp->f_flag, FASYNC); 506 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 507 break; 508 509 default: 510 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 511 break; 512 } 513 /* 514 * Copy any data to user, size was 515 * already set and checked above. 516 */ 517 if (error == 0 && (com&IOC_OUT) && size) 518 error = copyout(data, SCARG(uap, data), size); 519 out: 520 FRELE(fp, p); 521 free(memp, M_IOCTLOPS, size); 522 return (error); 523 } 524 525 int selwait, nselcoll; 526 527 /* 528 * Select system call. 529 */ 530 int 531 sys_select(struct proc *p, void *v, register_t *retval) 532 { 533 struct sys_select_args /* { 534 syscallarg(int) nd; 535 syscallarg(fd_set *) in; 536 syscallarg(fd_set *) ou; 537 syscallarg(fd_set *) ex; 538 syscallarg(struct timeval *) tv; 539 } */ *uap = v; 540 541 struct timespec ts, *tsp = NULL; 542 int error; 543 544 if (SCARG(uap, tv) != NULL) { 545 struct timeval tv; 546 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 547 return (error); 548 #ifdef KTRACE 549 if (KTRPOINT(p, KTR_STRUCT)) 550 ktrreltimeval(p, &tv); 551 #endif 552 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 553 return (EINVAL); 554 TIMEVAL_TO_TIMESPEC(&tv, &ts); 555 tsp = &ts; 556 } 557 558 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 559 SCARG(uap, ex), tsp, NULL, retval)); 560 } 561 562 int 563 sys_pselect(struct proc *p, void *v, register_t *retval) 564 { 565 struct sys_pselect_args /* { 566 syscallarg(int) nd; 567 syscallarg(fd_set *) in; 568 syscallarg(fd_set *) ou; 569 syscallarg(fd_set *) ex; 570 syscallarg(const struct timespec *) ts; 571 syscallarg(const sigset_t *) mask; 572 } */ *uap = v; 573 574 struct timespec ts, *tsp = NULL; 575 sigset_t ss, *ssp = NULL; 576 int error; 577 578 if (SCARG(uap, ts) != NULL) { 579 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 580 return (error); 581 #ifdef KTRACE 582 if (KTRPOINT(p, KTR_STRUCT)) 583 ktrreltimespec(p, &ts); 584 #endif 585 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 586 return (EINVAL); 587 tsp = &ts; 588 } 589 if (SCARG(uap, mask) != NULL) { 590 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 591 return (error); 592 ssp = &ss; 593 } 594 595 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 596 SCARG(uap, ex), tsp, ssp, retval)); 597 } 598 599 int 600 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 601 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 602 { 603 struct kqueue_scan_state scan; 604 struct timespec zerots = {}; 605 fd_mask bits[6]; 606 fd_set *pibits[3], *pobits[3]; 607 int error, ncollected = 0, nevents = 0; 608 u_int ni; 609 610 if (nd < 0) 611 return (EINVAL); 612 if (nd > p->p_fd->fd_nfiles) { 613 /* forgiving; slightly wrong */ 614 nd = p->p_fd->fd_nfiles; 615 } 616 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 617 if (ni > sizeof(bits[0])) { 618 caddr_t mbits; 619 620 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 621 pibits[0] = (fd_set *)&mbits[ni * 0]; 622 pibits[1] = (fd_set *)&mbits[ni * 1]; 623 pibits[2] = (fd_set *)&mbits[ni * 2]; 624 pobits[0] = (fd_set *)&mbits[ni * 3]; 625 pobits[1] = (fd_set *)&mbits[ni * 4]; 626 pobits[2] = (fd_set *)&mbits[ni * 5]; 627 } else { 628 memset(bits, 0, sizeof(bits)); 629 pibits[0] = (fd_set *)&bits[0]; 630 pibits[1] = (fd_set *)&bits[1]; 631 pibits[2] = (fd_set *)&bits[2]; 632 pobits[0] = (fd_set *)&bits[3]; 633 pobits[1] = (fd_set *)&bits[4]; 634 pobits[2] = (fd_set *)&bits[5]; 635 } 636 637 kqpoll_init(nd); 638 639 #define getbits(name, x) \ 640 if (name && (error = copyin(name, pibits[x], ni))) \ 641 goto done; 642 getbits(in, 0); 643 getbits(ou, 1); 644 getbits(ex, 2); 645 #undef getbits 646 #ifdef KTRACE 647 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 648 if (in) ktrfdset(p, pibits[0], ni); 649 if (ou) ktrfdset(p, pibits[1], ni); 650 if (ex) ktrfdset(p, pibits[2], ni); 651 } 652 #endif 653 654 if (sigmask) 655 dosigsuspend(p, *sigmask &~ sigcantmask); 656 657 /* Register kqueue events */ 658 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 659 if (error != 0) 660 goto done; 661 662 /* 663 * The poll/select family of syscalls has been designed to 664 * block when file descriptors are not available, even if 665 * there's nothing to wait for. 666 */ 667 if (nevents == 0 && ncollected == 0) { 668 uint64_t nsecs = INFSLP; 669 670 if (timeout != NULL) { 671 if (!timespecisset(timeout)) 672 goto done; 673 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 674 } 675 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 676 /* select is not restarted after signals... */ 677 if (error == ERESTART) 678 error = EINTR; 679 if (error == EWOULDBLOCK) 680 error = 0; 681 goto done; 682 } 683 684 /* Do not block if registering found pending events. */ 685 if (ncollected > 0) 686 timeout = &zerots; 687 688 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 689 kqueue_scan_setup(&scan, p->p_kq); 690 while (nevents > 0) { 691 struct kevent kev[KQ_NEVENTS]; 692 int i, ready, count; 693 694 /* Maximum number of events per iteration */ 695 count = MIN(nitems(kev), nevents); 696 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 697 #ifdef KTRACE 698 if (KTRPOINT(p, KTR_STRUCT)) 699 ktrevent(p, kev, ready); 700 #endif 701 /* Convert back events that are ready. */ 702 for (i = 0; i < ready && error == 0; i++) 703 error = pselcollect(p, &kev[i], pobits, &ncollected); 704 /* 705 * Stop if there was an error or if we had enough 706 * space to collect all events that were ready. 707 */ 708 if (error || ready < count) 709 break; 710 711 nevents -= ready; 712 } 713 kqueue_scan_finish(&scan); 714 *retval = ncollected; 715 done: 716 #define putbits(name, x) \ 717 if (name && (error2 = copyout(pobits[x], name, ni))) \ 718 error = error2; 719 if (error == 0) { 720 int error2; 721 722 putbits(in, 0); 723 putbits(ou, 1); 724 putbits(ex, 2); 725 #undef putbits 726 #ifdef KTRACE 727 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 728 if (in) ktrfdset(p, pobits[0], ni); 729 if (ou) ktrfdset(p, pobits[1], ni); 730 if (ex) ktrfdset(p, pobits[2], ni); 731 } 732 #endif 733 } 734 735 if (pibits[0] != (fd_set *)&bits[0]) 736 free(pibits[0], M_TEMP, 6 * ni); 737 738 kqpoll_done(nd); 739 740 return (error); 741 } 742 743 /* 744 * Convert fd_set into kqueue events and register them on the 745 * per-thread queue. 746 */ 747 int 748 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 749 int *nregistered, int *ncollected) 750 { 751 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 752 static const int evff[] = { 0, 0, NOTE_OOB }; 753 int msk, i, j, fd, nevents = 0, error = 0; 754 struct kevent kev; 755 fd_mask bits; 756 757 for (msk = 0; msk < 3; msk++) { 758 for (i = 0; i < nfd; i += NFDBITS) { 759 bits = pibits[msk]->fds_bits[i / NFDBITS]; 760 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 761 bits &= ~(1 << j); 762 763 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 764 fd, msk, p->p_kq_serial); 765 EV_SET(&kev, fd, evf[msk], 766 EV_ADD|EV_ENABLE|__EV_SELECT, 767 evff[msk], 0, (void *)(p->p_kq_serial)); 768 #ifdef KTRACE 769 if (KTRPOINT(p, KTR_STRUCT)) 770 ktrevent(p, &kev, 1); 771 #endif 772 error = kqueue_register(p->p_kq, &kev, 0, p); 773 switch (error) { 774 case 0: 775 nevents++; 776 /* FALLTHROUGH */ 777 case EOPNOTSUPP:/* No underlying kqfilter */ 778 case EINVAL: /* Unimplemented filter */ 779 case EPERM: /* Specific to FIFO and 780 * __EV_SELECT */ 781 error = 0; 782 break; 783 case EPIPE: /* Specific to pipes */ 784 KASSERT(kev.filter == EVFILT_WRITE); 785 FD_SET(kev.ident, pobits[1]); 786 (*ncollected)++; 787 error = 0; 788 break; 789 case ENXIO: /* Device has been detached */ 790 default: 791 goto bad; 792 } 793 } 794 } 795 } 796 797 *nregistered = nevents; 798 return (0); 799 bad: 800 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 801 kev.filter, error); 802 return (error); 803 } 804 805 /* 806 * Convert given kqueue event into corresponding select(2) bit. 807 */ 808 int 809 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 810 int *ncollected) 811 { 812 if ((unsigned long)kevp->udata != p->p_kq_serial) { 813 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 814 __func__, kevp, (int)kevp->ident, 815 (unsigned long)kevp->udata, p->p_kq_serial); 816 } 817 818 if (kevp->flags & EV_ERROR) { 819 DPRINTFN(2, "select fd %d filt %d error %d\n", 820 (int)kevp->ident, kevp->filter, (int)kevp->data); 821 return (kevp->data); 822 } 823 824 switch (kevp->filter) { 825 case EVFILT_READ: 826 FD_SET(kevp->ident, pobits[0]); 827 break; 828 case EVFILT_WRITE: 829 FD_SET(kevp->ident, pobits[1]); 830 break; 831 case EVFILT_EXCEPT: 832 FD_SET(kevp->ident, pobits[2]); 833 break; 834 default: 835 KASSERT(0); 836 } 837 (*ncollected)++; 838 839 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 840 return (0); 841 } 842 843 int 844 seltrue(dev_t dev, int events, struct proc *p) 845 { 846 847 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 848 } 849 850 int 851 selfalse(dev_t dev, int events, struct proc *p) 852 { 853 854 return (0); 855 } 856 857 /* 858 * Record a select request. 859 */ 860 void 861 selrecord(struct proc *selector, struct selinfo *sip) 862 { 863 struct proc *p; 864 pid_t mytid; 865 866 KERNEL_ASSERT_LOCKED(); 867 868 mytid = selector->p_tid; 869 if (sip->si_seltid == mytid) 870 return; 871 if (sip->si_seltid && (p = tfind(sip->si_seltid)) && 872 p->p_wchan == (caddr_t)&selwait) 873 sip->si_flags |= SI_COLL; 874 else 875 sip->si_seltid = mytid; 876 } 877 878 /* 879 * Do a wakeup when a selectable event occurs. 880 */ 881 void 882 selwakeup(struct selinfo *sip) 883 { 884 KERNEL_LOCK(); 885 KNOTE(&sip->si_note, NOTE_SUBMIT); 886 doselwakeup(sip); 887 KERNEL_UNLOCK(); 888 } 889 890 void 891 doselwakeup(struct selinfo *sip) 892 { 893 struct proc *p; 894 895 KERNEL_ASSERT_LOCKED(); 896 897 if (sip->si_seltid == 0) 898 return; 899 if (sip->si_flags & SI_COLL) { 900 nselcoll++; 901 sip->si_flags &= ~SI_COLL; 902 wakeup(&selwait); 903 } 904 p = tfind(sip->si_seltid); 905 sip->si_seltid = 0; 906 if (p != NULL) { 907 if (wakeup_proc(p, &selwait)) { 908 /* nothing else to do */ 909 } else if (p->p_flag & P_SELECT) 910 atomic_clearbits_int(&p->p_flag, P_SELECT); 911 } 912 } 913 914 /* 915 * Only copyout the revents field. 916 */ 917 int 918 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 919 { 920 int error = 0; 921 u_int i = 0; 922 923 while (!error && i++ < nfds) { 924 error = copyout(&pl->revents, &upl->revents, 925 sizeof(upl->revents)); 926 pl++; 927 upl++; 928 } 929 930 return (error); 931 } 932 933 /* 934 * We are using the same mechanism as select only we encode/decode args 935 * differently. 936 */ 937 int 938 sys_poll(struct proc *p, void *v, register_t *retval) 939 { 940 struct sys_poll_args /* { 941 syscallarg(struct pollfd *) fds; 942 syscallarg(u_int) nfds; 943 syscallarg(int) timeout; 944 } */ *uap = v; 945 946 struct timespec ts, *tsp = NULL; 947 int msec = SCARG(uap, timeout); 948 949 if (msec != INFTIM) { 950 if (msec < 0) 951 return (EINVAL); 952 ts.tv_sec = msec / 1000; 953 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 954 tsp = &ts; 955 } 956 957 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 958 retval)); 959 } 960 961 int 962 sys_ppoll(struct proc *p, void *v, register_t *retval) 963 { 964 struct sys_ppoll_args /* { 965 syscallarg(struct pollfd *) fds; 966 syscallarg(u_int) nfds; 967 syscallarg(const struct timespec *) ts; 968 syscallarg(const sigset_t *) mask; 969 } */ *uap = v; 970 971 int error; 972 struct timespec ts, *tsp = NULL; 973 sigset_t ss, *ssp = NULL; 974 975 if (SCARG(uap, ts) != NULL) { 976 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 977 return (error); 978 #ifdef KTRACE 979 if (KTRPOINT(p, KTR_STRUCT)) 980 ktrreltimespec(p, &ts); 981 #endif 982 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 983 return (EINVAL); 984 tsp = &ts; 985 } 986 987 if (SCARG(uap, mask) != NULL) { 988 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 989 return (error); 990 ssp = &ss; 991 } 992 993 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 994 retval)); 995 } 996 997 int 998 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 999 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 1000 { 1001 struct kqueue_scan_state scan; 1002 struct timespec zerots = {}; 1003 struct pollfd pfds[4], *pl = pfds; 1004 int error, ncollected = 0, nevents = 0; 1005 size_t sz; 1006 1007 /* Standards say no more than MAX_OPEN; this is possibly better. */ 1008 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 1009 return (EINVAL); 1010 1011 /* optimize for the default case, of a small nfds value */ 1012 if (nfds > nitems(pfds)) { 1013 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 1014 M_WAITOK | M_CANFAIL); 1015 if (pl == NULL) 1016 return (EINVAL); 1017 } 1018 1019 kqpoll_init(nfds); 1020 1021 sz = nfds * sizeof(*pl); 1022 1023 if ((error = copyin(fds, pl, sz)) != 0) 1024 goto bad; 1025 1026 if (sigmask) 1027 dosigsuspend(p, *sigmask &~ sigcantmask); 1028 1029 /* Register kqueue events */ 1030 ppollregister(p, pl, nfds, &nevents, &ncollected); 1031 1032 /* 1033 * The poll/select family of syscalls has been designed to 1034 * block when file descriptors are not available, even if 1035 * there's nothing to wait for. 1036 */ 1037 if (nevents == 0 && ncollected == 0) { 1038 uint64_t nsecs = INFSLP; 1039 1040 if (timeout != NULL) { 1041 if (!timespecisset(timeout)) 1042 goto done; 1043 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 1044 } 1045 1046 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 1047 if (error == ERESTART) 1048 error = EINTR; 1049 if (error == EWOULDBLOCK) 1050 error = 0; 1051 goto done; 1052 } 1053 1054 /* Do not block if registering found pending events. */ 1055 if (ncollected > 0) 1056 timeout = &zerots; 1057 1058 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 1059 kqueue_scan_setup(&scan, p->p_kq); 1060 while (nevents > 0) { 1061 struct kevent kev[KQ_NEVENTS]; 1062 int i, ready, count; 1063 1064 /* Maximum number of events per iteration */ 1065 count = MIN(nitems(kev), nevents); 1066 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 1067 #ifdef KTRACE 1068 if (KTRPOINT(p, KTR_STRUCT)) 1069 ktrevent(p, kev, ready); 1070 #endif 1071 /* Convert back events that are ready. */ 1072 for (i = 0; i < ready; i++) 1073 ncollected += ppollcollect(p, &kev[i], pl, nfds); 1074 1075 /* 1076 * Stop if there was an error or if we had enough 1077 * place to collect all events that were ready. 1078 */ 1079 if (error || ready < count) 1080 break; 1081 1082 nevents -= ready; 1083 } 1084 kqueue_scan_finish(&scan); 1085 *retval = ncollected; 1086 done: 1087 /* 1088 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1089 * ignored (since the whole point is to see what would block). 1090 */ 1091 switch (error) { 1092 case EINTR: 1093 error = pollout(pl, fds, nfds); 1094 if (error == 0) 1095 error = EINTR; 1096 break; 1097 case EWOULDBLOCK: 1098 case 0: 1099 error = pollout(pl, fds, nfds); 1100 break; 1101 } 1102 #ifdef KTRACE 1103 if (KTRPOINT(p, KTR_STRUCT)) 1104 ktrpollfd(p, pl, nfds); 1105 #endif /* KTRACE */ 1106 bad: 1107 if (pl != pfds) 1108 free(pl, M_TEMP, sz); 1109 1110 kqpoll_done(nfds); 1111 1112 return (error); 1113 } 1114 1115 int 1116 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 1117 struct pollfd *pl, unsigned int pollid) 1118 { 1119 int i, error, nevents = 0; 1120 1121 KASSERT(pl->revents == 0); 1122 1123 #ifdef KTRACE 1124 if (KTRPOINT(p, KTR_STRUCT)) 1125 ktrevent(p, kevp, nkev); 1126 #endif 1127 for (i = 0; i < nkev; i++, kevp++) { 1128 again: 1129 error = kqueue_register(p->p_kq, kevp, pollid, p); 1130 switch (error) { 1131 case 0: 1132 nevents++; 1133 break; 1134 case EOPNOTSUPP:/* No underlying kqfilter */ 1135 case EINVAL: /* Unimplemented filter */ 1136 break; 1137 case EBADF: /* Bad file descriptor */ 1138 pl->revents |= POLLNVAL; 1139 break; 1140 case EPERM: /* Specific to FIFO */ 1141 KASSERT(kevp->filter == EVFILT_WRITE); 1142 if (nkev == 1) { 1143 /* 1144 * If this is the only filter make sure 1145 * POLLHUP is passed to userland. 1146 */ 1147 kevp->filter = EVFILT_EXCEPT; 1148 goto again; 1149 } 1150 break; 1151 case EPIPE: /* Specific to pipes */ 1152 KASSERT(kevp->filter == EVFILT_WRITE); 1153 pl->revents |= POLLHUP; 1154 break; 1155 default: 1156 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 1157 " %lu filt %d ERROR=%d\n", 1158 ((unsigned long)kevp->udata - p->p_kq_serial), 1159 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 1160 error); 1161 /* FALLTHROUGH */ 1162 case ENXIO: /* Device has been detached */ 1163 pl->revents |= POLLERR; 1164 break; 1165 } 1166 } 1167 1168 return (nevents); 1169 } 1170 1171 /* 1172 * Convert pollfd into kqueue events and register them on the 1173 * per-thread queue. 1174 * 1175 * At most 3 events can correspond to a single pollfd. 1176 */ 1177 void 1178 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, 1179 int *ncollected) 1180 { 1181 int i, nkev, nevt, forcehup; 1182 struct kevent kev[3], *kevp; 1183 1184 for (i = 0; i < nfds; i++) { 1185 pl[i].events &= ~POLL_NOHUP; 1186 pl[i].revents = 0; 1187 1188 if (pl[i].fd < 0) 1189 continue; 1190 1191 /* 1192 * POLLHUP checking is implicit in the event filters. 1193 * However, the checking must be even if no events are 1194 * requested. 1195 */ 1196 forcehup = ((pl[i].events & ~POLLHUP) == 0); 1197 1198 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 1199 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 1200 1201 nevt = 0; 1202 nkev = 0; 1203 kevp = kev; 1204 if (pl[i].events & (POLLIN | POLLRDNORM)) { 1205 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1206 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1207 (void *)(p->p_kq_serial + i)); 1208 nkev++; 1209 kevp++; 1210 } 1211 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1212 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1213 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1214 (void *)(p->p_kq_serial + i)); 1215 nkev++; 1216 kevp++; 1217 } 1218 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1219 int evff = forcehup ? 0 : NOTE_OOB; 1220 1221 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1222 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1223 (void *)(p->p_kq_serial + i)); 1224 nkev++; 1225 kevp++; 1226 } 1227 1228 if (nkev == 0) 1229 continue; 1230 1231 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); 1232 1233 if (pl[i].revents != 0) 1234 (*ncollected)++; 1235 } 1236 1237 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, 1238 *ncollected); 1239 } 1240 1241 /* 1242 * Convert given kqueue event into corresponding poll(2) revents bit. 1243 */ 1244 int 1245 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1246 { 1247 static struct timeval poll_errintvl = { 5, 0 }; 1248 static struct timeval poll_lasterr; 1249 int already_seen; 1250 unsigned long i; 1251 1252 /* Extract poll array index */ 1253 i = (unsigned long)kevp->udata - p->p_kq_serial; 1254 1255 if (i >= nfds) { 1256 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1257 __func__, kevp, nfds, 1258 (unsigned long)kevp->udata, p->p_kq_serial); 1259 } 1260 if ((int)kevp->ident != pl[i].fd) { 1261 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1262 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1263 p->p_kq_serial); 1264 } 1265 1266 /* 1267 * A given descriptor may already have generated an error 1268 * against another filter during kqueue_register(). 1269 * 1270 * Make sure to set the appropriate flags but do not 1271 * increment `*retval' more than once. 1272 */ 1273 already_seen = (pl[i].revents != 0); 1274 1275 /* POLLNVAL preempts other events. */ 1276 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1277 pl[i].revents = POLLNVAL; 1278 goto done; 1279 } else if (pl[i].revents & POLLNVAL) { 1280 goto done; 1281 } 1282 1283 switch (kevp->filter) { 1284 case EVFILT_READ: 1285 if (kevp->flags & __EV_HUP) 1286 pl[i].revents |= POLLHUP; 1287 if (pl[i].events & (POLLIN | POLLRDNORM)) 1288 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1289 break; 1290 case EVFILT_WRITE: 1291 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1292 if (kevp->flags & __EV_HUP) { 1293 pl[i].revents |= POLLHUP; 1294 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1295 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1296 } 1297 break; 1298 case EVFILT_EXCEPT: 1299 if (kevp->flags & __EV_HUP) { 1300 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1301 DPRINTFN(0, "weird events %x\n", pl[i].events); 1302 pl[i].revents |= POLLHUP; 1303 break; 1304 } 1305 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1306 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1307 break; 1308 default: 1309 KASSERT(0); 1310 } 1311 1312 done: 1313 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1314 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1315 kevp->filter); 1316 1317 /* 1318 * Make noise about unclaimed events as they might indicate a bug 1319 * and can result in spurious-looking wakeups of poll(2). 1320 * 1321 * Live-locking within the system call should not happen because 1322 * the scan loop in doppoll() has an upper limit for the number 1323 * of events to process. 1324 */ 1325 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { 1326 printf("%s[%d]: poll index %lu fd %d events 0x%x " 1327 "filter %d/0x%x unclaimed\n", 1328 p->p_p->ps_comm, p->p_tid, i, pl[i].fd, 1329 pl[i].events, kevp->filter, kevp->flags); 1330 } 1331 1332 if (!already_seen && (pl[i].revents != 0)) 1333 return (1); 1334 1335 return (0); 1336 } 1337 1338 /* 1339 * utrace system call 1340 */ 1341 int 1342 sys_utrace(struct proc *curp, void *v, register_t *retval) 1343 { 1344 #ifdef KTRACE 1345 struct sys_utrace_args /* { 1346 syscallarg(const char *) label; 1347 syscallarg(const void *) addr; 1348 syscallarg(size_t) len; 1349 } */ *uap = v; 1350 1351 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1352 SCARG(uap, len))); 1353 #else 1354 return (0); 1355 #endif 1356 } 1357