1 /* $OpenBSD: sys_generic.c,v 1.142 2021/11/22 14:59:03 visa Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/stat.h> 55 #include <sys/time.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/eventvar.h> 59 #ifdef KTRACE 60 #include <sys/ktrace.h> 61 #endif 62 #include <sys/sched.h> 63 #include <sys/pledge.h> 64 65 #include <sys/mount.h> 66 #include <sys/syscallargs.h> 67 68 #include <uvm/uvm_extern.h> 69 70 /* 71 * Debug values: 72 * 1 - print implementation errors, things that should not happen. 73 * 2 - print ppoll(2) information, somewhat verbose 74 * 3 - print pselect(2) and ppoll(2) information, very verbose 75 */ 76 int kqpoll_debug = 0; 77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 78 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 79 printf(x); \ 80 } 81 82 int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *); 83 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *); 84 int ppollregister(struct proc *, struct pollfd *, int, int *); 85 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); 86 87 int pollout(struct pollfd *, struct pollfd *, u_int); 88 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 89 struct timespec *, const sigset_t *, register_t *); 90 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 91 const sigset_t *, register_t *); 92 void doselwakeup(struct selinfo *); 93 94 int 95 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 96 unsigned int iovcnt, size_t *residp) 97 { 98 #ifdef KTRACE 99 struct proc *p = curproc; 100 #endif 101 struct iovec *iov; 102 int error, i; 103 size_t resid = 0; 104 105 if (iovcnt > UIO_SMALLIOV) { 106 if (iovcnt > IOV_MAX) 107 return (EINVAL); 108 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 109 } else if (iovcnt > 0) { 110 iov = aiov; 111 } else { 112 return (EINVAL); 113 } 114 *iovp = iov; 115 116 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 117 return (error); 118 119 #ifdef KTRACE 120 if (KTRPOINT(p, KTR_STRUCT)) 121 ktriovec(p, iov, iovcnt); 122 #endif 123 124 for (i = 0; i < iovcnt; i++) { 125 resid += iov->iov_len; 126 /* 127 * Writes return ssize_t because -1 is returned on error. 128 * Therefore we must restrict the length to SSIZE_MAX to 129 * avoid garbage return values. Note that the addition is 130 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 131 */ 132 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 133 return (EINVAL); 134 iov++; 135 } 136 137 if (residp != NULL) 138 *residp = resid; 139 140 return (0); 141 } 142 143 void 144 iovec_free(struct iovec *iov, unsigned int iovcnt) 145 { 146 if (iovcnt > UIO_SMALLIOV) 147 free(iov, M_IOV, iovcnt * sizeof(*iov)); 148 } 149 150 /* 151 * Read system call. 152 */ 153 int 154 sys_read(struct proc *p, void *v, register_t *retval) 155 { 156 struct sys_read_args /* { 157 syscallarg(int) fd; 158 syscallarg(void *) buf; 159 syscallarg(size_t) nbyte; 160 } */ *uap = v; 161 struct iovec iov; 162 struct uio auio; 163 164 iov.iov_base = SCARG(uap, buf); 165 iov.iov_len = SCARG(uap, nbyte); 166 if (iov.iov_len > SSIZE_MAX) 167 return (EINVAL); 168 169 auio.uio_iov = &iov; 170 auio.uio_iovcnt = 1; 171 auio.uio_resid = iov.iov_len; 172 173 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 174 } 175 176 /* 177 * Scatter read system call. 178 */ 179 int 180 sys_readv(struct proc *p, void *v, register_t *retval) 181 { 182 struct sys_readv_args /* { 183 syscallarg(int) fd; 184 syscallarg(const struct iovec *) iovp; 185 syscallarg(int) iovcnt; 186 } */ *uap = v; 187 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 188 int error, iovcnt = SCARG(uap, iovcnt); 189 struct uio auio; 190 size_t resid; 191 192 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 193 if (error) 194 goto done; 195 196 auio.uio_iov = iov; 197 auio.uio_iovcnt = iovcnt; 198 auio.uio_resid = resid; 199 200 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 201 done: 202 iovec_free(iov, iovcnt); 203 return (error); 204 } 205 206 int 207 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 208 register_t *retval) 209 { 210 struct filedesc *fdp = p->p_fd; 211 struct file *fp; 212 long cnt, error = 0; 213 u_int iovlen; 214 #ifdef KTRACE 215 struct iovec *ktriov = NULL; 216 #endif 217 218 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 219 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 220 221 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 222 return (EBADF); 223 224 /* Checks for positioned read. */ 225 if (flags & FO_POSITION) { 226 struct vnode *vp = fp->f_data; 227 228 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 229 (vp->v_flag & VISTTY)) { 230 error = ESPIPE; 231 goto done; 232 } 233 234 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 235 error = EINVAL; 236 goto done; 237 } 238 } 239 240 uio->uio_rw = UIO_READ; 241 uio->uio_segflg = UIO_USERSPACE; 242 uio->uio_procp = p; 243 #ifdef KTRACE 244 /* 245 * if tracing, save a copy of iovec 246 */ 247 if (KTRPOINT(p, KTR_GENIO)) { 248 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 249 memcpy(ktriov, uio->uio_iov, iovlen); 250 } 251 #endif 252 cnt = uio->uio_resid; 253 error = (*fp->f_ops->fo_read)(fp, uio, flags); 254 if (error) { 255 if (uio->uio_resid != cnt && (error == ERESTART || 256 error == EINTR || error == EWOULDBLOCK)) 257 error = 0; 258 } 259 cnt -= uio->uio_resid; 260 261 mtx_enter(&fp->f_mtx); 262 fp->f_rxfer++; 263 fp->f_rbytes += cnt; 264 mtx_leave(&fp->f_mtx); 265 #ifdef KTRACE 266 if (ktriov != NULL) { 267 if (error == 0) 268 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 269 free(ktriov, M_TEMP, iovlen); 270 } 271 #endif 272 *retval = cnt; 273 done: 274 FRELE(fp, p); 275 return (error); 276 } 277 278 /* 279 * Write system call 280 */ 281 int 282 sys_write(struct proc *p, void *v, register_t *retval) 283 { 284 struct sys_write_args /* { 285 syscallarg(int) fd; 286 syscallarg(const void *) buf; 287 syscallarg(size_t) nbyte; 288 } */ *uap = v; 289 struct iovec iov; 290 struct uio auio; 291 292 iov.iov_base = (void *)SCARG(uap, buf); 293 iov.iov_len = SCARG(uap, nbyte); 294 if (iov.iov_len > SSIZE_MAX) 295 return (EINVAL); 296 297 auio.uio_iov = &iov; 298 auio.uio_iovcnt = 1; 299 auio.uio_resid = iov.iov_len; 300 301 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 302 } 303 304 /* 305 * Gather write system call 306 */ 307 int 308 sys_writev(struct proc *p, void *v, register_t *retval) 309 { 310 struct sys_writev_args /* { 311 syscallarg(int) fd; 312 syscallarg(const struct iovec *) iovp; 313 syscallarg(int) iovcnt; 314 } */ *uap = v; 315 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 316 int error, iovcnt = SCARG(uap, iovcnt); 317 struct uio auio; 318 size_t resid; 319 320 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 321 if (error) 322 goto done; 323 324 auio.uio_iov = iov; 325 auio.uio_iovcnt = iovcnt; 326 auio.uio_resid = resid; 327 328 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 329 done: 330 iovec_free(iov, iovcnt); 331 return (error); 332 } 333 334 int 335 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 336 register_t *retval) 337 { 338 struct filedesc *fdp = p->p_fd; 339 struct file *fp; 340 long cnt, error = 0; 341 u_int iovlen; 342 #ifdef KTRACE 343 struct iovec *ktriov = NULL; 344 #endif 345 346 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 347 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 348 349 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 350 return (EBADF); 351 352 /* Checks for positioned write. */ 353 if (flags & FO_POSITION) { 354 struct vnode *vp = fp->f_data; 355 356 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 357 (vp->v_flag & VISTTY)) { 358 error = ESPIPE; 359 goto done; 360 } 361 362 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 363 error = EINVAL; 364 goto done; 365 } 366 } 367 368 uio->uio_rw = UIO_WRITE; 369 uio->uio_segflg = UIO_USERSPACE; 370 uio->uio_procp = p; 371 #ifdef KTRACE 372 /* 373 * if tracing, save a copy of iovec 374 */ 375 if (KTRPOINT(p, KTR_GENIO)) { 376 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 377 memcpy(ktriov, uio->uio_iov, iovlen); 378 } 379 #endif 380 cnt = uio->uio_resid; 381 error = (*fp->f_ops->fo_write)(fp, uio, flags); 382 if (error) { 383 if (uio->uio_resid != cnt && (error == ERESTART || 384 error == EINTR || error == EWOULDBLOCK)) 385 error = 0; 386 if (error == EPIPE) { 387 KERNEL_LOCK(); 388 ptsignal(p, SIGPIPE, STHREAD); 389 KERNEL_UNLOCK(); 390 } 391 } 392 cnt -= uio->uio_resid; 393 394 mtx_enter(&fp->f_mtx); 395 fp->f_wxfer++; 396 fp->f_wbytes += cnt; 397 mtx_leave(&fp->f_mtx); 398 #ifdef KTRACE 399 if (ktriov != NULL) { 400 if (error == 0) 401 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 402 free(ktriov, M_TEMP, iovlen); 403 } 404 #endif 405 *retval = cnt; 406 done: 407 FRELE(fp, p); 408 return (error); 409 } 410 411 /* 412 * Ioctl system call 413 */ 414 int 415 sys_ioctl(struct proc *p, void *v, register_t *retval) 416 { 417 struct sys_ioctl_args /* { 418 syscallarg(int) fd; 419 syscallarg(u_long) com; 420 syscallarg(void *) data; 421 } */ *uap = v; 422 struct file *fp; 423 struct filedesc *fdp = p->p_fd; 424 u_long com = SCARG(uap, com); 425 int error = 0; 426 u_int size = 0; 427 caddr_t data, memp = NULL; 428 int tmp; 429 #define STK_PARAMS 128 430 long long stkbuf[STK_PARAMS / sizeof(long long)]; 431 432 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 433 return (EBADF); 434 435 if (fp->f_type == DTYPE_SOCKET) { 436 struct socket *so = fp->f_data; 437 438 if (so->so_state & SS_DNS) { 439 error = EINVAL; 440 goto out; 441 } 442 } 443 444 error = pledge_ioctl(p, com, fp); 445 if (error) 446 goto out; 447 448 switch (com) { 449 case FIONCLEX: 450 case FIOCLEX: 451 fdplock(fdp); 452 if (com == FIONCLEX) 453 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 454 else 455 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 456 fdpunlock(fdp); 457 goto out; 458 } 459 460 /* 461 * Interpret high order word to find amount of data to be 462 * copied to/from the user's address space. 463 */ 464 size = IOCPARM_LEN(com); 465 if (size > IOCPARM_MAX) { 466 error = ENOTTY; 467 goto out; 468 } 469 if (size > sizeof (stkbuf)) { 470 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 471 data = memp; 472 } else 473 data = (caddr_t)stkbuf; 474 if (com&IOC_IN) { 475 if (size) { 476 error = copyin(SCARG(uap, data), data, size); 477 if (error) { 478 goto out; 479 } 480 } else 481 *(caddr_t *)data = SCARG(uap, data); 482 } else if ((com&IOC_OUT) && size) 483 /* 484 * Zero the buffer so the user always 485 * gets back something deterministic. 486 */ 487 memset(data, 0, size); 488 else if (com&IOC_VOID) 489 *(caddr_t *)data = SCARG(uap, data); 490 491 switch (com) { 492 493 case FIONBIO: 494 if ((tmp = *(int *)data) != 0) 495 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 496 else 497 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 498 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 499 break; 500 501 case FIOASYNC: 502 if ((tmp = *(int *)data) != 0) 503 atomic_setbits_int(&fp->f_flag, FASYNC); 504 else 505 atomic_clearbits_int(&fp->f_flag, FASYNC); 506 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 507 break; 508 509 default: 510 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 511 break; 512 } 513 /* 514 * Copy any data to user, size was 515 * already set and checked above. 516 */ 517 if (error == 0 && (com&IOC_OUT) && size) 518 error = copyout(data, SCARG(uap, data), size); 519 out: 520 FRELE(fp, p); 521 free(memp, M_IOCTLOPS, size); 522 return (error); 523 } 524 525 int selwait, nselcoll; 526 527 /* 528 * Select system call. 529 */ 530 int 531 sys_select(struct proc *p, void *v, register_t *retval) 532 { 533 struct sys_select_args /* { 534 syscallarg(int) nd; 535 syscallarg(fd_set *) in; 536 syscallarg(fd_set *) ou; 537 syscallarg(fd_set *) ex; 538 syscallarg(struct timeval *) tv; 539 } */ *uap = v; 540 541 struct timespec ts, *tsp = NULL; 542 int error; 543 544 if (SCARG(uap, tv) != NULL) { 545 struct timeval tv; 546 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 547 return (error); 548 #ifdef KTRACE 549 if (KTRPOINT(p, KTR_STRUCT)) 550 ktrreltimeval(p, &tv); 551 #endif 552 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 553 return (EINVAL); 554 TIMEVAL_TO_TIMESPEC(&tv, &ts); 555 tsp = &ts; 556 } 557 558 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 559 SCARG(uap, ex), tsp, NULL, retval)); 560 } 561 562 int 563 sys_pselect(struct proc *p, void *v, register_t *retval) 564 { 565 struct sys_pselect_args /* { 566 syscallarg(int) nd; 567 syscallarg(fd_set *) in; 568 syscallarg(fd_set *) ou; 569 syscallarg(fd_set *) ex; 570 syscallarg(const struct timespec *) ts; 571 syscallarg(const sigset_t *) mask; 572 } */ *uap = v; 573 574 struct timespec ts, *tsp = NULL; 575 sigset_t ss, *ssp = NULL; 576 int error; 577 578 if (SCARG(uap, ts) != NULL) { 579 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 580 return (error); 581 #ifdef KTRACE 582 if (KTRPOINT(p, KTR_STRUCT)) 583 ktrreltimespec(p, &ts); 584 #endif 585 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 586 return (EINVAL); 587 tsp = &ts; 588 } 589 if (SCARG(uap, mask) != NULL) { 590 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 591 return (error); 592 ssp = &ss; 593 } 594 595 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 596 SCARG(uap, ex), tsp, ssp, retval)); 597 } 598 599 int 600 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 601 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 602 { 603 struct kqueue_scan_state scan; 604 fd_mask bits[6]; 605 fd_set *pibits[3], *pobits[3]; 606 int error, ncollected = 0, nevents = 0; 607 u_int ni; 608 609 if (nd < 0) 610 return (EINVAL); 611 if (nd > p->p_fd->fd_nfiles) { 612 /* forgiving; slightly wrong */ 613 nd = p->p_fd->fd_nfiles; 614 } 615 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 616 if (ni > sizeof(bits[0])) { 617 caddr_t mbits; 618 619 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 620 pibits[0] = (fd_set *)&mbits[ni * 0]; 621 pibits[1] = (fd_set *)&mbits[ni * 1]; 622 pibits[2] = (fd_set *)&mbits[ni * 2]; 623 pobits[0] = (fd_set *)&mbits[ni * 3]; 624 pobits[1] = (fd_set *)&mbits[ni * 4]; 625 pobits[2] = (fd_set *)&mbits[ni * 5]; 626 } else { 627 memset(bits, 0, sizeof(bits)); 628 pibits[0] = (fd_set *)&bits[0]; 629 pibits[1] = (fd_set *)&bits[1]; 630 pibits[2] = (fd_set *)&bits[2]; 631 pobits[0] = (fd_set *)&bits[3]; 632 pobits[1] = (fd_set *)&bits[4]; 633 pobits[2] = (fd_set *)&bits[5]; 634 } 635 636 kqpoll_init(nd); 637 638 #define getbits(name, x) \ 639 if (name && (error = copyin(name, pibits[x], ni))) \ 640 goto done; 641 getbits(in, 0); 642 getbits(ou, 1); 643 getbits(ex, 2); 644 #undef getbits 645 #ifdef KTRACE 646 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 647 if (in) ktrfdset(p, pibits[0], ni); 648 if (ou) ktrfdset(p, pibits[1], ni); 649 if (ex) ktrfdset(p, pibits[2], ni); 650 } 651 #endif 652 653 if (sigmask) 654 dosigsuspend(p, *sigmask &~ sigcantmask); 655 656 /* Register kqueue events */ 657 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 658 if (error != 0) 659 goto done; 660 661 /* 662 * The poll/select family of syscalls has been designed to 663 * block when file descriptors are not available, even if 664 * there's nothing to wait for. 665 */ 666 if (nevents == 0 && ncollected == 0) { 667 uint64_t nsecs = INFSLP; 668 669 if (timeout != NULL) { 670 if (!timespecisset(timeout)) 671 goto done; 672 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 673 } 674 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 675 /* select is not restarted after signals... */ 676 if (error == ERESTART) 677 error = EINTR; 678 if (error == EWOULDBLOCK) 679 error = 0; 680 goto done; 681 } 682 683 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 684 kqueue_scan_setup(&scan, p->p_kq); 685 while (nevents > 0) { 686 struct kevent kev[KQ_NEVENTS]; 687 int i, ready, count; 688 689 /* Maximum number of events per iteration */ 690 count = MIN(nitems(kev), nevents); 691 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 692 #ifdef KTRACE 693 if (KTRPOINT(p, KTR_STRUCT)) 694 ktrevent(p, kev, ready); 695 #endif 696 /* Convert back events that are ready. */ 697 for (i = 0; i < ready && error == 0; i++) 698 error = pselcollect(p, &kev[i], pobits, &ncollected); 699 /* 700 * Stop if there was an error or if we had enough 701 * space to collect all events that were ready. 702 */ 703 if (error || ready < count) 704 break; 705 706 nevents -= ready; 707 } 708 kqueue_scan_finish(&scan); 709 *retval = ncollected; 710 done: 711 #define putbits(name, x) \ 712 if (name && (error2 = copyout(pobits[x], name, ni))) \ 713 error = error2; 714 if (error == 0) { 715 int error2; 716 717 putbits(in, 0); 718 putbits(ou, 1); 719 putbits(ex, 2); 720 #undef putbits 721 #ifdef KTRACE 722 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 723 if (in) ktrfdset(p, pobits[0], ni); 724 if (ou) ktrfdset(p, pobits[1], ni); 725 if (ex) ktrfdset(p, pobits[2], ni); 726 } 727 #endif 728 } 729 730 if (pibits[0] != (fd_set *)&bits[0]) 731 free(pibits[0], M_TEMP, 6 * ni); 732 733 kqpoll_done(nd); 734 735 return (error); 736 } 737 738 /* 739 * Convert fd_set into kqueue events and register them on the 740 * per-thread queue. 741 */ 742 int 743 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 744 int *nregistered, int *ncollected) 745 { 746 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 747 static const int evff[] = { 0, 0, NOTE_OOB }; 748 int msk, i, j, fd, nevents = 0, error = 0; 749 struct kevent kev; 750 fd_mask bits; 751 752 for (msk = 0; msk < 3; msk++) { 753 for (i = 0; i < nfd; i += NFDBITS) { 754 bits = pibits[msk]->fds_bits[i / NFDBITS]; 755 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 756 bits &= ~(1 << j); 757 758 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 759 fd, msk, p->p_kq_serial); 760 EV_SET(&kev, fd, evf[msk], 761 EV_ADD|EV_ENABLE|__EV_POLL, 762 evff[msk], 0, (void *)(p->p_kq_serial)); 763 #ifdef KTRACE 764 if (KTRPOINT(p, KTR_STRUCT)) 765 ktrevent(p, &kev, 1); 766 #endif 767 error = kqueue_register(p->p_kq, &kev, p); 768 switch (error) { 769 case 0: 770 nevents++; 771 /* FALLTHROUGH */ 772 case EOPNOTSUPP:/* No underlying kqfilter */ 773 case EINVAL: /* Unimplemented filter */ 774 case EPERM: /* Specific to FIFO */ 775 error = 0; 776 break; 777 case EPIPE: /* Specific to pipes */ 778 KASSERT(kev.filter == EVFILT_WRITE); 779 FD_SET(kev.ident, pobits[1]); 780 (*ncollected)++; 781 error = 0; 782 break; 783 case ENXIO: /* Device has been detached */ 784 default: 785 goto bad; 786 } 787 } 788 } 789 } 790 791 *nregistered = nevents; 792 return (0); 793 bad: 794 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 795 kev.filter, error); 796 return (error); 797 } 798 799 /* 800 * Convert given kqueue event into corresponding select(2) bit. 801 */ 802 int 803 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 804 int *ncollected) 805 { 806 if ((unsigned long)kevp->udata != p->p_kq_serial) { 807 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 808 __func__, kevp, (int)kevp->ident, 809 (unsigned long)kevp->udata, p->p_kq_serial); 810 } 811 812 if (kevp->flags & EV_ERROR) { 813 DPRINTFN(2, "select fd %d filt %d error %d\n", 814 (int)kevp->ident, kevp->filter, (int)kevp->data); 815 return (kevp->data); 816 } 817 818 switch (kevp->filter) { 819 case EVFILT_READ: 820 FD_SET(kevp->ident, pobits[0]); 821 break; 822 case EVFILT_WRITE: 823 FD_SET(kevp->ident, pobits[1]); 824 break; 825 case EVFILT_EXCEPT: 826 FD_SET(kevp->ident, pobits[2]); 827 break; 828 default: 829 KASSERT(0); 830 } 831 (*ncollected)++; 832 833 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 834 return (0); 835 } 836 837 int 838 seltrue(dev_t dev, int events, struct proc *p) 839 { 840 841 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 842 } 843 844 int 845 selfalse(dev_t dev, int events, struct proc *p) 846 { 847 848 return (0); 849 } 850 851 /* 852 * Record a select request. 853 */ 854 void 855 selrecord(struct proc *selector, struct selinfo *sip) 856 { 857 struct proc *p; 858 pid_t mytid; 859 860 KERNEL_ASSERT_LOCKED(); 861 862 mytid = selector->p_tid; 863 if (sip->si_seltid == mytid) 864 return; 865 if (sip->si_seltid && (p = tfind(sip->si_seltid)) && 866 p->p_wchan == (caddr_t)&selwait) 867 sip->si_flags |= SI_COLL; 868 else 869 sip->si_seltid = mytid; 870 } 871 872 /* 873 * Do a wakeup when a selectable event occurs. 874 */ 875 void 876 selwakeup(struct selinfo *sip) 877 { 878 KERNEL_LOCK(); 879 KNOTE(&sip->si_note, NOTE_SUBMIT); 880 doselwakeup(sip); 881 KERNEL_UNLOCK(); 882 } 883 884 void 885 doselwakeup(struct selinfo *sip) 886 { 887 struct proc *p; 888 889 KERNEL_ASSERT_LOCKED(); 890 891 if (sip->si_seltid == 0) 892 return; 893 if (sip->si_flags & SI_COLL) { 894 nselcoll++; 895 sip->si_flags &= ~SI_COLL; 896 wakeup(&selwait); 897 } 898 p = tfind(sip->si_seltid); 899 sip->si_seltid = 0; 900 if (p != NULL) { 901 if (wakeup_proc(p, &selwait)) { 902 /* nothing else to do */ 903 } else if (p->p_flag & P_SELECT) 904 atomic_clearbits_int(&p->p_flag, P_SELECT); 905 } 906 } 907 908 int 909 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, 910 struct pollfd *pl) 911 { 912 int i, error, nevents = 0; 913 914 KASSERT(pl->revents == 0); 915 916 #ifdef KTRACE 917 if (KTRPOINT(p, KTR_STRUCT)) 918 ktrevent(p, kevp, nkev); 919 #endif 920 for (i = 0; i < nkev; i++, kevp++) { 921 again: 922 error = kqueue_register(p->p_kq, kevp, p); 923 switch (error) { 924 case 0: 925 nevents++; 926 break; 927 case EOPNOTSUPP:/* No underlying kqfilter */ 928 case EINVAL: /* Unimplemented filter */ 929 break; 930 case EBADF: /* Bad file descriptor */ 931 pl->revents |= POLLNVAL; 932 break; 933 case EPERM: /* Specific to FIFO */ 934 KASSERT(kevp->filter == EVFILT_WRITE); 935 if (nkev == 1) { 936 /* 937 * If this is the only filter make sure 938 * POLLHUP is passed to userland. 939 */ 940 kevp->filter = EVFILT_EXCEPT; 941 goto again; 942 } 943 break; 944 case EPIPE: /* Specific to pipes */ 945 KASSERT(kevp->filter == EVFILT_WRITE); 946 pl->revents |= POLLHUP; 947 break; 948 default: 949 #ifdef DIAGNOSTIC 950 DPRINTFN(0, "poll err %lu fd %d revents %02x serial" 951 " %lu filt %d ERROR=%d\n", 952 ((unsigned long)kevp->udata - p->p_kq_serial), 953 pl->fd, pl->revents, p->p_kq_serial, kevp->filter, 954 error); 955 #endif 956 /* FALLTHROUGH */ 957 case ENXIO: /* Device has been detached */ 958 pl->revents |= POLLERR; 959 break; 960 } 961 } 962 963 return (nevents); 964 } 965 966 /* 967 * Convert pollfd into kqueue events and register them on the 968 * per-thread queue. 969 * 970 * Return the number of pollfd that triggered at least one error and aren't 971 * completly monitored. These pollfd should have the correponding error bit 972 * set in `revents'. 973 * 974 * At most 3 events can correspond to a single pollfd. 975 */ 976 int 977 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered) 978 { 979 int i, nkev, nevt, errcount = 0, forcehup = 0; 980 struct kevent kev[3], *kevp; 981 982 for (i = 0; i < nfds; i++) { 983 pl[i].events &= ~POLL_NOHUP; 984 pl[i].revents = 0; 985 986 if (pl[i].fd < 0) 987 continue; 988 989 if (pl[i].events == 0) 990 forcehup = 1; 991 992 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", 993 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); 994 995 nevt = 0; 996 nkev = 0; 997 kevp = kev; 998 if (pl[i].events & (POLLIN | POLLRDNORM)) { 999 EV_SET(kevp, pl[i].fd, EVFILT_READ, 1000 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1001 (void *)(p->p_kq_serial + i)); 1002 nkev++; 1003 kevp++; 1004 } 1005 if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1006 EV_SET(kevp, pl[i].fd, EVFILT_WRITE, 1007 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, 1008 (void *)(p->p_kq_serial + i)); 1009 nkev++; 1010 kevp++; 1011 } 1012 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { 1013 int evff = forcehup ? 0 : NOTE_OOB; 1014 1015 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, 1016 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, 1017 (void *)(p->p_kq_serial + i)); 1018 nkev++; 1019 kevp++; 1020 } 1021 1022 if (nkev == 0) 1023 continue; 1024 1025 nevt = ppollregister_evts(p, kev, nkev, &pl[i]); 1026 if (nevt == 0 && !forcehup) 1027 errcount++; 1028 *nregistered += nevt; 1029 } 1030 1031 #ifdef DIAGNOSTIC 1032 DPRINTFN(1, "poll registered = %d, errors = %d\n", *nregistered, 1033 errcount); 1034 #endif 1035 return (errcount); 1036 } 1037 1038 /* 1039 * Only copyout the revents field. 1040 */ 1041 int 1042 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 1043 { 1044 int error = 0; 1045 u_int i = 0; 1046 1047 while (!error && i++ < nfds) { 1048 error = copyout(&pl->revents, &upl->revents, 1049 sizeof(upl->revents)); 1050 pl++; 1051 upl++; 1052 } 1053 1054 return (error); 1055 } 1056 1057 /* 1058 * We are using the same mechanism as select only we encode/decode args 1059 * differently. 1060 */ 1061 int 1062 sys_poll(struct proc *p, void *v, register_t *retval) 1063 { 1064 struct sys_poll_args /* { 1065 syscallarg(struct pollfd *) fds; 1066 syscallarg(u_int) nfds; 1067 syscallarg(int) timeout; 1068 } */ *uap = v; 1069 1070 struct timespec ts, *tsp = NULL; 1071 int msec = SCARG(uap, timeout); 1072 1073 if (msec != INFTIM) { 1074 if (msec < 0) 1075 return (EINVAL); 1076 ts.tv_sec = msec / 1000; 1077 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 1078 tsp = &ts; 1079 } 1080 1081 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 1082 retval)); 1083 } 1084 1085 int 1086 sys_ppoll(struct proc *p, void *v, register_t *retval) 1087 { 1088 struct sys_ppoll_args /* { 1089 syscallarg(struct pollfd *) fds; 1090 syscallarg(u_int) nfds; 1091 syscallarg(const struct timespec *) ts; 1092 syscallarg(const sigset_t *) mask; 1093 } */ *uap = v; 1094 1095 int error; 1096 struct timespec ts, *tsp = NULL; 1097 sigset_t ss, *ssp = NULL; 1098 1099 if (SCARG(uap, ts) != NULL) { 1100 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 1101 return (error); 1102 #ifdef KTRACE 1103 if (KTRPOINT(p, KTR_STRUCT)) 1104 ktrreltimespec(p, &ts); 1105 #endif 1106 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 1107 return (EINVAL); 1108 tsp = &ts; 1109 } 1110 1111 if (SCARG(uap, mask) != NULL) { 1112 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 1113 return (error); 1114 ssp = &ss; 1115 } 1116 1117 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 1118 retval)); 1119 } 1120 1121 int 1122 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 1123 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 1124 { 1125 struct kqueue_scan_state scan; 1126 struct pollfd pfds[4], *pl = pfds; 1127 int error, ncollected, nevents = 0; 1128 size_t sz; 1129 1130 /* Standards say no more than MAX_OPEN; this is possibly better. */ 1131 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 1132 return (EINVAL); 1133 1134 /* optimize for the default case, of a small nfds value */ 1135 if (nfds > nitems(pfds)) { 1136 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 1137 M_WAITOK | M_CANFAIL); 1138 if (pl == NULL) 1139 return (EINVAL); 1140 } 1141 1142 kqpoll_init(nfds); 1143 1144 sz = nfds * sizeof(*pl); 1145 1146 if ((error = copyin(fds, pl, sz)) != 0) 1147 goto bad; 1148 1149 if (sigmask) 1150 dosigsuspend(p, *sigmask &~ sigcantmask); 1151 1152 /* Register kqueue events */ 1153 ncollected = ppollregister(p, pl, nfds, &nevents); 1154 1155 /* 1156 * The poll/select family of syscalls has been designed to 1157 * block when file descriptors are not available, even if 1158 * there's nothing to wait for. 1159 */ 1160 if (nevents == 0 && ncollected == 0) { 1161 uint64_t nsecs = INFSLP; 1162 1163 if (timeout != NULL) { 1164 if (!timespecisset(timeout)) 1165 goto done; 1166 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 1167 } 1168 1169 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); 1170 if (error == ERESTART) 1171 error = EINTR; 1172 if (error == EWOULDBLOCK) 1173 error = 0; 1174 goto done; 1175 } 1176 1177 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 1178 kqueue_scan_setup(&scan, p->p_kq); 1179 while (nevents > 0) { 1180 struct kevent kev[KQ_NEVENTS]; 1181 int i, ready, count; 1182 1183 /* Maximum number of events per iteration */ 1184 count = MIN(nitems(kev), nevents); 1185 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 1186 #ifdef KTRACE 1187 if (KTRPOINT(p, KTR_STRUCT)) 1188 ktrevent(p, kev, ready); 1189 #endif 1190 /* Convert back events that are ready. */ 1191 for (i = 0; i < ready; i++) 1192 ncollected += ppollcollect(p, &kev[i], pl, nfds); 1193 1194 /* 1195 * Stop if there was an error or if we had enough 1196 * place to collect all events that were ready. 1197 */ 1198 if (error || ready < count) 1199 break; 1200 1201 nevents -= ready; 1202 } 1203 kqueue_scan_finish(&scan); 1204 *retval = ncollected; 1205 done: 1206 /* 1207 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1208 * ignored (since the whole point is to see what would block). 1209 */ 1210 switch (error) { 1211 case EINTR: 1212 error = pollout(pl, fds, nfds); 1213 if (error == 0) 1214 error = EINTR; 1215 break; 1216 case EWOULDBLOCK: 1217 case 0: 1218 error = pollout(pl, fds, nfds); 1219 break; 1220 } 1221 #ifdef KTRACE 1222 if (KTRPOINT(p, KTR_STRUCT)) 1223 ktrpollfd(p, pl, nfds); 1224 #endif /* KTRACE */ 1225 bad: 1226 if (pl != pfds) 1227 free(pl, M_TEMP, sz); 1228 1229 kqpoll_done(nfds); 1230 1231 return (error); 1232 } 1233 1234 /* 1235 * Convert given kqueue event into corresponding poll(2) revents bit. 1236 */ 1237 int 1238 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) 1239 { 1240 int already_seen; 1241 unsigned long i; 1242 1243 /* Extract poll array index */ 1244 i = (unsigned long)kevp->udata - p->p_kq_serial; 1245 1246 if (i >= nfds) { 1247 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", 1248 __func__, kevp, nfds, 1249 (unsigned long)kevp->udata, p->p_kq_serial); 1250 } 1251 if ((int)kevp->ident != pl[i].fd) { 1252 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", 1253 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, 1254 p->p_kq_serial); 1255 } 1256 1257 /* 1258 * A given descriptor may already have generated an error 1259 * against another filter during kqueue_register(). 1260 * 1261 * Make sure to set the appropriate flags but do not 1262 * increment `*retval' more than once. 1263 */ 1264 already_seen = (pl[i].revents != 0); 1265 1266 /* POLLNVAL preempts other events. */ 1267 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { 1268 pl[i].revents = POLLNVAL; 1269 goto done; 1270 } else if (pl[i].revents & POLLNVAL) { 1271 goto done; 1272 } 1273 1274 switch (kevp->filter) { 1275 case EVFILT_READ: 1276 if (kevp->flags & __EV_HUP) 1277 pl[i].revents |= POLLHUP; 1278 if (pl[i].events & (POLLIN | POLLRDNORM)) 1279 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); 1280 break; 1281 case EVFILT_WRITE: 1282 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ 1283 if (kevp->flags & __EV_HUP) { 1284 pl[i].revents |= POLLHUP; 1285 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { 1286 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); 1287 } 1288 break; 1289 case EVFILT_EXCEPT: 1290 if (kevp->flags & __EV_HUP) { 1291 #ifdef DIAGNOSTIC 1292 if (pl[i].events != 0 && pl[i].events != POLLOUT) 1293 DPRINTFN(0, "weird events %x\n", pl[i].events); 1294 #endif 1295 pl[i].revents |= POLLHUP; 1296 break; 1297 } 1298 if (pl[i].events & (POLLPRI | POLLRDBAND)) 1299 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); 1300 break; 1301 default: 1302 KASSERT(0); 1303 } 1304 1305 done: 1306 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", 1307 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, 1308 kevp->filter); 1309 if (!already_seen && (pl[i].revents != 0)) 1310 return (1); 1311 1312 return (0); 1313 } 1314 1315 /* 1316 * utrace system call 1317 */ 1318 int 1319 sys_utrace(struct proc *curp, void *v, register_t *retval) 1320 { 1321 #ifdef KTRACE 1322 struct sys_utrace_args /* { 1323 syscallarg(const char *) label; 1324 syscallarg(const void *) addr; 1325 syscallarg(size_t) len; 1326 } */ *uap = v; 1327 1328 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1329 SCARG(uap, len))); 1330 #else 1331 return (0); 1332 #endif 1333 } 1334