1 /* $OpenBSD: sys_generic.c,v 1.146 2021/12/11 09:28:26 visa Exp $ */ 2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ 3 4 /* 5 * Copyright (c) 1996 Theo de Raadt 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/filedesc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/stat.h> 55 #include <sys/time.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/eventvar.h> 59 #ifdef KTRACE 60 #include <sys/ktrace.h> 61 #endif 62 #include <sys/sched.h> 63 #include <sys/pledge.h> 64 65 #include <sys/mount.h> 66 #include <sys/syscallargs.h> 67 68 #include <uvm/uvm_extern.h> 69 70 /* 71 * Debug values: 72 * 1 - print implementation errors, things that should not happen. 73 * 2 - print ppoll(2) information, somewhat verbose 74 * 3 - print pselect(2) and ppoll(2) information, very verbose 75 */ 76 int kqpoll_debug = 0; 77 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ 78 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ 79 printf(x); \ 80 } 81 82 int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *); 83 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *); 84 85 void pollscan(struct proc *, struct pollfd *, u_int, register_t *); 86 int pollout(struct pollfd *, struct pollfd *, u_int); 87 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, 88 struct timespec *, const sigset_t *, register_t *); 89 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, 90 const sigset_t *, register_t *); 91 void doselwakeup(struct selinfo *); 92 93 int 94 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, 95 unsigned int iovcnt, size_t *residp) 96 { 97 #ifdef KTRACE 98 struct proc *p = curproc; 99 #endif 100 struct iovec *iov; 101 int error, i; 102 size_t resid = 0; 103 104 if (iovcnt > UIO_SMALLIOV) { 105 if (iovcnt > IOV_MAX) 106 return (EINVAL); 107 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); 108 } else if (iovcnt > 0) { 109 iov = aiov; 110 } else { 111 return (EINVAL); 112 } 113 *iovp = iov; 114 115 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) 116 return (error); 117 118 #ifdef KTRACE 119 if (KTRPOINT(p, KTR_STRUCT)) 120 ktriovec(p, iov, iovcnt); 121 #endif 122 123 for (i = 0; i < iovcnt; i++) { 124 resid += iov->iov_len; 125 /* 126 * Writes return ssize_t because -1 is returned on error. 127 * Therefore we must restrict the length to SSIZE_MAX to 128 * avoid garbage return values. Note that the addition is 129 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. 130 */ 131 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) 132 return (EINVAL); 133 iov++; 134 } 135 136 if (residp != NULL) 137 *residp = resid; 138 139 return (0); 140 } 141 142 void 143 iovec_free(struct iovec *iov, unsigned int iovcnt) 144 { 145 if (iovcnt > UIO_SMALLIOV) 146 free(iov, M_IOV, iovcnt * sizeof(*iov)); 147 } 148 149 /* 150 * Read system call. 151 */ 152 int 153 sys_read(struct proc *p, void *v, register_t *retval) 154 { 155 struct sys_read_args /* { 156 syscallarg(int) fd; 157 syscallarg(void *) buf; 158 syscallarg(size_t) nbyte; 159 } */ *uap = v; 160 struct iovec iov; 161 struct uio auio; 162 163 iov.iov_base = SCARG(uap, buf); 164 iov.iov_len = SCARG(uap, nbyte); 165 if (iov.iov_len > SSIZE_MAX) 166 return (EINVAL); 167 168 auio.uio_iov = &iov; 169 auio.uio_iovcnt = 1; 170 auio.uio_resid = iov.iov_len; 171 172 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); 173 } 174 175 /* 176 * Scatter read system call. 177 */ 178 int 179 sys_readv(struct proc *p, void *v, register_t *retval) 180 { 181 struct sys_readv_args /* { 182 syscallarg(int) fd; 183 syscallarg(const struct iovec *) iovp; 184 syscallarg(int) iovcnt; 185 } */ *uap = v; 186 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 187 int error, iovcnt = SCARG(uap, iovcnt); 188 struct uio auio; 189 size_t resid; 190 191 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 192 if (error) 193 goto done; 194 195 auio.uio_iov = iov; 196 auio.uio_iovcnt = iovcnt; 197 auio.uio_resid = resid; 198 199 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); 200 done: 201 iovec_free(iov, iovcnt); 202 return (error); 203 } 204 205 int 206 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, 207 register_t *retval) 208 { 209 struct filedesc *fdp = p->p_fd; 210 struct file *fp; 211 long cnt, error = 0; 212 u_int iovlen; 213 #ifdef KTRACE 214 struct iovec *ktriov = NULL; 215 #endif 216 217 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 218 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 219 220 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) 221 return (EBADF); 222 223 /* Checks for positioned read. */ 224 if (flags & FO_POSITION) { 225 struct vnode *vp = fp->f_data; 226 227 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 228 (vp->v_flag & VISTTY)) { 229 error = ESPIPE; 230 goto done; 231 } 232 233 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 234 error = EINVAL; 235 goto done; 236 } 237 } 238 239 uio->uio_rw = UIO_READ; 240 uio->uio_segflg = UIO_USERSPACE; 241 uio->uio_procp = p; 242 #ifdef KTRACE 243 /* 244 * if tracing, save a copy of iovec 245 */ 246 if (KTRPOINT(p, KTR_GENIO)) { 247 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 248 memcpy(ktriov, uio->uio_iov, iovlen); 249 } 250 #endif 251 cnt = uio->uio_resid; 252 error = (*fp->f_ops->fo_read)(fp, uio, flags); 253 if (error) { 254 if (uio->uio_resid != cnt && (error == ERESTART || 255 error == EINTR || error == EWOULDBLOCK)) 256 error = 0; 257 } 258 cnt -= uio->uio_resid; 259 260 mtx_enter(&fp->f_mtx); 261 fp->f_rxfer++; 262 fp->f_rbytes += cnt; 263 mtx_leave(&fp->f_mtx); 264 #ifdef KTRACE 265 if (ktriov != NULL) { 266 if (error == 0) 267 ktrgenio(p, fd, UIO_READ, ktriov, cnt); 268 free(ktriov, M_TEMP, iovlen); 269 } 270 #endif 271 *retval = cnt; 272 done: 273 FRELE(fp, p); 274 return (error); 275 } 276 277 /* 278 * Write system call 279 */ 280 int 281 sys_write(struct proc *p, void *v, register_t *retval) 282 { 283 struct sys_write_args /* { 284 syscallarg(int) fd; 285 syscallarg(const void *) buf; 286 syscallarg(size_t) nbyte; 287 } */ *uap = v; 288 struct iovec iov; 289 struct uio auio; 290 291 iov.iov_base = (void *)SCARG(uap, buf); 292 iov.iov_len = SCARG(uap, nbyte); 293 if (iov.iov_len > SSIZE_MAX) 294 return (EINVAL); 295 296 auio.uio_iov = &iov; 297 auio.uio_iovcnt = 1; 298 auio.uio_resid = iov.iov_len; 299 300 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); 301 } 302 303 /* 304 * Gather write system call 305 */ 306 int 307 sys_writev(struct proc *p, void *v, register_t *retval) 308 { 309 struct sys_writev_args /* { 310 syscallarg(int) fd; 311 syscallarg(const struct iovec *) iovp; 312 syscallarg(int) iovcnt; 313 } */ *uap = v; 314 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 315 int error, iovcnt = SCARG(uap, iovcnt); 316 struct uio auio; 317 size_t resid; 318 319 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); 320 if (error) 321 goto done; 322 323 auio.uio_iov = iov; 324 auio.uio_iovcnt = iovcnt; 325 auio.uio_resid = resid; 326 327 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); 328 done: 329 iovec_free(iov, iovcnt); 330 return (error); 331 } 332 333 int 334 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, 335 register_t *retval) 336 { 337 struct filedesc *fdp = p->p_fd; 338 struct file *fp; 339 long cnt, error = 0; 340 u_int iovlen; 341 #ifdef KTRACE 342 struct iovec *ktriov = NULL; 343 #endif 344 345 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); 346 iovlen = uio->uio_iovcnt * sizeof(struct iovec); 347 348 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) 349 return (EBADF); 350 351 /* Checks for positioned write. */ 352 if (flags & FO_POSITION) { 353 struct vnode *vp = fp->f_data; 354 355 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || 356 (vp->v_flag & VISTTY)) { 357 error = ESPIPE; 358 goto done; 359 } 360 361 if (uio->uio_offset < 0 && vp->v_type != VCHR) { 362 error = EINVAL; 363 goto done; 364 } 365 } 366 367 uio->uio_rw = UIO_WRITE; 368 uio->uio_segflg = UIO_USERSPACE; 369 uio->uio_procp = p; 370 #ifdef KTRACE 371 /* 372 * if tracing, save a copy of iovec 373 */ 374 if (KTRPOINT(p, KTR_GENIO)) { 375 ktriov = malloc(iovlen, M_TEMP, M_WAITOK); 376 memcpy(ktriov, uio->uio_iov, iovlen); 377 } 378 #endif 379 cnt = uio->uio_resid; 380 error = (*fp->f_ops->fo_write)(fp, uio, flags); 381 if (error) { 382 if (uio->uio_resid != cnt && (error == ERESTART || 383 error == EINTR || error == EWOULDBLOCK)) 384 error = 0; 385 if (error == EPIPE) { 386 KERNEL_LOCK(); 387 ptsignal(p, SIGPIPE, STHREAD); 388 KERNEL_UNLOCK(); 389 } 390 } 391 cnt -= uio->uio_resid; 392 393 mtx_enter(&fp->f_mtx); 394 fp->f_wxfer++; 395 fp->f_wbytes += cnt; 396 mtx_leave(&fp->f_mtx); 397 #ifdef KTRACE 398 if (ktriov != NULL) { 399 if (error == 0) 400 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); 401 free(ktriov, M_TEMP, iovlen); 402 } 403 #endif 404 *retval = cnt; 405 done: 406 FRELE(fp, p); 407 return (error); 408 } 409 410 /* 411 * Ioctl system call 412 */ 413 int 414 sys_ioctl(struct proc *p, void *v, register_t *retval) 415 { 416 struct sys_ioctl_args /* { 417 syscallarg(int) fd; 418 syscallarg(u_long) com; 419 syscallarg(void *) data; 420 } */ *uap = v; 421 struct file *fp; 422 struct filedesc *fdp = p->p_fd; 423 u_long com = SCARG(uap, com); 424 int error = 0; 425 u_int size = 0; 426 caddr_t data, memp = NULL; 427 int tmp; 428 #define STK_PARAMS 128 429 long long stkbuf[STK_PARAMS / sizeof(long long)]; 430 431 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) 432 return (EBADF); 433 434 if (fp->f_type == DTYPE_SOCKET) { 435 struct socket *so = fp->f_data; 436 437 if (so->so_state & SS_DNS) { 438 error = EINVAL; 439 goto out; 440 } 441 } 442 443 error = pledge_ioctl(p, com, fp); 444 if (error) 445 goto out; 446 447 switch (com) { 448 case FIONCLEX: 449 case FIOCLEX: 450 fdplock(fdp); 451 if (com == FIONCLEX) 452 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 453 else 454 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 455 fdpunlock(fdp); 456 goto out; 457 } 458 459 /* 460 * Interpret high order word to find amount of data to be 461 * copied to/from the user's address space. 462 */ 463 size = IOCPARM_LEN(com); 464 if (size > IOCPARM_MAX) { 465 error = ENOTTY; 466 goto out; 467 } 468 if (size > sizeof (stkbuf)) { 469 memp = malloc(size, M_IOCTLOPS, M_WAITOK); 470 data = memp; 471 } else 472 data = (caddr_t)stkbuf; 473 if (com&IOC_IN) { 474 if (size) { 475 error = copyin(SCARG(uap, data), data, size); 476 if (error) { 477 goto out; 478 } 479 } else 480 *(caddr_t *)data = SCARG(uap, data); 481 } else if ((com&IOC_OUT) && size) 482 /* 483 * Zero the buffer so the user always 484 * gets back something deterministic. 485 */ 486 memset(data, 0, size); 487 else if (com&IOC_VOID) 488 *(caddr_t *)data = SCARG(uap, data); 489 490 switch (com) { 491 492 case FIONBIO: 493 if ((tmp = *(int *)data) != 0) 494 atomic_setbits_int(&fp->f_flag, FNONBLOCK); 495 else 496 atomic_clearbits_int(&fp->f_flag, FNONBLOCK); 497 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); 498 break; 499 500 case FIOASYNC: 501 if ((tmp = *(int *)data) != 0) 502 atomic_setbits_int(&fp->f_flag, FASYNC); 503 else 504 atomic_clearbits_int(&fp->f_flag, FASYNC); 505 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); 506 break; 507 508 default: 509 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); 510 break; 511 } 512 /* 513 * Copy any data to user, size was 514 * already set and checked above. 515 */ 516 if (error == 0 && (com&IOC_OUT) && size) 517 error = copyout(data, SCARG(uap, data), size); 518 out: 519 FRELE(fp, p); 520 free(memp, M_IOCTLOPS, size); 521 return (error); 522 } 523 524 int selwait, nselcoll; 525 526 /* 527 * Select system call. 528 */ 529 int 530 sys_select(struct proc *p, void *v, register_t *retval) 531 { 532 struct sys_select_args /* { 533 syscallarg(int) nd; 534 syscallarg(fd_set *) in; 535 syscallarg(fd_set *) ou; 536 syscallarg(fd_set *) ex; 537 syscallarg(struct timeval *) tv; 538 } */ *uap = v; 539 540 struct timespec ts, *tsp = NULL; 541 int error; 542 543 if (SCARG(uap, tv) != NULL) { 544 struct timeval tv; 545 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) 546 return (error); 547 #ifdef KTRACE 548 if (KTRPOINT(p, KTR_STRUCT)) 549 ktrreltimeval(p, &tv); 550 #endif 551 if (tv.tv_sec < 0 || !timerisvalid(&tv)) 552 return (EINVAL); 553 TIMEVAL_TO_TIMESPEC(&tv, &ts); 554 tsp = &ts; 555 } 556 557 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 558 SCARG(uap, ex), tsp, NULL, retval)); 559 } 560 561 int 562 sys_pselect(struct proc *p, void *v, register_t *retval) 563 { 564 struct sys_pselect_args /* { 565 syscallarg(int) nd; 566 syscallarg(fd_set *) in; 567 syscallarg(fd_set *) ou; 568 syscallarg(fd_set *) ex; 569 syscallarg(const struct timespec *) ts; 570 syscallarg(const sigset_t *) mask; 571 } */ *uap = v; 572 573 struct timespec ts, *tsp = NULL; 574 sigset_t ss, *ssp = NULL; 575 int error; 576 577 if (SCARG(uap, ts) != NULL) { 578 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 579 return (error); 580 #ifdef KTRACE 581 if (KTRPOINT(p, KTR_STRUCT)) 582 ktrreltimespec(p, &ts); 583 #endif 584 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 585 return (EINVAL); 586 tsp = &ts; 587 } 588 if (SCARG(uap, mask) != NULL) { 589 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 590 return (error); 591 ssp = &ss; 592 } 593 594 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), 595 SCARG(uap, ex), tsp, ssp, retval)); 596 } 597 598 int 599 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, 600 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 601 { 602 struct kqueue_scan_state scan; 603 struct timespec zerots = {}; 604 fd_mask bits[6]; 605 fd_set *pibits[3], *pobits[3]; 606 int error, ncollected = 0, nevents = 0; 607 u_int ni; 608 609 if (nd < 0) 610 return (EINVAL); 611 if (nd > p->p_fd->fd_nfiles) { 612 /* forgiving; slightly wrong */ 613 nd = p->p_fd->fd_nfiles; 614 } 615 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 616 if (ni > sizeof(bits[0])) { 617 caddr_t mbits; 618 619 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); 620 pibits[0] = (fd_set *)&mbits[ni * 0]; 621 pibits[1] = (fd_set *)&mbits[ni * 1]; 622 pibits[2] = (fd_set *)&mbits[ni * 2]; 623 pobits[0] = (fd_set *)&mbits[ni * 3]; 624 pobits[1] = (fd_set *)&mbits[ni * 4]; 625 pobits[2] = (fd_set *)&mbits[ni * 5]; 626 } else { 627 memset(bits, 0, sizeof(bits)); 628 pibits[0] = (fd_set *)&bits[0]; 629 pibits[1] = (fd_set *)&bits[1]; 630 pibits[2] = (fd_set *)&bits[2]; 631 pobits[0] = (fd_set *)&bits[3]; 632 pobits[1] = (fd_set *)&bits[4]; 633 pobits[2] = (fd_set *)&bits[5]; 634 } 635 636 kqpoll_init(nd); 637 638 #define getbits(name, x) \ 639 if (name && (error = copyin(name, pibits[x], ni))) \ 640 goto done; 641 getbits(in, 0); 642 getbits(ou, 1); 643 getbits(ex, 2); 644 #undef getbits 645 #ifdef KTRACE 646 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 647 if (in) ktrfdset(p, pibits[0], ni); 648 if (ou) ktrfdset(p, pibits[1], ni); 649 if (ex) ktrfdset(p, pibits[2], ni); 650 } 651 #endif 652 653 if (sigmask) 654 dosigsuspend(p, *sigmask &~ sigcantmask); 655 656 /* Register kqueue events */ 657 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); 658 if (error != 0) 659 goto done; 660 661 /* 662 * The poll/select family of syscalls has been designed to 663 * block when file descriptors are not available, even if 664 * there's nothing to wait for. 665 */ 666 if (nevents == 0 && ncollected == 0) { 667 uint64_t nsecs = INFSLP; 668 669 if (timeout != NULL) { 670 if (!timespecisset(timeout)) 671 goto done; 672 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); 673 } 674 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); 675 /* select is not restarted after signals... */ 676 if (error == ERESTART) 677 error = EINTR; 678 if (error == EWOULDBLOCK) 679 error = 0; 680 goto done; 681 } 682 683 /* Do not block if registering found pending events. */ 684 if (ncollected > 0) 685 timeout = &zerots; 686 687 /* Collect at most `nevents' possibly waiting in kqueue_scan() */ 688 kqueue_scan_setup(&scan, p->p_kq); 689 while (nevents > 0) { 690 struct kevent kev[KQ_NEVENTS]; 691 int i, ready, count; 692 693 /* Maximum number of events per iteration */ 694 count = MIN(nitems(kev), nevents); 695 ready = kqueue_scan(&scan, count, kev, timeout, p, &error); 696 #ifdef KTRACE 697 if (KTRPOINT(p, KTR_STRUCT)) 698 ktrevent(p, kev, ready); 699 #endif 700 /* Convert back events that are ready. */ 701 for (i = 0; i < ready && error == 0; i++) 702 error = pselcollect(p, &kev[i], pobits, &ncollected); 703 /* 704 * Stop if there was an error or if we had enough 705 * space to collect all events that were ready. 706 */ 707 if (error || ready < count) 708 break; 709 710 nevents -= ready; 711 } 712 kqueue_scan_finish(&scan); 713 *retval = ncollected; 714 done: 715 #define putbits(name, x) \ 716 if (name && (error2 = copyout(pobits[x], name, ni))) \ 717 error = error2; 718 if (error == 0) { 719 int error2; 720 721 putbits(in, 0); 722 putbits(ou, 1); 723 putbits(ex, 2); 724 #undef putbits 725 #ifdef KTRACE 726 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { 727 if (in) ktrfdset(p, pobits[0], ni); 728 if (ou) ktrfdset(p, pobits[1], ni); 729 if (ex) ktrfdset(p, pobits[2], ni); 730 } 731 #endif 732 } 733 734 if (pibits[0] != (fd_set *)&bits[0]) 735 free(pibits[0], M_TEMP, 6 * ni); 736 737 kqpoll_done(nd); 738 739 return (error); 740 } 741 742 /* 743 * Convert fd_set into kqueue events and register them on the 744 * per-thread queue. 745 */ 746 int 747 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, 748 int *nregistered, int *ncollected) 749 { 750 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; 751 static const int evff[] = { 0, 0, NOTE_OOB }; 752 int msk, i, j, fd, nevents = 0, error = 0; 753 struct kevent kev; 754 fd_mask bits; 755 756 for (msk = 0; msk < 3; msk++) { 757 for (i = 0; i < nfd; i += NFDBITS) { 758 bits = pibits[msk]->fds_bits[i / NFDBITS]; 759 while ((j = ffs(bits)) && (fd = i + --j) < nfd) { 760 bits &= ~(1 << j); 761 762 DPRINTFN(2, "select fd %d mask %d serial %lu\n", 763 fd, msk, p->p_kq_serial); 764 EV_SET(&kev, fd, evf[msk], 765 EV_ADD|EV_ENABLE|__EV_SELECT, 766 evff[msk], 0, (void *)(p->p_kq_serial)); 767 #ifdef KTRACE 768 if (KTRPOINT(p, KTR_STRUCT)) 769 ktrevent(p, &kev, 1); 770 #endif 771 error = kqueue_register(p->p_kq, &kev, p); 772 switch (error) { 773 case 0: 774 nevents++; 775 /* FALLTHROUGH */ 776 case EOPNOTSUPP:/* No underlying kqfilter */ 777 case EINVAL: /* Unimplemented filter */ 778 case EPERM: /* Specific to FIFO and 779 * __EV_SELECT */ 780 error = 0; 781 break; 782 case EPIPE: /* Specific to pipes */ 783 KASSERT(kev.filter == EVFILT_WRITE); 784 FD_SET(kev.ident, pobits[1]); 785 (*ncollected)++; 786 error = 0; 787 break; 788 case ENXIO: /* Device has been detached */ 789 default: 790 goto bad; 791 } 792 } 793 } 794 } 795 796 *nregistered = nevents; 797 return (0); 798 bad: 799 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, 800 kev.filter, error); 801 return (error); 802 } 803 804 /* 805 * Convert given kqueue event into corresponding select(2) bit. 806 */ 807 int 808 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], 809 int *ncollected) 810 { 811 if ((unsigned long)kevp->udata != p->p_kq_serial) { 812 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", 813 __func__, kevp, (int)kevp->ident, 814 (unsigned long)kevp->udata, p->p_kq_serial); 815 } 816 817 if (kevp->flags & EV_ERROR) { 818 DPRINTFN(2, "select fd %d filt %d error %d\n", 819 (int)kevp->ident, kevp->filter, (int)kevp->data); 820 return (kevp->data); 821 } 822 823 switch (kevp->filter) { 824 case EVFILT_READ: 825 FD_SET(kevp->ident, pobits[0]); 826 break; 827 case EVFILT_WRITE: 828 FD_SET(kevp->ident, pobits[1]); 829 break; 830 case EVFILT_EXCEPT: 831 FD_SET(kevp->ident, pobits[2]); 832 break; 833 default: 834 KASSERT(0); 835 } 836 (*ncollected)++; 837 838 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); 839 return (0); 840 } 841 842 int 843 seltrue(dev_t dev, int events, struct proc *p) 844 { 845 846 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 847 } 848 849 int 850 selfalse(dev_t dev, int events, struct proc *p) 851 { 852 853 return (0); 854 } 855 856 /* 857 * Record a select request. 858 */ 859 void 860 selrecord(struct proc *selector, struct selinfo *sip) 861 { 862 struct proc *p; 863 pid_t mytid; 864 865 KERNEL_ASSERT_LOCKED(); 866 867 mytid = selector->p_tid; 868 if (sip->si_seltid == mytid) 869 return; 870 if (sip->si_seltid && (p = tfind(sip->si_seltid)) && 871 p->p_wchan == (caddr_t)&selwait) 872 sip->si_flags |= SI_COLL; 873 else 874 sip->si_seltid = mytid; 875 } 876 877 /* 878 * Do a wakeup when a selectable event occurs. 879 */ 880 void 881 selwakeup(struct selinfo *sip) 882 { 883 KERNEL_LOCK(); 884 KNOTE(&sip->si_note, NOTE_SUBMIT); 885 doselwakeup(sip); 886 KERNEL_UNLOCK(); 887 } 888 889 void 890 doselwakeup(struct selinfo *sip) 891 { 892 struct proc *p; 893 894 KERNEL_ASSERT_LOCKED(); 895 896 if (sip->si_seltid == 0) 897 return; 898 if (sip->si_flags & SI_COLL) { 899 nselcoll++; 900 sip->si_flags &= ~SI_COLL; 901 wakeup(&selwait); 902 } 903 p = tfind(sip->si_seltid); 904 sip->si_seltid = 0; 905 if (p != NULL) { 906 if (wakeup_proc(p, &selwait)) { 907 /* nothing else to do */ 908 } else if (p->p_flag & P_SELECT) 909 atomic_clearbits_int(&p->p_flag, P_SELECT); 910 } 911 } 912 913 void 914 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval) 915 { 916 struct filedesc *fdp = p->p_fd; 917 struct file *fp; 918 u_int i; 919 int n = 0; 920 921 for (i = 0; i < nfd; i++, pl++) { 922 /* Check the file descriptor. */ 923 if (pl->fd < 0) { 924 pl->revents = 0; 925 continue; 926 } 927 if ((fp = fd_getfile(fdp, pl->fd)) == NULL) { 928 pl->revents = POLLNVAL; 929 n++; 930 continue; 931 } 932 pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p); 933 FRELE(fp, p); 934 if (pl->revents != 0) 935 n++; 936 } 937 *retval = n; 938 } 939 940 /* 941 * Only copyout the revents field. 942 */ 943 int 944 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) 945 { 946 int error = 0; 947 u_int i = 0; 948 949 while (!error && i++ < nfds) { 950 error = copyout(&pl->revents, &upl->revents, 951 sizeof(upl->revents)); 952 pl++; 953 upl++; 954 } 955 956 return (error); 957 } 958 959 /* 960 * We are using the same mechanism as select only we encode/decode args 961 * differently. 962 */ 963 int 964 sys_poll(struct proc *p, void *v, register_t *retval) 965 { 966 struct sys_poll_args /* { 967 syscallarg(struct pollfd *) fds; 968 syscallarg(u_int) nfds; 969 syscallarg(int) timeout; 970 } */ *uap = v; 971 972 struct timespec ts, *tsp = NULL; 973 int msec = SCARG(uap, timeout); 974 975 if (msec != INFTIM) { 976 if (msec < 0) 977 return (EINVAL); 978 ts.tv_sec = msec / 1000; 979 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; 980 tsp = &ts; 981 } 982 983 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, 984 retval)); 985 } 986 987 int 988 sys_ppoll(struct proc *p, void *v, register_t *retval) 989 { 990 struct sys_ppoll_args /* { 991 syscallarg(struct pollfd *) fds; 992 syscallarg(u_int) nfds; 993 syscallarg(const struct timespec *) ts; 994 syscallarg(const sigset_t *) mask; 995 } */ *uap = v; 996 997 int error; 998 struct timespec ts, *tsp = NULL; 999 sigset_t ss, *ssp = NULL; 1000 1001 if (SCARG(uap, ts) != NULL) { 1002 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) 1003 return (error); 1004 #ifdef KTRACE 1005 if (KTRPOINT(p, KTR_STRUCT)) 1006 ktrreltimespec(p, &ts); 1007 #endif 1008 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) 1009 return (EINVAL); 1010 tsp = &ts; 1011 } 1012 1013 if (SCARG(uap, mask) != NULL) { 1014 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) 1015 return (error); 1016 ssp = &ss; 1017 } 1018 1019 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, 1020 retval)); 1021 } 1022 1023 int 1024 doppoll(struct proc *p, struct pollfd *fds, u_int nfds, 1025 struct timespec *timeout, const sigset_t *sigmask, register_t *retval) 1026 { 1027 size_t sz; 1028 struct pollfd pfds[4], *pl = pfds; 1029 struct timespec elapsed, start, stop; 1030 uint64_t nsecs; 1031 int ncoll, i, s, error; 1032 1033 /* Standards say no more than MAX_OPEN; this is possibly better. */ 1034 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) 1035 return (EINVAL); 1036 1037 /* optimize for the default case, of a small nfds value */ 1038 if (nfds > nitems(pfds)) { 1039 pl = mallocarray(nfds, sizeof(*pl), M_TEMP, 1040 M_WAITOK | M_CANFAIL); 1041 if (pl == NULL) 1042 return (EINVAL); 1043 } 1044 1045 sz = nfds * sizeof(*pl); 1046 1047 if ((error = copyin(fds, pl, sz)) != 0) 1048 goto bad; 1049 1050 for (i = 0; i < nfds; i++) { 1051 pl[i].events &= ~POLL_NOHUP; 1052 pl[i].revents = 0; 1053 } 1054 1055 if (sigmask) 1056 dosigsuspend(p, *sigmask &~ sigcantmask); 1057 1058 retry: 1059 ncoll = nselcoll; 1060 atomic_setbits_int(&p->p_flag, P_SELECT); 1061 pollscan(p, pl, nfds, retval); 1062 if (*retval) 1063 goto done; 1064 if (timeout == NULL || timespecisset(timeout)) { 1065 if (timeout != NULL) { 1066 getnanouptime(&start); 1067 nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP); 1068 } else 1069 nsecs = INFSLP; 1070 s = splhigh(); 1071 if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { 1072 splx(s); 1073 goto retry; 1074 } 1075 atomic_clearbits_int(&p->p_flag, P_SELECT); 1076 error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs); 1077 splx(s); 1078 if (timeout != NULL) { 1079 getnanouptime(&stop); 1080 timespecsub(&stop, &start, &elapsed); 1081 timespecsub(timeout, &elapsed, timeout); 1082 if (timeout->tv_sec < 0) 1083 timespecclear(timeout); 1084 } 1085 if (error == 0 || error == EWOULDBLOCK) 1086 goto retry; 1087 } 1088 1089 done: 1090 atomic_clearbits_int(&p->p_flag, P_SELECT); 1091 /* 1092 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is 1093 * ignored (since the whole point is to see what would block). 1094 */ 1095 switch (error) { 1096 case ERESTART: 1097 error = pollout(pl, fds, nfds); 1098 if (error == 0) 1099 error = EINTR; 1100 break; 1101 case EWOULDBLOCK: 1102 case 0: 1103 error = pollout(pl, fds, nfds); 1104 break; 1105 } 1106 #ifdef KTRACE 1107 if (KTRPOINT(p, KTR_STRUCT)) 1108 ktrpollfd(p, pl, nfds); 1109 #endif /* KTRACE */ 1110 bad: 1111 if (pl != pfds) 1112 free(pl, M_TEMP, sz); 1113 return (error); 1114 } 1115 1116 /* 1117 * utrace system call 1118 */ 1119 int 1120 sys_utrace(struct proc *curp, void *v, register_t *retval) 1121 { 1122 #ifdef KTRACE 1123 struct sys_utrace_args /* { 1124 syscallarg(const char *) label; 1125 syscallarg(const void *) addr; 1126 syscallarg(size_t) len; 1127 } */ *uap = v; 1128 1129 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), 1130 SCARG(uap, len))); 1131 #else 1132 return (0); 1133 #endif 1134 } 1135