1 /* $NetBSD: sys_generic.c,v 1.113 2008/03/05 18:09:58 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1989, 1993 41 * The Regents of the University of California. All rights reserved. 42 * (c) UNIX System Laboratories, Inc. 43 * All or some portions of this file are derived from material licensed 44 * to the University of California by American Telephone and Telegraph 45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 46 * the permission of UNIX System Laboratories, Inc. 47 * 48 * Redistribution and use in source and binary forms, with or without 49 * modification, are permitted provided that the following conditions 50 * are met: 51 * 1. Redistributions of source code must retain the above copyright 52 * notice, this list of conditions and the following disclaimer. 53 * 2. Redistributions in binary form must reproduce the above copyright 54 * notice, this list of conditions and the following disclaimer in the 55 * documentation and/or other materials provided with the distribution. 56 * 3. Neither the name of the University nor the names of its contributors 57 * may be used to endorse or promote products derived from this software 58 * without specific prior written permission. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 70 * SUCH DAMAGE. 71 * 72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 73 */ 74 75 /* 76 * System calls relating to files. 77 */ 78 79 #include <sys/cdefs.h> 80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.113 2008/03/05 18:09:58 ad Exp $"); 81 82 #include <sys/param.h> 83 #include <sys/systm.h> 84 #include <sys/filedesc.h> 85 #include <sys/ioctl.h> 86 #include <sys/file.h> 87 #include <sys/proc.h> 88 #include <sys/socketvar.h> 89 #include <sys/signalvar.h> 90 #include <sys/uio.h> 91 #include <sys/kernel.h> 92 #include <sys/stat.h> 93 #include <sys/kmem.h> 94 #include <sys/poll.h> 95 #include <sys/vnode.h> 96 #include <sys/mount.h> 97 #include <sys/syscallargs.h> 98 #include <sys/ktrace.h> 99 100 #include <uvm/uvm_extern.h> 101 102 /* Flags for lwp::l_selflag. */ 103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 104 #define SEL_SCANNING 1 /* polling descriptors */ 105 #define SEL_BLOCKING 2 /* about to block on select_cv */ 106 107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *); 108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *); 109 110 /* Global state for select()/poll(). */ 111 kmutex_t select_lock; 112 kcondvar_t select_cv; 113 int nselcoll; 114 115 /* 116 * Read system call. 117 */ 118 /* ARGSUSED */ 119 int 120 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval) 121 { 122 /* { 123 syscallarg(int) fd; 124 syscallarg(void *) buf; 125 syscallarg(size_t) nbyte; 126 } */ 127 int fd; 128 struct file *fp; 129 proc_t *p; 130 struct filedesc *fdp; 131 132 fd = SCARG(uap, fd); 133 p = l->l_proc; 134 fdp = p->p_fd; 135 136 if ((fp = fd_getfile(fdp, fd)) == NULL) 137 return (EBADF); 138 139 if ((fp->f_flag & FREAD) == 0) { 140 FILE_UNLOCK(fp); 141 return (EBADF); 142 } 143 144 FILE_USE(fp); 145 146 /* dofileread() will unuse the descriptor for us */ 147 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 148 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 149 } 150 151 int 152 dofileread(int fd, struct file *fp, void *buf, size_t nbyte, 153 off_t *offset, int flags, register_t *retval) 154 { 155 struct iovec aiov; 156 struct uio auio; 157 size_t cnt; 158 int error; 159 lwp_t *l; 160 161 l = curlwp; 162 163 aiov.iov_base = (void *)buf; 164 aiov.iov_len = nbyte; 165 auio.uio_iov = &aiov; 166 auio.uio_iovcnt = 1; 167 auio.uio_resid = nbyte; 168 auio.uio_rw = UIO_READ; 169 auio.uio_vmspace = l->l_proc->p_vmspace; 170 171 /* 172 * Reads return ssize_t because -1 is returned on error. Therefore 173 * we must restrict the length to SSIZE_MAX to avoid garbage return 174 * values. 175 */ 176 if (auio.uio_resid > SSIZE_MAX) { 177 error = EINVAL; 178 goto out; 179 } 180 181 cnt = auio.uio_resid; 182 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 183 if (error) 184 if (auio.uio_resid != cnt && (error == ERESTART || 185 error == EINTR || error == EWOULDBLOCK)) 186 error = 0; 187 cnt -= auio.uio_resid; 188 ktrgenio(fd, UIO_READ, buf, cnt, error); 189 *retval = cnt; 190 out: 191 FILE_UNUSE(fp, l); 192 return (error); 193 } 194 195 /* 196 * Scatter read system call. 197 */ 198 int 199 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval) 200 { 201 /* { 202 syscallarg(int) fd; 203 syscallarg(const struct iovec *) iovp; 204 syscallarg(int) iovcnt; 205 } */ 206 207 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), 208 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 209 } 210 211 int 212 do_filereadv(int fd, const struct iovec *iovp, int iovcnt, 213 off_t *offset, int flags, register_t *retval) 214 { 215 struct uio auio; 216 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 217 int i, error; 218 size_t cnt; 219 u_int iovlen; 220 struct file *fp; 221 struct iovec *ktriov = NULL; 222 lwp_t *l; 223 224 if (iovcnt == 0) 225 return EINVAL; 226 227 l = curlwp; 228 229 if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL) 230 return EBADF; 231 232 if ((fp->f_flag & FREAD) == 0) { 233 FILE_UNLOCK(fp); 234 return EBADF; 235 } 236 237 FILE_USE(fp); 238 239 if (offset == NULL) 240 offset = &fp->f_offset; 241 else { 242 struct vnode *vp = fp->f_data; 243 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 244 error = ESPIPE; 245 goto out; 246 } 247 /* 248 * Test that the device is seekable ? 249 * XXX This works because no file systems actually 250 * XXX take any action on the seek operation. 251 */ 252 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 253 if (error != 0) 254 goto out; 255 } 256 257 iovlen = iovcnt * sizeof(struct iovec); 258 if (flags & FOF_IOV_SYSSPACE) 259 iov = __UNCONST(iovp); 260 else { 261 iov = aiov; 262 if ((u_int)iovcnt > UIO_SMALLIOV) { 263 if ((u_int)iovcnt > IOV_MAX) { 264 error = EINVAL; 265 goto out; 266 } 267 iov = kmem_alloc(iovlen, KM_SLEEP); 268 if (iov == NULL) { 269 error = ENOMEM; 270 goto out; 271 } 272 needfree = iov; 273 } 274 error = copyin(iovp, iov, iovlen); 275 if (error) 276 goto done; 277 } 278 279 auio.uio_iov = iov; 280 auio.uio_iovcnt = iovcnt; 281 auio.uio_rw = UIO_READ; 282 auio.uio_vmspace = l->l_proc->p_vmspace; 283 284 auio.uio_resid = 0; 285 for (i = 0; i < iovcnt; i++, iov++) { 286 auio.uio_resid += iov->iov_len; 287 /* 288 * Reads return ssize_t because -1 is returned on error. 289 * Therefore we must restrict the length to SSIZE_MAX to 290 * avoid garbage return values. 291 */ 292 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 293 error = EINVAL; 294 goto done; 295 } 296 } 297 298 /* 299 * if tracing, save a copy of iovec 300 */ 301 if (ktrpoint(KTR_GENIO)) { 302 ktriov = kmem_alloc(iovlen, KM_SLEEP); 303 if (ktriov != NULL) 304 memcpy(ktriov, auio.uio_iov, iovlen); 305 } 306 307 cnt = auio.uio_resid; 308 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 309 if (error) 310 if (auio.uio_resid != cnt && (error == ERESTART || 311 error == EINTR || error == EWOULDBLOCK)) 312 error = 0; 313 cnt -= auio.uio_resid; 314 *retval = cnt; 315 316 if (ktriov != NULL) { 317 ktrgeniov(fd, UIO_READ, ktriov, cnt, error); 318 kmem_free(ktriov, iovlen); 319 } 320 321 done: 322 if (needfree) 323 kmem_free(needfree, iovlen); 324 out: 325 FILE_UNUSE(fp, l); 326 return (error); 327 } 328 329 /* 330 * Write system call 331 */ 332 int 333 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval) 334 { 335 /* { 336 syscallarg(int) fd; 337 syscallarg(const void *) buf; 338 syscallarg(size_t) nbyte; 339 } */ 340 int fd; 341 struct file *fp; 342 343 fd = SCARG(uap, fd); 344 345 if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL) 346 return (EBADF); 347 348 if ((fp->f_flag & FWRITE) == 0) { 349 FILE_UNLOCK(fp); 350 return (EBADF); 351 } 352 353 FILE_USE(fp); 354 355 /* dofilewrite() will unuse the descriptor for us */ 356 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 357 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 358 } 359 360 int 361 dofilewrite(int fd, struct file *fp, const void *buf, 362 size_t nbyte, off_t *offset, int flags, register_t *retval) 363 { 364 struct iovec aiov; 365 struct uio auio; 366 size_t cnt; 367 int error; 368 lwp_t *l; 369 370 l = curlwp; 371 372 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */ 373 aiov.iov_len = nbyte; 374 auio.uio_iov = &aiov; 375 auio.uio_iovcnt = 1; 376 auio.uio_resid = nbyte; 377 auio.uio_rw = UIO_WRITE; 378 auio.uio_vmspace = l->l_proc->p_vmspace; 379 380 /* 381 * Writes return ssize_t because -1 is returned on error. Therefore 382 * we must restrict the length to SSIZE_MAX to avoid garbage return 383 * values. 384 */ 385 if (auio.uio_resid > SSIZE_MAX) { 386 error = EINVAL; 387 goto out; 388 } 389 390 cnt = auio.uio_resid; 391 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 392 if (error) { 393 if (auio.uio_resid != cnt && (error == ERESTART || 394 error == EINTR || error == EWOULDBLOCK)) 395 error = 0; 396 if (error == EPIPE) { 397 mutex_enter(&proclist_mutex); 398 psignal(l->l_proc, SIGPIPE); 399 mutex_exit(&proclist_mutex); 400 } 401 } 402 cnt -= auio.uio_resid; 403 ktrgenio(fd, UIO_WRITE, buf, cnt, error); 404 *retval = cnt; 405 out: 406 FILE_UNUSE(fp, l); 407 return (error); 408 } 409 410 /* 411 * Gather write system call 412 */ 413 int 414 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval) 415 { 416 /* { 417 syscallarg(int) fd; 418 syscallarg(const struct iovec *) iovp; 419 syscallarg(int) iovcnt; 420 } */ 421 422 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), 423 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 424 } 425 426 int 427 do_filewritev(int fd, const struct iovec *iovp, int iovcnt, 428 off_t *offset, int flags, register_t *retval) 429 { 430 struct uio auio; 431 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 432 int i, error; 433 size_t cnt; 434 u_int iovlen; 435 struct file *fp; 436 struct iovec *ktriov = NULL; 437 lwp_t *l; 438 439 l = curlwp; 440 441 if (iovcnt == 0) 442 return EINVAL; 443 444 if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL) 445 return EBADF; 446 447 if ((fp->f_flag & FWRITE) == 0) { 448 FILE_UNLOCK(fp); 449 return EBADF; 450 } 451 452 FILE_USE(fp); 453 454 if (offset == NULL) 455 offset = &fp->f_offset; 456 else { 457 struct vnode *vp = fp->f_data; 458 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 459 error = ESPIPE; 460 goto out; 461 } 462 /* 463 * Test that the device is seekable ? 464 * XXX This works because no file systems actually 465 * XXX take any action on the seek operation. 466 */ 467 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 468 if (error != 0) 469 goto out; 470 } 471 472 iovlen = iovcnt * sizeof(struct iovec); 473 if (flags & FOF_IOV_SYSSPACE) 474 iov = __UNCONST(iovp); 475 else { 476 iov = aiov; 477 if ((u_int)iovcnt > UIO_SMALLIOV) { 478 if ((u_int)iovcnt > IOV_MAX) { 479 error = EINVAL; 480 goto out; 481 } 482 iov = kmem_alloc(iovlen, KM_SLEEP); 483 if (iov == NULL) { 484 error = ENOMEM; 485 goto out; 486 } 487 needfree = iov; 488 } 489 error = copyin(iovp, iov, iovlen); 490 if (error) 491 goto done; 492 } 493 494 auio.uio_iov = iov; 495 auio.uio_iovcnt = iovcnt; 496 auio.uio_rw = UIO_WRITE; 497 auio.uio_vmspace = curproc->p_vmspace; 498 499 auio.uio_resid = 0; 500 for (i = 0; i < iovcnt; i++, iov++) { 501 auio.uio_resid += iov->iov_len; 502 /* 503 * Writes return ssize_t because -1 is returned on error. 504 * Therefore we must restrict the length to SSIZE_MAX to 505 * avoid garbage return values. 506 */ 507 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 508 error = EINVAL; 509 goto done; 510 } 511 } 512 513 /* 514 * if tracing, save a copy of iovec 515 */ 516 if (ktrpoint(KTR_GENIO)) { 517 ktriov = kmem_alloc(iovlen, KM_SLEEP); 518 if (ktriov != NULL) 519 memcpy(ktriov, auio.uio_iov, iovlen); 520 } 521 522 cnt = auio.uio_resid; 523 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 524 if (error) { 525 if (auio.uio_resid != cnt && (error == ERESTART || 526 error == EINTR || error == EWOULDBLOCK)) 527 error = 0; 528 if (error == EPIPE) { 529 mutex_enter(&proclist_mutex); 530 psignal(l->l_proc, SIGPIPE); 531 mutex_exit(&proclist_mutex); 532 } 533 } 534 cnt -= auio.uio_resid; 535 *retval = cnt; 536 537 if (ktriov != NULL) { 538 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error); 539 kmem_free(ktriov, iovlen); 540 } 541 542 done: 543 if (needfree) 544 kmem_free(needfree, iovlen); 545 out: 546 FILE_UNUSE(fp, l); 547 return (error); 548 } 549 550 /* 551 * Ioctl system call 552 */ 553 /* ARGSUSED */ 554 int 555 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval) 556 { 557 /* { 558 syscallarg(int) fd; 559 syscallarg(u_long) com; 560 syscallarg(void *) data; 561 } */ 562 struct file *fp; 563 proc_t *p; 564 struct filedesc *fdp; 565 u_long com; 566 int error; 567 u_int size; 568 void *data, *memp; 569 #define STK_PARAMS 128 570 u_long stkbuf[STK_PARAMS/sizeof(u_long)]; 571 572 error = 0; 573 p = l->l_proc; 574 fdp = p->p_fd; 575 576 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) 577 return (EBADF); 578 579 FILE_USE(fp); 580 581 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 582 error = EBADF; 583 com = 0; 584 goto out; 585 } 586 587 switch (com = SCARG(uap, com)) { 588 case FIONCLEX: 589 rw_enter(&fdp->fd_lock, RW_WRITER); 590 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 591 rw_exit(&fdp->fd_lock); 592 goto out; 593 594 case FIOCLEX: 595 rw_enter(&fdp->fd_lock, RW_WRITER); 596 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 597 rw_exit(&fdp->fd_lock); 598 goto out; 599 } 600 601 /* 602 * Interpret high order word to find amount of data to be 603 * copied to/from the user's address space. 604 */ 605 size = IOCPARM_LEN(com); 606 if (size > IOCPARM_MAX) { 607 error = ENOTTY; 608 goto out; 609 } 610 memp = NULL; 611 if (size > sizeof(stkbuf)) { 612 memp = kmem_alloc(size, KM_SLEEP); 613 data = memp; 614 } else 615 data = (void *)stkbuf; 616 if (com&IOC_IN) { 617 if (size) { 618 error = copyin(SCARG(uap, data), data, size); 619 if (error) { 620 if (memp) 621 kmem_free(memp, size); 622 goto out; 623 } 624 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data), 625 size, 0); 626 } else 627 *(void **)data = SCARG(uap, data); 628 } else if ((com&IOC_OUT) && size) 629 /* 630 * Zero the buffer so the user always 631 * gets back something deterministic. 632 */ 633 memset(data, 0, size); 634 else if (com&IOC_VOID) 635 *(void **)data = SCARG(uap, data); 636 637 switch (com) { 638 639 case FIONBIO: 640 FILE_LOCK(fp); 641 if (*(int *)data != 0) 642 fp->f_flag |= FNONBLOCK; 643 else 644 fp->f_flag &= ~FNONBLOCK; 645 FILE_UNLOCK(fp); 646 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l); 647 break; 648 649 case FIOASYNC: 650 FILE_LOCK(fp); 651 if (*(int *)data != 0) 652 fp->f_flag |= FASYNC; 653 else 654 fp->f_flag &= ~FASYNC; 655 FILE_UNLOCK(fp); 656 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l); 657 break; 658 659 default: 660 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l); 661 /* 662 * Copy any data to user, size was 663 * already set and checked above. 664 */ 665 if (error == 0 && (com&IOC_OUT) && size) { 666 error = copyout(data, SCARG(uap, data), size); 667 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data), 668 size, error); 669 } 670 break; 671 } 672 if (memp) 673 kmem_free(memp, size); 674 out: 675 FILE_UNUSE(fp, l); 676 switch (error) { 677 case -1: 678 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: " 679 "pid=%d comm=%s\n", 680 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "", 681 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com), 682 p->p_pid, p->p_comm); 683 /* FALLTHROUGH */ 684 case EPASSTHROUGH: 685 error = ENOTTY; 686 /* FALLTHROUGH */ 687 default: 688 return (error); 689 } 690 } 691 692 /* 693 * Select system call. 694 */ 695 int 696 sys_pselect(struct lwp *l, const struct sys_pselect_args *uap, register_t *retval) 697 { 698 /* { 699 syscallarg(int) nd; 700 syscallarg(fd_set *) in; 701 syscallarg(fd_set *) ou; 702 syscallarg(fd_set *) ex; 703 syscallarg(const struct timespec *) ts; 704 syscallarg(sigset_t *) mask; 705 } */ 706 struct timespec ats; 707 struct timeval atv, *tv = NULL; 708 sigset_t amask, *mask = NULL; 709 int error; 710 711 if (SCARG(uap, ts)) { 712 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 713 if (error) 714 return error; 715 atv.tv_sec = ats.tv_sec; 716 atv.tv_usec = ats.tv_nsec / 1000; 717 tv = &atv; 718 } 719 if (SCARG(uap, mask) != NULL) { 720 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 721 if (error) 722 return error; 723 mask = &amask; 724 } 725 726 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in), 727 SCARG(uap, ou), SCARG(uap, ex), tv, mask); 728 } 729 730 int 731 inittimeleft(struct timeval *tv, struct timeval *sleeptv) 732 { 733 if (itimerfix(tv)) 734 return -1; 735 getmicrouptime(sleeptv); 736 return 0; 737 } 738 739 int 740 gettimeleft(struct timeval *tv, struct timeval *sleeptv) 741 { 742 /* 743 * We have to recalculate the timeout on every retry. 744 */ 745 struct timeval slepttv; 746 /* 747 * reduce tv by elapsed time 748 * based on monotonic time scale 749 */ 750 getmicrouptime(&slepttv); 751 timeradd(tv, sleeptv, tv); 752 timersub(tv, &slepttv, tv); 753 *sleeptv = slepttv; 754 return tvtohz(tv); 755 } 756 757 int 758 sys_select(struct lwp *l, const struct sys_select_args *uap, register_t *retval) 759 { 760 /* { 761 syscallarg(int) nd; 762 syscallarg(fd_set *) in; 763 syscallarg(fd_set *) ou; 764 syscallarg(fd_set *) ex; 765 syscallarg(struct timeval *) tv; 766 } */ 767 struct timeval atv, *tv = NULL; 768 int error; 769 770 if (SCARG(uap, tv)) { 771 error = copyin(SCARG(uap, tv), (void *)&atv, 772 sizeof(atv)); 773 if (error) 774 return error; 775 tv = &atv; 776 } 777 778 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in), 779 SCARG(uap, ou), SCARG(uap, ex), tv, NULL); 780 } 781 782 int 783 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in, 784 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask) 785 { 786 char smallbits[howmany(FD_SETSIZE, NFDBITS) * 787 sizeof(fd_mask) * 6]; 788 proc_t * const p = l->l_proc; 789 char *bits; 790 int ncoll, error, timo; 791 size_t ni; 792 sigset_t oldmask; 793 struct timeval sleeptv; 794 795 error = 0; 796 if (nd < 0) 797 return (EINVAL); 798 if (nd > p->p_fd->fd_nfiles) { 799 /* forgiving; slightly wrong */ 800 nd = p->p_fd->fd_nfiles; 801 } 802 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 803 if (ni * 6 > sizeof(smallbits)) 804 bits = kmem_alloc(ni * 6, KM_SLEEP); 805 else 806 bits = smallbits; 807 808 #define getbits(name, x) \ 809 if (u_ ## name) { \ 810 error = copyin(u_ ## name, bits + ni * x, ni); \ 811 if (error) \ 812 goto done; \ 813 } else \ 814 memset(bits + ni * x, 0, ni); 815 getbits(in, 0); 816 getbits(ou, 1); 817 getbits(ex, 2); 818 #undef getbits 819 820 timo = 0; 821 if (tv && inittimeleft(tv, &sleeptv) == -1) { 822 error = EINVAL; 823 goto done; 824 } 825 826 if (mask) { 827 sigminusset(&sigcantmask, mask); 828 mutex_enter(&p->p_smutex); 829 oldmask = l->l_sigmask; 830 l->l_sigmask = *mask; 831 mutex_exit(&p->p_smutex); 832 } else 833 oldmask = l->l_sigmask; /* XXXgcc */ 834 835 mutex_enter(&select_lock); 836 SLIST_INIT(&l->l_selwait); 837 for (;;) { 838 l->l_selflag = SEL_SCANNING; 839 ncoll = nselcoll; 840 mutex_exit(&select_lock); 841 842 error = selscan(l, (fd_mask *)(bits + ni * 0), 843 (fd_mask *)(bits + ni * 3), nd, retval); 844 845 mutex_enter(&select_lock); 846 if (error || *retval) 847 break; 848 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0) 849 break; 850 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll) 851 continue; 852 l->l_selflag = SEL_BLOCKING; 853 error = cv_timedwait_sig(&select_cv, &select_lock, timo); 854 if (error != 0) 855 break; 856 } 857 selclear(); 858 mutex_exit(&select_lock); 859 860 if (mask) { 861 mutex_enter(&p->p_smutex); 862 l->l_sigmask = oldmask; 863 mutex_exit(&p->p_smutex); 864 } 865 866 done: 867 /* select is not restarted after signals... */ 868 if (error == ERESTART) 869 error = EINTR; 870 if (error == EWOULDBLOCK) 871 error = 0; 872 if (error == 0 && u_in != NULL) 873 error = copyout(bits + ni * 3, u_in, ni); 874 if (error == 0 && u_ou != NULL) 875 error = copyout(bits + ni * 4, u_ou, ni); 876 if (error == 0 && u_ex != NULL) 877 error = copyout(bits + ni * 5, u_ex, ni); 878 if (bits != smallbits) 879 kmem_free(bits, ni * 6); 880 return (error); 881 } 882 883 int 884 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd, 885 register_t *retval) 886 { 887 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR, 888 POLLWRNORM | POLLHUP | POLLERR, 889 POLLRDBAND }; 890 proc_t *p = l->l_proc; 891 struct filedesc *fdp; 892 int msk, i, j, fd, n; 893 fd_mask ibits, obits; 894 struct file *fp; 895 896 fdp = p->p_fd; 897 n = 0; 898 for (msk = 0; msk < 3; msk++) { 899 for (i = 0; i < nfd; i += NFDBITS) { 900 ibits = *ibitp++; 901 obits = 0; 902 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 903 ibits &= ~(1 << j); 904 if ((fp = fd_getfile(fdp, fd)) == NULL) 905 return (EBADF); 906 FILE_USE(fp); 907 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) { 908 obits |= (1 << j); 909 n++; 910 } 911 FILE_UNUSE(fp, l); 912 } 913 *obitp++ = obits; 914 } 915 } 916 *retval = n; 917 return (0); 918 } 919 920 /* 921 * Poll system call. 922 */ 923 int 924 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) 925 { 926 /* { 927 syscallarg(struct pollfd *) fds; 928 syscallarg(u_int) nfds; 929 syscallarg(int) timeout; 930 } */ 931 struct timeval atv, *tv = NULL; 932 933 if (SCARG(uap, timeout) != INFTIM) { 934 atv.tv_sec = SCARG(uap, timeout) / 1000; 935 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 936 tv = &atv; 937 } 938 939 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds), 940 tv, NULL); 941 } 942 943 /* 944 * Poll system call. 945 */ 946 int 947 sys_pollts(struct lwp *l, const struct sys_pollts_args *uap, register_t *retval) 948 { 949 /* { 950 syscallarg(struct pollfd *) fds; 951 syscallarg(u_int) nfds; 952 syscallarg(const struct timespec *) ts; 953 syscallarg(const sigset_t *) mask; 954 } */ 955 struct timespec ats; 956 struct timeval atv, *tv = NULL; 957 sigset_t amask, *mask = NULL; 958 int error; 959 960 if (SCARG(uap, ts)) { 961 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 962 if (error) 963 return error; 964 atv.tv_sec = ats.tv_sec; 965 atv.tv_usec = ats.tv_nsec / 1000; 966 tv = &atv; 967 } 968 if (SCARG(uap, mask)) { 969 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 970 if (error) 971 return error; 972 mask = &amask; 973 } 974 975 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds), 976 tv, mask); 977 } 978 979 int 980 pollcommon(lwp_t *l, register_t *retval, 981 struct pollfd *u_fds, u_int nfds, 982 struct timeval *tv, sigset_t *mask) 983 { 984 char smallbits[32 * sizeof(struct pollfd)]; 985 proc_t * const p = l->l_proc; 986 void * bits; 987 sigset_t oldmask; 988 int ncoll, error, timo; 989 size_t ni; 990 struct timeval sleeptv; 991 992 if (nfds > p->p_fd->fd_nfiles) { 993 /* forgiving; slightly wrong */ 994 nfds = p->p_fd->fd_nfiles; 995 } 996 ni = nfds * sizeof(struct pollfd); 997 if (ni > sizeof(smallbits)) 998 bits = kmem_alloc(ni, KM_SLEEP); 999 else 1000 bits = smallbits; 1001 1002 error = copyin(u_fds, bits, ni); 1003 if (error) 1004 goto done; 1005 1006 timo = 0; 1007 if (tv && inittimeleft(tv, &sleeptv) == -1) { 1008 error = EINVAL; 1009 goto done; 1010 } 1011 1012 if (mask) { 1013 sigminusset(&sigcantmask, mask); 1014 mutex_enter(&p->p_smutex); 1015 oldmask = l->l_sigmask; 1016 l->l_sigmask = *mask; 1017 mutex_exit(&p->p_smutex); 1018 } else 1019 oldmask = l->l_sigmask; /* XXXgcc */ 1020 1021 mutex_enter(&select_lock); 1022 SLIST_INIT(&l->l_selwait); 1023 for (;;) { 1024 ncoll = nselcoll; 1025 l->l_selflag = SEL_SCANNING; 1026 mutex_exit(&select_lock); 1027 1028 error = pollscan(l, (struct pollfd *)bits, nfds, retval); 1029 1030 mutex_enter(&select_lock); 1031 if (error || *retval) 1032 break; 1033 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0) 1034 break; 1035 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll) 1036 continue; 1037 l->l_selflag = SEL_BLOCKING; 1038 error = cv_timedwait_sig(&select_cv, &select_lock, timo); 1039 if (error != 0) 1040 break; 1041 } 1042 selclear(); 1043 mutex_exit(&select_lock); 1044 1045 if (mask) { 1046 mutex_enter(&p->p_smutex); 1047 l->l_sigmask = oldmask; 1048 mutex_exit(&p->p_smutex); 1049 } 1050 done: 1051 /* poll is not restarted after signals... */ 1052 if (error == ERESTART) 1053 error = EINTR; 1054 if (error == EWOULDBLOCK) 1055 error = 0; 1056 if (error == 0) 1057 error = copyout(bits, u_fds, ni); 1058 if (bits != smallbits) 1059 kmem_free(bits, ni); 1060 return (error); 1061 } 1062 1063 int 1064 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval) 1065 { 1066 proc_t *p = l->l_proc; 1067 struct filedesc *fdp; 1068 int i, n; 1069 struct file *fp; 1070 1071 fdp = p->p_fd; 1072 n = 0; 1073 for (i = 0; i < nfd; i++, fds++) { 1074 if (fds->fd >= fdp->fd_nfiles) { 1075 fds->revents = POLLNVAL; 1076 n++; 1077 } else if (fds->fd < 0) { 1078 fds->revents = 0; 1079 } else { 1080 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) { 1081 fds->revents = POLLNVAL; 1082 n++; 1083 } else { 1084 FILE_USE(fp); 1085 fds->revents = (*fp->f_ops->fo_poll)(fp, 1086 fds->events | POLLERR | POLLHUP, l); 1087 if (fds->revents != 0) 1088 n++; 1089 FILE_UNUSE(fp, l); 1090 } 1091 } 1092 } 1093 *retval = n; 1094 return (0); 1095 } 1096 1097 /*ARGSUSED*/ 1098 int 1099 seltrue(dev_t dev, int events, lwp_t *l) 1100 { 1101 1102 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1103 } 1104 1105 /* 1106 * Record a select request. 1107 */ 1108 void 1109 selrecord(lwp_t *selector, struct selinfo *sip) 1110 { 1111 1112 mutex_enter(&select_lock); 1113 if (sip->sel_lwp == NULL) { 1114 /* First named waiter, although there may be more. */ 1115 sip->sel_lwp = selector; 1116 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 1117 } else if (sip->sel_lwp != selector) { 1118 /* Multiple waiters. */ 1119 sip->sel_collision = true; 1120 } 1121 mutex_exit(&select_lock); 1122 } 1123 1124 /* 1125 * Do a wakeup when a selectable event occurs. 1126 */ 1127 void 1128 selnotify(struct selinfo *sip, int events, long knhint) 1129 { 1130 lwp_t *l; 1131 1132 mutex_enter(&select_lock); 1133 if (sip->sel_collision) { 1134 /* Multiple waiters - just notify everybody. */ 1135 nselcoll++; 1136 sip->sel_collision = false; 1137 cv_broadcast(&select_cv); 1138 } else if (sip->sel_lwp != NULL) { 1139 /* Only one LWP waiting. */ 1140 l = sip->sel_lwp; 1141 if (l->l_selflag == SEL_BLOCKING) { 1142 /* 1143 * If it's sleeping, wake it up. If not, it's 1144 * already awake but hasn't yet removed itself 1145 * from the selector. We reset the state below 1146 * so that we only attempt to do this once. 1147 */ 1148 lwp_lock(l); 1149 if (l->l_wchan == &select_cv) { 1150 /* lwp_unsleep() releases the LWP lock. */ 1151 lwp_unsleep(l); 1152 } else 1153 lwp_unlock(l); 1154 } else { 1155 /* 1156 * Not yet asleep. Reset its state below so that 1157 * it will go around again. 1158 */ 1159 } 1160 l->l_selflag = SEL_RESET; 1161 } 1162 mutex_exit(&select_lock); 1163 1164 KNOTE(&sip->sel_klist, knhint); 1165 } 1166 1167 /* 1168 * Remove an LWP from all objects that it is waiting for. 1169 */ 1170 void 1171 selclear(void) 1172 { 1173 struct selinfo *sip; 1174 lwp_t *l = curlwp; 1175 1176 KASSERT(mutex_owned(&select_lock)); 1177 1178 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) { 1179 KASSERT(sip->sel_lwp == l); 1180 sip->sel_lwp = NULL; 1181 } 1182 } 1183 1184 /* 1185 * Initialize the select/poll system calls. 1186 */ 1187 void 1188 selsysinit(void) 1189 { 1190 1191 mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM); 1192 cv_init(&select_cv, "select"); 1193 } 1194 1195 /* 1196 * Initialize a selector. 1197 */ 1198 void 1199 selinit(struct selinfo *sip) 1200 { 1201 1202 memset(sip, 0, sizeof(*sip)); 1203 } 1204 1205 /* 1206 * Destroy a selector. The owning object must not gain new 1207 * references while this is in progress: all activity on the 1208 * selector must be stopped. 1209 */ 1210 void 1211 seldestroy(struct selinfo *sip) 1212 { 1213 lwp_t *l; 1214 1215 if (sip->sel_lwp == NULL) 1216 return; 1217 1218 mutex_enter(&select_lock); 1219 if ((l = sip->sel_lwp) != NULL) { 1220 /* This should rarely happen, so SLIST_REMOVE() is OK. */ 1221 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 1222 sip->sel_lwp = NULL; 1223 } 1224 mutex_exit(&select_lock); 1225 } 1226