1 /* $NetBSD: sys_generic.c,v 1.109 2007/12/05 07:06:55 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1989, 1993 41 * The Regents of the University of California. All rights reserved. 42 * (c) UNIX System Laboratories, Inc. 43 * All or some portions of this file are derived from material licensed 44 * to the University of California by American Telephone and Telegraph 45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 46 * the permission of UNIX System Laboratories, Inc. 47 * 48 * Redistribution and use in source and binary forms, with or without 49 * modification, are permitted provided that the following conditions 50 * are met: 51 * 1. Redistributions of source code must retain the above copyright 52 * notice, this list of conditions and the following disclaimer. 53 * 2. Redistributions in binary form must reproduce the above copyright 54 * notice, this list of conditions and the following disclaimer in the 55 * documentation and/or other materials provided with the distribution. 56 * 3. Neither the name of the University nor the names of its contributors 57 * may be used to endorse or promote products derived from this software 58 * without specific prior written permission. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 70 * SUCH DAMAGE. 71 * 72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 73 */ 74 75 /* 76 * System calls relating to files. 77 */ 78 79 #include <sys/cdefs.h> 80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.109 2007/12/05 07:06:55 ad Exp $"); 81 82 #include <sys/param.h> 83 #include <sys/systm.h> 84 #include <sys/filedesc.h> 85 #include <sys/ioctl.h> 86 #include <sys/file.h> 87 #include <sys/proc.h> 88 #include <sys/socketvar.h> 89 #include <sys/signalvar.h> 90 #include <sys/uio.h> 91 #include <sys/kernel.h> 92 #include <sys/stat.h> 93 #include <sys/kmem.h> 94 #include <sys/poll.h> 95 #include <sys/vnode.h> 96 #include <sys/mount.h> 97 #include <sys/syscallargs.h> 98 #include <sys/ktrace.h> 99 100 #include <uvm/uvm_extern.h> 101 102 /* Flags for lwp::l_selflag. */ 103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 104 #define SEL_SCANNING 1 /* polling descriptors */ 105 #define SEL_BLOCKING 2 /* about to block on select_cv */ 106 107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *); 108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *); 109 static void selclear(void); 110 111 /* Global state for select()/poll(). */ 112 kmutex_t select_lock; 113 kcondvar_t select_cv; 114 int nselcoll; 115 116 /* 117 * Read system call. 118 */ 119 /* ARGSUSED */ 120 int 121 sys_read(lwp_t *l, void *v, register_t *retval) 122 { 123 struct sys_read_args /* { 124 syscallarg(int) fd; 125 syscallarg(void *) buf; 126 syscallarg(size_t) nbyte; 127 } */ *uap = v; 128 int fd; 129 struct file *fp; 130 proc_t *p; 131 struct filedesc *fdp; 132 133 fd = SCARG(uap, fd); 134 p = l->l_proc; 135 fdp = p->p_fd; 136 137 if ((fp = fd_getfile(fdp, fd)) == NULL) 138 return (EBADF); 139 140 if ((fp->f_flag & FREAD) == 0) { 141 mutex_exit(&fp->f_lock); 142 return (EBADF); 143 } 144 145 FILE_USE(fp); 146 147 /* dofileread() will unuse the descriptor for us */ 148 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 149 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 150 } 151 152 int 153 dofileread(int fd, struct file *fp, void *buf, size_t nbyte, 154 off_t *offset, int flags, register_t *retval) 155 { 156 struct iovec aiov; 157 struct uio auio; 158 size_t cnt; 159 int error; 160 lwp_t *l; 161 162 l = curlwp; 163 164 aiov.iov_base = (void *)buf; 165 aiov.iov_len = nbyte; 166 auio.uio_iov = &aiov; 167 auio.uio_iovcnt = 1; 168 auio.uio_resid = nbyte; 169 auio.uio_rw = UIO_READ; 170 auio.uio_vmspace = l->l_proc->p_vmspace; 171 172 /* 173 * Reads return ssize_t because -1 is returned on error. Therefore 174 * we must restrict the length to SSIZE_MAX to avoid garbage return 175 * values. 176 */ 177 if (auio.uio_resid > SSIZE_MAX) { 178 error = EINVAL; 179 goto out; 180 } 181 182 cnt = auio.uio_resid; 183 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 184 if (error) 185 if (auio.uio_resid != cnt && (error == ERESTART || 186 error == EINTR || error == EWOULDBLOCK)) 187 error = 0; 188 cnt -= auio.uio_resid; 189 ktrgenio(fd, UIO_READ, buf, cnt, error); 190 *retval = cnt; 191 out: 192 FILE_UNUSE(fp, l); 193 return (error); 194 } 195 196 /* 197 * Scatter read system call. 198 */ 199 int 200 sys_readv(lwp_t *l, void *v, register_t *retval) 201 { 202 struct sys_readv_args /* { 203 syscallarg(int) fd; 204 syscallarg(const struct iovec *) iovp; 205 syscallarg(int) iovcnt; 206 } */ *uap = v; 207 208 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), 209 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 210 } 211 212 int 213 do_filereadv(int fd, const struct iovec *iovp, int iovcnt, 214 off_t *offset, int flags, register_t *retval) 215 { 216 struct uio auio; 217 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 218 int i, error; 219 size_t cnt; 220 u_int iovlen; 221 struct file *fp; 222 struct iovec *ktriov = NULL; 223 lwp_t *l; 224 225 if (iovcnt == 0) 226 return EINVAL; 227 228 l = curlwp; 229 230 if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL) 231 return EBADF; 232 233 if ((fp->f_flag & FREAD) == 0) { 234 mutex_exit(&fp->f_lock); 235 return EBADF; 236 } 237 238 FILE_USE(fp); 239 240 if (offset == NULL) 241 offset = &fp->f_offset; 242 else { 243 struct vnode *vp = fp->f_data; 244 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 245 error = ESPIPE; 246 goto out; 247 } 248 /* 249 * Test that the device is seekable ? 250 * XXX This works because no file systems actually 251 * XXX take any action on the seek operation. 252 */ 253 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 254 if (error != 0) 255 goto out; 256 } 257 258 iovlen = iovcnt * sizeof(struct iovec); 259 if (flags & FOF_IOV_SYSSPACE) 260 iov = __UNCONST(iovp); 261 else { 262 iov = aiov; 263 if ((u_int)iovcnt > UIO_SMALLIOV) { 264 if ((u_int)iovcnt > IOV_MAX) { 265 error = EINVAL; 266 goto out; 267 } 268 iov = kmem_alloc(iovlen, KM_SLEEP); 269 if (iov == NULL) { 270 error = ENOMEM; 271 goto out; 272 } 273 needfree = iov; 274 } 275 error = copyin(iovp, iov, iovlen); 276 if (error) 277 goto done; 278 } 279 280 auio.uio_iov = iov; 281 auio.uio_iovcnt = iovcnt; 282 auio.uio_rw = UIO_READ; 283 auio.uio_vmspace = l->l_proc->p_vmspace; 284 285 auio.uio_resid = 0; 286 for (i = 0; i < iovcnt; i++, iov++) { 287 auio.uio_resid += iov->iov_len; 288 /* 289 * Reads return ssize_t because -1 is returned on error. 290 * Therefore we must restrict the length to SSIZE_MAX to 291 * avoid garbage return values. 292 */ 293 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 294 error = EINVAL; 295 goto done; 296 } 297 } 298 299 /* 300 * if tracing, save a copy of iovec 301 */ 302 if (ktrpoint(KTR_GENIO)) { 303 ktriov = kmem_alloc(iovlen, KM_SLEEP); 304 if (ktriov != NULL) 305 memcpy(ktriov, auio.uio_iov, iovlen); 306 } 307 308 cnt = auio.uio_resid; 309 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 310 if (error) 311 if (auio.uio_resid != cnt && (error == ERESTART || 312 error == EINTR || error == EWOULDBLOCK)) 313 error = 0; 314 cnt -= auio.uio_resid; 315 *retval = cnt; 316 317 if (ktriov != NULL) { 318 ktrgeniov(fd, UIO_READ, ktriov, cnt, error); 319 kmem_free(ktriov, iovlen); 320 } 321 322 done: 323 if (needfree) 324 kmem_free(needfree, iovlen); 325 out: 326 FILE_UNUSE(fp, l); 327 return (error); 328 } 329 330 /* 331 * Write system call 332 */ 333 int 334 sys_write(lwp_t *l, void *v, register_t *retval) 335 { 336 struct sys_write_args /* { 337 syscallarg(int) fd; 338 syscallarg(const void *) buf; 339 syscallarg(size_t) nbyte; 340 } */ *uap = v; 341 int fd; 342 struct file *fp; 343 344 fd = SCARG(uap, fd); 345 346 if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL) 347 return (EBADF); 348 349 if ((fp->f_flag & FWRITE) == 0) { 350 mutex_exit(&fp->f_lock); 351 return (EBADF); 352 } 353 354 FILE_USE(fp); 355 356 /* dofilewrite() will unuse the descriptor for us */ 357 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 358 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 359 } 360 361 int 362 dofilewrite(int fd, struct file *fp, const void *buf, 363 size_t nbyte, off_t *offset, int flags, register_t *retval) 364 { 365 struct iovec aiov; 366 struct uio auio; 367 size_t cnt; 368 int error; 369 lwp_t *l; 370 371 l = curlwp; 372 373 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */ 374 aiov.iov_len = nbyte; 375 auio.uio_iov = &aiov; 376 auio.uio_iovcnt = 1; 377 auio.uio_resid = nbyte; 378 auio.uio_rw = UIO_WRITE; 379 auio.uio_vmspace = l->l_proc->p_vmspace; 380 381 /* 382 * Writes return ssize_t because -1 is returned on error. Therefore 383 * we must restrict the length to SSIZE_MAX to avoid garbage return 384 * values. 385 */ 386 if (auio.uio_resid > SSIZE_MAX) { 387 error = EINVAL; 388 goto out; 389 } 390 391 cnt = auio.uio_resid; 392 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 393 if (error) { 394 if (auio.uio_resid != cnt && (error == ERESTART || 395 error == EINTR || error == EWOULDBLOCK)) 396 error = 0; 397 if (error == EPIPE) { 398 mutex_enter(&proclist_mutex); 399 psignal(l->l_proc, SIGPIPE); 400 mutex_exit(&proclist_mutex); 401 } 402 } 403 cnt -= auio.uio_resid; 404 ktrgenio(fd, UIO_WRITE, buf, cnt, error); 405 *retval = cnt; 406 out: 407 FILE_UNUSE(fp, l); 408 return (error); 409 } 410 411 /* 412 * Gather write system call 413 */ 414 int 415 sys_writev(lwp_t *l, void *v, register_t *retval) 416 { 417 struct sys_writev_args /* { 418 syscallarg(int) fd; 419 syscallarg(const struct iovec *) iovp; 420 syscallarg(int) iovcnt; 421 } */ *uap = v; 422 423 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), 424 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 425 } 426 427 int 428 do_filewritev(int fd, const struct iovec *iovp, int iovcnt, 429 off_t *offset, int flags, register_t *retval) 430 { 431 struct uio auio; 432 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 433 int i, error; 434 size_t cnt; 435 u_int iovlen; 436 struct file *fp; 437 struct iovec *ktriov = NULL; 438 lwp_t *l; 439 440 l = curlwp; 441 442 if (iovcnt == 0) 443 return EINVAL; 444 445 if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL) 446 return EBADF; 447 448 if ((fp->f_flag & FWRITE) == 0) { 449 mutex_exit(&fp->f_lock); 450 return EBADF; 451 } 452 453 FILE_USE(fp); 454 455 if (offset == NULL) 456 offset = &fp->f_offset; 457 else { 458 struct vnode *vp = fp->f_data; 459 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 460 error = ESPIPE; 461 goto out; 462 } 463 /* 464 * Test that the device is seekable ? 465 * XXX This works because no file systems actually 466 * XXX take any action on the seek operation. 467 */ 468 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 469 if (error != 0) 470 goto out; 471 } 472 473 iovlen = iovcnt * sizeof(struct iovec); 474 if (flags & FOF_IOV_SYSSPACE) 475 iov = __UNCONST(iovp); 476 else { 477 iov = aiov; 478 if ((u_int)iovcnt > UIO_SMALLIOV) { 479 if ((u_int)iovcnt > IOV_MAX) { 480 error = EINVAL; 481 goto out; 482 } 483 iov = kmem_alloc(iovlen, KM_SLEEP); 484 if (iov == NULL) { 485 error = ENOMEM; 486 goto out; 487 } 488 needfree = iov; 489 } 490 error = copyin(iovp, iov, iovlen); 491 if (error) 492 goto done; 493 } 494 495 auio.uio_iov = iov; 496 auio.uio_iovcnt = iovcnt; 497 auio.uio_rw = UIO_WRITE; 498 auio.uio_vmspace = curproc->p_vmspace; 499 500 auio.uio_resid = 0; 501 for (i = 0; i < iovcnt; i++, iov++) { 502 auio.uio_resid += iov->iov_len; 503 /* 504 * Writes return ssize_t because -1 is returned on error. 505 * Therefore we must restrict the length to SSIZE_MAX to 506 * avoid garbage return values. 507 */ 508 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 509 error = EINVAL; 510 goto done; 511 } 512 } 513 514 /* 515 * if tracing, save a copy of iovec 516 */ 517 if (ktrpoint(KTR_GENIO)) { 518 ktriov = kmem_alloc(iovlen, KM_SLEEP); 519 if (ktriov != NULL) 520 memcpy(ktriov, auio.uio_iov, iovlen); 521 } 522 523 cnt = auio.uio_resid; 524 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 525 if (error) { 526 if (auio.uio_resid != cnt && (error == ERESTART || 527 error == EINTR || error == EWOULDBLOCK)) 528 error = 0; 529 if (error == EPIPE) { 530 mutex_enter(&proclist_mutex); 531 psignal(l->l_proc, SIGPIPE); 532 mutex_exit(&proclist_mutex); 533 } 534 } 535 cnt -= auio.uio_resid; 536 *retval = cnt; 537 538 if (ktriov != NULL) { 539 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error); 540 kmem_free(ktriov, iovlen); 541 } 542 543 done: 544 if (needfree) 545 kmem_free(needfree, iovlen); 546 out: 547 FILE_UNUSE(fp, l); 548 return (error); 549 } 550 551 /* 552 * Ioctl system call 553 */ 554 /* ARGSUSED */ 555 int 556 sys_ioctl(lwp_t *l, void *v, register_t *retval) 557 { 558 struct sys_ioctl_args /* { 559 syscallarg(int) fd; 560 syscallarg(u_long) com; 561 syscallarg(void *) data; 562 } */ *uap = v; 563 struct file *fp; 564 proc_t *p; 565 struct filedesc *fdp; 566 u_long com; 567 int error; 568 u_int size; 569 void *data, *memp; 570 #define STK_PARAMS 128 571 u_long stkbuf[STK_PARAMS/sizeof(u_long)]; 572 573 error = 0; 574 p = l->l_proc; 575 fdp = p->p_fd; 576 577 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) 578 return (EBADF); 579 580 FILE_USE(fp); 581 582 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 583 error = EBADF; 584 com = 0; 585 goto out; 586 } 587 588 switch (com = SCARG(uap, com)) { 589 case FIONCLEX: 590 rw_enter(&fdp->fd_lock, RW_WRITER); 591 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; 592 rw_exit(&fdp->fd_lock); 593 goto out; 594 595 case FIOCLEX: 596 rw_enter(&fdp->fd_lock, RW_WRITER); 597 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; 598 rw_exit(&fdp->fd_lock); 599 goto out; 600 } 601 602 /* 603 * Interpret high order word to find amount of data to be 604 * copied to/from the user's address space. 605 */ 606 size = IOCPARM_LEN(com); 607 if (size > IOCPARM_MAX) { 608 error = ENOTTY; 609 goto out; 610 } 611 memp = NULL; 612 if (size > sizeof(stkbuf)) { 613 memp = kmem_alloc(size, KM_SLEEP); 614 data = memp; 615 } else 616 data = (void *)stkbuf; 617 if (com&IOC_IN) { 618 if (size) { 619 error = copyin(SCARG(uap, data), data, size); 620 if (error) { 621 if (memp) 622 kmem_free(memp, size); 623 goto out; 624 } 625 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data), 626 size, 0); 627 } else 628 *(void **)data = SCARG(uap, data); 629 } else if ((com&IOC_OUT) && size) 630 /* 631 * Zero the buffer so the user always 632 * gets back something deterministic. 633 */ 634 memset(data, 0, size); 635 else if (com&IOC_VOID) 636 *(void **)data = SCARG(uap, data); 637 638 switch (com) { 639 640 case FIONBIO: 641 mutex_enter(&fp->f_lock); 642 if (*(int *)data != 0) 643 fp->f_flag |= FNONBLOCK; 644 else 645 fp->f_flag &= ~FNONBLOCK; 646 mutex_exit(&fp->f_lock); 647 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l); 648 break; 649 650 case FIOASYNC: 651 mutex_enter(&fp->f_lock); 652 if (*(int *)data != 0) 653 fp->f_flag |= FASYNC; 654 else 655 fp->f_flag &= ~FASYNC; 656 mutex_exit(&fp->f_lock); 657 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l); 658 break; 659 660 default: 661 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l); 662 /* 663 * Copy any data to user, size was 664 * already set and checked above. 665 */ 666 if (error == 0 && (com&IOC_OUT) && size) { 667 error = copyout(data, SCARG(uap, data), size); 668 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data), 669 size, error); 670 } 671 break; 672 } 673 if (memp) 674 kmem_free(memp, size); 675 out: 676 FILE_UNUSE(fp, l); 677 switch (error) { 678 case -1: 679 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: " 680 "pid=%d comm=%s\n", 681 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "", 682 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com), 683 p->p_pid, p->p_comm); 684 /* FALLTHROUGH */ 685 case EPASSTHROUGH: 686 error = ENOTTY; 687 /* FALLTHROUGH */ 688 default: 689 return (error); 690 } 691 } 692 693 /* 694 * Select system call. 695 */ 696 int 697 sys_pselect(lwp_t *l, void *v, register_t *retval) 698 { 699 struct sys_pselect_args /* { 700 syscallarg(int) nd; 701 syscallarg(fd_set *) in; 702 syscallarg(fd_set *) ou; 703 syscallarg(fd_set *) ex; 704 syscallarg(const struct timespec *) ts; 705 syscallarg(sigset_t *) mask; 706 } */ * const uap = v; 707 struct timespec ats; 708 struct timeval atv, *tv = NULL; 709 sigset_t amask, *mask = NULL; 710 int error; 711 712 if (SCARG(uap, ts)) { 713 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 714 if (error) 715 return error; 716 atv.tv_sec = ats.tv_sec; 717 atv.tv_usec = ats.tv_nsec / 1000; 718 tv = &atv; 719 } 720 if (SCARG(uap, mask) != NULL) { 721 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 722 if (error) 723 return error; 724 mask = &amask; 725 } 726 727 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in), 728 SCARG(uap, ou), SCARG(uap, ex), tv, mask); 729 } 730 731 int 732 inittimeleft(struct timeval *tv, struct timeval *sleeptv) 733 { 734 if (itimerfix(tv)) 735 return -1; 736 getmicrouptime(sleeptv); 737 return 0; 738 } 739 740 int 741 gettimeleft(struct timeval *tv, struct timeval *sleeptv) 742 { 743 /* 744 * We have to recalculate the timeout on every retry. 745 */ 746 struct timeval slepttv; 747 /* 748 * reduce tv by elapsed time 749 * based on monotonic time scale 750 */ 751 getmicrouptime(&slepttv); 752 timeradd(tv, sleeptv, tv); 753 timersub(tv, &slepttv, tv); 754 *sleeptv = slepttv; 755 return tvtohz(tv); 756 } 757 758 int 759 sys_select(lwp_t *l, void *v, register_t *retval) 760 { 761 struct sys_select_args /* { 762 syscallarg(int) nd; 763 syscallarg(fd_set *) in; 764 syscallarg(fd_set *) ou; 765 syscallarg(fd_set *) ex; 766 syscallarg(struct timeval *) tv; 767 } */ * const uap = v; 768 struct timeval atv, *tv = NULL; 769 int error; 770 771 if (SCARG(uap, tv)) { 772 error = copyin(SCARG(uap, tv), (void *)&atv, 773 sizeof(atv)); 774 if (error) 775 return error; 776 tv = &atv; 777 } 778 779 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in), 780 SCARG(uap, ou), SCARG(uap, ex), tv, NULL); 781 } 782 783 int 784 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in, 785 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask) 786 { 787 char smallbits[howmany(FD_SETSIZE, NFDBITS) * 788 sizeof(fd_mask) * 6]; 789 proc_t * const p = l->l_proc; 790 char *bits; 791 int ncoll, error, timo; 792 size_t ni; 793 sigset_t oldmask; 794 struct timeval sleeptv; 795 796 error = 0; 797 if (nd < 0) 798 return (EINVAL); 799 if (nd > p->p_fd->fd_nfiles) { 800 /* forgiving; slightly wrong */ 801 nd = p->p_fd->fd_nfiles; 802 } 803 ni = howmany(nd, NFDBITS) * sizeof(fd_mask); 804 if (ni * 6 > sizeof(smallbits)) 805 bits = kmem_alloc(ni * 6, KM_SLEEP); 806 else 807 bits = smallbits; 808 809 #define getbits(name, x) \ 810 if (u_ ## name) { \ 811 error = copyin(u_ ## name, bits + ni * x, ni); \ 812 if (error) \ 813 goto done; \ 814 } else \ 815 memset(bits + ni * x, 0, ni); 816 getbits(in, 0); 817 getbits(ou, 1); 818 getbits(ex, 2); 819 #undef getbits 820 821 timo = 0; 822 if (tv && inittimeleft(tv, &sleeptv) == -1) { 823 error = EINVAL; 824 goto done; 825 } 826 827 if (mask) { 828 sigminusset(&sigcantmask, mask); 829 mutex_enter(&p->p_smutex); 830 oldmask = l->l_sigmask; 831 l->l_sigmask = *mask; 832 mutex_exit(&p->p_smutex); 833 } else 834 oldmask = l->l_sigmask; /* XXXgcc */ 835 836 mutex_enter(&select_lock); 837 SLIST_INIT(&l->l_selwait); 838 for (;;) { 839 l->l_selflag = SEL_SCANNING; 840 ncoll = nselcoll; 841 mutex_exit(&select_lock); 842 843 error = selscan(l, (fd_mask *)(bits + ni * 0), 844 (fd_mask *)(bits + ni * 3), nd, retval); 845 846 mutex_enter(&select_lock); 847 if (error || *retval) 848 break; 849 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0) 850 break; 851 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll) 852 continue; 853 l->l_selflag = SEL_BLOCKING; 854 error = cv_timedwait_sig(&select_cv, &select_lock, timo); 855 if (error != 0) 856 break; 857 } 858 selclear(); 859 mutex_exit(&select_lock); 860 861 if (mask) { 862 mutex_enter(&p->p_smutex); 863 l->l_sigmask = oldmask; 864 mutex_exit(&p->p_smutex); 865 } 866 867 done: 868 /* select is not restarted after signals... */ 869 if (error == ERESTART) 870 error = EINTR; 871 if (error == EWOULDBLOCK) 872 error = 0; 873 if (error == 0 && u_in != NULL) 874 error = copyout(bits + ni * 3, u_in, ni); 875 if (error == 0 && u_ou != NULL) 876 error = copyout(bits + ni * 4, u_ou, ni); 877 if (error == 0 && u_ex != NULL) 878 error = copyout(bits + ni * 5, u_ex, ni); 879 if (bits != smallbits) 880 kmem_free(bits, ni * 6); 881 return (error); 882 } 883 884 int 885 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd, 886 register_t *retval) 887 { 888 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR, 889 POLLWRNORM | POLLHUP | POLLERR, 890 POLLRDBAND }; 891 proc_t *p = l->l_proc; 892 struct filedesc *fdp; 893 int msk, i, j, fd, n; 894 fd_mask ibits, obits; 895 struct file *fp; 896 897 fdp = p->p_fd; 898 n = 0; 899 for (msk = 0; msk < 3; msk++) { 900 for (i = 0; i < nfd; i += NFDBITS) { 901 ibits = *ibitp++; 902 obits = 0; 903 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 904 ibits &= ~(1 << j); 905 if ((fp = fd_getfile(fdp, fd)) == NULL) 906 return (EBADF); 907 FILE_USE(fp); 908 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) { 909 obits |= (1 << j); 910 n++; 911 } 912 FILE_UNUSE(fp, l); 913 } 914 *obitp++ = obits; 915 } 916 } 917 *retval = n; 918 return (0); 919 } 920 921 /* 922 * Poll system call. 923 */ 924 int 925 sys_poll(lwp_t *l, void *v, register_t *retval) 926 { 927 struct sys_poll_args /* { 928 syscallarg(struct pollfd *) fds; 929 syscallarg(u_int) nfds; 930 syscallarg(int) timeout; 931 } */ * const uap = v; 932 struct timeval atv, *tv = NULL; 933 934 if (SCARG(uap, timeout) != INFTIM) { 935 atv.tv_sec = SCARG(uap, timeout) / 1000; 936 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 937 tv = &atv; 938 } 939 940 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds), 941 tv, NULL); 942 } 943 944 /* 945 * Poll system call. 946 */ 947 int 948 sys_pollts(lwp_t *l, void *v, register_t *retval) 949 { 950 struct sys_pollts_args /* { 951 syscallarg(struct pollfd *) fds; 952 syscallarg(u_int) nfds; 953 syscallarg(const struct timespec *) ts; 954 syscallarg(const sigset_t *) mask; 955 } */ * const uap = v; 956 struct timespec ats; 957 struct timeval atv, *tv = NULL; 958 sigset_t amask, *mask = NULL; 959 int error; 960 961 if (SCARG(uap, ts)) { 962 error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 963 if (error) 964 return error; 965 atv.tv_sec = ats.tv_sec; 966 atv.tv_usec = ats.tv_nsec / 1000; 967 tv = &atv; 968 } 969 if (SCARG(uap, mask)) { 970 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 971 if (error) 972 return error; 973 mask = &amask; 974 } 975 976 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds), 977 tv, mask); 978 } 979 980 int 981 pollcommon(lwp_t *l, register_t *retval, 982 struct pollfd *u_fds, u_int nfds, 983 struct timeval *tv, sigset_t *mask) 984 { 985 char smallbits[32 * sizeof(struct pollfd)]; 986 proc_t * const p = l->l_proc; 987 void * bits; 988 sigset_t oldmask; 989 int ncoll, error, timo; 990 size_t ni; 991 struct timeval sleeptv; 992 993 if (nfds > p->p_fd->fd_nfiles) { 994 /* forgiving; slightly wrong */ 995 nfds = p->p_fd->fd_nfiles; 996 } 997 ni = nfds * sizeof(struct pollfd); 998 if (ni > sizeof(smallbits)) 999 bits = kmem_alloc(ni, KM_SLEEP); 1000 else 1001 bits = smallbits; 1002 1003 error = copyin(u_fds, bits, ni); 1004 if (error) 1005 goto done; 1006 1007 timo = 0; 1008 if (tv && inittimeleft(tv, &sleeptv) == -1) { 1009 error = EINVAL; 1010 goto done; 1011 } 1012 1013 if (mask) { 1014 sigminusset(&sigcantmask, mask); 1015 mutex_enter(&p->p_smutex); 1016 oldmask = l->l_sigmask; 1017 l->l_sigmask = *mask; 1018 mutex_exit(&p->p_smutex); 1019 } else 1020 oldmask = l->l_sigmask; /* XXXgcc */ 1021 1022 mutex_enter(&select_lock); 1023 SLIST_INIT(&l->l_selwait); 1024 for (;;) { 1025 ncoll = nselcoll; 1026 l->l_selflag = SEL_SCANNING; 1027 mutex_exit(&select_lock); 1028 1029 error = pollscan(l, (struct pollfd *)bits, nfds, retval); 1030 1031 mutex_enter(&select_lock); 1032 if (error || *retval) 1033 break; 1034 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0) 1035 break; 1036 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll) 1037 continue; 1038 l->l_selflag = SEL_BLOCKING; 1039 error = cv_timedwait_sig(&select_cv, &select_lock, timo); 1040 if (error != 0) 1041 break; 1042 } 1043 selclear(); 1044 mutex_exit(&select_lock); 1045 1046 if (mask) { 1047 mutex_enter(&p->p_smutex); 1048 l->l_sigmask = oldmask; 1049 mutex_exit(&p->p_smutex); 1050 } 1051 done: 1052 /* poll is not restarted after signals... */ 1053 if (error == ERESTART) 1054 error = EINTR; 1055 if (error == EWOULDBLOCK) 1056 error = 0; 1057 if (error == 0) 1058 error = copyout(bits, u_fds, ni); 1059 if (bits != smallbits) 1060 kmem_free(bits, ni); 1061 return (error); 1062 } 1063 1064 int 1065 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval) 1066 { 1067 proc_t *p = l->l_proc; 1068 struct filedesc *fdp; 1069 int i, n; 1070 struct file *fp; 1071 1072 fdp = p->p_fd; 1073 n = 0; 1074 for (i = 0; i < nfd; i++, fds++) { 1075 if (fds->fd >= fdp->fd_nfiles) { 1076 fds->revents = POLLNVAL; 1077 n++; 1078 } else if (fds->fd < 0) { 1079 fds->revents = 0; 1080 } else { 1081 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) { 1082 fds->revents = POLLNVAL; 1083 n++; 1084 } else { 1085 FILE_USE(fp); 1086 fds->revents = (*fp->f_ops->fo_poll)(fp, 1087 fds->events | POLLERR | POLLHUP, l); 1088 if (fds->revents != 0) 1089 n++; 1090 FILE_UNUSE(fp, l); 1091 } 1092 } 1093 } 1094 *retval = n; 1095 return (0); 1096 } 1097 1098 /*ARGSUSED*/ 1099 int 1100 seltrue(dev_t dev, int events, lwp_t *l) 1101 { 1102 1103 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1104 } 1105 1106 /* 1107 * Record a select request. 1108 */ 1109 void 1110 selrecord(lwp_t *selector, struct selinfo *sip) 1111 { 1112 1113 mutex_enter(&select_lock); 1114 if (sip->sel_lwp == NULL) { 1115 /* First named waiter, although there may be more. */ 1116 sip->sel_lwp = selector; 1117 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 1118 } else if (sip->sel_lwp != selector) { 1119 /* Multiple waiters. */ 1120 sip->sel_collision = true; 1121 } 1122 mutex_exit(&select_lock); 1123 } 1124 1125 /* 1126 * Do a wakeup when a selectable event occurs. 1127 */ 1128 void 1129 selwakeup(struct selinfo *sip) 1130 { 1131 lwp_t *l; 1132 1133 mutex_enter(&select_lock); 1134 if (sip->sel_collision) { 1135 /* Multiple waiters - just notify everybody. */ 1136 nselcoll++; 1137 sip->sel_collision = false; 1138 cv_broadcast(&select_cv); 1139 } else if (sip->sel_lwp != NULL) { 1140 /* Only one LWP waiting. */ 1141 l = sip->sel_lwp; 1142 if (l->l_selflag == SEL_BLOCKING) { 1143 /* 1144 * If it's sleeping, wake it up. If not, it's 1145 * already awake but hasn't yet removed itself 1146 * from the selector. We reset the state below 1147 * so that we only attempt to do this once. 1148 */ 1149 lwp_lock(l); 1150 if (l->l_wchan == &select_cv) { 1151 /* lwp_unsleep() releases the LWP lock. */ 1152 lwp_unsleep(l); 1153 } else 1154 lwp_unlock(l); 1155 } else { 1156 /* 1157 * Not yet asleep. Reset its state below so that 1158 * it will go around again. 1159 */ 1160 } 1161 l->l_selflag = SEL_RESET; 1162 } 1163 mutex_exit(&select_lock); 1164 } 1165 1166 void 1167 selnotify(struct selinfo *sip, long knhint) 1168 { 1169 1170 selwakeup(sip); 1171 KNOTE(&sip->sel_klist, knhint); 1172 } 1173 1174 /* 1175 * Remove an LWP from all objects that it is waiting for. 1176 */ 1177 static void 1178 selclear(void) 1179 { 1180 struct selinfo *sip; 1181 lwp_t *l = curlwp; 1182 1183 KASSERT(mutex_owned(&select_lock)); 1184 1185 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) { 1186 KASSERT(sip->sel_lwp == l); 1187 sip->sel_lwp = NULL; 1188 } 1189 } 1190 1191 /* 1192 * Initialize the select/poll system calls. 1193 */ 1194 void 1195 selsysinit(void) 1196 { 1197 1198 mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM); 1199 cv_init(&select_cv, "select"); 1200 } 1201 1202 /* 1203 * Initialize a selector. 1204 */ 1205 void 1206 selinit(struct selinfo *sip) 1207 { 1208 1209 memset(sip, 0, sizeof(*sip)); 1210 } 1211 1212 /* 1213 * Destroy a selector. The owning object must not gain new 1214 * references while this is in progress: all activity on the 1215 * selector must be stopped. 1216 */ 1217 void 1218 seldestroy(struct selinfo *sip) 1219 { 1220 lwp_t *l; 1221 1222 if (sip->sel_lwp == NULL) 1223 return; 1224 1225 mutex_enter(&select_lock); 1226 if ((l = sip->sel_lwp) != NULL) { 1227 /* This should rarely happen, so SLIST_REMOVE() is OK. */ 1228 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 1229 sip->sel_lwp = NULL; 1230 } 1231 mutex_exit(&select_lock); 1232 } 1233