1 /* $NetBSD: sys_generic.c,v 1.126 2011/04/10 15:45:33 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 66 */ 67 68 /* 69 * System calls relating to files. 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.126 2011/04/10 15:45:33 christos Exp $"); 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/filedesc.h> 78 #include <sys/ioctl.h> 79 #include <sys/file.h> 80 #include <sys/proc.h> 81 #include <sys/socketvar.h> 82 #include <sys/signalvar.h> 83 #include <sys/uio.h> 84 #include <sys/kernel.h> 85 #include <sys/stat.h> 86 #include <sys/kmem.h> 87 #include <sys/poll.h> 88 #include <sys/vnode.h> 89 #include <sys/mount.h> 90 #include <sys/syscallargs.h> 91 #include <sys/ktrace.h> 92 #include <sys/atomic.h> 93 #include <sys/disklabel.h> 94 95 #include <uvm/uvm_extern.h> 96 97 /* 98 * Read system call. 99 */ 100 /* ARGSUSED */ 101 int 102 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval) 103 { 104 /* { 105 syscallarg(int) fd; 106 syscallarg(void *) buf; 107 syscallarg(size_t) nbyte; 108 } */ 109 file_t *fp; 110 int fd; 111 112 fd = SCARG(uap, fd); 113 114 if ((fp = fd_getfile(fd)) == NULL) 115 return (EBADF); 116 117 if ((fp->f_flag & FREAD) == 0) { 118 fd_putfile(fd); 119 return (EBADF); 120 } 121 122 /* dofileread() will unuse the descriptor for us */ 123 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 124 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 125 } 126 127 int 128 dofileread(int fd, struct file *fp, void *buf, size_t nbyte, 129 off_t *offset, int flags, register_t *retval) 130 { 131 struct iovec aiov; 132 struct uio auio; 133 size_t cnt; 134 int error; 135 lwp_t *l; 136 137 l = curlwp; 138 139 aiov.iov_base = (void *)buf; 140 aiov.iov_len = nbyte; 141 auio.uio_iov = &aiov; 142 auio.uio_iovcnt = 1; 143 auio.uio_resid = nbyte; 144 auio.uio_rw = UIO_READ; 145 auio.uio_vmspace = l->l_proc->p_vmspace; 146 147 /* 148 * Reads return ssize_t because -1 is returned on error. Therefore 149 * we must restrict the length to SSIZE_MAX to avoid garbage return 150 * values. 151 */ 152 if (auio.uio_resid > SSIZE_MAX) { 153 error = EINVAL; 154 goto out; 155 } 156 157 cnt = auio.uio_resid; 158 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 159 if (error) 160 if (auio.uio_resid != cnt && (error == ERESTART || 161 error == EINTR || error == EWOULDBLOCK)) 162 error = 0; 163 cnt -= auio.uio_resid; 164 ktrgenio(fd, UIO_READ, buf, cnt, error); 165 *retval = cnt; 166 out: 167 fd_putfile(fd); 168 return (error); 169 } 170 171 /* 172 * Scatter read system call. 173 */ 174 int 175 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval) 176 { 177 /* { 178 syscallarg(int) fd; 179 syscallarg(const struct iovec *) iovp; 180 syscallarg(int) iovcnt; 181 } */ 182 183 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), 184 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 185 } 186 187 int 188 do_filereadv(int fd, const struct iovec *iovp, int iovcnt, 189 off_t *offset, int flags, register_t *retval) 190 { 191 struct uio auio; 192 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 193 int i, error; 194 size_t cnt; 195 u_int iovlen; 196 struct file *fp; 197 struct iovec *ktriov = NULL; 198 199 if (iovcnt == 0) 200 return EINVAL; 201 202 if ((fp = fd_getfile(fd)) == NULL) 203 return EBADF; 204 205 if ((fp->f_flag & FREAD) == 0) { 206 fd_putfile(fd); 207 return EBADF; 208 } 209 210 if (offset == NULL) 211 offset = &fp->f_offset; 212 else { 213 struct vnode *vp = fp->f_data; 214 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 215 error = ESPIPE; 216 goto out; 217 } 218 /* 219 * Test that the device is seekable ? 220 * XXX This works because no file systems actually 221 * XXX take any action on the seek operation. 222 */ 223 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 224 if (error != 0) 225 goto out; 226 } 227 228 iovlen = iovcnt * sizeof(struct iovec); 229 if (flags & FOF_IOV_SYSSPACE) 230 iov = __UNCONST(iovp); 231 else { 232 iov = aiov; 233 if ((u_int)iovcnt > UIO_SMALLIOV) { 234 if ((u_int)iovcnt > IOV_MAX) { 235 error = EINVAL; 236 goto out; 237 } 238 iov = kmem_alloc(iovlen, KM_SLEEP); 239 if (iov == NULL) { 240 error = ENOMEM; 241 goto out; 242 } 243 needfree = iov; 244 } 245 error = copyin(iovp, iov, iovlen); 246 if (error) 247 goto done; 248 } 249 250 auio.uio_iov = iov; 251 auio.uio_iovcnt = iovcnt; 252 auio.uio_rw = UIO_READ; 253 auio.uio_vmspace = curproc->p_vmspace; 254 255 auio.uio_resid = 0; 256 for (i = 0; i < iovcnt; i++, iov++) { 257 auio.uio_resid += iov->iov_len; 258 /* 259 * Reads return ssize_t because -1 is returned on error. 260 * Therefore we must restrict the length to SSIZE_MAX to 261 * avoid garbage return values. 262 */ 263 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 264 error = EINVAL; 265 goto done; 266 } 267 } 268 269 /* 270 * if tracing, save a copy of iovec 271 */ 272 if (ktrpoint(KTR_GENIO)) { 273 ktriov = kmem_alloc(iovlen, KM_SLEEP); 274 if (ktriov != NULL) 275 memcpy(ktriov, auio.uio_iov, iovlen); 276 } 277 278 cnt = auio.uio_resid; 279 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 280 if (error) 281 if (auio.uio_resid != cnt && (error == ERESTART || 282 error == EINTR || error == EWOULDBLOCK)) 283 error = 0; 284 cnt -= auio.uio_resid; 285 *retval = cnt; 286 287 if (ktriov != NULL) { 288 ktrgeniov(fd, UIO_READ, ktriov, cnt, error); 289 kmem_free(ktriov, iovlen); 290 } 291 292 done: 293 if (needfree) 294 kmem_free(needfree, iovlen); 295 out: 296 fd_putfile(fd); 297 return (error); 298 } 299 300 /* 301 * Write system call 302 */ 303 int 304 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval) 305 { 306 /* { 307 syscallarg(int) fd; 308 syscallarg(const void *) buf; 309 syscallarg(size_t) nbyte; 310 } */ 311 file_t *fp; 312 int fd; 313 314 fd = SCARG(uap, fd); 315 316 if ((fp = fd_getfile(fd)) == NULL) 317 return (EBADF); 318 319 if ((fp->f_flag & FWRITE) == 0) { 320 fd_putfile(fd); 321 return (EBADF); 322 } 323 324 /* dofilewrite() will unuse the descriptor for us */ 325 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 326 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 327 } 328 329 int 330 dofilewrite(int fd, struct file *fp, const void *buf, 331 size_t nbyte, off_t *offset, int flags, register_t *retval) 332 { 333 struct iovec aiov; 334 struct uio auio; 335 size_t cnt; 336 int error; 337 338 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */ 339 aiov.iov_len = nbyte; 340 auio.uio_iov = &aiov; 341 auio.uio_iovcnt = 1; 342 auio.uio_resid = nbyte; 343 auio.uio_rw = UIO_WRITE; 344 auio.uio_vmspace = curproc->p_vmspace; 345 346 /* 347 * Writes return ssize_t because -1 is returned on error. Therefore 348 * we must restrict the length to SSIZE_MAX to avoid garbage return 349 * values. 350 */ 351 if (auio.uio_resid > SSIZE_MAX) { 352 error = EINVAL; 353 goto out; 354 } 355 356 cnt = auio.uio_resid; 357 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 358 if (error) { 359 if (auio.uio_resid != cnt && (error == ERESTART || 360 error == EINTR || error == EWOULDBLOCK)) 361 error = 0; 362 if (error == EPIPE) { 363 mutex_enter(proc_lock); 364 psignal(curproc, SIGPIPE); 365 mutex_exit(proc_lock); 366 } 367 } 368 cnt -= auio.uio_resid; 369 ktrgenio(fd, UIO_WRITE, buf, cnt, error); 370 *retval = cnt; 371 out: 372 fd_putfile(fd); 373 return (error); 374 } 375 376 /* 377 * Gather write system call 378 */ 379 int 380 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval) 381 { 382 /* { 383 syscallarg(int) fd; 384 syscallarg(const struct iovec *) iovp; 385 syscallarg(int) iovcnt; 386 } */ 387 388 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), 389 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 390 } 391 392 int 393 do_filewritev(int fd, const struct iovec *iovp, int iovcnt, 394 off_t *offset, int flags, register_t *retval) 395 { 396 struct uio auio; 397 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 398 int i, error; 399 size_t cnt; 400 u_int iovlen; 401 struct file *fp; 402 struct iovec *ktriov = NULL; 403 404 if (iovcnt == 0) 405 return EINVAL; 406 407 if ((fp = fd_getfile(fd)) == NULL) 408 return EBADF; 409 410 if ((fp->f_flag & FWRITE) == 0) { 411 fd_putfile(fd); 412 return EBADF; 413 } 414 415 if (offset == NULL) 416 offset = &fp->f_offset; 417 else { 418 struct vnode *vp = fp->f_data; 419 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 420 error = ESPIPE; 421 goto out; 422 } 423 /* 424 * Test that the device is seekable ? 425 * XXX This works because no file systems actually 426 * XXX take any action on the seek operation. 427 */ 428 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 429 if (error != 0) 430 goto out; 431 } 432 433 iovlen = iovcnt * sizeof(struct iovec); 434 if (flags & FOF_IOV_SYSSPACE) 435 iov = __UNCONST(iovp); 436 else { 437 iov = aiov; 438 if ((u_int)iovcnt > UIO_SMALLIOV) { 439 if ((u_int)iovcnt > IOV_MAX) { 440 error = EINVAL; 441 goto out; 442 } 443 iov = kmem_alloc(iovlen, KM_SLEEP); 444 if (iov == NULL) { 445 error = ENOMEM; 446 goto out; 447 } 448 needfree = iov; 449 } 450 error = copyin(iovp, iov, iovlen); 451 if (error) 452 goto done; 453 } 454 455 auio.uio_iov = iov; 456 auio.uio_iovcnt = iovcnt; 457 auio.uio_rw = UIO_WRITE; 458 auio.uio_vmspace = curproc->p_vmspace; 459 460 auio.uio_resid = 0; 461 for (i = 0; i < iovcnt; i++, iov++) { 462 auio.uio_resid += iov->iov_len; 463 /* 464 * Writes return ssize_t because -1 is returned on error. 465 * Therefore we must restrict the length to SSIZE_MAX to 466 * avoid garbage return values. 467 */ 468 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 469 error = EINVAL; 470 goto done; 471 } 472 } 473 474 /* 475 * if tracing, save a copy of iovec 476 */ 477 if (ktrpoint(KTR_GENIO)) { 478 ktriov = kmem_alloc(iovlen, KM_SLEEP); 479 if (ktriov != NULL) 480 memcpy(ktriov, auio.uio_iov, iovlen); 481 } 482 483 cnt = auio.uio_resid; 484 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 485 if (error) { 486 if (auio.uio_resid != cnt && (error == ERESTART || 487 error == EINTR || error == EWOULDBLOCK)) 488 error = 0; 489 if (error == EPIPE) { 490 mutex_enter(proc_lock); 491 psignal(curproc, SIGPIPE); 492 mutex_exit(proc_lock); 493 } 494 } 495 cnt -= auio.uio_resid; 496 *retval = cnt; 497 498 if (ktriov != NULL) { 499 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error); 500 kmem_free(ktriov, iovlen); 501 } 502 503 done: 504 if (needfree) 505 kmem_free(needfree, iovlen); 506 out: 507 fd_putfile(fd); 508 return (error); 509 } 510 511 /* 512 * Ioctl system call 513 */ 514 /* ARGSUSED */ 515 int 516 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval) 517 { 518 /* { 519 syscallarg(int) fd; 520 syscallarg(u_long) com; 521 syscallarg(void *) data; 522 } */ 523 struct file *fp; 524 proc_t *p; 525 u_long com; 526 int error; 527 size_t size, alloc_size; 528 void *data, *memp; 529 #define STK_PARAMS 128 530 u_long stkbuf[STK_PARAMS/sizeof(u_long)]; 531 532 memp = NULL; 533 alloc_size = 0; 534 error = 0; 535 p = l->l_proc; 536 537 if ((fp = fd_getfile(SCARG(uap, fd))) == NULL) 538 return (EBADF); 539 540 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 541 error = EBADF; 542 com = 0; 543 goto out; 544 } 545 546 switch (com = SCARG(uap, com)) { 547 case FIONCLEX: 548 case FIOCLEX: 549 fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX); 550 goto out; 551 } 552 553 /* 554 * Interpret high order word to find amount of data to be 555 * copied to/from the user's address space. 556 */ 557 size = IOCPARM_LEN(com); 558 alloc_size = size; 559 560 /* 561 * The disklabel is now padded to a multiple of 8 bytes however the old 562 * disklabel on 32bit platforms wasn't. This leaves a difference in 563 * size of 4 bytes between the two but are otherwise identical. 564 * To deal with this, we allocate enough space for the new disklabel 565 * but only copyin/out the smaller amount. 566 */ 567 if (IOCGROUP(com) == 'd') { 568 u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32); 569 switch (ncom) { 570 case DIOCGDINFO: 571 case DIOCWDINFO: 572 case DIOCSDINFO: 573 case DIOCGDEFLABEL: 574 com = ncom; 575 if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO)) 576 alloc_size = IOCPARM_LEN(DIOCGDINFO); 577 break; 578 } 579 } 580 if (size > IOCPARM_MAX) { 581 error = ENOTTY; 582 goto out; 583 } 584 memp = NULL; 585 if ((com >> IOCPARM_SHIFT) == 0) { 586 /* UNIX-style ioctl. */ 587 data = SCARG(uap, data); 588 } else { 589 if (alloc_size > sizeof(stkbuf)) { 590 memp = kmem_alloc(alloc_size, KM_SLEEP); 591 data = memp; 592 } else { 593 data = (void *)stkbuf; 594 } 595 if (com&IOC_IN) { 596 if (size) { 597 error = copyin(SCARG(uap, data), data, size); 598 if (error) { 599 goto out; 600 } 601 /* 602 * The data between size and alloc_size has 603 * not been overwritten. It shouldn't matter 604 * but let's clear that anyway. 605 */ 606 if (__predict_false(size < alloc_size)) { 607 memset((char *)data+size, 0, 608 alloc_size - size); 609 } 610 ktrgenio(SCARG(uap, fd), UIO_WRITE, 611 SCARG(uap, data), size, 0); 612 } else { 613 *(void **)data = SCARG(uap, data); 614 } 615 } else if ((com&IOC_OUT) && size) { 616 /* 617 * Zero the buffer so the user always 618 * gets back something deterministic. 619 */ 620 memset(data, 0, size); 621 } else if (com&IOC_VOID) { 622 *(void **)data = SCARG(uap, data); 623 } 624 } 625 626 switch (com) { 627 628 case FIONBIO: 629 /* XXX Code block is not atomic */ 630 if (*(int *)data != 0) 631 atomic_or_uint(&fp->f_flag, FNONBLOCK); 632 else 633 atomic_and_uint(&fp->f_flag, ~FNONBLOCK); 634 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data); 635 break; 636 637 case FIOASYNC: 638 /* XXX Code block is not atomic */ 639 if (*(int *)data != 0) 640 atomic_or_uint(&fp->f_flag, FASYNC); 641 else 642 atomic_and_uint(&fp->f_flag, ~FASYNC); 643 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data); 644 break; 645 646 default: 647 error = (*fp->f_ops->fo_ioctl)(fp, com, data); 648 /* 649 * Copy any data to user, size was 650 * already set and checked above. 651 */ 652 if (error == 0 && (com&IOC_OUT) && size) { 653 error = copyout(data, SCARG(uap, data), size); 654 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data), 655 size, error); 656 } 657 break; 658 } 659 out: 660 if (memp) 661 kmem_free(memp, alloc_size); 662 fd_putfile(SCARG(uap, fd)); 663 switch (error) { 664 case -1: 665 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: " 666 "pid=%d comm=%s\n", 667 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "", 668 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com), 669 p->p_pid, p->p_comm); 670 /* FALLTHROUGH */ 671 case EPASSTHROUGH: 672 error = ENOTTY; 673 /* FALLTHROUGH */ 674 default: 675 return (error); 676 } 677 } 678