1 /* $NetBSD: vfs_vnops.c,v 1.213 2020/06/11 22:21:05 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 66 */ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.213 2020/06/11 22:21:05 ad Exp $"); 70 71 #include "veriexec.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/kernel.h> 76 #include <sys/file.h> 77 #include <sys/stat.h> 78 #include <sys/buf.h> 79 #include <sys/proc.h> 80 #include <sys/mount.h> 81 #include <sys/namei.h> 82 #include <sys/vnode.h> 83 #include <sys/ioctl.h> 84 #include <sys/tty.h> 85 #include <sys/poll.h> 86 #include <sys/kauth.h> 87 #include <sys/syslog.h> 88 #include <sys/fstrans.h> 89 #include <sys/atomic.h> 90 #include <sys/filedesc.h> 91 #include <sys/wapbl.h> 92 #include <sys/mman.h> 93 94 #include <miscfs/specfs/specdev.h> 95 #include <miscfs/fifofs/fifo.h> 96 97 #include <uvm/uvm_extern.h> 98 #include <uvm/uvm_readahead.h> 99 #include <uvm/uvm_device.h> 100 101 #ifdef UNION 102 #include <fs/union/union.h> 103 #endif 104 105 #ifndef COMPAT_ZERODEV 106 #define COMPAT_ZERODEV(dev) (0) 107 #endif 108 109 int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *); 110 111 #include <sys/verified_exec.h> 112 113 static int vn_read(file_t *fp, off_t *offset, struct uio *uio, 114 kauth_cred_t cred, int flags); 115 static int vn_write(file_t *fp, off_t *offset, struct uio *uio, 116 kauth_cred_t cred, int flags); 117 static int vn_closefile(file_t *fp); 118 static int vn_poll(file_t *fp, int events); 119 static int vn_fcntl(file_t *fp, u_int com, void *data); 120 static int vn_statfile(file_t *fp, struct stat *sb); 121 static int vn_ioctl(file_t *fp, u_long com, void *data); 122 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, 123 struct uvm_object **, int *); 124 125 const struct fileops vnops = { 126 .fo_name = "vn", 127 .fo_read = vn_read, 128 .fo_write = vn_write, 129 .fo_ioctl = vn_ioctl, 130 .fo_fcntl = vn_fcntl, 131 .fo_poll = vn_poll, 132 .fo_stat = vn_statfile, 133 .fo_close = vn_closefile, 134 .fo_kqfilter = vn_kqfilter, 135 .fo_restart = fnullop_restart, 136 .fo_mmap = vn_mmap, 137 }; 138 139 /* 140 * Common code for vnode open operations. 141 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 142 */ 143 int 144 vn_open(struct nameidata *ndp, int fmode, int cmode) 145 { 146 struct vnode *vp; 147 struct lwp *l = curlwp; 148 kauth_cred_t cred = l->l_cred; 149 struct vattr va; 150 int error; 151 const char *pathstring; 152 153 if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) 154 return EINVAL; 155 156 ndp->ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; 157 158 if (fmode & O_CREAT) { 159 ndp->ni_cnd.cn_nameiop = CREATE; 160 ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; 161 if ((fmode & O_EXCL) == 0 && 162 ((fmode & O_NOFOLLOW) == 0)) 163 ndp->ni_cnd.cn_flags |= FOLLOW; 164 } else { 165 ndp->ni_cnd.cn_nameiop = LOOKUP; 166 ndp->ni_cnd.cn_flags |= LOCKLEAF; 167 if ((fmode & O_NOFOLLOW) == 0) 168 ndp->ni_cnd.cn_flags |= FOLLOW; 169 } 170 171 pathstring = pathbuf_stringcopy_get(ndp->ni_pathbuf); 172 if (pathstring == NULL) { 173 return ENOMEM; 174 } 175 176 error = namei(ndp); 177 if (error) 178 goto out; 179 180 vp = ndp->ni_vp; 181 182 #if NVERIEXEC > 0 183 error = veriexec_openchk(l, ndp->ni_vp, pathstring, fmode); 184 if (error) { 185 /* We have to release the locks ourselves */ 186 if (fmode & O_CREAT) { 187 if (vp == NULL) { 188 vput(ndp->ni_dvp); 189 } else { 190 VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); 191 if (ndp->ni_dvp == ndp->ni_vp) 192 vrele(ndp->ni_dvp); 193 else 194 vput(ndp->ni_dvp); 195 ndp->ni_dvp = NULL; 196 vput(vp); 197 } 198 } else { 199 vput(vp); 200 } 201 goto out; 202 } 203 #endif /* NVERIEXEC > 0 */ 204 205 if (fmode & O_CREAT) { 206 if (ndp->ni_vp == NULL) { 207 vattr_null(&va); 208 va.va_type = VREG; 209 va.va_mode = cmode; 210 if (fmode & O_EXCL) 211 va.va_vaflags |= VA_EXCLUSIVE; 212 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 213 &ndp->ni_cnd, &va); 214 if (error) { 215 vput(ndp->ni_dvp); 216 goto out; 217 } 218 fmode &= ~O_TRUNC; 219 vp = ndp->ni_vp; 220 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 221 vput(ndp->ni_dvp); 222 } else { 223 VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); 224 if (ndp->ni_dvp == ndp->ni_vp) 225 vrele(ndp->ni_dvp); 226 else 227 vput(ndp->ni_dvp); 228 ndp->ni_dvp = NULL; 229 vp = ndp->ni_vp; 230 if (fmode & O_EXCL) { 231 error = EEXIST; 232 goto bad; 233 } 234 fmode &= ~O_CREAT; 235 } 236 } else { 237 vp = ndp->ni_vp; 238 } 239 if (vp->v_type == VSOCK) { 240 error = EOPNOTSUPP; 241 goto bad; 242 } 243 if (ndp->ni_vp->v_type == VLNK) { 244 error = EFTYPE; 245 goto bad; 246 } 247 248 if ((fmode & O_CREAT) == 0) { 249 error = vn_openchk(vp, cred, fmode); 250 if (error != 0) 251 goto bad; 252 } 253 254 if (fmode & O_TRUNC) { 255 vattr_null(&va); 256 va.va_size = 0; 257 error = VOP_SETATTR(vp, &va, cred); 258 if (error != 0) 259 goto bad; 260 } 261 if ((error = VOP_OPEN(vp, fmode, cred)) != 0) 262 goto bad; 263 if (fmode & FWRITE) { 264 mutex_enter(vp->v_interlock); 265 vp->v_writecount++; 266 mutex_exit(vp->v_interlock); 267 } 268 269 bad: 270 if (error) 271 vput(vp); 272 out: 273 pathbuf_stringcopy_put(ndp->ni_pathbuf, pathstring); 274 return (error); 275 } 276 277 /* 278 * Check for write permissions on the specified vnode. 279 * Prototype text segments cannot be written. 280 */ 281 int 282 vn_writechk(struct vnode *vp) 283 { 284 285 /* 286 * If the vnode is in use as a process's text, 287 * we can't allow writing. 288 */ 289 if (vp->v_iflag & VI_TEXT) 290 return (ETXTBSY); 291 return (0); 292 } 293 294 int 295 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) 296 { 297 int permbits = 0; 298 int error; 299 300 if (vp->v_type == VNON || vp->v_type == VBAD) 301 return ENXIO; 302 303 if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) 304 return ENOTDIR; 305 306 if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG) 307 return EFTYPE; 308 309 if ((fflags & FREAD) != 0) { 310 permbits = VREAD; 311 } 312 if ((fflags & FEXEC) != 0) { 313 permbits |= VEXEC; 314 } 315 if ((fflags & (FWRITE | O_TRUNC)) != 0) { 316 permbits |= VWRITE; 317 if (vp->v_type == VDIR) { 318 error = EISDIR; 319 goto bad; 320 } 321 error = vn_writechk(vp); 322 if (error != 0) 323 goto bad; 324 } 325 error = VOP_ACCESS(vp, permbits, cred); 326 bad: 327 return error; 328 } 329 330 /* 331 * Mark a vnode as having executable mappings. 332 */ 333 void 334 vn_markexec(struct vnode *vp) 335 { 336 337 if ((vp->v_iflag & VI_EXECMAP) != 0) { 338 /* Safe unlocked, as long as caller holds a reference. */ 339 return; 340 } 341 342 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 343 mutex_enter(vp->v_interlock); 344 if ((vp->v_iflag & VI_EXECMAP) == 0) { 345 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 346 vp->v_iflag |= VI_EXECMAP; 347 } 348 mutex_exit(vp->v_interlock); 349 rw_exit(vp->v_uobj.vmobjlock); 350 } 351 352 /* 353 * Mark a vnode as being the text of a process. 354 * Fail if the vnode is currently writable. 355 */ 356 int 357 vn_marktext(struct vnode *vp) 358 { 359 360 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { 361 /* Safe unlocked, as long as caller holds a reference. */ 362 return (0); 363 } 364 365 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 366 mutex_enter(vp->v_interlock); 367 if (vp->v_writecount != 0) { 368 KASSERT((vp->v_iflag & VI_TEXT) == 0); 369 mutex_exit(vp->v_interlock); 370 rw_exit(vp->v_uobj.vmobjlock); 371 return (ETXTBSY); 372 } 373 if ((vp->v_iflag & VI_EXECMAP) == 0) { 374 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 375 } 376 vp->v_iflag |= (VI_TEXT | VI_EXECMAP); 377 mutex_exit(vp->v_interlock); 378 rw_exit(vp->v_uobj.vmobjlock); 379 return (0); 380 } 381 382 /* 383 * Vnode close call 384 * 385 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. 386 */ 387 int 388 vn_close(struct vnode *vp, int flags, kauth_cred_t cred) 389 { 390 int error; 391 392 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 393 if (flags & FWRITE) { 394 mutex_enter(vp->v_interlock); 395 KASSERT(vp->v_writecount > 0); 396 vp->v_writecount--; 397 mutex_exit(vp->v_interlock); 398 } 399 error = VOP_CLOSE(vp, flags, cred); 400 vput(vp); 401 return (error); 402 } 403 404 static int 405 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) 406 { 407 struct lwp *l = curlwp; 408 off_t testoff; 409 410 if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) 411 return 0; 412 413 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 414 if (ioflag & IO_APPEND) 415 testoff = vp->v_size; 416 else 417 testoff = uio->uio_offset; 418 419 if (testoff + uio->uio_resid > 420 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 421 mutex_enter(&proc_lock); 422 psignal(l->l_proc, SIGXFSZ); 423 mutex_exit(&proc_lock); 424 return EFBIG; 425 } 426 427 return 0; 428 } 429 430 /* 431 * Package up an I/O request on a vnode into a uio and do it. 432 */ 433 int 434 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 435 enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, 436 struct lwp *l) 437 { 438 struct uio auio; 439 struct iovec aiov; 440 int error; 441 442 if ((ioflg & IO_NODELOCKED) == 0) { 443 if (rw == UIO_READ) { 444 vn_lock(vp, LK_SHARED | LK_RETRY); 445 } else /* UIO_WRITE */ { 446 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 447 } 448 } 449 auio.uio_iov = &aiov; 450 auio.uio_iovcnt = 1; 451 aiov.iov_base = base; 452 aiov.iov_len = len; 453 auio.uio_resid = len; 454 auio.uio_offset = offset; 455 auio.uio_rw = rw; 456 if (segflg == UIO_SYSSPACE) { 457 UIO_SETUP_SYSSPACE(&auio); 458 } else { 459 auio.uio_vmspace = l->l_proc->p_vmspace; 460 } 461 462 if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) 463 goto out; 464 465 if (rw == UIO_READ) { 466 error = VOP_READ(vp, &auio, ioflg, cred); 467 } else { 468 error = VOP_WRITE(vp, &auio, ioflg, cred); 469 } 470 471 if (aresid) 472 *aresid = auio.uio_resid; 473 else 474 if (auio.uio_resid && error == 0) 475 error = EIO; 476 477 out: 478 if ((ioflg & IO_NODELOCKED) == 0) { 479 VOP_UNLOCK(vp); 480 } 481 return (error); 482 } 483 484 int 485 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, 486 struct lwp *l, off_t **cookies, int *ncookies) 487 { 488 struct vnode *vp = fp->f_vnode; 489 struct iovec aiov; 490 struct uio auio; 491 int error, eofflag; 492 493 /* Limit the size on any kernel buffers used by VOP_READDIR */ 494 count = uimin(MAXBSIZE, count); 495 496 unionread: 497 if (vp->v_type != VDIR) 498 return (EINVAL); 499 aiov.iov_base = bf; 500 aiov.iov_len = count; 501 auio.uio_iov = &aiov; 502 auio.uio_iovcnt = 1; 503 auio.uio_rw = UIO_READ; 504 if (segflg == UIO_SYSSPACE) { 505 UIO_SETUP_SYSSPACE(&auio); 506 } else { 507 KASSERT(l == curlwp); 508 auio.uio_vmspace = l->l_proc->p_vmspace; 509 } 510 auio.uio_resid = count; 511 vn_lock(vp, LK_SHARED | LK_RETRY); 512 auio.uio_offset = fp->f_offset; 513 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, 514 ncookies); 515 mutex_enter(&fp->f_lock); 516 fp->f_offset = auio.uio_offset; 517 mutex_exit(&fp->f_lock); 518 VOP_UNLOCK(vp); 519 if (error) 520 return (error); 521 522 if (count == auio.uio_resid && vn_union_readdir_hook) { 523 struct vnode *ovp = vp; 524 525 error = (*vn_union_readdir_hook)(&vp, fp, l); 526 if (error) 527 return (error); 528 if (vp != ovp) 529 goto unionread; 530 } 531 532 if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && 533 (vp->v_mount->mnt_flag & MNT_UNION)) { 534 struct vnode *tvp = vp; 535 vp = vp->v_mount->mnt_vnodecovered; 536 vref(vp); 537 mutex_enter(&fp->f_lock); 538 fp->f_vnode = vp; 539 fp->f_offset = 0; 540 mutex_exit(&fp->f_lock); 541 vrele(tvp); 542 goto unionread; 543 } 544 *done = count - auio.uio_resid; 545 return error; 546 } 547 548 /* 549 * File table vnode read routine. 550 */ 551 static int 552 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 553 int flags) 554 { 555 struct vnode *vp = fp->f_vnode; 556 int error, ioflag, fflag; 557 size_t count; 558 559 ioflag = IO_ADV_ENCODE(fp->f_advice); 560 fflag = fp->f_flag; 561 if (fflag & FNONBLOCK) 562 ioflag |= IO_NDELAY; 563 if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) 564 ioflag |= IO_SYNC; 565 if (fflag & FALTIO) 566 ioflag |= IO_ALTSEMANTICS; 567 if (fflag & FDIRECT) 568 ioflag |= IO_DIRECT; 569 vn_lock(vp, LK_SHARED | LK_RETRY); 570 uio->uio_offset = *offset; 571 count = uio->uio_resid; 572 error = VOP_READ(vp, uio, ioflag, cred); 573 if (flags & FOF_UPDATE_OFFSET) 574 *offset += count - uio->uio_resid; 575 VOP_UNLOCK(vp); 576 return (error); 577 } 578 579 /* 580 * File table vnode write routine. 581 */ 582 static int 583 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 584 int flags) 585 { 586 struct vnode *vp = fp->f_vnode; 587 int error, ioflag, fflag; 588 size_t count; 589 590 ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; 591 fflag = fp->f_flag; 592 if (vp->v_type == VREG && (fflag & O_APPEND)) 593 ioflag |= IO_APPEND; 594 if (fflag & FNONBLOCK) 595 ioflag |= IO_NDELAY; 596 if (fflag & FFSYNC || 597 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 598 ioflag |= IO_SYNC; 599 else if (fflag & FDSYNC) 600 ioflag |= IO_DSYNC; 601 if (fflag & FALTIO) 602 ioflag |= IO_ALTSEMANTICS; 603 if (fflag & FDIRECT) 604 ioflag |= IO_DIRECT; 605 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 606 uio->uio_offset = *offset; 607 count = uio->uio_resid; 608 609 if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) 610 goto out; 611 612 error = VOP_WRITE(vp, uio, ioflag, cred); 613 614 if (flags & FOF_UPDATE_OFFSET) { 615 if (ioflag & IO_APPEND) { 616 /* 617 * SUSv3 describes behaviour for count = 0 as following: 618 * "Before any action ... is taken, and if nbyte is zero 619 * and the file is a regular file, the write() function 620 * ... in the absence of errors ... shall return zero 621 * and have no other results." 622 */ 623 if (count) 624 *offset = uio->uio_offset; 625 } else 626 *offset += count - uio->uio_resid; 627 } 628 629 out: 630 VOP_UNLOCK(vp); 631 return (error); 632 } 633 634 /* 635 * File table vnode stat routine. 636 */ 637 static int 638 vn_statfile(file_t *fp, struct stat *sb) 639 { 640 struct vnode *vp = fp->f_vnode; 641 int error; 642 643 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 644 error = vn_stat(vp, sb); 645 VOP_UNLOCK(vp); 646 return error; 647 } 648 649 int 650 vn_stat(struct vnode *vp, struct stat *sb) 651 { 652 struct vattr va; 653 int error; 654 mode_t mode; 655 656 memset(&va, 0, sizeof(va)); 657 error = VOP_GETATTR(vp, &va, kauth_cred_get()); 658 if (error) 659 return (error); 660 /* 661 * Copy from vattr table 662 */ 663 memset(sb, 0, sizeof(*sb)); 664 sb->st_dev = va.va_fsid; 665 sb->st_ino = va.va_fileid; 666 mode = va.va_mode; 667 switch (vp->v_type) { 668 case VREG: 669 mode |= S_IFREG; 670 break; 671 case VDIR: 672 mode |= S_IFDIR; 673 break; 674 case VBLK: 675 mode |= S_IFBLK; 676 break; 677 case VCHR: 678 mode |= S_IFCHR; 679 break; 680 case VLNK: 681 mode |= S_IFLNK; 682 break; 683 case VSOCK: 684 mode |= S_IFSOCK; 685 break; 686 case VFIFO: 687 mode |= S_IFIFO; 688 break; 689 default: 690 return (EBADF); 691 } 692 sb->st_mode = mode; 693 sb->st_nlink = va.va_nlink; 694 sb->st_uid = va.va_uid; 695 sb->st_gid = va.va_gid; 696 sb->st_rdev = va.va_rdev; 697 sb->st_size = va.va_size; 698 sb->st_atimespec = va.va_atime; 699 sb->st_mtimespec = va.va_mtime; 700 sb->st_ctimespec = va.va_ctime; 701 sb->st_birthtimespec = va.va_birthtime; 702 sb->st_blksize = va.va_blocksize; 703 sb->st_flags = va.va_flags; 704 sb->st_gen = 0; 705 sb->st_blocks = va.va_bytes / S_BLKSIZE; 706 return (0); 707 } 708 709 /* 710 * File table vnode fcntl routine. 711 */ 712 static int 713 vn_fcntl(file_t *fp, u_int com, void *data) 714 { 715 struct vnode *vp = fp->f_vnode; 716 int error; 717 718 error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); 719 return (error); 720 } 721 722 /* 723 * File table vnode ioctl routine. 724 */ 725 static int 726 vn_ioctl(file_t *fp, u_long com, void *data) 727 { 728 struct vnode *vp = fp->f_vnode, *ovp; 729 struct vattr vattr; 730 int error; 731 732 switch (vp->v_type) { 733 734 case VREG: 735 case VDIR: 736 if (com == FIONREAD) { 737 vn_lock(vp, LK_SHARED | LK_RETRY); 738 error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); 739 VOP_UNLOCK(vp); 740 if (error) 741 return (error); 742 *(int *)data = vattr.va_size - fp->f_offset; 743 return (0); 744 } 745 if ((com == FIONWRITE) || (com == FIONSPACE)) { 746 /* 747 * Files don't have send queues, so there never 748 * are any bytes in them, nor is there any 749 * open space in them. 750 */ 751 *(int *)data = 0; 752 return (0); 753 } 754 if (com == FIOGETBMAP) { 755 daddr_t *block; 756 757 if (*(daddr_t *)data < 0) 758 return (EINVAL); 759 block = (daddr_t *)data; 760 return (VOP_BMAP(vp, *block, NULL, block, NULL)); 761 } 762 if (com == OFIOGETBMAP) { 763 daddr_t ibn, obn; 764 765 if (*(int32_t *)data < 0) 766 return (EINVAL); 767 ibn = (daddr_t)*(int32_t *)data; 768 error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); 769 *(int32_t *)data = (int32_t)obn; 770 return error; 771 } 772 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 773 return (0); /* XXX */ 774 /* FALLTHROUGH */ 775 case VFIFO: 776 case VCHR: 777 case VBLK: 778 error = VOP_IOCTL(vp, com, data, fp->f_flag, 779 kauth_cred_get()); 780 if (error == 0 && com == TIOCSCTTY) { 781 vref(vp); 782 mutex_enter(&proc_lock); 783 ovp = curproc->p_session->s_ttyvp; 784 curproc->p_session->s_ttyvp = vp; 785 mutex_exit(&proc_lock); 786 if (ovp != NULL) 787 vrele(ovp); 788 } 789 return (error); 790 791 default: 792 return (EPASSTHROUGH); 793 } 794 } 795 796 /* 797 * File table vnode poll routine. 798 */ 799 static int 800 vn_poll(file_t *fp, int events) 801 { 802 803 return (VOP_POLL(fp->f_vnode, events)); 804 } 805 806 /* 807 * File table vnode kqfilter routine. 808 */ 809 int 810 vn_kqfilter(file_t *fp, struct knote *kn) 811 { 812 813 return (VOP_KQFILTER(fp->f_vnode, kn)); 814 } 815 816 static int 817 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, 818 int *advicep, struct uvm_object **uobjp, int *maxprotp) 819 { 820 struct uvm_object *uobj; 821 struct vnode *vp; 822 struct vattr va; 823 struct lwp *l; 824 vm_prot_t maxprot; 825 off_t off; 826 int error, flags; 827 bool needwritemap; 828 829 l = curlwp; 830 831 off = *offp; 832 flags = *flagsp; 833 maxprot = VM_PROT_EXECUTE; 834 835 vp = fp->f_vnode; 836 if (vp->v_type != VREG && vp->v_type != VCHR && 837 vp->v_type != VBLK) { 838 /* only REG/CHR/BLK support mmap */ 839 return ENODEV; 840 } 841 if (vp->v_type != VCHR && off < 0) { 842 return EINVAL; 843 } 844 if (vp->v_type != VCHR && (off_t)(off + size) < off) { 845 /* no offset wrapping */ 846 return EOVERFLOW; 847 } 848 849 /* special case: catch SunOS style /dev/zero */ 850 if (vp->v_type == VCHR && 851 (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { 852 *uobjp = NULL; 853 *maxprotp = VM_PROT_ALL; 854 return 0; 855 } 856 857 /* 858 * Old programs may not select a specific sharing type, so 859 * default to an appropriate one. 860 * 861 * XXX: how does MAP_ANON fit in the picture? 862 */ 863 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 864 #if defined(DEBUG) 865 struct proc *p = l->l_proc; 866 printf("WARNING: defaulted mmap() share type to " 867 "%s (pid %d command %s)\n", vp->v_type == VCHR ? 868 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, 869 p->p_comm); 870 #endif 871 if (vp->v_type == VCHR) 872 flags |= MAP_SHARED; /* for a device */ 873 else 874 flags |= MAP_PRIVATE; /* for a file */ 875 } 876 877 /* 878 * MAP_PRIVATE device mappings don't make sense (and aren't 879 * supported anyway). However, some programs rely on this, 880 * so just change it to MAP_SHARED. 881 */ 882 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 883 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 884 } 885 886 /* 887 * now check protection 888 */ 889 890 /* check read access */ 891 if (fp->f_flag & FREAD) 892 maxprot |= VM_PROT_READ; 893 else if (prot & PROT_READ) { 894 return EACCES; 895 } 896 897 /* check write access, shared case first */ 898 if (flags & MAP_SHARED) { 899 /* 900 * if the file is writable, only add PROT_WRITE to 901 * maxprot if the file is not immutable, append-only. 902 * otherwise, if we have asked for PROT_WRITE, return 903 * EPERM. 904 */ 905 if (fp->f_flag & FWRITE) { 906 vn_lock(vp, LK_SHARED | LK_RETRY); 907 error = VOP_GETATTR(vp, &va, l->l_cred); 908 VOP_UNLOCK(vp); 909 if (error) { 910 return error; 911 } 912 if ((va.va_flags & 913 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) 914 maxprot |= VM_PROT_WRITE; 915 else if (prot & PROT_WRITE) { 916 return EPERM; 917 } 918 } else if (prot & PROT_WRITE) { 919 return EACCES; 920 } 921 } else { 922 /* MAP_PRIVATE mappings can always write to */ 923 maxprot |= VM_PROT_WRITE; 924 } 925 926 /* 927 * Don't allow mmap for EXEC if the file system 928 * is mounted NOEXEC. 929 */ 930 if ((prot & PROT_EXEC) != 0 && 931 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { 932 return EACCES; 933 } 934 935 if (vp->v_type != VCHR) { 936 error = VOP_MMAP(vp, prot, curlwp->l_cred); 937 if (error) { 938 return error; 939 } 940 vref(vp); 941 uobj = &vp->v_uobj; 942 943 /* 944 * If the vnode is being mapped with PROT_EXEC, 945 * then mark it as text. 946 */ 947 if (prot & PROT_EXEC) { 948 vn_markexec(vp); 949 } 950 } else { 951 int i = maxprot; 952 953 /* 954 * XXX Some devices don't like to be mapped with 955 * XXX PROT_EXEC or PROT_WRITE, but we don't really 956 * XXX have a better way of handling this, right now 957 */ 958 do { 959 uobj = udv_attach(vp->v_rdev, 960 (flags & MAP_SHARED) ? i : 961 (i & ~VM_PROT_WRITE), off, size); 962 i--; 963 } while ((uobj == NULL) && (i > 0)); 964 if (uobj == NULL) { 965 return EINVAL; 966 } 967 *advicep = UVM_ADV_RANDOM; 968 } 969 970 /* 971 * Set vnode flags to indicate the new kinds of mapping. 972 * We take the vnode lock in exclusive mode here to serialize 973 * with direct I/O. 974 * 975 * Safe to check for these flag values without a lock, as 976 * long as a reference to the vnode is held. 977 */ 978 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && 979 (flags & MAP_SHARED) != 0 && 980 (maxprot & VM_PROT_WRITE) != 0; 981 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { 982 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 983 vp->v_vflag |= VV_MAPPED; 984 if (needwritemap) { 985 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 986 mutex_enter(vp->v_interlock); 987 vp->v_iflag |= VI_WRMAP; 988 mutex_exit(vp->v_interlock); 989 rw_exit(vp->v_uobj.vmobjlock); 990 } 991 VOP_UNLOCK(vp); 992 } 993 994 #if NVERIEXEC > 0 995 996 /* 997 * Check if the file can be executed indirectly. 998 * 999 * XXX: This gives false warnings about "Incorrect access type" 1000 * XXX: if the mapping is not executable. Harmless, but will be 1001 * XXX: fixed as part of other changes. 1002 */ 1003 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, 1004 NULL)) { 1005 1006 /* 1007 * Don't allow executable mappings if we can't 1008 * indirectly execute the file. 1009 */ 1010 if (prot & VM_PROT_EXECUTE) { 1011 return EPERM; 1012 } 1013 1014 /* 1015 * Strip the executable bit from 'maxprot' to make sure 1016 * it can't be made executable later. 1017 */ 1018 maxprot &= ~VM_PROT_EXECUTE; 1019 } 1020 #endif /* NVERIEXEC > 0 */ 1021 1022 *uobjp = uobj; 1023 *maxprotp = maxprot; 1024 *flagsp = flags; 1025 1026 return 0; 1027 } 1028 1029 1030 1031 /* 1032 * Check that the vnode is still valid, and if so 1033 * acquire requested lock. 1034 */ 1035 int 1036 vn_lock(struct vnode *vp, int flags) 1037 { 1038 struct lwp *l; 1039 int error; 1040 1041 #if 0 1042 KASSERT(vrefcnt(vp) > 0 || (vp->v_iflag & VI_ONWORKLST) != 0); 1043 #endif 1044 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| 1045 LK_UPGRADE|LK_DOWNGRADE)) == 0); 1046 KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); 1047 1048 #ifdef DIAGNOSTIC 1049 if (wapbl_vphaswapbl(vp)) 1050 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); 1051 #endif 1052 1053 /* Get a more useful report for lockstat. */ 1054 l = curlwp; 1055 KASSERT(l->l_rwcallsite == 0); 1056 l->l_rwcallsite = (uintptr_t)__builtin_return_address(0); 1057 1058 error = VOP_LOCK(vp, flags); 1059 if ((flags & LK_RETRY) != 0 && error == ENOENT) 1060 error = VOP_LOCK(vp, flags); 1061 1062 l->l_rwcallsite = 0; 1063 1064 KASSERT((flags & LK_RETRY) == 0 || (flags & LK_NOWAIT) != 0 || 1065 error == 0); 1066 1067 return error; 1068 } 1069 1070 /* 1071 * File table vnode close routine. 1072 */ 1073 static int 1074 vn_closefile(file_t *fp) 1075 { 1076 1077 return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); 1078 } 1079 1080 /* 1081 * Simplified in-kernel wrapper calls for extended attribute access. 1082 * Both calls pass in a NULL credential, authorizing a "kernel" access. 1083 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1084 */ 1085 int 1086 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1087 const char *attrname, size_t *buflen, void *bf, struct lwp *l) 1088 { 1089 struct uio auio; 1090 struct iovec aiov; 1091 int error; 1092 1093 aiov.iov_len = *buflen; 1094 aiov.iov_base = bf; 1095 1096 auio.uio_iov = &aiov; 1097 auio.uio_iovcnt = 1; 1098 auio.uio_rw = UIO_READ; 1099 auio.uio_offset = 0; 1100 auio.uio_resid = *buflen; 1101 UIO_SETUP_SYSSPACE(&auio); 1102 1103 if ((ioflg & IO_NODELOCKED) == 0) 1104 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1105 1106 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, 1107 NOCRED); 1108 1109 if ((ioflg & IO_NODELOCKED) == 0) 1110 VOP_UNLOCK(vp); 1111 1112 if (error == 0) 1113 *buflen = *buflen - auio.uio_resid; 1114 1115 return (error); 1116 } 1117 1118 /* 1119 * XXX Failure mode if partially written? 1120 */ 1121 int 1122 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1123 const char *attrname, size_t buflen, const void *bf, struct lwp *l) 1124 { 1125 struct uio auio; 1126 struct iovec aiov; 1127 int error; 1128 1129 aiov.iov_len = buflen; 1130 aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ 1131 1132 auio.uio_iov = &aiov; 1133 auio.uio_iovcnt = 1; 1134 auio.uio_rw = UIO_WRITE; 1135 auio.uio_offset = 0; 1136 auio.uio_resid = buflen; 1137 UIO_SETUP_SYSSPACE(&auio); 1138 1139 if ((ioflg & IO_NODELOCKED) == 0) { 1140 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1141 } 1142 1143 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED); 1144 1145 if ((ioflg & IO_NODELOCKED) == 0) { 1146 VOP_UNLOCK(vp); 1147 } 1148 1149 return (error); 1150 } 1151 1152 int 1153 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1154 const char *attrname, struct lwp *l) 1155 { 1156 int error; 1157 1158 if ((ioflg & IO_NODELOCKED) == 0) { 1159 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1160 } 1161 1162 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED); 1163 if (error == EOPNOTSUPP) 1164 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1165 NOCRED); 1166 1167 if ((ioflg & IO_NODELOCKED) == 0) { 1168 VOP_UNLOCK(vp); 1169 } 1170 1171 return (error); 1172 } 1173 1174 int 1175 vn_fifo_bypass(void *v) 1176 { 1177 struct vop_generic_args *ap = v; 1178 1179 return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); 1180 } 1181 1182 /* 1183 * Open block device by device number 1184 */ 1185 int 1186 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l) 1187 { 1188 int error; 1189 1190 if ((error = bdevvp(dev, vpp)) != 0) 1191 return error; 1192 1193 if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) { 1194 vrele(*vpp); 1195 return error; 1196 } 1197 mutex_enter((*vpp)->v_interlock); 1198 (*vpp)->v_writecount++; 1199 mutex_exit((*vpp)->v_interlock); 1200 1201 return 0; 1202 } 1203 1204 /* 1205 * Lookup the provided name in the filesystem. If the file exists, 1206 * is a valid block device, and isn't being used by anyone else, 1207 * set *vpp to the file's vnode. 1208 */ 1209 int 1210 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l) 1211 { 1212 struct nameidata nd; 1213 struct vnode *vp; 1214 dev_t dev; 1215 enum vtype vt; 1216 int error; 1217 1218 NDINIT(&nd, LOOKUP, FOLLOW, pb); 1219 if ((error = vn_open(&nd, FREAD | FWRITE, 0)) != 0) 1220 return error; 1221 1222 vp = nd.ni_vp; 1223 dev = vp->v_rdev; 1224 vt = vp->v_type; 1225 1226 VOP_UNLOCK(vp); 1227 (void) vn_close(vp, FREAD | FWRITE, l->l_cred); 1228 1229 if (vt != VBLK) 1230 return ENOTBLK; 1231 1232 return vn_bdev_open(dev, vpp, l); 1233 } 1234