1 /* $NetBSD: vfs_vnops.c,v 1.214 2020/11/09 18:09:02 chs Exp $ */ 2 3 /*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 66 */ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.214 2020/11/09 18:09:02 chs Exp $"); 70 71 #include "veriexec.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/kernel.h> 76 #include <sys/file.h> 77 #include <sys/stat.h> 78 #include <sys/buf.h> 79 #include <sys/proc.h> 80 #include <sys/mount.h> 81 #include <sys/namei.h> 82 #include <sys/vnode.h> 83 #include <sys/ioctl.h> 84 #include <sys/tty.h> 85 #include <sys/poll.h> 86 #include <sys/kauth.h> 87 #include <sys/syslog.h> 88 #include <sys/fstrans.h> 89 #include <sys/atomic.h> 90 #include <sys/filedesc.h> 91 #include <sys/wapbl.h> 92 #include <sys/mman.h> 93 94 #include <miscfs/specfs/specdev.h> 95 #include <miscfs/fifofs/fifo.h> 96 97 #include <uvm/uvm_extern.h> 98 #include <uvm/uvm_readahead.h> 99 #include <uvm/uvm_device.h> 100 101 #ifdef UNION 102 #include <fs/union/union.h> 103 #endif 104 105 #ifndef COMPAT_ZERODEV 106 #define COMPAT_ZERODEV(dev) (0) 107 #endif 108 109 int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *); 110 111 #include <sys/verified_exec.h> 112 113 static int vn_read(file_t *fp, off_t *offset, struct uio *uio, 114 kauth_cred_t cred, int flags); 115 static int vn_write(file_t *fp, off_t *offset, struct uio *uio, 116 kauth_cred_t cred, int flags); 117 static int vn_closefile(file_t *fp); 118 static int vn_poll(file_t *fp, int events); 119 static int vn_fcntl(file_t *fp, u_int com, void *data); 120 static int vn_statfile(file_t *fp, struct stat *sb); 121 static int vn_ioctl(file_t *fp, u_long com, void *data); 122 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, 123 struct uvm_object **, int *); 124 125 const struct fileops vnops = { 126 .fo_name = "vn", 127 .fo_read = vn_read, 128 .fo_write = vn_write, 129 .fo_ioctl = vn_ioctl, 130 .fo_fcntl = vn_fcntl, 131 .fo_poll = vn_poll, 132 .fo_stat = vn_statfile, 133 .fo_close = vn_closefile, 134 .fo_kqfilter = vn_kqfilter, 135 .fo_restart = fnullop_restart, 136 .fo_mmap = vn_mmap, 137 }; 138 139 /* 140 * Common code for vnode open operations. 141 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 142 */ 143 int 144 vn_open(struct nameidata *ndp, int fmode, int cmode) 145 { 146 struct vnode *vp; 147 struct lwp *l = curlwp; 148 kauth_cred_t cred = l->l_cred; 149 struct vattr va; 150 int error; 151 const char *pathstring; 152 153 if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) 154 return EINVAL; 155 156 ndp->ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; 157 158 if (fmode & O_CREAT) { 159 ndp->ni_cnd.cn_nameiop = CREATE; 160 ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; 161 if ((fmode & O_EXCL) == 0 && 162 ((fmode & O_NOFOLLOW) == 0)) 163 ndp->ni_cnd.cn_flags |= FOLLOW; 164 } else { 165 ndp->ni_cnd.cn_nameiop = LOOKUP; 166 ndp->ni_cnd.cn_flags |= LOCKLEAF; 167 if ((fmode & O_NOFOLLOW) == 0) 168 ndp->ni_cnd.cn_flags |= FOLLOW; 169 } 170 171 pathstring = pathbuf_stringcopy_get(ndp->ni_pathbuf); 172 if (pathstring == NULL) { 173 return ENOMEM; 174 } 175 176 error = namei(ndp); 177 if (error) 178 goto out; 179 180 vp = ndp->ni_vp; 181 182 #if NVERIEXEC > 0 183 error = veriexec_openchk(l, ndp->ni_vp, pathstring, fmode); 184 if (error) { 185 /* We have to release the locks ourselves */ 186 if (fmode & O_CREAT) { 187 if (vp == NULL) { 188 vput(ndp->ni_dvp); 189 } else { 190 VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); 191 if (ndp->ni_dvp == ndp->ni_vp) 192 vrele(ndp->ni_dvp); 193 else 194 vput(ndp->ni_dvp); 195 ndp->ni_dvp = NULL; 196 vput(vp); 197 } 198 } else { 199 vput(vp); 200 } 201 goto out; 202 } 203 #endif /* NVERIEXEC > 0 */ 204 205 if (fmode & O_CREAT) { 206 if (ndp->ni_vp == NULL) { 207 vattr_null(&va); 208 va.va_type = VREG; 209 va.va_mode = cmode; 210 if (fmode & O_EXCL) 211 va.va_vaflags |= VA_EXCLUSIVE; 212 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 213 &ndp->ni_cnd, &va); 214 if (error) { 215 vput(ndp->ni_dvp); 216 goto out; 217 } 218 fmode &= ~O_TRUNC; 219 vp = ndp->ni_vp; 220 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 221 vput(ndp->ni_dvp); 222 } else { 223 VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); 224 if (ndp->ni_dvp == ndp->ni_vp) 225 vrele(ndp->ni_dvp); 226 else 227 vput(ndp->ni_dvp); 228 ndp->ni_dvp = NULL; 229 vp = ndp->ni_vp; 230 if (fmode & O_EXCL) { 231 error = EEXIST; 232 goto bad; 233 } 234 fmode &= ~O_CREAT; 235 } 236 } else { 237 vp = ndp->ni_vp; 238 } 239 if (vp->v_type == VSOCK) { 240 error = EOPNOTSUPP; 241 goto bad; 242 } 243 if (ndp->ni_vp->v_type == VLNK) { 244 error = EFTYPE; 245 goto bad; 246 } 247 248 if ((fmode & O_CREAT) == 0) { 249 error = vn_openchk(vp, cred, fmode); 250 if (error != 0) 251 goto bad; 252 } 253 254 if (fmode & O_TRUNC) { 255 vattr_null(&va); 256 va.va_size = 0; 257 error = VOP_SETATTR(vp, &va, cred); 258 if (error != 0) 259 goto bad; 260 } 261 if ((error = VOP_OPEN(vp, fmode, cred)) != 0) 262 goto bad; 263 if (fmode & FWRITE) { 264 mutex_enter(vp->v_interlock); 265 vp->v_writecount++; 266 mutex_exit(vp->v_interlock); 267 } 268 269 bad: 270 if (error) 271 vput(vp); 272 out: 273 pathbuf_stringcopy_put(ndp->ni_pathbuf, pathstring); 274 return (error); 275 } 276 277 /* 278 * Check for write permissions on the specified vnode. 279 * Prototype text segments cannot be written. 280 */ 281 int 282 vn_writechk(struct vnode *vp) 283 { 284 285 /* 286 * If the vnode is in use as a process's text, 287 * we can't allow writing. 288 */ 289 if (vp->v_iflag & VI_TEXT) 290 return (ETXTBSY); 291 return (0); 292 } 293 294 int 295 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) 296 { 297 int permbits = 0; 298 int error; 299 300 if (vp->v_type == VNON || vp->v_type == VBAD) 301 return ENXIO; 302 303 if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) 304 return ENOTDIR; 305 306 if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG) 307 return EFTYPE; 308 309 if ((fflags & FREAD) != 0) { 310 permbits = VREAD; 311 } 312 if ((fflags & FEXEC) != 0) { 313 permbits |= VEXEC; 314 } 315 if ((fflags & (FWRITE | O_TRUNC)) != 0) { 316 permbits |= VWRITE; 317 if (vp->v_type == VDIR) { 318 error = EISDIR; 319 goto bad; 320 } 321 error = vn_writechk(vp); 322 if (error != 0) 323 goto bad; 324 } 325 error = VOP_ACCESS(vp, permbits, cred); 326 bad: 327 return error; 328 } 329 330 /* 331 * Mark a vnode as having executable mappings. 332 */ 333 void 334 vn_markexec(struct vnode *vp) 335 { 336 337 if ((vp->v_iflag & VI_EXECMAP) != 0) { 338 /* Safe unlocked, as long as caller holds a reference. */ 339 return; 340 } 341 342 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 343 mutex_enter(vp->v_interlock); 344 if ((vp->v_iflag & VI_EXECMAP) == 0) { 345 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 346 vp->v_iflag |= VI_EXECMAP; 347 } 348 mutex_exit(vp->v_interlock); 349 rw_exit(vp->v_uobj.vmobjlock); 350 } 351 352 /* 353 * Mark a vnode as being the text of a process. 354 * Fail if the vnode is currently writable. 355 */ 356 int 357 vn_marktext(struct vnode *vp) 358 { 359 360 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { 361 /* Safe unlocked, as long as caller holds a reference. */ 362 return (0); 363 } 364 365 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 366 mutex_enter(vp->v_interlock); 367 if (vp->v_writecount != 0) { 368 KASSERT((vp->v_iflag & VI_TEXT) == 0); 369 mutex_exit(vp->v_interlock); 370 rw_exit(vp->v_uobj.vmobjlock); 371 return (ETXTBSY); 372 } 373 if ((vp->v_iflag & VI_EXECMAP) == 0) { 374 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 375 } 376 vp->v_iflag |= (VI_TEXT | VI_EXECMAP); 377 mutex_exit(vp->v_interlock); 378 rw_exit(vp->v_uobj.vmobjlock); 379 return (0); 380 } 381 382 /* 383 * Vnode close call 384 * 385 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. 386 */ 387 int 388 vn_close(struct vnode *vp, int flags, kauth_cred_t cred) 389 { 390 int error; 391 392 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 393 if (flags & FWRITE) { 394 mutex_enter(vp->v_interlock); 395 KASSERT(vp->v_writecount > 0); 396 vp->v_writecount--; 397 mutex_exit(vp->v_interlock); 398 } 399 error = VOP_CLOSE(vp, flags, cred); 400 vput(vp); 401 return (error); 402 } 403 404 static int 405 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) 406 { 407 struct lwp *l = curlwp; 408 off_t testoff; 409 410 if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) 411 return 0; 412 413 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 414 if (ioflag & IO_APPEND) 415 testoff = vp->v_size; 416 else 417 testoff = uio->uio_offset; 418 419 if (testoff + uio->uio_resid > 420 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 421 mutex_enter(&proc_lock); 422 psignal(l->l_proc, SIGXFSZ); 423 mutex_exit(&proc_lock); 424 return EFBIG; 425 } 426 427 return 0; 428 } 429 430 /* 431 * Package up an I/O request on a vnode into a uio and do it. 432 */ 433 int 434 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 435 enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, 436 struct lwp *l) 437 { 438 struct uio auio; 439 struct iovec aiov; 440 int error; 441 442 if ((ioflg & IO_NODELOCKED) == 0) { 443 if (rw == UIO_READ) { 444 vn_lock(vp, LK_SHARED | LK_RETRY); 445 } else /* UIO_WRITE */ { 446 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 447 } 448 } 449 auio.uio_iov = &aiov; 450 auio.uio_iovcnt = 1; 451 aiov.iov_base = base; 452 aiov.iov_len = len; 453 auio.uio_resid = len; 454 auio.uio_offset = offset; 455 auio.uio_rw = rw; 456 if (segflg == UIO_SYSSPACE) { 457 UIO_SETUP_SYSSPACE(&auio); 458 } else { 459 auio.uio_vmspace = l->l_proc->p_vmspace; 460 } 461 462 if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) 463 goto out; 464 465 if (rw == UIO_READ) { 466 error = VOP_READ(vp, &auio, ioflg, cred); 467 } else { 468 error = VOP_WRITE(vp, &auio, ioflg, cred); 469 } 470 471 if (aresid) 472 *aresid = auio.uio_resid; 473 else 474 if (auio.uio_resid && error == 0) 475 error = EIO; 476 477 out: 478 if ((ioflg & IO_NODELOCKED) == 0) { 479 VOP_UNLOCK(vp); 480 } 481 return (error); 482 } 483 484 int 485 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, 486 struct lwp *l, off_t **cookies, int *ncookies) 487 { 488 struct vnode *vp = fp->f_vnode; 489 struct iovec aiov; 490 struct uio auio; 491 int error, eofflag; 492 493 /* Limit the size on any kernel buffers used by VOP_READDIR */ 494 count = uimin(MAXBSIZE, count); 495 496 unionread: 497 if (vp->v_type != VDIR) 498 return (EINVAL); 499 aiov.iov_base = bf; 500 aiov.iov_len = count; 501 auio.uio_iov = &aiov; 502 auio.uio_iovcnt = 1; 503 auio.uio_rw = UIO_READ; 504 if (segflg == UIO_SYSSPACE) { 505 UIO_SETUP_SYSSPACE(&auio); 506 } else { 507 KASSERT(l == curlwp); 508 auio.uio_vmspace = l->l_proc->p_vmspace; 509 } 510 auio.uio_resid = count; 511 vn_lock(vp, LK_SHARED | LK_RETRY); 512 auio.uio_offset = fp->f_offset; 513 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, 514 ncookies); 515 mutex_enter(&fp->f_lock); 516 fp->f_offset = auio.uio_offset; 517 mutex_exit(&fp->f_lock); 518 VOP_UNLOCK(vp); 519 if (error) 520 return (error); 521 522 if (count == auio.uio_resid && vn_union_readdir_hook) { 523 struct vnode *ovp = vp; 524 525 error = (*vn_union_readdir_hook)(&vp, fp, l); 526 if (error) 527 return (error); 528 if (vp != ovp) 529 goto unionread; 530 } 531 532 if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && 533 (vp->v_mount->mnt_flag & MNT_UNION)) { 534 struct vnode *tvp = vp; 535 vp = vp->v_mount->mnt_vnodecovered; 536 vref(vp); 537 mutex_enter(&fp->f_lock); 538 fp->f_vnode = vp; 539 fp->f_offset = 0; 540 mutex_exit(&fp->f_lock); 541 vrele(tvp); 542 goto unionread; 543 } 544 *done = count - auio.uio_resid; 545 return error; 546 } 547 548 /* 549 * File table vnode read routine. 550 */ 551 static int 552 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 553 int flags) 554 { 555 struct vnode *vp = fp->f_vnode; 556 int error, ioflag, fflag; 557 size_t count; 558 559 ioflag = IO_ADV_ENCODE(fp->f_advice); 560 fflag = fp->f_flag; 561 if (fflag & FNONBLOCK) 562 ioflag |= IO_NDELAY; 563 if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) 564 ioflag |= IO_SYNC; 565 if (fflag & FALTIO) 566 ioflag |= IO_ALTSEMANTICS; 567 if (fflag & FDIRECT) 568 ioflag |= IO_DIRECT; 569 vn_lock(vp, LK_SHARED | LK_RETRY); 570 uio->uio_offset = *offset; 571 count = uio->uio_resid; 572 error = VOP_READ(vp, uio, ioflag, cred); 573 if (flags & FOF_UPDATE_OFFSET) 574 *offset += count - uio->uio_resid; 575 VOP_UNLOCK(vp); 576 return (error); 577 } 578 579 /* 580 * File table vnode write routine. 581 */ 582 static int 583 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 584 int flags) 585 { 586 struct vnode *vp = fp->f_vnode; 587 int error, ioflag, fflag; 588 size_t count; 589 590 ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; 591 fflag = fp->f_flag; 592 if (vp->v_type == VREG && (fflag & O_APPEND)) 593 ioflag |= IO_APPEND; 594 if (fflag & FNONBLOCK) 595 ioflag |= IO_NDELAY; 596 if (fflag & FFSYNC || 597 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 598 ioflag |= IO_SYNC; 599 else if (fflag & FDSYNC) 600 ioflag |= IO_DSYNC; 601 if (fflag & FALTIO) 602 ioflag |= IO_ALTSEMANTICS; 603 if (fflag & FDIRECT) 604 ioflag |= IO_DIRECT; 605 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 606 uio->uio_offset = *offset; 607 count = uio->uio_resid; 608 609 if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) 610 goto out; 611 612 error = VOP_WRITE(vp, uio, ioflag, cred); 613 614 if (flags & FOF_UPDATE_OFFSET) { 615 if (ioflag & IO_APPEND) { 616 /* 617 * SUSv3 describes behaviour for count = 0 as following: 618 * "Before any action ... is taken, and if nbyte is zero 619 * and the file is a regular file, the write() function 620 * ... in the absence of errors ... shall return zero 621 * and have no other results." 622 */ 623 if (count) 624 *offset = uio->uio_offset; 625 } else 626 *offset += count - uio->uio_resid; 627 } 628 629 out: 630 VOP_UNLOCK(vp); 631 return (error); 632 } 633 634 /* 635 * File table vnode stat routine. 636 */ 637 static int 638 vn_statfile(file_t *fp, struct stat *sb) 639 { 640 struct vnode *vp = fp->f_vnode; 641 int error; 642 643 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 644 error = vn_stat(vp, sb); 645 VOP_UNLOCK(vp); 646 return error; 647 } 648 649 int 650 vn_stat(struct vnode *vp, struct stat *sb) 651 { 652 struct vattr va; 653 int error; 654 mode_t mode; 655 656 memset(&va, 0, sizeof(va)); 657 error = VOP_GETATTR(vp, &va, kauth_cred_get()); 658 if (error) 659 return (error); 660 /* 661 * Copy from vattr table 662 */ 663 memset(sb, 0, sizeof(*sb)); 664 sb->st_dev = va.va_fsid; 665 sb->st_ino = va.va_fileid; 666 mode = va.va_mode; 667 switch (vp->v_type) { 668 case VREG: 669 mode |= S_IFREG; 670 break; 671 case VDIR: 672 mode |= S_IFDIR; 673 break; 674 case VBLK: 675 mode |= S_IFBLK; 676 break; 677 case VCHR: 678 mode |= S_IFCHR; 679 break; 680 case VLNK: 681 mode |= S_IFLNK; 682 break; 683 case VSOCK: 684 mode |= S_IFSOCK; 685 break; 686 case VFIFO: 687 mode |= S_IFIFO; 688 break; 689 default: 690 return (EBADF); 691 } 692 sb->st_mode = mode; 693 sb->st_nlink = va.va_nlink; 694 sb->st_uid = va.va_uid; 695 sb->st_gid = va.va_gid; 696 sb->st_rdev = va.va_rdev; 697 sb->st_size = va.va_size; 698 sb->st_atimespec = va.va_atime; 699 sb->st_mtimespec = va.va_mtime; 700 sb->st_ctimespec = va.va_ctime; 701 sb->st_birthtimespec = va.va_birthtime; 702 sb->st_blksize = va.va_blocksize; 703 sb->st_flags = va.va_flags; 704 sb->st_gen = 0; 705 sb->st_blocks = va.va_bytes / S_BLKSIZE; 706 return (0); 707 } 708 709 /* 710 * File table vnode fcntl routine. 711 */ 712 static int 713 vn_fcntl(file_t *fp, u_int com, void *data) 714 { 715 struct vnode *vp = fp->f_vnode; 716 int error; 717 718 error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); 719 return (error); 720 } 721 722 /* 723 * File table vnode ioctl routine. 724 */ 725 static int 726 vn_ioctl(file_t *fp, u_long com, void *data) 727 { 728 struct vnode *vp = fp->f_vnode, *ovp; 729 struct vattr vattr; 730 int error; 731 732 switch (vp->v_type) { 733 734 case VREG: 735 case VDIR: 736 if (com == FIONREAD) { 737 vn_lock(vp, LK_SHARED | LK_RETRY); 738 error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); 739 VOP_UNLOCK(vp); 740 if (error) 741 return (error); 742 *(int *)data = vattr.va_size - fp->f_offset; 743 return (0); 744 } 745 if ((com == FIONWRITE) || (com == FIONSPACE)) { 746 /* 747 * Files don't have send queues, so there never 748 * are any bytes in them, nor is there any 749 * open space in them. 750 */ 751 *(int *)data = 0; 752 return (0); 753 } 754 if (com == FIOGETBMAP) { 755 daddr_t *block; 756 757 if (*(daddr_t *)data < 0) 758 return (EINVAL); 759 block = (daddr_t *)data; 760 vn_lock(vp, LK_SHARED | LK_RETRY); 761 error = VOP_BMAP(vp, *block, NULL, block, NULL); 762 VOP_UNLOCK(vp); 763 return error; 764 } 765 if (com == OFIOGETBMAP) { 766 daddr_t ibn, obn; 767 768 if (*(int32_t *)data < 0) 769 return (EINVAL); 770 ibn = (daddr_t)*(int32_t *)data; 771 vn_lock(vp, LK_SHARED | LK_RETRY); 772 error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); 773 VOP_UNLOCK(vp); 774 *(int32_t *)data = (int32_t)obn; 775 return error; 776 } 777 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 778 return (0); /* XXX */ 779 /* FALLTHROUGH */ 780 case VFIFO: 781 case VCHR: 782 case VBLK: 783 error = VOP_IOCTL(vp, com, data, fp->f_flag, 784 kauth_cred_get()); 785 if (error == 0 && com == TIOCSCTTY) { 786 vref(vp); 787 mutex_enter(&proc_lock); 788 ovp = curproc->p_session->s_ttyvp; 789 curproc->p_session->s_ttyvp = vp; 790 mutex_exit(&proc_lock); 791 if (ovp != NULL) 792 vrele(ovp); 793 } 794 return (error); 795 796 default: 797 return (EPASSTHROUGH); 798 } 799 } 800 801 /* 802 * File table vnode poll routine. 803 */ 804 static int 805 vn_poll(file_t *fp, int events) 806 { 807 808 return (VOP_POLL(fp->f_vnode, events)); 809 } 810 811 /* 812 * File table vnode kqfilter routine. 813 */ 814 int 815 vn_kqfilter(file_t *fp, struct knote *kn) 816 { 817 818 return (VOP_KQFILTER(fp->f_vnode, kn)); 819 } 820 821 static int 822 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, 823 int *advicep, struct uvm_object **uobjp, int *maxprotp) 824 { 825 struct uvm_object *uobj; 826 struct vnode *vp; 827 struct vattr va; 828 struct lwp *l; 829 vm_prot_t maxprot; 830 off_t off; 831 int error, flags; 832 bool needwritemap; 833 834 l = curlwp; 835 836 off = *offp; 837 flags = *flagsp; 838 maxprot = VM_PROT_EXECUTE; 839 840 vp = fp->f_vnode; 841 if (vp->v_type != VREG && vp->v_type != VCHR && 842 vp->v_type != VBLK) { 843 /* only REG/CHR/BLK support mmap */ 844 return ENODEV; 845 } 846 if (vp->v_type != VCHR && off < 0) { 847 return EINVAL; 848 } 849 if (vp->v_type != VCHR && (off_t)(off + size) < off) { 850 /* no offset wrapping */ 851 return EOVERFLOW; 852 } 853 854 /* special case: catch SunOS style /dev/zero */ 855 if (vp->v_type == VCHR && 856 (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { 857 *uobjp = NULL; 858 *maxprotp = VM_PROT_ALL; 859 return 0; 860 } 861 862 /* 863 * Old programs may not select a specific sharing type, so 864 * default to an appropriate one. 865 * 866 * XXX: how does MAP_ANON fit in the picture? 867 */ 868 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 869 #if defined(DEBUG) 870 struct proc *p = l->l_proc; 871 printf("WARNING: defaulted mmap() share type to " 872 "%s (pid %d command %s)\n", vp->v_type == VCHR ? 873 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, 874 p->p_comm); 875 #endif 876 if (vp->v_type == VCHR) 877 flags |= MAP_SHARED; /* for a device */ 878 else 879 flags |= MAP_PRIVATE; /* for a file */ 880 } 881 882 /* 883 * MAP_PRIVATE device mappings don't make sense (and aren't 884 * supported anyway). However, some programs rely on this, 885 * so just change it to MAP_SHARED. 886 */ 887 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 888 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 889 } 890 891 /* 892 * now check protection 893 */ 894 895 /* check read access */ 896 if (fp->f_flag & FREAD) 897 maxprot |= VM_PROT_READ; 898 else if (prot & PROT_READ) { 899 return EACCES; 900 } 901 902 /* check write access, shared case first */ 903 if (flags & MAP_SHARED) { 904 /* 905 * if the file is writable, only add PROT_WRITE to 906 * maxprot if the file is not immutable, append-only. 907 * otherwise, if we have asked for PROT_WRITE, return 908 * EPERM. 909 */ 910 if (fp->f_flag & FWRITE) { 911 vn_lock(vp, LK_SHARED | LK_RETRY); 912 error = VOP_GETATTR(vp, &va, l->l_cred); 913 VOP_UNLOCK(vp); 914 if (error) { 915 return error; 916 } 917 if ((va.va_flags & 918 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) 919 maxprot |= VM_PROT_WRITE; 920 else if (prot & PROT_WRITE) { 921 return EPERM; 922 } 923 } else if (prot & PROT_WRITE) { 924 return EACCES; 925 } 926 } else { 927 /* MAP_PRIVATE mappings can always write to */ 928 maxprot |= VM_PROT_WRITE; 929 } 930 931 /* 932 * Don't allow mmap for EXEC if the file system 933 * is mounted NOEXEC. 934 */ 935 if ((prot & PROT_EXEC) != 0 && 936 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { 937 return EACCES; 938 } 939 940 if (vp->v_type != VCHR) { 941 error = VOP_MMAP(vp, prot, curlwp->l_cred); 942 if (error) { 943 return error; 944 } 945 vref(vp); 946 uobj = &vp->v_uobj; 947 948 /* 949 * If the vnode is being mapped with PROT_EXEC, 950 * then mark it as text. 951 */ 952 if (prot & PROT_EXEC) { 953 vn_markexec(vp); 954 } 955 } else { 956 int i = maxprot; 957 958 /* 959 * XXX Some devices don't like to be mapped with 960 * XXX PROT_EXEC or PROT_WRITE, but we don't really 961 * XXX have a better way of handling this, right now 962 */ 963 do { 964 uobj = udv_attach(vp->v_rdev, 965 (flags & MAP_SHARED) ? i : 966 (i & ~VM_PROT_WRITE), off, size); 967 i--; 968 } while ((uobj == NULL) && (i > 0)); 969 if (uobj == NULL) { 970 return EINVAL; 971 } 972 *advicep = UVM_ADV_RANDOM; 973 } 974 975 /* 976 * Set vnode flags to indicate the new kinds of mapping. 977 * We take the vnode lock in exclusive mode here to serialize 978 * with direct I/O. 979 * 980 * Safe to check for these flag values without a lock, as 981 * long as a reference to the vnode is held. 982 */ 983 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && 984 (flags & MAP_SHARED) != 0 && 985 (maxprot & VM_PROT_WRITE) != 0; 986 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { 987 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 988 vp->v_vflag |= VV_MAPPED; 989 if (needwritemap) { 990 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 991 mutex_enter(vp->v_interlock); 992 vp->v_iflag |= VI_WRMAP; 993 mutex_exit(vp->v_interlock); 994 rw_exit(vp->v_uobj.vmobjlock); 995 } 996 VOP_UNLOCK(vp); 997 } 998 999 #if NVERIEXEC > 0 1000 1001 /* 1002 * Check if the file can be executed indirectly. 1003 * 1004 * XXX: This gives false warnings about "Incorrect access type" 1005 * XXX: if the mapping is not executable. Harmless, but will be 1006 * XXX: fixed as part of other changes. 1007 */ 1008 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, 1009 NULL)) { 1010 1011 /* 1012 * Don't allow executable mappings if we can't 1013 * indirectly execute the file. 1014 */ 1015 if (prot & VM_PROT_EXECUTE) { 1016 return EPERM; 1017 } 1018 1019 /* 1020 * Strip the executable bit from 'maxprot' to make sure 1021 * it can't be made executable later. 1022 */ 1023 maxprot &= ~VM_PROT_EXECUTE; 1024 } 1025 #endif /* NVERIEXEC > 0 */ 1026 1027 *uobjp = uobj; 1028 *maxprotp = maxprot; 1029 *flagsp = flags; 1030 1031 return 0; 1032 } 1033 1034 1035 1036 /* 1037 * Check that the vnode is still valid, and if so 1038 * acquire requested lock. 1039 */ 1040 int 1041 vn_lock(struct vnode *vp, int flags) 1042 { 1043 struct lwp *l; 1044 int error; 1045 1046 #if 0 1047 KASSERT(vrefcnt(vp) > 0 || (vp->v_iflag & VI_ONWORKLST) != 0); 1048 #endif 1049 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| 1050 LK_UPGRADE|LK_DOWNGRADE)) == 0); 1051 KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); 1052 1053 #ifdef DIAGNOSTIC 1054 if (wapbl_vphaswapbl(vp)) 1055 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); 1056 #endif 1057 1058 /* Get a more useful report for lockstat. */ 1059 l = curlwp; 1060 KASSERT(l->l_rwcallsite == 0); 1061 l->l_rwcallsite = (uintptr_t)__builtin_return_address(0); 1062 1063 error = VOP_LOCK(vp, flags); 1064 if ((flags & LK_RETRY) != 0 && error == ENOENT) 1065 error = VOP_LOCK(vp, flags); 1066 1067 l->l_rwcallsite = 0; 1068 1069 KASSERT((flags & LK_RETRY) == 0 || (flags & LK_NOWAIT) != 0 || 1070 error == 0); 1071 1072 return error; 1073 } 1074 1075 /* 1076 * File table vnode close routine. 1077 */ 1078 static int 1079 vn_closefile(file_t *fp) 1080 { 1081 1082 return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); 1083 } 1084 1085 /* 1086 * Simplified in-kernel wrapper calls for extended attribute access. 1087 * Both calls pass in a NULL credential, authorizing a "kernel" access. 1088 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1089 */ 1090 int 1091 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1092 const char *attrname, size_t *buflen, void *bf, struct lwp *l) 1093 { 1094 struct uio auio; 1095 struct iovec aiov; 1096 int error; 1097 1098 aiov.iov_len = *buflen; 1099 aiov.iov_base = bf; 1100 1101 auio.uio_iov = &aiov; 1102 auio.uio_iovcnt = 1; 1103 auio.uio_rw = UIO_READ; 1104 auio.uio_offset = 0; 1105 auio.uio_resid = *buflen; 1106 UIO_SETUP_SYSSPACE(&auio); 1107 1108 if ((ioflg & IO_NODELOCKED) == 0) 1109 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1110 1111 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, 1112 NOCRED); 1113 1114 if ((ioflg & IO_NODELOCKED) == 0) 1115 VOP_UNLOCK(vp); 1116 1117 if (error == 0) 1118 *buflen = *buflen - auio.uio_resid; 1119 1120 return (error); 1121 } 1122 1123 /* 1124 * XXX Failure mode if partially written? 1125 */ 1126 int 1127 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1128 const char *attrname, size_t buflen, const void *bf, struct lwp *l) 1129 { 1130 struct uio auio; 1131 struct iovec aiov; 1132 int error; 1133 1134 aiov.iov_len = buflen; 1135 aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ 1136 1137 auio.uio_iov = &aiov; 1138 auio.uio_iovcnt = 1; 1139 auio.uio_rw = UIO_WRITE; 1140 auio.uio_offset = 0; 1141 auio.uio_resid = buflen; 1142 UIO_SETUP_SYSSPACE(&auio); 1143 1144 if ((ioflg & IO_NODELOCKED) == 0) { 1145 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1146 } 1147 1148 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED); 1149 1150 if ((ioflg & IO_NODELOCKED) == 0) { 1151 VOP_UNLOCK(vp); 1152 } 1153 1154 return (error); 1155 } 1156 1157 int 1158 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1159 const char *attrname, struct lwp *l) 1160 { 1161 int error; 1162 1163 if ((ioflg & IO_NODELOCKED) == 0) { 1164 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1165 } 1166 1167 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED); 1168 if (error == EOPNOTSUPP) 1169 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1170 NOCRED); 1171 1172 if ((ioflg & IO_NODELOCKED) == 0) { 1173 VOP_UNLOCK(vp); 1174 } 1175 1176 return (error); 1177 } 1178 1179 int 1180 vn_fifo_bypass(void *v) 1181 { 1182 struct vop_generic_args *ap = v; 1183 1184 return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); 1185 } 1186 1187 /* 1188 * Open block device by device number 1189 */ 1190 int 1191 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l) 1192 { 1193 int error; 1194 1195 if ((error = bdevvp(dev, vpp)) != 0) 1196 return error; 1197 1198 if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) { 1199 vrele(*vpp); 1200 return error; 1201 } 1202 mutex_enter((*vpp)->v_interlock); 1203 (*vpp)->v_writecount++; 1204 mutex_exit((*vpp)->v_interlock); 1205 1206 return 0; 1207 } 1208 1209 /* 1210 * Lookup the provided name in the filesystem. If the file exists, 1211 * is a valid block device, and isn't being used by anyone else, 1212 * set *vpp to the file's vnode. 1213 */ 1214 int 1215 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l) 1216 { 1217 struct nameidata nd; 1218 struct vnode *vp; 1219 dev_t dev; 1220 enum vtype vt; 1221 int error; 1222 1223 NDINIT(&nd, LOOKUP, FOLLOW, pb); 1224 if ((error = vn_open(&nd, FREAD | FWRITE, 0)) != 0) 1225 return error; 1226 1227 vp = nd.ni_vp; 1228 dev = vp->v_rdev; 1229 vt = vp->v_type; 1230 1231 VOP_UNLOCK(vp); 1232 (void) vn_close(vp, FREAD | FWRITE, l->l_cred); 1233 1234 if (vt != VBLK) 1235 return ENOTBLK; 1236 1237 return vn_bdev_open(dev, vpp, l); 1238 } 1239