1 /* $NetBSD: vfs_vnops.c,v 1.244 2024/12/07 02:27:38 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 66 */ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.244 2024/12/07 02:27:38 riastradh Exp $"); 70 71 #include "veriexec.h" 72 73 #include <sys/param.h> 74 #include <sys/types.h> 75 76 #include <sys/atomic.h> 77 #include <sys/buf.h> 78 #include <sys/file.h> 79 #include <sys/filedesc.h> 80 #include <sys/fstrans.h> 81 #include <sys/ioctl.h> 82 #include <sys/kauth.h> 83 #include <sys/kernel.h> 84 #include <sys/mman.h> 85 #include <sys/mount.h> 86 #include <sys/namei.h> 87 #include <sys/poll.h> 88 #include <sys/proc.h> 89 #include <sys/sdt.h> 90 #include <sys/stat.h> 91 #include <sys/syslog.h> 92 #include <sys/systm.h> 93 #include <sys/tty.h> 94 #include <sys/verified_exec.h> 95 #include <sys/vnode_impl.h> 96 #include <sys/wapbl.h> 97 98 #include <miscfs/fifofs/fifo.h> 99 #include <miscfs/specfs/specdev.h> 100 101 #include <uvm/uvm_device.h> 102 #include <uvm/uvm_extern.h> 103 #include <uvm/uvm_readahead.h> 104 105 #ifndef COMPAT_ZERODEV 106 #define COMPAT_ZERODEV(dev) (0) 107 #endif 108 109 int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *); 110 111 static int vn_read(file_t *fp, off_t *offset, struct uio *uio, 112 kauth_cred_t cred, int flags); 113 static int vn_write(file_t *fp, off_t *offset, struct uio *uio, 114 kauth_cred_t cred, int flags); 115 static int vn_closefile(file_t *fp); 116 static int vn_poll(file_t *fp, int events); 117 static int vn_fcntl(file_t *fp, u_int com, void *data); 118 static int vn_statfile(file_t *fp, struct stat *sb); 119 static int vn_ioctl(file_t *fp, u_long com, void *data); 120 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, 121 struct uvm_object **, int *); 122 static int vn_seek(struct file *, off_t, int, off_t *, int); 123 static int vn_advlock(struct file *, void *, int, struct flock *, int); 124 static int vn_fpathconf(struct file *, int, register_t *); 125 static int vn_posix_fadvise(struct file *, off_t, off_t, int); 126 static int vn_truncate(file_t *, off_t); 127 128 const struct fileops vnops = { 129 .fo_name = "vn", 130 .fo_read = vn_read, 131 .fo_write = vn_write, 132 .fo_ioctl = vn_ioctl, 133 .fo_fcntl = vn_fcntl, 134 .fo_poll = vn_poll, 135 .fo_stat = vn_statfile, 136 .fo_close = vn_closefile, 137 .fo_kqfilter = vn_kqfilter, 138 .fo_restart = fnullop_restart, 139 .fo_mmap = vn_mmap, 140 .fo_seek = vn_seek, 141 .fo_advlock = vn_advlock, 142 .fo_fpathconf = vn_fpathconf, 143 .fo_posix_fadvise = vn_posix_fadvise, 144 .fo_truncate = vn_truncate, 145 }; 146 147 /* 148 * Common code for vnode open operations. 149 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 150 * 151 * at_dvp is the directory for openat(), if any. 152 * pb is the path. 153 * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT. 154 * fmode is the open flags, converted from O_* to F* 155 * cmode is the creation file permissions. 156 * 157 * XXX shouldn't cmode be mode_t? 158 * 159 * On success produces either a locked vnode in *ret_vp, or NULL in 160 * *ret_vp and a file descriptor number in *ret_fd. 161 * 162 * The caller may pass NULL for ret_fd (and ret_domove), in which case 163 * EOPNOTSUPP will be produced in the cases that would otherwise return 164 * a file descriptor. 165 * 166 * Note that callers that want no-follow behavior should pass 167 * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is 168 * honored. 169 */ 170 int 171 vn_open(struct vnode *at_dvp, struct pathbuf *pb, 172 int nmode, int fmode, int cmode, 173 struct vnode **ret_vp, bool *ret_domove, int *ret_fd) 174 { 175 struct nameidata nd; 176 struct vnode *vp = NULL; 177 struct lwp *l = curlwp; 178 kauth_cred_t cred = l->l_cred; 179 struct vattr va; 180 int error; 181 const char *pathstring; 182 183 KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode); 184 185 KASSERT(ret_vp != NULL); 186 KASSERT((ret_domove == NULL) == (ret_fd == NULL)); 187 188 if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) 189 return SET_ERROR(EINVAL); 190 191 NDINIT(&nd, LOOKUP, nmode, pb); 192 if (at_dvp != NULL) 193 NDAT(&nd, at_dvp); 194 195 nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; 196 197 if (fmode & O_CREAT) { 198 nd.ni_cnd.cn_nameiop = CREATE; 199 nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; 200 if ((fmode & O_EXCL) == 0 && 201 ((fmode & O_NOFOLLOW) == 0)) 202 nd.ni_cnd.cn_flags |= FOLLOW; 203 if ((fmode & O_EXCL) == 0) 204 nd.ni_cnd.cn_flags |= NONEXCLHACK; 205 } else { 206 nd.ni_cnd.cn_nameiop = LOOKUP; 207 nd.ni_cnd.cn_flags |= LOCKLEAF; 208 if ((fmode & O_NOFOLLOW) == 0) 209 nd.ni_cnd.cn_flags |= FOLLOW; 210 } 211 212 pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf); 213 if (pathstring == NULL) { 214 return SET_ERROR(ENOMEM); 215 } 216 217 /* 218 * When this "interface" was exposed to do_open() it used 219 * to initialize l_dupfd to -newfd-1 (thus passing in the 220 * new file handle number to use)... but nothing in the 221 * kernel uses that value. So just send 0. 222 */ 223 l->l_dupfd = 0; 224 225 error = namei(&nd); 226 if (error) 227 goto out; 228 229 vp = nd.ni_vp; 230 231 #if NVERIEXEC > 0 232 error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode); 233 if (error) { 234 /* We have to release the locks ourselves */ 235 /* 236 * 20210604 dholland passing NONEXCLHACK means we can 237 * get ni_dvp == NULL back if ni_vp exists, and we should 238 * treat that like the non-O_CREAT case. 239 */ 240 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { 241 if (vp == NULL) { 242 vput(nd.ni_dvp); 243 } else { 244 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 245 if (nd.ni_dvp == nd.ni_vp) 246 vrele(nd.ni_dvp); 247 else 248 vput(nd.ni_dvp); 249 nd.ni_dvp = NULL; 250 vput(vp); 251 vp = NULL; 252 } 253 } else { 254 vput(vp); 255 vp = NULL; 256 } 257 goto out; 258 } 259 #endif /* NVERIEXEC > 0 */ 260 261 /* 262 * 20210604 dholland ditto 263 */ 264 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { 265 if (nd.ni_vp == NULL) { 266 vattr_null(&va); 267 va.va_type = VREG; 268 va.va_mode = cmode; 269 if (fmode & O_EXCL) 270 va.va_vaflags |= VA_EXCLUSIVE; 271 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, 272 &nd.ni_cnd, &va); 273 if (error) { 274 vput(nd.ni_dvp); 275 goto out; 276 } 277 fmode &= ~O_TRUNC; 278 vp = nd.ni_vp; 279 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 280 vput(nd.ni_dvp); 281 } else { 282 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 283 if (nd.ni_dvp == nd.ni_vp) 284 vrele(nd.ni_dvp); 285 else 286 vput(nd.ni_dvp); 287 nd.ni_dvp = NULL; 288 vp = nd.ni_vp; 289 if (fmode & O_EXCL) { 290 error = SET_ERROR(EEXIST); 291 goto bad; 292 } 293 fmode &= ~O_CREAT; 294 } 295 } else if ((fmode & O_CREAT) != 0) { 296 /* 297 * 20210606 dholland passing NONEXCLHACK means this 298 * case exists; it is the same as the following one 299 * but also needs to do things in the second (exists) 300 * half of the following block. (Besides handle 301 * ni_dvp, anyway.) 302 */ 303 vp = nd.ni_vp; 304 KASSERT((fmode & O_EXCL) == 0); 305 fmode &= ~O_CREAT; 306 } else { 307 vp = nd.ni_vp; 308 } 309 if (vp->v_type == VSOCK) { 310 error = SET_ERROR(EOPNOTSUPP); 311 goto bad; 312 } 313 if (nd.ni_vp->v_type == VLNK) { 314 error = SET_ERROR(EFTYPE); 315 goto bad; 316 } 317 318 if ((fmode & O_CREAT) == 0) { 319 error = vn_openchk(vp, cred, fmode); 320 if (error != 0) 321 goto bad; 322 } 323 324 if (fmode & O_TRUNC) { 325 vattr_null(&va); 326 va.va_size = 0; 327 error = VOP_SETATTR(vp, &va, cred); 328 if (error != 0) 329 goto bad; 330 } 331 if ((error = VOP_OPEN(vp, fmode, cred)) != 0) 332 goto bad; 333 if (fmode & FWRITE) { 334 mutex_enter(vp->v_interlock); 335 vp->v_writecount++; 336 mutex_exit(vp->v_interlock); 337 } 338 339 bad: 340 if (error) { 341 vput(vp); 342 vp = NULL; 343 } 344 out: 345 pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring); 346 347 switch (error) { 348 case EDUPFD: 349 case EMOVEFD: 350 /* if the caller isn't prepared to handle fds, fail for them */ 351 if (ret_fd == NULL) { 352 error = SET_ERROR(EOPNOTSUPP); 353 break; 354 } 355 *ret_vp = NULL; 356 *ret_domove = error == EMOVEFD; 357 *ret_fd = l->l_dupfd; 358 error = 0; 359 break; 360 case 0: 361 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 362 *ret_vp = vp; 363 break; 364 } 365 l->l_dupfd = 0; 366 return error; 367 } 368 369 /* 370 * Check for write permissions on the specified vnode. 371 * Prototype text segments cannot be written. 372 */ 373 int 374 vn_writechk(struct vnode *vp) 375 { 376 377 /* 378 * If the vnode is in use as a process's text, 379 * we can't allow writing. 380 */ 381 if (vp->v_iflag & VI_TEXT) 382 return SET_ERROR(ETXTBSY); 383 return 0; 384 } 385 386 int 387 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) 388 { 389 int permbits = 0; 390 int error; 391 392 if (vp->v_type == VNON || vp->v_type == VBAD) 393 return SET_ERROR(ENXIO); 394 395 if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) 396 return SET_ERROR(ENOTDIR); 397 398 if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG) 399 return SET_ERROR(EFTYPE); 400 401 if ((fflags & FREAD) != 0) { 402 permbits = VREAD; 403 } 404 if ((fflags & FEXEC) != 0) { 405 permbits |= VEXEC; 406 } 407 if ((fflags & (FWRITE | O_TRUNC)) != 0) { 408 permbits |= VWRITE; 409 if (vp->v_type == VDIR) { 410 error = SET_ERROR(EISDIR); 411 goto bad; 412 } 413 error = vn_writechk(vp); 414 if (error != 0) 415 goto bad; 416 } 417 error = VOP_ACCESS(vp, permbits, cred); 418 bad: 419 return error; 420 } 421 422 /* 423 * Mark a vnode as having executable mappings. 424 */ 425 void 426 vn_markexec(struct vnode *vp) 427 { 428 429 if ((vp->v_iflag & VI_EXECMAP) != 0) { 430 /* Safe unlocked, as long as caller holds a reference. */ 431 return; 432 } 433 434 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 435 mutex_enter(vp->v_interlock); 436 if ((vp->v_iflag & VI_EXECMAP) == 0) { 437 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 438 vp->v_iflag |= VI_EXECMAP; 439 } 440 mutex_exit(vp->v_interlock); 441 rw_exit(vp->v_uobj.vmobjlock); 442 } 443 444 /* 445 * Mark a vnode as being the text of a process. 446 * Fail if the vnode is currently writable. 447 */ 448 int 449 vn_marktext(struct vnode *vp) 450 { 451 452 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { 453 /* Safe unlocked, as long as caller holds a reference. */ 454 return 0; 455 } 456 457 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 458 mutex_enter(vp->v_interlock); 459 if (vp->v_writecount != 0) { 460 KASSERT((vp->v_iflag & VI_TEXT) == 0); 461 mutex_exit(vp->v_interlock); 462 rw_exit(vp->v_uobj.vmobjlock); 463 return SET_ERROR(ETXTBSY); 464 } 465 if ((vp->v_iflag & VI_EXECMAP) == 0) { 466 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 467 } 468 vp->v_iflag |= (VI_TEXT | VI_EXECMAP); 469 mutex_exit(vp->v_interlock); 470 rw_exit(vp->v_uobj.vmobjlock); 471 return 0; 472 } 473 474 /* 475 * Vnode close call 476 * 477 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. 478 */ 479 int 480 vn_close(struct vnode *vp, int flags, kauth_cred_t cred) 481 { 482 int error; 483 484 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 485 if (flags & FWRITE) { 486 mutex_enter(vp->v_interlock); 487 KASSERT(vp->v_writecount > 0); 488 vp->v_writecount--; 489 mutex_exit(vp->v_interlock); 490 } 491 error = VOP_CLOSE(vp, flags, cred); 492 vput(vp); 493 return error; 494 } 495 496 static int 497 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) 498 { 499 struct lwp *l = curlwp; 500 off_t testoff; 501 502 if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) 503 return 0; 504 505 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 506 if (ioflag & IO_APPEND) 507 testoff = vp->v_size; 508 else 509 testoff = uio->uio_offset; 510 511 if (testoff + uio->uio_resid > 512 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 513 mutex_enter(&proc_lock); 514 psignal(l->l_proc, SIGXFSZ); 515 mutex_exit(&proc_lock); 516 return SET_ERROR(EFBIG); 517 } 518 519 return 0; 520 } 521 522 /* 523 * Package up an I/O request on a vnode into a uio and do it. 524 */ 525 int 526 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 527 enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, 528 struct lwp *l) 529 { 530 struct uio auio; 531 struct iovec aiov; 532 int error; 533 534 if ((ioflg & IO_NODELOCKED) == 0) { 535 if (rw == UIO_READ) { 536 vn_lock(vp, LK_SHARED | LK_RETRY); 537 } else /* UIO_WRITE */ { 538 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 539 } 540 } 541 auio.uio_iov = &aiov; 542 auio.uio_iovcnt = 1; 543 aiov.iov_base = base; 544 aiov.iov_len = len; 545 auio.uio_resid = len; 546 auio.uio_offset = offset; 547 auio.uio_rw = rw; 548 if (segflg == UIO_SYSSPACE) { 549 UIO_SETUP_SYSSPACE(&auio); 550 } else { 551 auio.uio_vmspace = l->l_proc->p_vmspace; 552 } 553 554 if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) 555 goto out; 556 557 if (rw == UIO_READ) { 558 error = VOP_READ(vp, &auio, ioflg, cred); 559 } else { 560 error = VOP_WRITE(vp, &auio, ioflg, cred); 561 } 562 563 if (aresid) 564 *aresid = auio.uio_resid; 565 else 566 if (auio.uio_resid && error == 0) 567 error = SET_ERROR(EIO); 568 569 out: 570 if ((ioflg & IO_NODELOCKED) == 0) { 571 VOP_UNLOCK(vp); 572 } 573 return error; 574 } 575 576 int 577 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, 578 struct lwp *l, off_t **cookies, int *ncookies) 579 { 580 struct vnode *vp = fp->f_vnode; 581 struct iovec aiov; 582 struct uio auio; 583 int error, eofflag; 584 585 /* Limit the size on any kernel buffers used by VOP_READDIR */ 586 count = uimin(MAXBSIZE, count); 587 588 unionread: 589 if (vp->v_type != VDIR) 590 return SET_ERROR(EINVAL); 591 aiov.iov_base = bf; 592 aiov.iov_len = count; 593 auio.uio_iov = &aiov; 594 auio.uio_iovcnt = 1; 595 auio.uio_rw = UIO_READ; 596 if (segflg == UIO_SYSSPACE) { 597 UIO_SETUP_SYSSPACE(&auio); 598 } else { 599 KASSERT(l == curlwp); 600 auio.uio_vmspace = l->l_proc->p_vmspace; 601 } 602 auio.uio_resid = count; 603 vn_lock(vp, LK_SHARED | LK_RETRY); 604 mutex_enter(&fp->f_lock); 605 auio.uio_offset = fp->f_offset; 606 mutex_exit(&fp->f_lock); 607 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, 608 ncookies); 609 mutex_enter(&fp->f_lock); 610 fp->f_offset = auio.uio_offset; 611 mutex_exit(&fp->f_lock); 612 VOP_UNLOCK(vp); 613 if (error) 614 return error; 615 616 if (count == auio.uio_resid && vn_union_readdir_hook) { 617 struct vnode *ovp = vp; 618 619 error = (*vn_union_readdir_hook)(&vp, fp, l); 620 if (error) 621 return error; 622 if (vp != ovp) 623 goto unionread; 624 } 625 626 if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && 627 (vp->v_mount->mnt_flag & MNT_UNION)) { 628 struct vnode *tvp = vp; 629 vp = vp->v_mount->mnt_vnodecovered; 630 vref(vp); 631 mutex_enter(&fp->f_lock); 632 fp->f_vnode = vp; 633 fp->f_offset = 0; 634 mutex_exit(&fp->f_lock); 635 vrele(tvp); 636 goto unionread; 637 } 638 *done = count - auio.uio_resid; 639 return error; 640 } 641 642 /* 643 * File table vnode read routine. 644 */ 645 static int 646 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 647 int flags) 648 { 649 struct vnode *vp = fp->f_vnode; 650 int error, ioflag, fflag; 651 size_t count; 652 653 ioflag = IO_ADV_ENCODE(fp->f_advice); 654 fflag = fp->f_flag; 655 if (fflag & FNONBLOCK) 656 ioflag |= IO_NDELAY; 657 if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) 658 ioflag |= IO_SYNC; 659 if (fflag & FALTIO) 660 ioflag |= IO_ALTSEMANTICS; 661 if (fflag & FDIRECT) 662 ioflag |= IO_DIRECT; 663 if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0) 664 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 665 else 666 vn_lock(vp, LK_SHARED | LK_RETRY); 667 if (__predict_false(vp->v_type == VDIR) && 668 offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0) 669 mutex_enter(&fp->f_lock); 670 uio->uio_offset = *offset; 671 if (__predict_false(vp->v_type == VDIR) && 672 offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0) 673 mutex_enter(&fp->f_lock); 674 count = uio->uio_resid; 675 error = VOP_READ(vp, uio, ioflag, cred); 676 if (flags & FOF_UPDATE_OFFSET) 677 *offset += count - uio->uio_resid; 678 VOP_UNLOCK(vp); 679 return error; 680 } 681 682 /* 683 * File table vnode write routine. 684 */ 685 static int 686 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 687 int flags) 688 { 689 struct vnode *vp = fp->f_vnode; 690 int error, ioflag, fflag; 691 size_t count; 692 693 ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; 694 fflag = fp->f_flag; 695 if (vp->v_type == VREG && (fflag & O_APPEND)) 696 ioflag |= IO_APPEND; 697 if (fflag & FNONBLOCK) 698 ioflag |= IO_NDELAY; 699 if (fflag & FFSYNC || 700 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 701 ioflag |= IO_SYNC; 702 else if (fflag & FDSYNC) 703 ioflag |= IO_DSYNC; 704 if (fflag & FALTIO) 705 ioflag |= IO_ALTSEMANTICS; 706 if (fflag & FDIRECT) 707 ioflag |= IO_DIRECT; 708 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 709 uio->uio_offset = *offset; 710 count = uio->uio_resid; 711 712 if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) 713 goto out; 714 715 error = VOP_WRITE(vp, uio, ioflag, cred); 716 717 if (flags & FOF_UPDATE_OFFSET) { 718 if (ioflag & IO_APPEND) { 719 /* 720 * SUSv3 describes behaviour for count = 0 as 721 * following: "Before any action ... is taken, 722 * and if nbyte is zero and the file is a 723 * regular file, the write() function ... in 724 * the absence of errors ... shall return zero 725 * and have no other results." 726 */ 727 if (count) 728 *offset = uio->uio_offset; 729 } else 730 *offset += count - uio->uio_resid; 731 } 732 733 out: 734 VOP_UNLOCK(vp); 735 return error; 736 } 737 738 /* 739 * File table vnode stat routine. 740 */ 741 static int 742 vn_statfile(file_t *fp, struct stat *sb) 743 { 744 struct vnode *vp = fp->f_vnode; 745 int error; 746 747 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 748 error = vn_stat(vp, sb); 749 VOP_UNLOCK(vp); 750 return error; 751 } 752 753 int 754 vn_stat(struct vnode *vp, struct stat *sb) 755 { 756 struct vattr va; 757 int error; 758 mode_t mode; 759 760 memset(&va, 0, sizeof(va)); 761 error = VOP_GETATTR(vp, &va, kauth_cred_get()); 762 if (error) 763 return error; 764 /* 765 * Copy from vattr table 766 */ 767 memset(sb, 0, sizeof(*sb)); 768 sb->st_dev = va.va_fsid; 769 sb->st_ino = va.va_fileid; 770 mode = va.va_mode; 771 switch (vp->v_type) { 772 case VREG: 773 mode |= S_IFREG; 774 break; 775 case VDIR: 776 mode |= S_IFDIR; 777 break; 778 case VBLK: 779 mode |= S_IFBLK; 780 break; 781 case VCHR: 782 mode |= S_IFCHR; 783 break; 784 case VLNK: 785 mode |= S_IFLNK; 786 break; 787 case VSOCK: 788 mode |= S_IFSOCK; 789 break; 790 case VFIFO: 791 mode |= S_IFIFO; 792 break; 793 default: 794 return SET_ERROR(EBADF); 795 } 796 sb->st_mode = mode; 797 sb->st_nlink = va.va_nlink; 798 sb->st_uid = va.va_uid; 799 sb->st_gid = va.va_gid; 800 sb->st_rdev = va.va_rdev; 801 sb->st_size = va.va_size; 802 sb->st_atimespec = va.va_atime; 803 sb->st_mtimespec = va.va_mtime; 804 sb->st_ctimespec = va.va_ctime; 805 sb->st_birthtimespec = va.va_birthtime; 806 sb->st_blksize = va.va_blocksize; 807 sb->st_flags = va.va_flags; 808 sb->st_gen = 0; 809 sb->st_blocks = va.va_bytes / S_BLKSIZE; 810 return 0; 811 } 812 813 /* 814 * File table vnode fcntl routine. 815 */ 816 static int 817 vn_fcntl(file_t *fp, u_int com, void *data) 818 { 819 struct vnode *vp = fp->f_vnode; 820 int error; 821 822 error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); 823 return error; 824 } 825 826 /* 827 * File table vnode ioctl routine. 828 */ 829 static int 830 vn_ioctl(file_t *fp, u_long com, void *data) 831 { 832 struct vnode *vp = fp->f_vnode, *ovp; 833 struct vattr vattr; 834 int error; 835 836 switch (vp->v_type) { 837 838 case VREG: 839 case VDIR: 840 if (com == FIONREAD) { 841 vn_lock(vp, LK_SHARED | LK_RETRY); 842 error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); 843 if (error == 0) { 844 if (vp->v_type == VDIR) 845 mutex_enter(&fp->f_lock); 846 *(int *)data = vattr.va_size - fp->f_offset; 847 if (vp->v_type == VDIR) 848 mutex_exit(&fp->f_lock); 849 } 850 VOP_UNLOCK(vp); 851 if (error) 852 return error; 853 return 0; 854 } 855 if ((com == FIONWRITE) || (com == FIONSPACE)) { 856 /* 857 * Files don't have send queues, so there never 858 * are any bytes in them, nor is there any 859 * open space in them. 860 */ 861 *(int *)data = 0; 862 return 0; 863 } 864 if (com == FIOGETBMAP) { 865 daddr_t *block; 866 867 if (*(daddr_t *)data < 0) 868 return SET_ERROR(EINVAL); 869 block = (daddr_t *)data; 870 vn_lock(vp, LK_SHARED | LK_RETRY); 871 error = VOP_BMAP(vp, *block, NULL, block, NULL); 872 VOP_UNLOCK(vp); 873 return error; 874 } 875 if (com == OFIOGETBMAP) { 876 daddr_t ibn, obn; 877 878 if (*(int32_t *)data < 0) 879 return SET_ERROR(EINVAL); 880 ibn = (daddr_t)*(int32_t *)data; 881 vn_lock(vp, LK_SHARED | LK_RETRY); 882 error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); 883 VOP_UNLOCK(vp); 884 *(int32_t *)data = (int32_t)obn; 885 return error; 886 } 887 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 888 return 0; /* XXX */ 889 /* FALLTHROUGH */ 890 case VFIFO: 891 case VCHR: 892 case VBLK: 893 error = VOP_IOCTL(vp, com, data, fp->f_flag, kauth_cred_get()); 894 if (error == 0 && com == TIOCSCTTY) { 895 vref(vp); 896 mutex_enter(&proc_lock); 897 ovp = curproc->p_session->s_ttyvp; 898 curproc->p_session->s_ttyvp = vp; 899 mutex_exit(&proc_lock); 900 if (ovp != NULL) 901 vrele(ovp); 902 } 903 return error; 904 905 default: 906 return SET_ERROR(EPASSTHROUGH); 907 } 908 } 909 910 /* 911 * File table vnode poll routine. 912 */ 913 static int 914 vn_poll(file_t *fp, int events) 915 { 916 917 return VOP_POLL(fp->f_vnode, events); 918 } 919 920 /* 921 * File table vnode kqfilter routine. 922 */ 923 int 924 vn_kqfilter(file_t *fp, struct knote *kn) 925 { 926 927 return VOP_KQFILTER(fp->f_vnode, kn); 928 } 929 930 static int 931 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, 932 int *advicep, struct uvm_object **uobjp, int *maxprotp) 933 { 934 struct uvm_object *uobj; 935 struct vnode *vp; 936 struct vattr va; 937 struct lwp *l; 938 vm_prot_t maxprot; 939 off_t off; 940 int error, flags; 941 bool needwritemap; 942 943 l = curlwp; 944 945 off = *offp; 946 flags = *flagsp; 947 maxprot = VM_PROT_EXECUTE; 948 949 KASSERT(size > 0); 950 951 vp = fp->f_vnode; 952 if (vp->v_type != VREG && vp->v_type != VCHR && 953 vp->v_type != VBLK) { 954 /* only REG/CHR/BLK support mmap */ 955 return SET_ERROR(ENODEV); 956 } 957 if (vp->v_type != VCHR && off < 0) { 958 return SET_ERROR(EINVAL); 959 } 960 #if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */ 961 if (vp->v_type != VCHR && size > __type_max(off_t)) { 962 return SET_ERROR(EOVERFLOW); 963 } 964 #endif 965 if (vp->v_type != VCHR && off > __type_max(off_t) - size) { 966 /* no offset wrapping */ 967 return SET_ERROR(EOVERFLOW); 968 } 969 970 /* special case: catch SunOS style /dev/zero */ 971 if (vp->v_type == VCHR && 972 (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { 973 *uobjp = NULL; 974 *maxprotp = VM_PROT_ALL; 975 return 0; 976 } 977 978 /* 979 * Old programs may not select a specific sharing type, so 980 * default to an appropriate one. 981 * 982 * XXX: how does MAP_ANON fit in the picture? 983 */ 984 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 985 #if defined(DEBUG) 986 struct proc *p = l->l_proc; 987 printf("WARNING: defaulted mmap() share type to " 988 "%s (pid %d command %s)\n", 989 vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE", 990 p->p_pid, 991 p->p_comm); 992 #endif 993 if (vp->v_type == VCHR) 994 flags |= MAP_SHARED; /* for a device */ 995 else 996 flags |= MAP_PRIVATE; /* for a file */ 997 } 998 999 /* 1000 * MAP_PRIVATE device mappings don't make sense (and aren't 1001 * supported anyway). However, some programs rely on this, 1002 * so just change it to MAP_SHARED. 1003 */ 1004 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 1005 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 1006 } 1007 1008 /* 1009 * now check protection 1010 */ 1011 1012 /* check read access */ 1013 if (fp->f_flag & FREAD) 1014 maxprot |= VM_PROT_READ; 1015 else if (prot & PROT_READ) { 1016 return SET_ERROR(EACCES); 1017 } 1018 1019 /* check write access, shared case first */ 1020 if (flags & MAP_SHARED) { 1021 /* 1022 * if the file is writable, only add PROT_WRITE to 1023 * maxprot if the file is not immutable, append-only. 1024 * otherwise, if we have asked for PROT_WRITE, return 1025 * EPERM. 1026 */ 1027 if (fp->f_flag & FWRITE) { 1028 vn_lock(vp, LK_SHARED | LK_RETRY); 1029 error = VOP_GETATTR(vp, &va, l->l_cred); 1030 VOP_UNLOCK(vp); 1031 if (error) { 1032 return error; 1033 } 1034 if ((va.va_flags & 1035 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) 1036 maxprot |= VM_PROT_WRITE; 1037 else if (prot & PROT_WRITE) { 1038 return SET_ERROR(EPERM); 1039 } 1040 } else if (prot & PROT_WRITE) { 1041 return SET_ERROR(EACCES); 1042 } 1043 } else { 1044 /* MAP_PRIVATE mappings can always write to */ 1045 maxprot |= VM_PROT_WRITE; 1046 } 1047 1048 /* 1049 * Don't allow mmap for EXEC if the file system 1050 * is mounted NOEXEC. 1051 */ 1052 if ((prot & PROT_EXEC) != 0 && 1053 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { 1054 return SET_ERROR(EACCES); 1055 } 1056 1057 if (vp->v_type != VCHR) { 1058 error = VOP_MMAP(vp, prot, curlwp->l_cred); 1059 if (error) { 1060 return error; 1061 } 1062 vref(vp); 1063 uobj = &vp->v_uobj; 1064 1065 /* 1066 * If the vnode is being mapped with PROT_EXEC, 1067 * then mark it as text. 1068 */ 1069 if (prot & PROT_EXEC) { 1070 vn_markexec(vp); 1071 } 1072 } else { 1073 int i = maxprot; 1074 1075 /* 1076 * XXX Some devices don't like to be mapped with 1077 * XXX PROT_EXEC or PROT_WRITE, but we don't really 1078 * XXX have a better way of handling this, right now 1079 */ 1080 do { 1081 uobj = udv_attach(vp->v_rdev, 1082 (flags & MAP_SHARED) ? i : (i & ~VM_PROT_WRITE), 1083 off, size); 1084 i--; 1085 } while ((uobj == NULL) && (i > 0)); 1086 if (uobj == NULL) { 1087 return SET_ERROR(EINVAL); 1088 } 1089 *advicep = UVM_ADV_RANDOM; 1090 } 1091 1092 /* 1093 * Set vnode flags to indicate the new kinds of mapping. 1094 * We take the vnode lock in exclusive mode here to serialize 1095 * with direct I/O. 1096 * 1097 * Safe to check for these flag values without a lock, as 1098 * long as a reference to the vnode is held. 1099 */ 1100 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && 1101 (flags & MAP_SHARED) != 0 && 1102 (maxprot & VM_PROT_WRITE) != 0; 1103 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { 1104 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1105 vp->v_vflag |= VV_MAPPED; 1106 if (needwritemap) { 1107 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 1108 mutex_enter(vp->v_interlock); 1109 vp->v_iflag |= VI_WRMAP; 1110 mutex_exit(vp->v_interlock); 1111 rw_exit(vp->v_uobj.vmobjlock); 1112 } 1113 VOP_UNLOCK(vp); 1114 } 1115 1116 #if NVERIEXEC > 0 1117 /* 1118 * Check if the file can be executed indirectly. 1119 * 1120 * XXX: This gives false warnings about "Incorrect access type" 1121 * XXX: if the mapping is not executable. Harmless, but will be 1122 * XXX: fixed as part of other changes. 1123 */ 1124 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, 1125 NULL)) { 1126 1127 /* 1128 * Don't allow executable mappings if we can't 1129 * indirectly execute the file. 1130 */ 1131 if (prot & VM_PROT_EXECUTE) { 1132 return SET_ERROR(EPERM); 1133 } 1134 1135 /* 1136 * Strip the executable bit from 'maxprot' to make sure 1137 * it can't be made executable later. 1138 */ 1139 maxprot &= ~VM_PROT_EXECUTE; 1140 } 1141 #endif /* NVERIEXEC > 0 */ 1142 1143 *uobjp = uobj; 1144 *maxprotp = maxprot; 1145 *flagsp = flags; 1146 1147 return 0; 1148 } 1149 1150 static int 1151 vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp, int flags) 1152 { 1153 const off_t OFF_MIN = __type_min(off_t); 1154 const off_t OFF_MAX = __type_max(off_t); 1155 kauth_cred_t cred = fp->f_cred; 1156 off_t oldoff, newoff; 1157 struct vnode *vp = fp->f_vnode; 1158 struct vattr vattr; 1159 int error; 1160 1161 if (vp->v_type == VFIFO) 1162 return SET_ERROR(ESPIPE); 1163 1164 if (flags & FOF_UPDATE_OFFSET) 1165 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1166 else 1167 vn_lock(vp, LK_SHARED | LK_RETRY); 1168 1169 /* Compute the old and new offsets. */ 1170 if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0) 1171 mutex_enter(&fp->f_lock); 1172 oldoff = fp->f_offset; 1173 if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0) 1174 mutex_exit(&fp->f_lock); 1175 switch (whence) { 1176 case SEEK_CUR: 1177 if (delta > 0) { 1178 if (oldoff > 0 && delta > OFF_MAX - oldoff) { 1179 newoff = OFF_MAX; 1180 break; 1181 } 1182 } else { 1183 if (oldoff < 0 && delta < OFF_MIN - oldoff) { 1184 newoff = OFF_MIN; 1185 break; 1186 } 1187 } 1188 newoff = oldoff + delta; 1189 break; 1190 case SEEK_END: 1191 error = VOP_GETATTR(vp, &vattr, cred); 1192 if (error) 1193 goto out; 1194 if (vattr.va_size > OFF_MAX || 1195 delta > OFF_MAX - (off_t)vattr.va_size) { 1196 newoff = OFF_MAX; 1197 break; 1198 } 1199 newoff = delta + vattr.va_size; 1200 break; 1201 case SEEK_SET: 1202 newoff = delta; 1203 break; 1204 default: 1205 error = SET_ERROR(EINVAL); 1206 goto out; 1207 } 1208 1209 /* Pass the proposed change to the file system to audit. */ 1210 error = VOP_SEEK(vp, oldoff, newoff, cred); 1211 if (error) 1212 goto out; 1213 1214 /* Success! */ 1215 if (newoffp) 1216 *newoffp = newoff; 1217 if (flags & FOF_UPDATE_OFFSET) 1218 fp->f_offset = newoff; 1219 error = 0; 1220 1221 out: VOP_UNLOCK(vp); 1222 return error; 1223 } 1224 1225 static int 1226 vn_advlock(struct file *fp, void *id, int op, struct flock *fl, int flags) 1227 { 1228 struct vnode *const vp = fp->f_vnode; 1229 1230 if (fl->l_whence == SEEK_CUR) { 1231 vn_lock(vp, LK_SHARED | LK_RETRY); 1232 fl->l_start += fp->f_offset; 1233 VOP_UNLOCK(vp); 1234 } 1235 1236 return VOP_ADVLOCK(vp, id, op, fl, flags); 1237 } 1238 1239 static int 1240 vn_fpathconf(struct file *fp, int name, register_t *retval) 1241 { 1242 struct vnode *const vp = fp->f_vnode; 1243 int error; 1244 1245 vn_lock(vp, LK_SHARED | LK_RETRY); 1246 error = VOP_PATHCONF(vp, name, retval); 1247 VOP_UNLOCK(vp); 1248 1249 return error; 1250 } 1251 1252 static int 1253 vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice) 1254 { 1255 const off_t OFF_MAX = __type_max(off_t); 1256 struct vnode *vp = fp->f_vnode; 1257 off_t endoffset; 1258 int error; 1259 1260 if (offset < 0) { 1261 return SET_ERROR(EINVAL); 1262 } 1263 if (len == 0) { 1264 endoffset = OFF_MAX; 1265 } else if (len > 0 && (OFF_MAX - offset) >= len) { 1266 endoffset = offset + len; 1267 } else { 1268 return SET_ERROR(EINVAL); 1269 } 1270 1271 CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL); 1272 CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM); 1273 CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL); 1274 1275 switch (advice) { 1276 case POSIX_FADV_WILLNEED: 1277 case POSIX_FADV_DONTNEED: 1278 if (vp->v_type != VREG && vp->v_type != VBLK) 1279 return 0; 1280 break; 1281 } 1282 1283 switch (advice) { 1284 case POSIX_FADV_NORMAL: 1285 case POSIX_FADV_RANDOM: 1286 case POSIX_FADV_SEQUENTIAL: 1287 /* 1288 * We ignore offset and size. Must lock the file to 1289 * do this, as f_advice is sub-word sized. 1290 */ 1291 mutex_enter(&fp->f_lock); 1292 fp->f_advice = (u_char)advice; 1293 mutex_exit(&fp->f_lock); 1294 error = 0; 1295 break; 1296 1297 case POSIX_FADV_WILLNEED: 1298 error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset); 1299 break; 1300 1301 case POSIX_FADV_DONTNEED: 1302 /* 1303 * Align the region to page boundaries as VOP_PUTPAGES expects 1304 * by shrinking it. We shrink instead of expand because we 1305 * do not want to deactivate cache outside of the requested 1306 * region. It means that if the specified region is smaller 1307 * than PAGE_SIZE, we do nothing. 1308 */ 1309 if (offset <= trunc_page(OFF_MAX) && 1310 round_page(offset) < trunc_page(endoffset)) { 1311 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 1312 error = VOP_PUTPAGES(vp, 1313 round_page(offset), trunc_page(endoffset), 1314 PGO_DEACTIVATE | PGO_CLEANIT); 1315 } else { 1316 error = 0; 1317 } 1318 break; 1319 1320 case POSIX_FADV_NOREUSE: 1321 /* Not implemented yet. */ 1322 error = 0; 1323 break; 1324 default: 1325 error = SET_ERROR(EINVAL); 1326 break; 1327 } 1328 1329 return error; 1330 } 1331 1332 static int 1333 vn_truncate(file_t *fp, off_t length) 1334 { 1335 struct vattr vattr; 1336 struct vnode *vp; 1337 int error = 0; 1338 1339 if (length < 0) 1340 return SET_ERROR(EINVAL); 1341 1342 if ((fp->f_flag & FWRITE) == 0) 1343 return SET_ERROR(EINVAL); 1344 vp = fp->f_vnode; 1345 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1346 if (vp->v_type == VDIR) 1347 error = SET_ERROR(EISDIR); 1348 else if ((error = vn_writechk(vp)) == 0) { 1349 vattr_null(&vattr); 1350 vattr.va_size = length; 1351 error = VOP_SETATTR(vp, &vattr, fp->f_cred); 1352 } 1353 VOP_UNLOCK(vp); 1354 1355 return error; 1356 } 1357 1358 1359 /* 1360 * Check that the vnode is still valid, and if so 1361 * acquire requested lock. 1362 */ 1363 int 1364 vn_lock(struct vnode *vp, int flags) 1365 { 1366 struct lwp *l; 1367 int error; 1368 1369 KASSERT(vrefcnt(vp) > 0); 1370 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| 1371 LK_UPGRADE|LK_DOWNGRADE)) == 0); 1372 KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); 1373 1374 #ifdef DIAGNOSTIC 1375 if (wapbl_vphaswapbl(vp)) 1376 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); 1377 #endif 1378 1379 /* Get a more useful report for lockstat. */ 1380 l = curlwp; 1381 KASSERT(l->l_rwcallsite == 0); 1382 l->l_rwcallsite = (uintptr_t)__builtin_return_address(0); 1383 1384 error = VOP_LOCK(vp, flags); 1385 1386 l->l_rwcallsite = 0; 1387 1388 switch (flags & (LK_RETRY | LK_NOWAIT)) { 1389 case 0: 1390 KASSERT(error == 0 || error == ENOENT); 1391 break; 1392 case LK_RETRY: 1393 KASSERT(error == 0); 1394 break; 1395 case LK_NOWAIT: 1396 KASSERT(error == 0 || error == EBUSY || error == ENOENT); 1397 break; 1398 case LK_RETRY | LK_NOWAIT: 1399 KASSERT(error == 0 || error == EBUSY); 1400 break; 1401 } 1402 1403 return error; 1404 } 1405 1406 /* 1407 * File table vnode close routine. 1408 */ 1409 static int 1410 vn_closefile(file_t *fp) 1411 { 1412 1413 return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); 1414 } 1415 1416 /* 1417 * Simplified in-kernel wrapper calls for extended attribute access. 1418 * Both calls pass in a NULL credential, authorizing a "kernel" access. 1419 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1420 */ 1421 int 1422 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1423 const char *attrname, size_t *buflen, void *bf, struct lwp *l) 1424 { 1425 struct uio auio; 1426 struct iovec aiov; 1427 int error; 1428 1429 aiov.iov_len = *buflen; 1430 aiov.iov_base = bf; 1431 1432 auio.uio_iov = &aiov; 1433 auio.uio_iovcnt = 1; 1434 auio.uio_rw = UIO_READ; 1435 auio.uio_offset = 0; 1436 auio.uio_resid = *buflen; 1437 UIO_SETUP_SYSSPACE(&auio); 1438 1439 if ((ioflg & IO_NODELOCKED) == 0) 1440 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1441 1442 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, 1443 NOCRED); 1444 1445 if ((ioflg & IO_NODELOCKED) == 0) 1446 VOP_UNLOCK(vp); 1447 1448 if (error == 0) 1449 *buflen = *buflen - auio.uio_resid; 1450 1451 return error; 1452 } 1453 1454 /* 1455 * XXX Failure mode if partially written? 1456 */ 1457 int 1458 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1459 const char *attrname, size_t buflen, const void *bf, struct lwp *l) 1460 { 1461 struct uio auio; 1462 struct iovec aiov; 1463 int error; 1464 1465 aiov.iov_len = buflen; 1466 aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ 1467 1468 auio.uio_iov = &aiov; 1469 auio.uio_iovcnt = 1; 1470 auio.uio_rw = UIO_WRITE; 1471 auio.uio_offset = 0; 1472 auio.uio_resid = buflen; 1473 UIO_SETUP_SYSSPACE(&auio); 1474 1475 if ((ioflg & IO_NODELOCKED) == 0) { 1476 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1477 } 1478 1479 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED); 1480 1481 if ((ioflg & IO_NODELOCKED) == 0) { 1482 VOP_UNLOCK(vp); 1483 } 1484 1485 return error; 1486 } 1487 1488 int 1489 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1490 const char *attrname, struct lwp *l) 1491 { 1492 int error; 1493 1494 if ((ioflg & IO_NODELOCKED) == 0) { 1495 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1496 } 1497 1498 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED); 1499 if (error == EOPNOTSUPP) 1500 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1501 NOCRED); 1502 1503 if ((ioflg & IO_NODELOCKED) == 0) { 1504 VOP_UNLOCK(vp); 1505 } 1506 1507 return error; 1508 } 1509 1510 int 1511 vn_fifo_bypass(void *v) 1512 { 1513 struct vop_generic_args *ap = v; 1514 1515 return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); 1516 } 1517 1518 /* 1519 * Open block device by device number 1520 */ 1521 int 1522 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l) 1523 { 1524 int error; 1525 1526 if ((error = bdevvp(dev, vpp)) != 0) 1527 return error; 1528 1529 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1530 if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) { 1531 vput(*vpp); 1532 return error; 1533 } 1534 mutex_enter((*vpp)->v_interlock); 1535 (*vpp)->v_writecount++; 1536 mutex_exit((*vpp)->v_interlock); 1537 VOP_UNLOCK(*vpp); 1538 1539 return 0; 1540 } 1541 1542 /* 1543 * Lookup the provided name in the filesystem. If the file exists, 1544 * is a valid block device, and isn't being used by anyone else, 1545 * set *vpp to the file's vnode. 1546 */ 1547 int 1548 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l) 1549 { 1550 struct vnode *vp; 1551 dev_t dev; 1552 enum vtype vt; 1553 int error; 1554 1555 error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL); 1556 if (error != 0) 1557 return error; 1558 1559 dev = vp->v_rdev; 1560 vt = vp->v_type; 1561 1562 VOP_UNLOCK(vp); 1563 (void) vn_close(vp, FREAD | FWRITE, l->l_cred); 1564 1565 if (vt != VBLK) 1566 return SET_ERROR(ENOTBLK); 1567 1568 return vn_bdev_open(dev, vpp, l); 1569 } 1570 1571 static long 1572 vn_knote_to_interest(const struct knote *kn) 1573 { 1574 1575 switch (kn->kn_filter) { 1576 case EVFILT_READ: 1577 /* 1578 * Writing to the file or changing its attributes can 1579 * set the file size, which impacts the readability 1580 * filter. 1581 * 1582 * (No need to set NOTE_EXTEND here; it's only ever 1583 * send with other hints; see vnode_if.c.) 1584 */ 1585 return NOTE_WRITE | NOTE_ATTRIB; 1586 1587 case EVFILT_VNODE: 1588 return kn->kn_sfflags; 1589 1590 case EVFILT_WRITE: 1591 default: 1592 return 0; 1593 } 1594 } 1595 1596 void 1597 vn_knote_attach(struct vnode *vp, struct knote *kn) 1598 { 1599 struct vnode_klist *vk = vp->v_klist; 1600 long interest = 0; 1601 1602 /* 1603 * In the case of layered / stacked file systems, knotes 1604 * should only ever be associated with the base vnode. 1605 */ 1606 KASSERT(kn->kn_hook == vp); 1607 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); 1608 1609 /* 1610 * We maintain a bitmask of the kevents that there is interest in, 1611 * to minimize the impact of having watchers. It's silly to have 1612 * to traverse vn_klist every time a read or write happens simply 1613 * because there is someone interested in knowing when the file 1614 * is deleted, for example. 1615 */ 1616 1617 mutex_enter(vp->v_interlock); 1618 SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext); 1619 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { 1620 interest |= vn_knote_to_interest(kn); 1621 } 1622 vk->vk_interest = interest; 1623 mutex_exit(vp->v_interlock); 1624 } 1625 1626 void 1627 vn_knote_detach(struct vnode *vp, struct knote *kn) 1628 { 1629 struct vnode_klist *vk = vp->v_klist; 1630 long interest = 0; 1631 1632 /* See above. */ 1633 KASSERT(kn->kn_hook == vp); 1634 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); 1635 1636 /* 1637 * We special case removing the head of the list, because: 1638 * 1639 * 1. It's extremely likely that we're detaching the only 1640 * knote. 1641 * 1642 * 2. We're already traversing the whole list, so we don't 1643 * want to use the generic SLIST_REMOVE() which would 1644 * traverse it *again*. 1645 */ 1646 1647 mutex_enter(vp->v_interlock); 1648 if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) { 1649 SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext); 1650 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { 1651 interest |= vn_knote_to_interest(kn); 1652 } 1653 vk->vk_interest = interest; 1654 } else { 1655 struct knote *thiskn, *nextkn, *prevkn = NULL; 1656 1657 SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) { 1658 if (thiskn == kn) { 1659 KASSERT(kn != NULL); 1660 KASSERT(prevkn != NULL); 1661 SLIST_REMOVE_AFTER(prevkn, kn_selnext); 1662 kn = NULL; 1663 } else { 1664 interest |= vn_knote_to_interest(thiskn); 1665 prevkn = thiskn; 1666 } 1667 } 1668 vk->vk_interest = interest; 1669 } 1670 mutex_exit(vp->v_interlock); 1671 } 1672