1 /* $NetBSD: vfs_vnops.c,v 1.235 2022/08/06 21:21:10 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 66 */ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.235 2022/08/06 21:21:10 riastradh Exp $"); 70 71 #include "veriexec.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/kernel.h> 76 #include <sys/file.h> 77 #include <sys/stat.h> 78 #include <sys/buf.h> 79 #include <sys/proc.h> 80 #include <sys/mount.h> 81 #include <sys/namei.h> 82 #include <sys/vnode_impl.h> 83 #include <sys/ioctl.h> 84 #include <sys/tty.h> 85 #include <sys/poll.h> 86 #include <sys/kauth.h> 87 #include <sys/syslog.h> 88 #include <sys/fstrans.h> 89 #include <sys/atomic.h> 90 #include <sys/filedesc.h> 91 #include <sys/wapbl.h> 92 #include <sys/mman.h> 93 94 #include <miscfs/specfs/specdev.h> 95 #include <miscfs/fifofs/fifo.h> 96 97 #include <uvm/uvm_extern.h> 98 #include <uvm/uvm_readahead.h> 99 #include <uvm/uvm_device.h> 100 101 #ifdef UNION 102 #include <fs/union/union.h> 103 #endif 104 105 #ifndef COMPAT_ZERODEV 106 #define COMPAT_ZERODEV(dev) (0) 107 #endif 108 109 int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *); 110 111 #include <sys/verified_exec.h> 112 113 static int vn_read(file_t *fp, off_t *offset, struct uio *uio, 114 kauth_cred_t cred, int flags); 115 static int vn_write(file_t *fp, off_t *offset, struct uio *uio, 116 kauth_cred_t cred, int flags); 117 static int vn_closefile(file_t *fp); 118 static int vn_poll(file_t *fp, int events); 119 static int vn_fcntl(file_t *fp, u_int com, void *data); 120 static int vn_statfile(file_t *fp, struct stat *sb); 121 static int vn_ioctl(file_t *fp, u_long com, void *data); 122 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, 123 struct uvm_object **, int *); 124 static int vn_seek(struct file *, off_t, int, off_t *, int); 125 126 const struct fileops vnops = { 127 .fo_name = "vn", 128 .fo_read = vn_read, 129 .fo_write = vn_write, 130 .fo_ioctl = vn_ioctl, 131 .fo_fcntl = vn_fcntl, 132 .fo_poll = vn_poll, 133 .fo_stat = vn_statfile, 134 .fo_close = vn_closefile, 135 .fo_kqfilter = vn_kqfilter, 136 .fo_restart = fnullop_restart, 137 .fo_mmap = vn_mmap, 138 .fo_seek = vn_seek, 139 }; 140 141 /* 142 * Common code for vnode open operations. 143 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 144 * 145 * at_dvp is the directory for openat(), if any. 146 * pb is the path. 147 * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT. 148 * fmode is the open flags, converted from O_* to F* 149 * cmode is the creation file permissions. 150 * 151 * XXX shouldn't cmode be mode_t? 152 * 153 * On success produces either a vnode in *ret_vp, or if that is NULL, 154 * a file descriptor number in ret_fd. 155 * 156 * The caller may pass NULL for ret_fd (and ret_domove), in which case 157 * EOPNOTSUPP will be produced in the cases that would otherwise return 158 * a file descriptor. 159 * 160 * Note that callers that want no-follow behavior should pass 161 * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is 162 * honored. 163 */ 164 int 165 vn_open(struct vnode *at_dvp, struct pathbuf *pb, 166 int nmode, int fmode, int cmode, 167 struct vnode **ret_vp, bool *ret_domove, int *ret_fd) 168 { 169 struct nameidata nd; 170 struct vnode *vp = NULL; 171 struct lwp *l = curlwp; 172 kauth_cred_t cred = l->l_cred; 173 struct vattr va; 174 int error; 175 const char *pathstring; 176 177 KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode); 178 179 KASSERT(ret_vp != NULL); 180 KASSERT((ret_domove == NULL) == (ret_fd == NULL)); 181 182 if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) 183 return EINVAL; 184 185 NDINIT(&nd, LOOKUP, nmode, pb); 186 if (at_dvp != NULL) 187 NDAT(&nd, at_dvp); 188 189 nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; 190 191 if (fmode & O_CREAT) { 192 nd.ni_cnd.cn_nameiop = CREATE; 193 nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; 194 if ((fmode & O_EXCL) == 0 && 195 ((fmode & O_NOFOLLOW) == 0)) 196 nd.ni_cnd.cn_flags |= FOLLOW; 197 if ((fmode & O_EXCL) == 0) 198 nd.ni_cnd.cn_flags |= NONEXCLHACK; 199 } else { 200 nd.ni_cnd.cn_nameiop = LOOKUP; 201 nd.ni_cnd.cn_flags |= LOCKLEAF; 202 if ((fmode & O_NOFOLLOW) == 0) 203 nd.ni_cnd.cn_flags |= FOLLOW; 204 } 205 206 pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf); 207 if (pathstring == NULL) { 208 return ENOMEM; 209 } 210 211 /* 212 * When this "interface" was exposed to do_open() it used 213 * to initialize l_dupfd to -newfd-1 (thus passing in the 214 * new file handle number to use)... but nothing in the 215 * kernel uses that value. So just send 0. 216 */ 217 l->l_dupfd = 0; 218 219 error = namei(&nd); 220 if (error) 221 goto out; 222 223 vp = nd.ni_vp; 224 225 #if NVERIEXEC > 0 226 error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode); 227 if (error) { 228 /* We have to release the locks ourselves */ 229 /* 230 * 20210604 dholland passing NONEXCLHACK means we can 231 * get ni_dvp == NULL back if ni_vp exists, and we should 232 * treat that like the non-O_CREAT case. 233 */ 234 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { 235 if (vp == NULL) { 236 vput(nd.ni_dvp); 237 } else { 238 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 239 if (nd.ni_dvp == nd.ni_vp) 240 vrele(nd.ni_dvp); 241 else 242 vput(nd.ni_dvp); 243 nd.ni_dvp = NULL; 244 vput(vp); 245 } 246 } else { 247 vput(vp); 248 } 249 goto out; 250 } 251 #endif /* NVERIEXEC > 0 */ 252 253 /* 254 * 20210604 dholland ditto 255 */ 256 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { 257 if (nd.ni_vp == NULL) { 258 vattr_null(&va); 259 va.va_type = VREG; 260 va.va_mode = cmode; 261 if (fmode & O_EXCL) 262 va.va_vaflags |= VA_EXCLUSIVE; 263 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, 264 &nd.ni_cnd, &va); 265 if (error) { 266 vput(nd.ni_dvp); 267 goto out; 268 } 269 fmode &= ~O_TRUNC; 270 vp = nd.ni_vp; 271 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 272 vput(nd.ni_dvp); 273 } else { 274 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 275 if (nd.ni_dvp == nd.ni_vp) 276 vrele(nd.ni_dvp); 277 else 278 vput(nd.ni_dvp); 279 nd.ni_dvp = NULL; 280 vp = nd.ni_vp; 281 if (fmode & O_EXCL) { 282 error = EEXIST; 283 goto bad; 284 } 285 fmode &= ~O_CREAT; 286 } 287 } else if ((fmode & O_CREAT) != 0) { 288 /* 289 * 20210606 dholland passing NONEXCLHACK means this 290 * case exists; it is the same as the following one 291 * but also needs to do things in the second (exists) 292 * half of the following block. (Besides handle 293 * ni_dvp, anyway.) 294 */ 295 vp = nd.ni_vp; 296 KASSERT((fmode & O_EXCL) == 0); 297 fmode &= ~O_CREAT; 298 } else { 299 vp = nd.ni_vp; 300 } 301 if (vp->v_type == VSOCK) { 302 error = EOPNOTSUPP; 303 goto bad; 304 } 305 if (nd.ni_vp->v_type == VLNK) { 306 error = EFTYPE; 307 goto bad; 308 } 309 310 if ((fmode & O_CREAT) == 0) { 311 error = vn_openchk(vp, cred, fmode); 312 if (error != 0) 313 goto bad; 314 } 315 316 if (fmode & O_TRUNC) { 317 vattr_null(&va); 318 va.va_size = 0; 319 error = VOP_SETATTR(vp, &va, cred); 320 if (error != 0) 321 goto bad; 322 } 323 if ((error = VOP_OPEN(vp, fmode, cred)) != 0) 324 goto bad; 325 if (fmode & FWRITE) { 326 mutex_enter(vp->v_interlock); 327 vp->v_writecount++; 328 mutex_exit(vp->v_interlock); 329 } 330 331 bad: 332 if (error) 333 vput(vp); 334 out: 335 pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring); 336 337 switch (error) { 338 case EDUPFD: 339 case EMOVEFD: 340 /* if the caller isn't prepared to handle fds, fail for them */ 341 if (ret_fd == NULL) { 342 error = EOPNOTSUPP; 343 break; 344 } 345 *ret_vp = NULL; 346 *ret_domove = error == EMOVEFD; 347 *ret_fd = l->l_dupfd; 348 error = 0; 349 break; 350 case 0: 351 *ret_vp = vp; 352 break; 353 } 354 l->l_dupfd = 0; 355 return error; 356 } 357 358 /* 359 * Check for write permissions on the specified vnode. 360 * Prototype text segments cannot be written. 361 */ 362 int 363 vn_writechk(struct vnode *vp) 364 { 365 366 /* 367 * If the vnode is in use as a process's text, 368 * we can't allow writing. 369 */ 370 if (vp->v_iflag & VI_TEXT) 371 return ETXTBSY; 372 return 0; 373 } 374 375 int 376 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) 377 { 378 int permbits = 0; 379 int error; 380 381 if (vp->v_type == VNON || vp->v_type == VBAD) 382 return ENXIO; 383 384 if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) 385 return ENOTDIR; 386 387 if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG) 388 return EFTYPE; 389 390 if ((fflags & FREAD) != 0) { 391 permbits = VREAD; 392 } 393 if ((fflags & FEXEC) != 0) { 394 permbits |= VEXEC; 395 } 396 if ((fflags & (FWRITE | O_TRUNC)) != 0) { 397 permbits |= VWRITE; 398 if (vp->v_type == VDIR) { 399 error = EISDIR; 400 goto bad; 401 } 402 error = vn_writechk(vp); 403 if (error != 0) 404 goto bad; 405 } 406 error = VOP_ACCESS(vp, permbits, cred); 407 bad: 408 return error; 409 } 410 411 /* 412 * Mark a vnode as having executable mappings. 413 */ 414 void 415 vn_markexec(struct vnode *vp) 416 { 417 418 if ((vp->v_iflag & VI_EXECMAP) != 0) { 419 /* Safe unlocked, as long as caller holds a reference. */ 420 return; 421 } 422 423 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 424 mutex_enter(vp->v_interlock); 425 if ((vp->v_iflag & VI_EXECMAP) == 0) { 426 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 427 vp->v_iflag |= VI_EXECMAP; 428 } 429 mutex_exit(vp->v_interlock); 430 rw_exit(vp->v_uobj.vmobjlock); 431 } 432 433 /* 434 * Mark a vnode as being the text of a process. 435 * Fail if the vnode is currently writable. 436 */ 437 int 438 vn_marktext(struct vnode *vp) 439 { 440 441 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { 442 /* Safe unlocked, as long as caller holds a reference. */ 443 return 0; 444 } 445 446 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 447 mutex_enter(vp->v_interlock); 448 if (vp->v_writecount != 0) { 449 KASSERT((vp->v_iflag & VI_TEXT) == 0); 450 mutex_exit(vp->v_interlock); 451 rw_exit(vp->v_uobj.vmobjlock); 452 return ETXTBSY; 453 } 454 if ((vp->v_iflag & VI_EXECMAP) == 0) { 455 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 456 } 457 vp->v_iflag |= (VI_TEXT | VI_EXECMAP); 458 mutex_exit(vp->v_interlock); 459 rw_exit(vp->v_uobj.vmobjlock); 460 return 0; 461 } 462 463 /* 464 * Vnode close call 465 * 466 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. 467 */ 468 int 469 vn_close(struct vnode *vp, int flags, kauth_cred_t cred) 470 { 471 int error; 472 473 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 474 if (flags & FWRITE) { 475 mutex_enter(vp->v_interlock); 476 KASSERT(vp->v_writecount > 0); 477 vp->v_writecount--; 478 mutex_exit(vp->v_interlock); 479 } 480 error = VOP_CLOSE(vp, flags, cred); 481 vput(vp); 482 return error; 483 } 484 485 static int 486 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) 487 { 488 struct lwp *l = curlwp; 489 off_t testoff; 490 491 if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) 492 return 0; 493 494 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 495 if (ioflag & IO_APPEND) 496 testoff = vp->v_size; 497 else 498 testoff = uio->uio_offset; 499 500 if (testoff + uio->uio_resid > 501 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 502 mutex_enter(&proc_lock); 503 psignal(l->l_proc, SIGXFSZ); 504 mutex_exit(&proc_lock); 505 return EFBIG; 506 } 507 508 return 0; 509 } 510 511 /* 512 * Package up an I/O request on a vnode into a uio and do it. 513 */ 514 int 515 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 516 enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, 517 struct lwp *l) 518 { 519 struct uio auio; 520 struct iovec aiov; 521 int error; 522 523 if ((ioflg & IO_NODELOCKED) == 0) { 524 if (rw == UIO_READ) { 525 vn_lock(vp, LK_SHARED | LK_RETRY); 526 } else /* UIO_WRITE */ { 527 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 528 } 529 } 530 auio.uio_iov = &aiov; 531 auio.uio_iovcnt = 1; 532 aiov.iov_base = base; 533 aiov.iov_len = len; 534 auio.uio_resid = len; 535 auio.uio_offset = offset; 536 auio.uio_rw = rw; 537 if (segflg == UIO_SYSSPACE) { 538 UIO_SETUP_SYSSPACE(&auio); 539 } else { 540 auio.uio_vmspace = l->l_proc->p_vmspace; 541 } 542 543 if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) 544 goto out; 545 546 if (rw == UIO_READ) { 547 error = VOP_READ(vp, &auio, ioflg, cred); 548 } else { 549 error = VOP_WRITE(vp, &auio, ioflg, cred); 550 } 551 552 if (aresid) 553 *aresid = auio.uio_resid; 554 else 555 if (auio.uio_resid && error == 0) 556 error = EIO; 557 558 out: 559 if ((ioflg & IO_NODELOCKED) == 0) { 560 VOP_UNLOCK(vp); 561 } 562 return error; 563 } 564 565 int 566 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, 567 struct lwp *l, off_t **cookies, int *ncookies) 568 { 569 struct vnode *vp = fp->f_vnode; 570 struct iovec aiov; 571 struct uio auio; 572 int error, eofflag; 573 574 /* Limit the size on any kernel buffers used by VOP_READDIR */ 575 count = uimin(MAXBSIZE, count); 576 577 unionread: 578 if (vp->v_type != VDIR) 579 return EINVAL; 580 aiov.iov_base = bf; 581 aiov.iov_len = count; 582 auio.uio_iov = &aiov; 583 auio.uio_iovcnt = 1; 584 auio.uio_rw = UIO_READ; 585 if (segflg == UIO_SYSSPACE) { 586 UIO_SETUP_SYSSPACE(&auio); 587 } else { 588 KASSERT(l == curlwp); 589 auio.uio_vmspace = l->l_proc->p_vmspace; 590 } 591 auio.uio_resid = count; 592 vn_lock(vp, LK_SHARED | LK_RETRY); 593 auio.uio_offset = fp->f_offset; 594 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, 595 ncookies); 596 mutex_enter(&fp->f_lock); 597 fp->f_offset = auio.uio_offset; 598 mutex_exit(&fp->f_lock); 599 VOP_UNLOCK(vp); 600 if (error) 601 return error; 602 603 if (count == auio.uio_resid && vn_union_readdir_hook) { 604 struct vnode *ovp = vp; 605 606 error = (*vn_union_readdir_hook)(&vp, fp, l); 607 if (error) 608 return error; 609 if (vp != ovp) 610 goto unionread; 611 } 612 613 if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && 614 (vp->v_mount->mnt_flag & MNT_UNION)) { 615 struct vnode *tvp = vp; 616 vp = vp->v_mount->mnt_vnodecovered; 617 vref(vp); 618 mutex_enter(&fp->f_lock); 619 fp->f_vnode = vp; 620 fp->f_offset = 0; 621 mutex_exit(&fp->f_lock); 622 vrele(tvp); 623 goto unionread; 624 } 625 *done = count - auio.uio_resid; 626 return error; 627 } 628 629 /* 630 * File table vnode read routine. 631 */ 632 static int 633 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 634 int flags) 635 { 636 struct vnode *vp = fp->f_vnode; 637 int error, ioflag, fflag; 638 size_t count; 639 640 ioflag = IO_ADV_ENCODE(fp->f_advice); 641 fflag = fp->f_flag; 642 if (fflag & FNONBLOCK) 643 ioflag |= IO_NDELAY; 644 if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) 645 ioflag |= IO_SYNC; 646 if (fflag & FALTIO) 647 ioflag |= IO_ALTSEMANTICS; 648 if (fflag & FDIRECT) 649 ioflag |= IO_DIRECT; 650 if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0) 651 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 652 else 653 vn_lock(vp, LK_SHARED | LK_RETRY); 654 uio->uio_offset = *offset; 655 count = uio->uio_resid; 656 error = VOP_READ(vp, uio, ioflag, cred); 657 if (flags & FOF_UPDATE_OFFSET) 658 *offset += count - uio->uio_resid; 659 VOP_UNLOCK(vp); 660 return error; 661 } 662 663 /* 664 * File table vnode write routine. 665 */ 666 static int 667 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 668 int flags) 669 { 670 struct vnode *vp = fp->f_vnode; 671 int error, ioflag, fflag; 672 size_t count; 673 674 ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; 675 fflag = fp->f_flag; 676 if (vp->v_type == VREG && (fflag & O_APPEND)) 677 ioflag |= IO_APPEND; 678 if (fflag & FNONBLOCK) 679 ioflag |= IO_NDELAY; 680 if (fflag & FFSYNC || 681 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 682 ioflag |= IO_SYNC; 683 else if (fflag & FDSYNC) 684 ioflag |= IO_DSYNC; 685 if (fflag & FALTIO) 686 ioflag |= IO_ALTSEMANTICS; 687 if (fflag & FDIRECT) 688 ioflag |= IO_DIRECT; 689 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 690 uio->uio_offset = *offset; 691 count = uio->uio_resid; 692 693 if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) 694 goto out; 695 696 error = VOP_WRITE(vp, uio, ioflag, cred); 697 698 if (flags & FOF_UPDATE_OFFSET) { 699 if (ioflag & IO_APPEND) { 700 /* 701 * SUSv3 describes behaviour for count = 0 as following: 702 * "Before any action ... is taken, and if nbyte is zero 703 * and the file is a regular file, the write() function 704 * ... in the absence of errors ... shall return zero 705 * and have no other results." 706 */ 707 if (count) 708 *offset = uio->uio_offset; 709 } else 710 *offset += count - uio->uio_resid; 711 } 712 713 out: 714 VOP_UNLOCK(vp); 715 return error; 716 } 717 718 /* 719 * File table vnode stat routine. 720 */ 721 static int 722 vn_statfile(file_t *fp, struct stat *sb) 723 { 724 struct vnode *vp = fp->f_vnode; 725 int error; 726 727 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 728 error = vn_stat(vp, sb); 729 VOP_UNLOCK(vp); 730 return error; 731 } 732 733 int 734 vn_stat(struct vnode *vp, struct stat *sb) 735 { 736 struct vattr va; 737 int error; 738 mode_t mode; 739 740 memset(&va, 0, sizeof(va)); 741 error = VOP_GETATTR(vp, &va, kauth_cred_get()); 742 if (error) 743 return error; 744 /* 745 * Copy from vattr table 746 */ 747 memset(sb, 0, sizeof(*sb)); 748 sb->st_dev = va.va_fsid; 749 sb->st_ino = va.va_fileid; 750 mode = va.va_mode; 751 switch (vp->v_type) { 752 case VREG: 753 mode |= S_IFREG; 754 break; 755 case VDIR: 756 mode |= S_IFDIR; 757 break; 758 case VBLK: 759 mode |= S_IFBLK; 760 break; 761 case VCHR: 762 mode |= S_IFCHR; 763 break; 764 case VLNK: 765 mode |= S_IFLNK; 766 break; 767 case VSOCK: 768 mode |= S_IFSOCK; 769 break; 770 case VFIFO: 771 mode |= S_IFIFO; 772 break; 773 default: 774 return EBADF; 775 } 776 sb->st_mode = mode; 777 sb->st_nlink = va.va_nlink; 778 sb->st_uid = va.va_uid; 779 sb->st_gid = va.va_gid; 780 sb->st_rdev = va.va_rdev; 781 sb->st_size = va.va_size; 782 sb->st_atimespec = va.va_atime; 783 sb->st_mtimespec = va.va_mtime; 784 sb->st_ctimespec = va.va_ctime; 785 sb->st_birthtimespec = va.va_birthtime; 786 sb->st_blksize = va.va_blocksize; 787 sb->st_flags = va.va_flags; 788 sb->st_gen = 0; 789 sb->st_blocks = va.va_bytes / S_BLKSIZE; 790 return 0; 791 } 792 793 /* 794 * File table vnode fcntl routine. 795 */ 796 static int 797 vn_fcntl(file_t *fp, u_int com, void *data) 798 { 799 struct vnode *vp = fp->f_vnode; 800 int error; 801 802 error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); 803 return error; 804 } 805 806 /* 807 * File table vnode ioctl routine. 808 */ 809 static int 810 vn_ioctl(file_t *fp, u_long com, void *data) 811 { 812 struct vnode *vp = fp->f_vnode, *ovp; 813 struct vattr vattr; 814 int error; 815 816 switch (vp->v_type) { 817 818 case VREG: 819 case VDIR: 820 if (com == FIONREAD) { 821 vn_lock(vp, LK_SHARED | LK_RETRY); 822 error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); 823 if (error == 0) 824 *(int *)data = vattr.va_size - fp->f_offset; 825 VOP_UNLOCK(vp); 826 if (error) 827 return error; 828 return 0; 829 } 830 if ((com == FIONWRITE) || (com == FIONSPACE)) { 831 /* 832 * Files don't have send queues, so there never 833 * are any bytes in them, nor is there any 834 * open space in them. 835 */ 836 *(int *)data = 0; 837 return 0; 838 } 839 if (com == FIOGETBMAP) { 840 daddr_t *block; 841 842 if (*(daddr_t *)data < 0) 843 return EINVAL; 844 block = (daddr_t *)data; 845 vn_lock(vp, LK_SHARED | LK_RETRY); 846 error = VOP_BMAP(vp, *block, NULL, block, NULL); 847 VOP_UNLOCK(vp); 848 return error; 849 } 850 if (com == OFIOGETBMAP) { 851 daddr_t ibn, obn; 852 853 if (*(int32_t *)data < 0) 854 return EINVAL; 855 ibn = (daddr_t)*(int32_t *)data; 856 vn_lock(vp, LK_SHARED | LK_RETRY); 857 error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); 858 VOP_UNLOCK(vp); 859 *(int32_t *)data = (int32_t)obn; 860 return error; 861 } 862 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 863 return 0; /* XXX */ 864 /* FALLTHROUGH */ 865 case VFIFO: 866 case VCHR: 867 case VBLK: 868 error = VOP_IOCTL(vp, com, data, fp->f_flag, 869 kauth_cred_get()); 870 if (error == 0 && com == TIOCSCTTY) { 871 vref(vp); 872 mutex_enter(&proc_lock); 873 ovp = curproc->p_session->s_ttyvp; 874 curproc->p_session->s_ttyvp = vp; 875 mutex_exit(&proc_lock); 876 if (ovp != NULL) 877 vrele(ovp); 878 } 879 return error; 880 881 default: 882 return EPASSTHROUGH; 883 } 884 } 885 886 /* 887 * File table vnode poll routine. 888 */ 889 static int 890 vn_poll(file_t *fp, int events) 891 { 892 893 return VOP_POLL(fp->f_vnode, events); 894 } 895 896 /* 897 * File table vnode kqfilter routine. 898 */ 899 int 900 vn_kqfilter(file_t *fp, struct knote *kn) 901 { 902 903 return VOP_KQFILTER(fp->f_vnode, kn); 904 } 905 906 static int 907 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, 908 int *advicep, struct uvm_object **uobjp, int *maxprotp) 909 { 910 struct uvm_object *uobj; 911 struct vnode *vp; 912 struct vattr va; 913 struct lwp *l; 914 vm_prot_t maxprot; 915 off_t off; 916 int error, flags; 917 bool needwritemap; 918 919 l = curlwp; 920 921 off = *offp; 922 flags = *flagsp; 923 maxprot = VM_PROT_EXECUTE; 924 925 KASSERT(size > 0); 926 927 vp = fp->f_vnode; 928 if (vp->v_type != VREG && vp->v_type != VCHR && 929 vp->v_type != VBLK) { 930 /* only REG/CHR/BLK support mmap */ 931 return ENODEV; 932 } 933 if (vp->v_type != VCHR && off < 0) { 934 return EINVAL; 935 } 936 #if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */ 937 if (vp->v_type != VCHR && size > __type_max(off_t)) { 938 return EOVERFLOW; 939 } 940 #endif 941 if (vp->v_type != VCHR && off > __type_max(off_t) - size) { 942 /* no offset wrapping */ 943 return EOVERFLOW; 944 } 945 946 /* special case: catch SunOS style /dev/zero */ 947 if (vp->v_type == VCHR && 948 (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { 949 *uobjp = NULL; 950 *maxprotp = VM_PROT_ALL; 951 return 0; 952 } 953 954 /* 955 * Old programs may not select a specific sharing type, so 956 * default to an appropriate one. 957 * 958 * XXX: how does MAP_ANON fit in the picture? 959 */ 960 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 961 #if defined(DEBUG) 962 struct proc *p = l->l_proc; 963 printf("WARNING: defaulted mmap() share type to " 964 "%s (pid %d command %s)\n", vp->v_type == VCHR ? 965 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, 966 p->p_comm); 967 #endif 968 if (vp->v_type == VCHR) 969 flags |= MAP_SHARED; /* for a device */ 970 else 971 flags |= MAP_PRIVATE; /* for a file */ 972 } 973 974 /* 975 * MAP_PRIVATE device mappings don't make sense (and aren't 976 * supported anyway). However, some programs rely on this, 977 * so just change it to MAP_SHARED. 978 */ 979 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 980 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 981 } 982 983 /* 984 * now check protection 985 */ 986 987 /* check read access */ 988 if (fp->f_flag & FREAD) 989 maxprot |= VM_PROT_READ; 990 else if (prot & PROT_READ) { 991 return EACCES; 992 } 993 994 /* check write access, shared case first */ 995 if (flags & MAP_SHARED) { 996 /* 997 * if the file is writable, only add PROT_WRITE to 998 * maxprot if the file is not immutable, append-only. 999 * otherwise, if we have asked for PROT_WRITE, return 1000 * EPERM. 1001 */ 1002 if (fp->f_flag & FWRITE) { 1003 vn_lock(vp, LK_SHARED | LK_RETRY); 1004 error = VOP_GETATTR(vp, &va, l->l_cred); 1005 VOP_UNLOCK(vp); 1006 if (error) { 1007 return error; 1008 } 1009 if ((va.va_flags & 1010 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) 1011 maxprot |= VM_PROT_WRITE; 1012 else if (prot & PROT_WRITE) { 1013 return EPERM; 1014 } 1015 } else if (prot & PROT_WRITE) { 1016 return EACCES; 1017 } 1018 } else { 1019 /* MAP_PRIVATE mappings can always write to */ 1020 maxprot |= VM_PROT_WRITE; 1021 } 1022 1023 /* 1024 * Don't allow mmap for EXEC if the file system 1025 * is mounted NOEXEC. 1026 */ 1027 if ((prot & PROT_EXEC) != 0 && 1028 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { 1029 return EACCES; 1030 } 1031 1032 if (vp->v_type != VCHR) { 1033 error = VOP_MMAP(vp, prot, curlwp->l_cred); 1034 if (error) { 1035 return error; 1036 } 1037 vref(vp); 1038 uobj = &vp->v_uobj; 1039 1040 /* 1041 * If the vnode is being mapped with PROT_EXEC, 1042 * then mark it as text. 1043 */ 1044 if (prot & PROT_EXEC) { 1045 vn_markexec(vp); 1046 } 1047 } else { 1048 int i = maxprot; 1049 1050 /* 1051 * XXX Some devices don't like to be mapped with 1052 * XXX PROT_EXEC or PROT_WRITE, but we don't really 1053 * XXX have a better way of handling this, right now 1054 */ 1055 do { 1056 uobj = udv_attach(vp->v_rdev, 1057 (flags & MAP_SHARED) ? i : 1058 (i & ~VM_PROT_WRITE), off, size); 1059 i--; 1060 } while ((uobj == NULL) && (i > 0)); 1061 if (uobj == NULL) { 1062 return EINVAL; 1063 } 1064 *advicep = UVM_ADV_RANDOM; 1065 } 1066 1067 /* 1068 * Set vnode flags to indicate the new kinds of mapping. 1069 * We take the vnode lock in exclusive mode here to serialize 1070 * with direct I/O. 1071 * 1072 * Safe to check for these flag values without a lock, as 1073 * long as a reference to the vnode is held. 1074 */ 1075 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && 1076 (flags & MAP_SHARED) != 0 && 1077 (maxprot & VM_PROT_WRITE) != 0; 1078 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { 1079 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1080 vp->v_vflag |= VV_MAPPED; 1081 if (needwritemap) { 1082 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 1083 mutex_enter(vp->v_interlock); 1084 vp->v_iflag |= VI_WRMAP; 1085 mutex_exit(vp->v_interlock); 1086 rw_exit(vp->v_uobj.vmobjlock); 1087 } 1088 VOP_UNLOCK(vp); 1089 } 1090 1091 #if NVERIEXEC > 0 1092 1093 /* 1094 * Check if the file can be executed indirectly. 1095 * 1096 * XXX: This gives false warnings about "Incorrect access type" 1097 * XXX: if the mapping is not executable. Harmless, but will be 1098 * XXX: fixed as part of other changes. 1099 */ 1100 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, 1101 NULL)) { 1102 1103 /* 1104 * Don't allow executable mappings if we can't 1105 * indirectly execute the file. 1106 */ 1107 if (prot & VM_PROT_EXECUTE) { 1108 return EPERM; 1109 } 1110 1111 /* 1112 * Strip the executable bit from 'maxprot' to make sure 1113 * it can't be made executable later. 1114 */ 1115 maxprot &= ~VM_PROT_EXECUTE; 1116 } 1117 #endif /* NVERIEXEC > 0 */ 1118 1119 *uobjp = uobj; 1120 *maxprotp = maxprot; 1121 *flagsp = flags; 1122 1123 return 0; 1124 } 1125 1126 static int 1127 vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp, 1128 int flags) 1129 { 1130 const off_t OFF_MIN = __type_min(off_t); 1131 const off_t OFF_MAX = __type_max(off_t); 1132 kauth_cred_t cred = fp->f_cred; 1133 off_t oldoff, newoff; 1134 struct vnode *vp = fp->f_vnode; 1135 struct vattr vattr; 1136 int error; 1137 1138 if (vp->v_type == VFIFO) 1139 return ESPIPE; 1140 1141 if (flags & FOF_UPDATE_OFFSET) 1142 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1143 else 1144 vn_lock(vp, LK_SHARED | LK_RETRY); 1145 1146 /* Compute the old and new offsets. */ 1147 oldoff = fp->f_offset; 1148 switch (whence) { 1149 case SEEK_CUR: 1150 if (delta > 0) { 1151 if (oldoff > 0 && delta > OFF_MAX - oldoff) { 1152 newoff = OFF_MAX; 1153 break; 1154 } 1155 } else { 1156 if (oldoff < 0 && delta < OFF_MIN - oldoff) { 1157 newoff = OFF_MIN; 1158 break; 1159 } 1160 } 1161 newoff = oldoff + delta; 1162 break; 1163 case SEEK_END: 1164 error = VOP_GETATTR(vp, &vattr, cred); 1165 if (error) 1166 goto out; 1167 if (vattr.va_size > OFF_MAX || 1168 delta > OFF_MAX - (off_t)vattr.va_size) { 1169 newoff = OFF_MAX; 1170 break; 1171 } 1172 newoff = delta + vattr.va_size; 1173 break; 1174 case SEEK_SET: 1175 newoff = delta; 1176 break; 1177 default: 1178 error = EINVAL; 1179 goto out; 1180 } 1181 1182 /* Pass the proposed change to the file system to audit. */ 1183 error = VOP_SEEK(vp, oldoff, newoff, cred); 1184 if (error) 1185 goto out; 1186 1187 /* Success! */ 1188 if (newoffp) 1189 *newoffp = newoff; 1190 if (flags & FOF_UPDATE_OFFSET) 1191 fp->f_offset = newoff; 1192 error = 0; 1193 1194 out: VOP_UNLOCK(vp); 1195 return error; 1196 } 1197 1198 /* 1199 * Check that the vnode is still valid, and if so 1200 * acquire requested lock. 1201 */ 1202 int 1203 vn_lock(struct vnode *vp, int flags) 1204 { 1205 struct lwp *l; 1206 int error; 1207 1208 KASSERT(vrefcnt(vp) > 0); 1209 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| 1210 LK_UPGRADE|LK_DOWNGRADE)) == 0); 1211 KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); 1212 1213 #ifdef DIAGNOSTIC 1214 if (wapbl_vphaswapbl(vp)) 1215 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); 1216 #endif 1217 1218 /* Get a more useful report for lockstat. */ 1219 l = curlwp; 1220 KASSERT(l->l_rwcallsite == 0); 1221 l->l_rwcallsite = (uintptr_t)__builtin_return_address(0); 1222 1223 error = VOP_LOCK(vp, flags); 1224 1225 l->l_rwcallsite = 0; 1226 1227 switch (flags & (LK_RETRY | LK_NOWAIT)) { 1228 case 0: 1229 KASSERT(error == 0 || error == ENOENT); 1230 break; 1231 case LK_RETRY: 1232 KASSERT(error == 0); 1233 break; 1234 case LK_NOWAIT: 1235 KASSERT(error == 0 || error == EBUSY || error == ENOENT); 1236 break; 1237 case LK_RETRY | LK_NOWAIT: 1238 KASSERT(error == 0 || error == EBUSY); 1239 break; 1240 } 1241 1242 return error; 1243 } 1244 1245 /* 1246 * File table vnode close routine. 1247 */ 1248 static int 1249 vn_closefile(file_t *fp) 1250 { 1251 1252 return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); 1253 } 1254 1255 /* 1256 * Simplified in-kernel wrapper calls for extended attribute access. 1257 * Both calls pass in a NULL credential, authorizing a "kernel" access. 1258 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1259 */ 1260 int 1261 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1262 const char *attrname, size_t *buflen, void *bf, struct lwp *l) 1263 { 1264 struct uio auio; 1265 struct iovec aiov; 1266 int error; 1267 1268 aiov.iov_len = *buflen; 1269 aiov.iov_base = bf; 1270 1271 auio.uio_iov = &aiov; 1272 auio.uio_iovcnt = 1; 1273 auio.uio_rw = UIO_READ; 1274 auio.uio_offset = 0; 1275 auio.uio_resid = *buflen; 1276 UIO_SETUP_SYSSPACE(&auio); 1277 1278 if ((ioflg & IO_NODELOCKED) == 0) 1279 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1280 1281 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, 1282 NOCRED); 1283 1284 if ((ioflg & IO_NODELOCKED) == 0) 1285 VOP_UNLOCK(vp); 1286 1287 if (error == 0) 1288 *buflen = *buflen - auio.uio_resid; 1289 1290 return error; 1291 } 1292 1293 /* 1294 * XXX Failure mode if partially written? 1295 */ 1296 int 1297 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1298 const char *attrname, size_t buflen, const void *bf, struct lwp *l) 1299 { 1300 struct uio auio; 1301 struct iovec aiov; 1302 int error; 1303 1304 aiov.iov_len = buflen; 1305 aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ 1306 1307 auio.uio_iov = &aiov; 1308 auio.uio_iovcnt = 1; 1309 auio.uio_rw = UIO_WRITE; 1310 auio.uio_offset = 0; 1311 auio.uio_resid = buflen; 1312 UIO_SETUP_SYSSPACE(&auio); 1313 1314 if ((ioflg & IO_NODELOCKED) == 0) { 1315 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1316 } 1317 1318 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED); 1319 1320 if ((ioflg & IO_NODELOCKED) == 0) { 1321 VOP_UNLOCK(vp); 1322 } 1323 1324 return error; 1325 } 1326 1327 int 1328 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1329 const char *attrname, struct lwp *l) 1330 { 1331 int error; 1332 1333 if ((ioflg & IO_NODELOCKED) == 0) { 1334 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1335 } 1336 1337 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED); 1338 if (error == EOPNOTSUPP) 1339 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1340 NOCRED); 1341 1342 if ((ioflg & IO_NODELOCKED) == 0) { 1343 VOP_UNLOCK(vp); 1344 } 1345 1346 return error; 1347 } 1348 1349 int 1350 vn_fifo_bypass(void *v) 1351 { 1352 struct vop_generic_args *ap = v; 1353 1354 return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); 1355 } 1356 1357 /* 1358 * Open block device by device number 1359 */ 1360 int 1361 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l) 1362 { 1363 int error; 1364 1365 if ((error = bdevvp(dev, vpp)) != 0) 1366 return error; 1367 1368 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1369 if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) { 1370 vput(*vpp); 1371 return error; 1372 } 1373 mutex_enter((*vpp)->v_interlock); 1374 (*vpp)->v_writecount++; 1375 mutex_exit((*vpp)->v_interlock); 1376 VOP_UNLOCK(*vpp); 1377 1378 return 0; 1379 } 1380 1381 /* 1382 * Lookup the provided name in the filesystem. If the file exists, 1383 * is a valid block device, and isn't being used by anyone else, 1384 * set *vpp to the file's vnode. 1385 */ 1386 int 1387 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l) 1388 { 1389 struct vnode *vp; 1390 dev_t dev; 1391 enum vtype vt; 1392 int error; 1393 1394 error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL); 1395 if (error != 0) 1396 return error; 1397 1398 dev = vp->v_rdev; 1399 vt = vp->v_type; 1400 1401 VOP_UNLOCK(vp); 1402 (void) vn_close(vp, FREAD | FWRITE, l->l_cred); 1403 1404 if (vt != VBLK) 1405 return ENOTBLK; 1406 1407 return vn_bdev_open(dev, vpp, l); 1408 } 1409 1410 static long 1411 vn_knote_to_interest(const struct knote *kn) 1412 { 1413 switch (kn->kn_filter) { 1414 case EVFILT_READ: 1415 /* 1416 * Writing to the file or changing its attributes can 1417 * set the file size, which impacts the readability 1418 * filter. 1419 * 1420 * (No need to set NOTE_EXTEND here; it's only ever 1421 * send with other hints; see vnode_if.c.) 1422 */ 1423 return NOTE_WRITE | NOTE_ATTRIB; 1424 1425 case EVFILT_VNODE: 1426 return kn->kn_sfflags; 1427 1428 case EVFILT_WRITE: 1429 default: 1430 return 0; 1431 } 1432 } 1433 1434 void 1435 vn_knote_attach(struct vnode *vp, struct knote *kn) 1436 { 1437 struct vnode_klist *vk = vp->v_klist; 1438 long interest = 0; 1439 1440 /* 1441 * In the case of layered / stacked file systems, knotes 1442 * should only ever be associated with the base vnode. 1443 */ 1444 KASSERT(kn->kn_hook == vp); 1445 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); 1446 1447 /* 1448 * We maintain a bitmask of the kevents that there is interest in, 1449 * to minimize the impact of having watchers. It's silly to have 1450 * to traverse vn_klist every time a read or write happens simply 1451 * because there is someone interested in knowing when the file 1452 * is deleted, for example. 1453 */ 1454 1455 mutex_enter(vp->v_interlock); 1456 SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext); 1457 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { 1458 interest |= vn_knote_to_interest(kn); 1459 } 1460 vk->vk_interest = interest; 1461 mutex_exit(vp->v_interlock); 1462 } 1463 1464 void 1465 vn_knote_detach(struct vnode *vp, struct knote *kn) 1466 { 1467 struct vnode_klist *vk = vp->v_klist; 1468 long interest = 0; 1469 1470 /* See above. */ 1471 KASSERT(kn->kn_hook == vp); 1472 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); 1473 1474 /* 1475 * We special case removing the head of the list, because: 1476 * 1477 * 1. It's extremely likely that we're detaching the only 1478 * knote. 1479 * 1480 * 2. We're already traversing the whole list, so we don't 1481 * want to use the generic SLIST_REMOVE() which would 1482 * traverse it *again*. 1483 */ 1484 1485 mutex_enter(vp->v_interlock); 1486 if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) { 1487 SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext); 1488 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { 1489 interest |= vn_knote_to_interest(kn); 1490 } 1491 vk->vk_interest = interest; 1492 } else { 1493 struct knote *thiskn, *nextkn, *prevkn = NULL; 1494 1495 SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) { 1496 if (thiskn == kn) { 1497 KASSERT(kn != NULL); 1498 KASSERT(prevkn != NULL); 1499 SLIST_REMOVE_AFTER(prevkn, kn_selnext); 1500 kn = NULL; 1501 } else { 1502 interest |= vn_knote_to_interest(thiskn); 1503 prevkn = thiskn; 1504 } 1505 } 1506 vk->vk_interest = interest; 1507 } 1508 mutex_exit(vp->v_interlock); 1509 } 1510