1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.118 2008/09/17 21:44:18 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/file.h> 57 #include <sys/kernel.h> 58 #include <sys/kthread.h> 59 #include <sys/malloc.h> 60 #include <sys/mbuf.h> 61 #include <sys/mount.h> 62 #include <sys/proc.h> 63 #include <sys/reboot.h> 64 #include <sys/socket.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/syslog.h> 68 #include <sys/unistd.h> 69 #include <sys/vmmeter.h> 70 #include <sys/vnode.h> 71 72 #include <machine/limits.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_extern.h> 77 #include <vm/vm_kern.h> 78 #include <vm/pmap.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_pager.h> 82 #include <vm/vnode_pager.h> 83 #include <vm/vm_zone.h> 84 85 #include <sys/buf2.h> 86 #include <sys/thread2.h> 87 #include <sys/sysref2.h> 88 89 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 90 91 int numvnodes; 92 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 93 int vfs_fastdev = 1; 94 SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, ""); 95 96 enum vtype iftovt_tab[16] = { 97 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 98 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 99 }; 100 int vttoif_tab[9] = { 101 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 102 S_IFSOCK, S_IFIFO, S_IFMT, 103 }; 104 105 static int reassignbufcalls; 106 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, 107 &reassignbufcalls, 0, ""); 108 static int reassignbufloops; 109 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, 110 &reassignbufloops, 0, ""); 111 static int reassignbufsortgood; 112 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, 113 &reassignbufsortgood, 0, ""); 114 static int reassignbufsortbad; 115 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, 116 &reassignbufsortbad, 0, ""); 117 static int reassignbufmethod = 1; 118 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, 119 &reassignbufmethod, 0, ""); 120 121 int nfs_mount_type = -1; 122 static struct lwkt_token spechash_token; 123 struct nfs_public nfs_pub; /* publicly exported FS */ 124 125 int desiredvnodes; 126 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 127 &desiredvnodes, 0, "Maximum number of vnodes"); 128 129 static void vfs_free_addrlist (struct netexport *nep); 130 static int vfs_free_netcred (struct radix_node *rn, void *w); 131 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 132 const struct export_args *argp); 133 134 extern int dev_ref_debug; 135 136 /* 137 * Red black tree functions 138 */ 139 static int rb_buf_compare(struct buf *b1, struct buf *b2); 140 RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, off_t, b_loffset); 141 RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, off_t, b_loffset); 142 143 static int 144 rb_buf_compare(struct buf *b1, struct buf *b2) 145 { 146 if (b1->b_loffset < b2->b_loffset) 147 return(-1); 148 if (b1->b_loffset > b2->b_loffset) 149 return(1); 150 return(0); 151 } 152 153 /* 154 * Returns non-zero if the vnode is a candidate for lazy msyncing. 155 */ 156 static __inline int 157 vshouldmsync(struct vnode *vp) 158 { 159 if (vp->v_auxrefs != 0 || vp->v_sysref.refcnt > 0) 160 return (0); /* other holders */ 161 if (vp->v_object && 162 (vp->v_object->ref_count || vp->v_object->resident_page_count)) { 163 return (0); 164 } 165 return (1); 166 } 167 168 /* 169 * Initialize the vnode management data structures. 170 * 171 * Called from vfsinit() 172 */ 173 void 174 vfs_subr_init(void) 175 { 176 /* 177 * Desiredvnodes is kern.maxvnodes. We want to scale it 178 * according to available system memory but we may also have 179 * to limit it based on available KVM, which is capped on 32 bit 180 * systems. 181 */ 182 desiredvnodes = min(maxproc + vmstats.v_page_count / 4, 183 KvaSize / (20 * 184 (sizeof(struct vm_object) + sizeof(struct vnode)))); 185 186 lwkt_token_init(&spechash_token); 187 } 188 189 /* 190 * Knob to control the precision of file timestamps: 191 * 192 * 0 = seconds only; nanoseconds zeroed. 193 * 1 = seconds and nanoseconds, accurate within 1/HZ. 194 * 2 = seconds and nanoseconds, truncated to microseconds. 195 * >=3 = seconds and nanoseconds, maximum precision. 196 */ 197 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 198 199 static int timestamp_precision = TSP_SEC; 200 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 201 ×tamp_precision, 0, ""); 202 203 /* 204 * Get a current timestamp. 205 * 206 * MPSAFE 207 */ 208 void 209 vfs_timestamp(struct timespec *tsp) 210 { 211 struct timeval tv; 212 213 switch (timestamp_precision) { 214 case TSP_SEC: 215 tsp->tv_sec = time_second; 216 tsp->tv_nsec = 0; 217 break; 218 case TSP_HZ: 219 getnanotime(tsp); 220 break; 221 case TSP_USEC: 222 microtime(&tv); 223 TIMEVAL_TO_TIMESPEC(&tv, tsp); 224 break; 225 case TSP_NSEC: 226 default: 227 nanotime(tsp); 228 break; 229 } 230 } 231 232 /* 233 * Set vnode attributes to VNOVAL 234 */ 235 void 236 vattr_null(struct vattr *vap) 237 { 238 vap->va_type = VNON; 239 vap->va_size = VNOVAL; 240 vap->va_bytes = VNOVAL; 241 vap->va_mode = VNOVAL; 242 vap->va_nlink = VNOVAL; 243 vap->va_uid = VNOVAL; 244 vap->va_gid = VNOVAL; 245 vap->va_fsid = VNOVAL; 246 vap->va_fileid = VNOVAL; 247 vap->va_blocksize = VNOVAL; 248 vap->va_rmajor = VNOVAL; 249 vap->va_rminor = VNOVAL; 250 vap->va_atime.tv_sec = VNOVAL; 251 vap->va_atime.tv_nsec = VNOVAL; 252 vap->va_mtime.tv_sec = VNOVAL; 253 vap->va_mtime.tv_nsec = VNOVAL; 254 vap->va_ctime.tv_sec = VNOVAL; 255 vap->va_ctime.tv_nsec = VNOVAL; 256 vap->va_flags = VNOVAL; 257 vap->va_gen = VNOVAL; 258 vap->va_vaflags = 0; 259 vap->va_fsmid = VNOVAL; 260 /* va_*_uuid fields are only valid if related flags are set */ 261 } 262 263 /* 264 * Flush out and invalidate all buffers associated with a vnode. 265 * 266 * vp must be locked. 267 */ 268 static int vinvalbuf_bp(struct buf *bp, void *data); 269 270 struct vinvalbuf_bp_info { 271 struct vnode *vp; 272 int slptimeo; 273 int lkflags; 274 int flags; 275 }; 276 277 void 278 vupdatefsmid(struct vnode *vp) 279 { 280 atomic_set_int(&vp->v_flag, VFSMID); 281 } 282 283 int 284 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 285 { 286 struct vinvalbuf_bp_info info; 287 vm_object_t object; 288 lwkt_tokref vlock; 289 int error; 290 291 lwkt_gettoken(&vlock, &vp->v_token); 292 293 /* 294 * If we are being asked to save, call fsync to ensure that the inode 295 * is updated. 296 */ 297 if (flags & V_SAVE) { 298 error = bio_track_wait(&vp->v_track_write, slpflag, slptimeo); 299 if (error) 300 goto done; 301 if (!RB_EMPTY(&vp->v_rbdirty_tree)) { 302 if ((error = VOP_FSYNC(vp, MNT_WAIT)) != 0) 303 goto done; 304 305 /* 306 * Dirty bufs may be left or generated via races 307 * in circumstances where vinvalbuf() is called on 308 * a vnode not undergoing reclamation. Only 309 * panic if we are trying to reclaim the vnode. 310 */ 311 if ((vp->v_flag & VRECLAIMED) && 312 (bio_track_active(&vp->v_track_write) || 313 !RB_EMPTY(&vp->v_rbdirty_tree))) { 314 panic("vinvalbuf: dirty bufs"); 315 } 316 } 317 } 318 info.slptimeo = slptimeo; 319 info.lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 320 if (slpflag & PCATCH) 321 info.lkflags |= LK_PCATCH; 322 info.flags = flags; 323 info.vp = vp; 324 325 /* 326 * Flush the buffer cache until nothing is left. 327 */ 328 while (!RB_EMPTY(&vp->v_rbclean_tree) || 329 !RB_EMPTY(&vp->v_rbdirty_tree)) { 330 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL, 331 vinvalbuf_bp, &info); 332 if (error == 0) { 333 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 334 vinvalbuf_bp, &info); 335 } 336 } 337 338 /* 339 * Wait for I/O completion. We may block in the pip code so we have 340 * to re-check. 341 */ 342 do { 343 bio_track_wait(&vp->v_track_write, 0, 0); 344 if ((object = vp->v_object) != NULL) { 345 while (object->paging_in_progress) 346 vm_object_pip_sleep(object, "vnvlbx"); 347 } 348 } while (bio_track_active(&vp->v_track_write)); 349 350 /* 351 * Destroy the copy in the VM cache, too. 352 */ 353 if ((object = vp->v_object) != NULL) { 354 vm_object_page_remove(object, 0, 0, 355 (flags & V_SAVE) ? TRUE : FALSE); 356 } 357 358 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree)) 359 panic("vinvalbuf: flush failed"); 360 if (!RB_EMPTY(&vp->v_rbhash_tree)) 361 panic("vinvalbuf: flush failed, buffers still present"); 362 error = 0; 363 done: 364 lwkt_reltoken(&vlock); 365 return (error); 366 } 367 368 static int 369 vinvalbuf_bp(struct buf *bp, void *data) 370 { 371 struct vinvalbuf_bp_info *info = data; 372 int error; 373 374 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 375 error = BUF_TIMELOCK(bp, info->lkflags, 376 "vinvalbuf", info->slptimeo); 377 if (error == 0) { 378 BUF_UNLOCK(bp); 379 error = ENOLCK; 380 } 381 if (error == ENOLCK) 382 return(0); 383 return (-error); 384 } 385 386 KKASSERT(bp->b_vp == info->vp); 387 388 /* 389 * XXX Since there are no node locks for NFS, I 390 * believe there is a slight chance that a delayed 391 * write will occur while sleeping just above, so 392 * check for it. Note that vfs_bio_awrite expects 393 * buffers to reside on a queue, while bwrite() and 394 * brelse() do not. 395 * 396 * NOTE: NO B_LOCKED CHECK. Also no buf_checkwrite() 397 * check. This code will write out the buffer, period. 398 */ 399 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 400 (info->flags & V_SAVE)) { 401 if (bp->b_vp == info->vp) { 402 if (bp->b_flags & B_CLUSTEROK) { 403 vfs_bio_awrite(bp); 404 } else { 405 bremfree(bp); 406 bp->b_flags |= B_ASYNC; 407 bwrite(bp); 408 } 409 } else { 410 bremfree(bp); 411 bwrite(bp); 412 } 413 } else if (info->flags & V_SAVE) { 414 /* 415 * Cannot set B_NOCACHE on a clean buffer as this will 416 * destroy the VM backing store which might actually 417 * be dirty (and unsynchronized). 418 */ 419 bremfree(bp); 420 bp->b_flags |= (B_INVAL | B_RELBUF); 421 bp->b_flags &= ~B_ASYNC; 422 brelse(bp); 423 } else { 424 bremfree(bp); 425 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 426 bp->b_flags &= ~B_ASYNC; 427 brelse(bp); 428 } 429 return(0); 430 } 431 432 /* 433 * Truncate a file's buffer and pages to a specified length. This 434 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 435 * sync activity. 436 * 437 * The vnode must be locked. 438 */ 439 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 440 static int vtruncbuf_bp_trunc(struct buf *bp, void *data); 441 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 442 static int vtruncbuf_bp_metasync(struct buf *bp, void *data); 443 444 int 445 vtruncbuf(struct vnode *vp, off_t length, int blksize) 446 { 447 off_t truncloffset; 448 const char *filename; 449 lwkt_tokref vlock; 450 int count; 451 452 /* 453 * Round up to the *next* block, then destroy the buffers in question. 454 * Since we are only removing some of the buffers we must rely on the 455 * scan count to determine whether a loop is necessary. 456 */ 457 if ((count = (int)(length % blksize)) != 0) 458 truncloffset = length + (blksize - count); 459 else 460 truncloffset = length; 461 462 lwkt_gettoken(&vlock, &vp->v_token); 463 do { 464 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 465 vtruncbuf_bp_trunc_cmp, 466 vtruncbuf_bp_trunc, &truncloffset); 467 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 468 vtruncbuf_bp_trunc_cmp, 469 vtruncbuf_bp_trunc, &truncloffset); 470 } while(count); 471 472 /* 473 * For safety, fsync any remaining metadata if the file is not being 474 * truncated to 0. Since the metadata does not represent the entire 475 * dirty list we have to rely on the hit count to ensure that we get 476 * all of it. 477 */ 478 if (length > 0) { 479 do { 480 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 481 vtruncbuf_bp_metasync_cmp, 482 vtruncbuf_bp_metasync, vp); 483 } while (count); 484 } 485 486 /* 487 * Clean out any left over VM backing store. 488 * 489 * It is possible to have in-progress I/O from buffers that were 490 * not part of the truncation. This should not happen if we 491 * are truncating to 0-length. 492 */ 493 vnode_pager_setsize(vp, length); 494 bio_track_wait(&vp->v_track_write, 0, 0); 495 496 filename = TAILQ_FIRST(&vp->v_namecache) ? 497 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"; 498 499 /* 500 * Make sure no buffers were instantiated while we were trying 501 * to clean out the remaining VM pages. This could occur due 502 * to busy dirty VM pages being flushed out to disk. 503 */ 504 do { 505 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 506 vtruncbuf_bp_trunc_cmp, 507 vtruncbuf_bp_trunc, &truncloffset); 508 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 509 vtruncbuf_bp_trunc_cmp, 510 vtruncbuf_bp_trunc, &truncloffset); 511 if (count) { 512 kprintf("Warning: vtruncbuf(): Had to re-clean %d " 513 "left over buffers in %s\n", count, filename); 514 } 515 } while(count); 516 517 lwkt_reltoken(&vlock); 518 519 return (0); 520 } 521 522 /* 523 * The callback buffer is beyond the new file EOF and must be destroyed. 524 * Note that the compare function must conform to the RB_SCAN's requirements. 525 */ 526 static 527 int 528 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 529 { 530 if (bp->b_loffset >= *(off_t *)data) 531 return(0); 532 return(-1); 533 } 534 535 static 536 int 537 vtruncbuf_bp_trunc(struct buf *bp, void *data) 538 { 539 /* 540 * Do not try to use a buffer we cannot immediately lock, but sleep 541 * anyway to prevent a livelock. The code will loop until all buffers 542 * can be acted upon. 543 */ 544 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 545 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 546 BUF_UNLOCK(bp); 547 } else { 548 bremfree(bp); 549 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); 550 bp->b_flags &= ~B_ASYNC; 551 brelse(bp); 552 } 553 return(1); 554 } 555 556 /* 557 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 558 * blocks (with a negative loffset) are scanned. 559 * Note that the compare function must conform to the RB_SCAN's requirements. 560 */ 561 static int 562 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) 563 { 564 if (bp->b_loffset < 0) 565 return(0); 566 return(1); 567 } 568 569 static int 570 vtruncbuf_bp_metasync(struct buf *bp, void *data) 571 { 572 struct vnode *vp = data; 573 574 if (bp->b_flags & B_DELWRI) { 575 /* 576 * Do not try to use a buffer we cannot immediately lock, 577 * but sleep anyway to prevent a livelock. The code will 578 * loop until all buffers can be acted upon. 579 */ 580 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 581 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 582 BUF_UNLOCK(bp); 583 } else { 584 bremfree(bp); 585 if (bp->b_vp == vp) { 586 bp->b_flags |= B_ASYNC; 587 } else { 588 bp->b_flags &= ~B_ASYNC; 589 } 590 bwrite(bp); 591 } 592 return(1); 593 } else { 594 return(0); 595 } 596 } 597 598 /* 599 * vfsync - implements a multipass fsync on a file which understands 600 * dependancies and meta-data. The passed vnode must be locked. The 601 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY. 602 * 603 * When fsyncing data asynchronously just do one consolidated pass starting 604 * with the most negative block number. This may not get all the data due 605 * to dependancies. 606 * 607 * When fsyncing data synchronously do a data pass, then a metadata pass, 608 * then do additional data+metadata passes to try to get all the data out. 609 */ 610 static int vfsync_wait_output(struct vnode *vp, 611 int (*waitoutput)(struct vnode *, struct thread *)); 612 static int vfsync_data_only_cmp(struct buf *bp, void *data); 613 static int vfsync_meta_only_cmp(struct buf *bp, void *data); 614 static int vfsync_lazy_range_cmp(struct buf *bp, void *data); 615 static int vfsync_bp(struct buf *bp, void *data); 616 617 struct vfsync_info { 618 struct vnode *vp; 619 int synchronous; 620 int syncdeps; 621 int lazycount; 622 int lazylimit; 623 int skippedbufs; 624 int (*checkdef)(struct buf *); 625 }; 626 627 int 628 vfsync(struct vnode *vp, int waitfor, int passes, 629 int (*checkdef)(struct buf *), 630 int (*waitoutput)(struct vnode *, struct thread *)) 631 { 632 struct vfsync_info info; 633 lwkt_tokref vlock; 634 int error; 635 636 bzero(&info, sizeof(info)); 637 info.vp = vp; 638 if ((info.checkdef = checkdef) == NULL) 639 info.syncdeps = 1; 640 641 lwkt_gettoken(&vlock, &vp->v_token); 642 643 switch(waitfor) { 644 case MNT_LAZY: 645 /* 646 * Lazy (filesystem syncer typ) Asynchronous plus limit the 647 * number of data (not meta) pages we try to flush to 1MB. 648 * A non-zero return means that lazy limit was reached. 649 */ 650 info.lazylimit = 1024 * 1024; 651 info.syncdeps = 1; 652 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 653 vfsync_lazy_range_cmp, vfsync_bp, &info); 654 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 655 vfsync_meta_only_cmp, vfsync_bp, &info); 656 if (error == 0) 657 vp->v_lazyw = 0; 658 else if (!RB_EMPTY(&vp->v_rbdirty_tree)) 659 vn_syncer_add_to_worklist(vp, 1); 660 error = 0; 661 break; 662 case MNT_NOWAIT: 663 /* 664 * Asynchronous. Do a data-only pass and a meta-only pass. 665 */ 666 info.syncdeps = 1; 667 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 668 vfsync_bp, &info); 669 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 670 vfsync_bp, &info); 671 error = 0; 672 break; 673 default: 674 /* 675 * Synchronous. Do a data-only pass, then a meta-data+data 676 * pass, then additional integrated passes to try to get 677 * all the dependancies flushed. 678 */ 679 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 680 vfsync_bp, &info); 681 error = vfsync_wait_output(vp, waitoutput); 682 if (error == 0) { 683 info.skippedbufs = 0; 684 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 685 vfsync_bp, &info); 686 error = vfsync_wait_output(vp, waitoutput); 687 if (info.skippedbufs) 688 kprintf("Warning: vfsync skipped %d dirty bufs in pass2!\n", info.skippedbufs); 689 } 690 while (error == 0 && passes > 0 && 691 !RB_EMPTY(&vp->v_rbdirty_tree) 692 ) { 693 if (--passes == 0) { 694 info.synchronous = 1; 695 info.syncdeps = 1; 696 } 697 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 698 vfsync_bp, &info); 699 if (error < 0) 700 error = -error; 701 info.syncdeps = 1; 702 if (error == 0) 703 error = vfsync_wait_output(vp, waitoutput); 704 } 705 break; 706 } 707 lwkt_reltoken(&vlock); 708 return(error); 709 } 710 711 static int 712 vfsync_wait_output(struct vnode *vp, 713 int (*waitoutput)(struct vnode *, struct thread *)) 714 { 715 int error; 716 717 error = bio_track_wait(&vp->v_track_write, 0, 0); 718 if (waitoutput) 719 error = waitoutput(vp, curthread); 720 return(error); 721 } 722 723 static int 724 vfsync_data_only_cmp(struct buf *bp, void *data) 725 { 726 if (bp->b_loffset < 0) 727 return(-1); 728 return(0); 729 } 730 731 static int 732 vfsync_meta_only_cmp(struct buf *bp, void *data) 733 { 734 if (bp->b_loffset < 0) 735 return(0); 736 return(1); 737 } 738 739 static int 740 vfsync_lazy_range_cmp(struct buf *bp, void *data) 741 { 742 struct vfsync_info *info = data; 743 if (bp->b_loffset < info->vp->v_lazyw) 744 return(-1); 745 return(0); 746 } 747 748 static int 749 vfsync_bp(struct buf *bp, void *data) 750 { 751 struct vfsync_info *info = data; 752 struct vnode *vp = info->vp; 753 int error; 754 755 /* 756 * if syncdeps is not set we do not try to write buffers which have 757 * dependancies. 758 */ 759 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) 760 return(0); 761 762 /* 763 * Ignore buffers that we cannot immediately lock. XXX 764 */ 765 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 766 kprintf("Warning: vfsync_bp skipping dirty buffer %p\n", bp); 767 ++info->skippedbufs; 768 return(0); 769 } 770 if ((bp->b_flags & B_DELWRI) == 0) 771 panic("vfsync_bp: buffer not dirty"); 772 if (vp != bp->b_vp) 773 panic("vfsync_bp: buffer vp mismatch"); 774 775 /* 776 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer 777 * has been written but an additional handshake with the device 778 * is required before we can dispose of the buffer. We have no idea 779 * how to do this so we have to skip these buffers. 780 */ 781 if (bp->b_flags & B_NEEDCOMMIT) { 782 BUF_UNLOCK(bp); 783 return(0); 784 } 785 786 /* 787 * Ask bioops if it is ok to sync 788 */ 789 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) { 790 bremfree(bp); 791 brelse(bp); 792 return(0); 793 } 794 795 if (info->synchronous) { 796 /* 797 * Synchronous flushing. An error may be returned. 798 */ 799 bremfree(bp); 800 error = bwrite(bp); 801 } else { 802 /* 803 * Asynchronous flushing. A negative return value simply 804 * stops the scan and is not considered an error. We use 805 * this to support limited MNT_LAZY flushes. 806 */ 807 vp->v_lazyw = bp->b_loffset; 808 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { 809 info->lazycount += vfs_bio_awrite(bp); 810 } else { 811 info->lazycount += bp->b_bufsize; 812 bremfree(bp); 813 bawrite(bp); 814 } 815 if (info->lazylimit && info->lazycount >= info->lazylimit) 816 error = 1; 817 else 818 error = 0; 819 } 820 return(-error); 821 } 822 823 /* 824 * Associate a buffer with a vnode. 825 * 826 * MPSAFE 827 */ 828 int 829 bgetvp(struct vnode *vp, struct buf *bp) 830 { 831 lwkt_tokref vlock; 832 833 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 834 KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI|B_VNCLEAN|B_VNDIRTY)) == 0); 835 836 /* 837 * Insert onto list for new vnode. 838 */ 839 lwkt_gettoken(&vlock, &vp->v_token); 840 if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) { 841 lwkt_reltoken(&vlock); 842 return (EEXIST); 843 } 844 bp->b_vp = vp; 845 bp->b_flags |= B_HASHED; 846 bp->b_flags |= B_VNCLEAN; 847 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) 848 panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp); 849 vhold(vp); 850 lwkt_reltoken(&vlock); 851 return(0); 852 } 853 854 /* 855 * Disassociate a buffer from a vnode. 856 */ 857 void 858 brelvp(struct buf *bp) 859 { 860 struct vnode *vp; 861 lwkt_tokref vlock; 862 863 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 864 865 /* 866 * Delete from old vnode list, if on one. 867 */ 868 vp = bp->b_vp; 869 lwkt_gettoken(&vlock, &vp->v_token); 870 if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN)) { 871 if (bp->b_flags & B_VNDIRTY) 872 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 873 else 874 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 875 bp->b_flags &= ~(B_VNDIRTY | B_VNCLEAN); 876 } 877 if (bp->b_flags & B_HASHED) { 878 buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp); 879 bp->b_flags &= ~B_HASHED; 880 } 881 if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) { 882 vp->v_flag &= ~VONWORKLST; 883 LIST_REMOVE(vp, v_synclist); 884 } 885 bp->b_vp = NULL; 886 lwkt_reltoken(&vlock); 887 888 vdrop(vp); 889 } 890 891 /* 892 * Reassign the buffer to the proper clean/dirty list based on B_DELWRI. 893 * This routine is called when the state of the B_DELWRI bit is changed. 894 * 895 * MPSAFE 896 */ 897 void 898 reassignbuf(struct buf *bp) 899 { 900 struct vnode *vp = bp->b_vp; 901 lwkt_tokref vlock; 902 int delay; 903 904 KKASSERT(vp != NULL); 905 ++reassignbufcalls; 906 907 /* 908 * B_PAGING flagged buffers cannot be reassigned because their vp 909 * is not fully linked in. 910 */ 911 if (bp->b_flags & B_PAGING) 912 panic("cannot reassign paging buffer"); 913 914 lwkt_gettoken(&vlock, &vp->v_token); 915 if (bp->b_flags & B_DELWRI) { 916 /* 917 * Move to the dirty list, add the vnode to the worklist 918 */ 919 if (bp->b_flags & B_VNCLEAN) { 920 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 921 bp->b_flags &= ~B_VNCLEAN; 922 } 923 if ((bp->b_flags & B_VNDIRTY) == 0) { 924 if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) { 925 panic("reassignbuf: dup lblk vp %p bp %p", 926 vp, bp); 927 } 928 bp->b_flags |= B_VNDIRTY; 929 } 930 if ((vp->v_flag & VONWORKLST) == 0) { 931 switch (vp->v_type) { 932 case VDIR: 933 delay = dirdelay; 934 break; 935 case VCHR: 936 case VBLK: 937 if (vp->v_rdev && 938 vp->v_rdev->si_mountpoint != NULL) { 939 delay = metadelay; 940 break; 941 } 942 /* fall through */ 943 default: 944 delay = filedelay; 945 } 946 vn_syncer_add_to_worklist(vp, delay); 947 } 948 } else { 949 /* 950 * Move to the clean list, remove the vnode from the worklist 951 * if no dirty blocks remain. 952 */ 953 if (bp->b_flags & B_VNDIRTY) { 954 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 955 bp->b_flags &= ~B_VNDIRTY; 956 } 957 if ((bp->b_flags & B_VNCLEAN) == 0) { 958 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) { 959 panic("reassignbuf: dup lblk vp %p bp %p", 960 vp, bp); 961 } 962 bp->b_flags |= B_VNCLEAN; 963 } 964 if ((vp->v_flag & VONWORKLST) && 965 RB_EMPTY(&vp->v_rbdirty_tree)) { 966 vp->v_flag &= ~VONWORKLST; 967 LIST_REMOVE(vp, v_synclist); 968 } 969 } 970 lwkt_reltoken(&vlock); 971 } 972 973 /* 974 * Create a vnode for a block device. 975 * Used for mounting the root file system. 976 */ 977 int 978 bdevvp(cdev_t dev, struct vnode **vpp) 979 { 980 struct vnode *vp; 981 struct vnode *nvp; 982 int error; 983 984 if (dev == NULL) { 985 *vpp = NULLVP; 986 return (ENXIO); 987 } 988 error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops_p, &nvp, 0, 0); 989 if (error) { 990 *vpp = NULLVP; 991 return (error); 992 } 993 vp = nvp; 994 vp->v_type = VCHR; 995 vp->v_umajor = dev->si_umajor; 996 vp->v_uminor = dev->si_uminor; 997 vx_unlock(vp); 998 *vpp = vp; 999 return (0); 1000 } 1001 1002 int 1003 v_associate_rdev(struct vnode *vp, cdev_t dev) 1004 { 1005 lwkt_tokref ilock; 1006 1007 if (dev == NULL) 1008 return(ENXIO); 1009 if (dev_is_good(dev) == 0) 1010 return(ENXIO); 1011 KKASSERT(vp->v_rdev == NULL); 1012 if (dev_ref_debug) 1013 kprintf("Z1"); 1014 vp->v_rdev = reference_dev(dev); 1015 lwkt_gettoken(&ilock, &spechash_token); 1016 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_cdevnext); 1017 lwkt_reltoken(&ilock); 1018 return(0); 1019 } 1020 1021 void 1022 v_release_rdev(struct vnode *vp) 1023 { 1024 lwkt_tokref ilock; 1025 cdev_t dev; 1026 1027 if ((dev = vp->v_rdev) != NULL) { 1028 lwkt_gettoken(&ilock, &spechash_token); 1029 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_cdevnext); 1030 vp->v_rdev = NULL; 1031 release_dev(dev); 1032 lwkt_reltoken(&ilock); 1033 } 1034 } 1035 1036 /* 1037 * Add a vnode to the alias list hung off the cdev_t. We only associate 1038 * the device number with the vnode. The actual device is not associated 1039 * until the vnode is opened (usually in spec_open()), and will be 1040 * disassociated on last close. 1041 */ 1042 void 1043 addaliasu(struct vnode *nvp, int x, int y) 1044 { 1045 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1046 panic("addaliasu on non-special vnode"); 1047 nvp->v_umajor = x; 1048 nvp->v_uminor = y; 1049 } 1050 1051 /* 1052 * Simple call that a filesystem can make to try to get rid of a 1053 * vnode. It will fail if anyone is referencing the vnode (including 1054 * the caller). 1055 * 1056 * The filesystem can check whether its in-memory inode structure still 1057 * references the vp on return. 1058 */ 1059 void 1060 vclean_unlocked(struct vnode *vp) 1061 { 1062 vx_get(vp); 1063 if (sysref_isactive(&vp->v_sysref) == 0) 1064 vgone_vxlocked(vp); 1065 vx_put(vp); 1066 } 1067 1068 /* 1069 * Disassociate a vnode from its underlying filesystem. 1070 * 1071 * The vnode must be VX locked and referenced. In all normal situations 1072 * there are no active references. If vclean_vxlocked() is called while 1073 * there are active references, the vnode is being ripped out and we have 1074 * to call VOP_CLOSE() as appropriate before we can reclaim it. 1075 */ 1076 void 1077 vclean_vxlocked(struct vnode *vp, int flags) 1078 { 1079 int active; 1080 int n; 1081 vm_object_t object; 1082 1083 /* 1084 * If the vnode has already been reclaimed we have nothing to do. 1085 */ 1086 if (vp->v_flag & VRECLAIMED) 1087 return; 1088 vp->v_flag |= VRECLAIMED; 1089 1090 /* 1091 * Scrap the vfs cache 1092 */ 1093 while (cache_inval_vp(vp, 0) != 0) { 1094 kprintf("Warning: vnode %p clean/cache_resolution race detected\n", vp); 1095 tsleep(vp, 0, "vclninv", 2); 1096 } 1097 1098 /* 1099 * Check to see if the vnode is in use. If so we have to reference it 1100 * before we clean it out so that its count cannot fall to zero and 1101 * generate a race against ourselves to recycle it. 1102 */ 1103 active = sysref_isactive(&vp->v_sysref); 1104 1105 /* 1106 * Clean out any buffers associated with the vnode and destroy its 1107 * object, if it has one. 1108 */ 1109 vinvalbuf(vp, V_SAVE, 0, 0); 1110 1111 /* 1112 * If purging an active vnode (typically during a forced unmount 1113 * or reboot), it must be closed and deactivated before being 1114 * reclaimed. This isn't really all that safe, but what can 1115 * we do? XXX. 1116 * 1117 * Note that neither of these routines unlocks the vnode. 1118 */ 1119 if (active && (flags & DOCLOSE)) { 1120 while ((n = vp->v_opencount) != 0) { 1121 if (vp->v_writecount) 1122 VOP_CLOSE(vp, FWRITE|FNONBLOCK); 1123 else 1124 VOP_CLOSE(vp, FNONBLOCK); 1125 if (vp->v_opencount == n) { 1126 kprintf("Warning: unable to force-close" 1127 " vnode %p\n", vp); 1128 break; 1129 } 1130 } 1131 } 1132 1133 /* 1134 * If the vnode has not been deactivated, deactivated it. Deactivation 1135 * can create new buffers and VM pages so we have to call vinvalbuf() 1136 * again to make sure they all get flushed. 1137 * 1138 * This can occur if a file with a link count of 0 needs to be 1139 * truncated. 1140 */ 1141 if ((vp->v_flag & VINACTIVE) == 0) { 1142 vp->v_flag |= VINACTIVE; 1143 VOP_INACTIVE(vp); 1144 vinvalbuf(vp, V_SAVE, 0, 0); 1145 } 1146 1147 /* 1148 * If the vnode has an object, destroy it. 1149 */ 1150 if ((object = vp->v_object) != NULL) { 1151 if (object->ref_count == 0) { 1152 if ((object->flags & OBJ_DEAD) == 0) 1153 vm_object_terminate(object); 1154 } else { 1155 vm_pager_deallocate(object); 1156 } 1157 vp->v_flag &= ~VOBJBUF; 1158 } 1159 KKASSERT((vp->v_flag & VOBJBUF) == 0); 1160 1161 /* 1162 * Reclaim the vnode. 1163 */ 1164 if (VOP_RECLAIM(vp)) 1165 panic("vclean: cannot reclaim"); 1166 1167 /* 1168 * Done with purge, notify sleepers of the grim news. 1169 */ 1170 vp->v_ops = &dead_vnode_vops_p; 1171 vn_pollgone(vp); 1172 vp->v_tag = VT_NON; 1173 1174 /* 1175 * If we are destroying an active vnode, reactivate it now that 1176 * we have reassociated it with deadfs. This prevents the system 1177 * from crashing on the vnode due to it being unexpectedly marked 1178 * as inactive or reclaimed. 1179 */ 1180 if (active && (flags & DOCLOSE)) { 1181 vp->v_flag &= ~(VINACTIVE|VRECLAIMED); 1182 } 1183 } 1184 1185 /* 1186 * Eliminate all activity associated with the requested vnode 1187 * and with all vnodes aliased to the requested vnode. 1188 * 1189 * The vnode must be referenced but should not be locked. 1190 */ 1191 int 1192 vrevoke(struct vnode *vp, struct ucred *cred) 1193 { 1194 struct vnode *vq; 1195 struct vnode *vqn; 1196 lwkt_tokref ilock; 1197 cdev_t dev; 1198 int error; 1199 1200 /* 1201 * If the vnode has a device association, scrap all vnodes associated 1202 * with the device. Don't let the device disappear on us while we 1203 * are scrapping the vnodes. 1204 * 1205 * The passed vp will probably show up in the list, do not VX lock 1206 * it twice! 1207 * 1208 * Releasing the vnode's rdev here can mess up specfs's call to 1209 * device close, so don't do it. The vnode has been disassociated 1210 * and the device will be closed after the last ref on the related 1211 * fp goes away (if not still open by e.g. the kernel). 1212 */ 1213 if (vp->v_type != VCHR) { 1214 error = fdrevoke(vp, DTYPE_VNODE, cred); 1215 return (error); 1216 } 1217 if ((dev = vp->v_rdev) == NULL) { 1218 if ((dev = get_dev(vp->v_umajor, vp->v_uminor)) == NULL) 1219 return(0); 1220 } 1221 reference_dev(dev); 1222 lwkt_gettoken(&ilock, &spechash_token); 1223 1224 vqn = SLIST_FIRST(&dev->si_hlist); 1225 if (vqn) 1226 vref(vqn); 1227 while ((vq = vqn) != NULL) { 1228 vqn = SLIST_NEXT(vqn, v_cdevnext); 1229 if (vqn) 1230 vref(vqn); 1231 fdrevoke(vq, DTYPE_VNODE, cred); 1232 /*v_release_rdev(vq);*/ 1233 vrele(vq); 1234 } 1235 lwkt_reltoken(&ilock); 1236 dev_drevoke(dev); 1237 release_dev(dev); 1238 return (0); 1239 } 1240 1241 /* 1242 * This is called when the object underlying a vnode is being destroyed, 1243 * such as in a remove(). Try to recycle the vnode immediately if the 1244 * only active reference is our reference. 1245 * 1246 * Directory vnodes in the namecache with children cannot be immediately 1247 * recycled because numerous VOP_N*() ops require them to be stable. 1248 */ 1249 int 1250 vrecycle(struct vnode *vp) 1251 { 1252 if (vp->v_sysref.refcnt <= 1) { 1253 if (cache_inval_vp_nonblock(vp)) 1254 return(0); 1255 vgone_vxlocked(vp); 1256 return (1); 1257 } 1258 return (0); 1259 } 1260 1261 /* 1262 * Return the maximum I/O size allowed for strategy calls on VP. 1263 * 1264 * If vp is VCHR or VBLK we dive the device, otherwise we use 1265 * the vp's mount info. 1266 */ 1267 int 1268 vmaxiosize(struct vnode *vp) 1269 { 1270 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1271 return(vp->v_rdev->si_iosize_max); 1272 } else { 1273 return(vp->v_mount->mnt_iosize_max); 1274 } 1275 } 1276 1277 /* 1278 * Eliminate all activity associated with a vnode in preparation for reuse. 1279 * 1280 * The vnode must be VX locked and refd and will remain VX locked and refd 1281 * on return. This routine may be called with the vnode in any state, as 1282 * long as it is VX locked. The vnode will be cleaned out and marked 1283 * VRECLAIMED but will not actually be reused until all existing refs and 1284 * holds go away. 1285 * 1286 * NOTE: This routine may be called on a vnode which has not yet been 1287 * already been deactivated (VOP_INACTIVE), or on a vnode which has 1288 * already been reclaimed. 1289 * 1290 * This routine is not responsible for placing us back on the freelist. 1291 * Instead, it happens automatically when the caller releases the VX lock 1292 * (assuming there aren't any other references). 1293 */ 1294 1295 void 1296 vgone_vxlocked(struct vnode *vp) 1297 { 1298 /* 1299 * assert that the VX lock is held. This is an absolute requirement 1300 * now for vgone_vxlocked() to be called. 1301 */ 1302 KKASSERT(vp->v_lock.lk_exclusivecount == 1); 1303 1304 /* 1305 * Clean out the filesystem specific data and set the VRECLAIMED 1306 * bit. Also deactivate the vnode if necessary. 1307 */ 1308 vclean_vxlocked(vp, DOCLOSE); 1309 1310 /* 1311 * Delete from old mount point vnode list, if on one. 1312 */ 1313 if (vp->v_mount != NULL) 1314 insmntque(vp, NULL); 1315 1316 /* 1317 * If special device, remove it from special device alias list 1318 * if it is on one. This should normally only occur if a vnode is 1319 * being revoked as the device should otherwise have been released 1320 * naturally. 1321 */ 1322 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1323 v_release_rdev(vp); 1324 } 1325 1326 /* 1327 * Set us to VBAD 1328 */ 1329 vp->v_type = VBAD; 1330 } 1331 1332 /* 1333 * Lookup a vnode by device number. 1334 * 1335 * Returns non-zero and *vpp set to a vref'd vnode on success. 1336 * Returns zero on failure. 1337 */ 1338 int 1339 vfinddev(cdev_t dev, enum vtype type, struct vnode **vpp) 1340 { 1341 lwkt_tokref ilock; 1342 struct vnode *vp; 1343 1344 lwkt_gettoken(&ilock, &spechash_token); 1345 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) { 1346 if (type == vp->v_type) { 1347 *vpp = vp; 1348 vref(vp); 1349 lwkt_reltoken(&ilock); 1350 return (1); 1351 } 1352 } 1353 lwkt_reltoken(&ilock); 1354 return (0); 1355 } 1356 1357 /* 1358 * Calculate the total number of references to a special device. This 1359 * routine may only be called for VBLK and VCHR vnodes since v_rdev is 1360 * an overloaded field. Since udev2dev can now return NULL, we have 1361 * to check for a NULL v_rdev. 1362 */ 1363 int 1364 count_dev(cdev_t dev) 1365 { 1366 lwkt_tokref ilock; 1367 struct vnode *vp; 1368 int count = 0; 1369 1370 if (SLIST_FIRST(&dev->si_hlist)) { 1371 lwkt_gettoken(&ilock, &spechash_token); 1372 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) { 1373 if (vp->v_sysref.refcnt > 0) 1374 count += vp->v_sysref.refcnt; 1375 } 1376 lwkt_reltoken(&ilock); 1377 } 1378 return(count); 1379 } 1380 1381 int 1382 count_udev(int x, int y) 1383 { 1384 cdev_t dev; 1385 1386 if ((dev = get_dev(x, y)) == NULL) 1387 return(0); 1388 return(count_dev(dev)); 1389 } 1390 1391 int 1392 vcount(struct vnode *vp) 1393 { 1394 if (vp->v_rdev == NULL) 1395 return(0); 1396 return(count_dev(vp->v_rdev)); 1397 } 1398 1399 /* 1400 * Initialize VMIO for a vnode. This routine MUST be called before a 1401 * VFS can issue buffer cache ops on a vnode. It is typically called 1402 * when a vnode is initialized from its inode. 1403 */ 1404 int 1405 vinitvmio(struct vnode *vp, off_t filesize) 1406 { 1407 vm_object_t object; 1408 int error = 0; 1409 1410 retry: 1411 if ((object = vp->v_object) == NULL) { 1412 object = vnode_pager_alloc(vp, filesize, 0, 0); 1413 /* 1414 * Dereference the reference we just created. This assumes 1415 * that the object is associated with the vp. 1416 */ 1417 object->ref_count--; 1418 vrele(vp); 1419 } else { 1420 if (object->flags & OBJ_DEAD) { 1421 vn_unlock(vp); 1422 vm_object_dead_sleep(object, "vodead"); 1423 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1424 goto retry; 1425 } 1426 } 1427 KASSERT(vp->v_object != NULL, ("vinitvmio: NULL object")); 1428 vp->v_flag |= VOBJBUF; 1429 return (error); 1430 } 1431 1432 1433 /* 1434 * Print out a description of a vnode. 1435 */ 1436 static char *typename[] = 1437 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1438 1439 void 1440 vprint(char *label, struct vnode *vp) 1441 { 1442 char buf[96]; 1443 1444 if (label != NULL) 1445 kprintf("%s: %p: ", label, (void *)vp); 1446 else 1447 kprintf("%p: ", (void *)vp); 1448 kprintf("type %s, sysrefs %d, writecount %d, holdcnt %d,", 1449 typename[vp->v_type], 1450 vp->v_sysref.refcnt, vp->v_writecount, vp->v_auxrefs); 1451 buf[0] = '\0'; 1452 if (vp->v_flag & VROOT) 1453 strcat(buf, "|VROOT"); 1454 if (vp->v_flag & VPFSROOT) 1455 strcat(buf, "|VPFSROOT"); 1456 if (vp->v_flag & VTEXT) 1457 strcat(buf, "|VTEXT"); 1458 if (vp->v_flag & VSYSTEM) 1459 strcat(buf, "|VSYSTEM"); 1460 if (vp->v_flag & VFREE) 1461 strcat(buf, "|VFREE"); 1462 if (vp->v_flag & VOBJBUF) 1463 strcat(buf, "|VOBJBUF"); 1464 if (buf[0] != '\0') 1465 kprintf(" flags (%s)", &buf[1]); 1466 if (vp->v_data == NULL) { 1467 kprintf("\n"); 1468 } else { 1469 kprintf("\n\t"); 1470 VOP_PRINT(vp); 1471 } 1472 } 1473 1474 #ifdef DDB 1475 #include <ddb/ddb.h> 1476 1477 static int db_show_locked_vnodes(struct mount *mp, void *data); 1478 1479 /* 1480 * List all of the locked vnodes in the system. 1481 * Called when debugging the kernel. 1482 */ 1483 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 1484 { 1485 kprintf("Locked vnodes\n"); 1486 mountlist_scan(db_show_locked_vnodes, NULL, 1487 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1488 } 1489 1490 static int 1491 db_show_locked_vnodes(struct mount *mp, void *data __unused) 1492 { 1493 struct vnode *vp; 1494 1495 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 1496 if (vn_islocked(vp)) 1497 vprint(NULL, vp); 1498 } 1499 return(0); 1500 } 1501 #endif 1502 1503 /* 1504 * Top level filesystem related information gathering. 1505 */ 1506 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 1507 1508 static int 1509 vfs_sysctl(SYSCTL_HANDLER_ARGS) 1510 { 1511 int *name = (int *)arg1 - 1; /* XXX */ 1512 u_int namelen = arg2 + 1; /* XXX */ 1513 struct vfsconf *vfsp; 1514 int maxtypenum; 1515 1516 #if 1 || defined(COMPAT_PRELITE2) 1517 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 1518 if (namelen == 1) 1519 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 1520 #endif 1521 1522 #ifdef notyet 1523 /* all sysctl names at this level are at least name and field */ 1524 if (namelen < 2) 1525 return (ENOTDIR); /* overloaded */ 1526 if (name[0] != VFS_GENERIC) { 1527 vfsp = vfsconf_find_by_typenum(name[0]); 1528 if (vfsp == NULL) 1529 return (EOPNOTSUPP); 1530 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 1531 oldp, oldlenp, newp, newlen, p)); 1532 } 1533 #endif 1534 switch (name[1]) { 1535 case VFS_MAXTYPENUM: 1536 if (namelen != 2) 1537 return (ENOTDIR); 1538 maxtypenum = vfsconf_get_maxtypenum(); 1539 return (SYSCTL_OUT(req, &maxtypenum, sizeof(maxtypenum))); 1540 case VFS_CONF: 1541 if (namelen != 3) 1542 return (ENOTDIR); /* overloaded */ 1543 vfsp = vfsconf_find_by_typenum(name[2]); 1544 if (vfsp == NULL) 1545 return (EOPNOTSUPP); 1546 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 1547 } 1548 return (EOPNOTSUPP); 1549 } 1550 1551 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 1552 "Generic filesystem"); 1553 1554 #if 1 || defined(COMPAT_PRELITE2) 1555 1556 static int 1557 sysctl_ovfs_conf_iter(struct vfsconf *vfsp, void *data) 1558 { 1559 int error; 1560 struct ovfsconf ovfs; 1561 struct sysctl_req *req = (struct sysctl_req*) data; 1562 1563 bzero(&ovfs, sizeof(ovfs)); 1564 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 1565 strcpy(ovfs.vfc_name, vfsp->vfc_name); 1566 ovfs.vfc_index = vfsp->vfc_typenum; 1567 ovfs.vfc_refcount = vfsp->vfc_refcount; 1568 ovfs.vfc_flags = vfsp->vfc_flags; 1569 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 1570 if (error) 1571 return error; /* abort iteration with error code */ 1572 else 1573 return 0; /* continue iterating with next element */ 1574 } 1575 1576 static int 1577 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 1578 { 1579 return vfsconf_each(sysctl_ovfs_conf_iter, (void*)req); 1580 } 1581 1582 #endif /* 1 || COMPAT_PRELITE2 */ 1583 1584 /* 1585 * Check to see if a filesystem is mounted on a block device. 1586 */ 1587 int 1588 vfs_mountedon(struct vnode *vp) 1589 { 1590 cdev_t dev; 1591 1592 if ((dev = vp->v_rdev) == NULL) { 1593 if (vp->v_type != VBLK) 1594 dev = get_dev(vp->v_uminor, vp->v_umajor); 1595 } 1596 if (dev != NULL && dev->si_mountpoint) 1597 return (EBUSY); 1598 return (0); 1599 } 1600 1601 /* 1602 * Unmount all filesystems. The list is traversed in reverse order 1603 * of mounting to avoid dependencies. 1604 */ 1605 1606 static int vfs_umountall_callback(struct mount *mp, void *data); 1607 1608 void 1609 vfs_unmountall(void) 1610 { 1611 int count; 1612 1613 do { 1614 count = mountlist_scan(vfs_umountall_callback, 1615 NULL, MNTSCAN_REVERSE|MNTSCAN_NOBUSY); 1616 } while (count); 1617 } 1618 1619 static 1620 int 1621 vfs_umountall_callback(struct mount *mp, void *data) 1622 { 1623 int error; 1624 1625 error = dounmount(mp, MNT_FORCE); 1626 if (error) { 1627 mountlist_remove(mp); 1628 kprintf("unmount of filesystem mounted from %s failed (", 1629 mp->mnt_stat.f_mntfromname); 1630 if (error == EBUSY) 1631 kprintf("BUSY)\n"); 1632 else 1633 kprintf("%d)\n", error); 1634 } 1635 return(1); 1636 } 1637 1638 /* 1639 * Build hash lists of net addresses and hang them off the mount point. 1640 * Called by ufs_mount() to set up the lists of export addresses. 1641 */ 1642 static int 1643 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 1644 const struct export_args *argp) 1645 { 1646 struct netcred *np; 1647 struct radix_node_head *rnh; 1648 int i; 1649 struct radix_node *rn; 1650 struct sockaddr *saddr, *smask = 0; 1651 struct domain *dom; 1652 int error; 1653 1654 if (argp->ex_addrlen == 0) { 1655 if (mp->mnt_flag & MNT_DEFEXPORTED) 1656 return (EPERM); 1657 np = &nep->ne_defexported; 1658 np->netc_exflags = argp->ex_flags; 1659 np->netc_anon = argp->ex_anon; 1660 np->netc_anon.cr_ref = 1; 1661 mp->mnt_flag |= MNT_DEFEXPORTED; 1662 return (0); 1663 } 1664 1665 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 1666 return (EINVAL); 1667 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 1668 return (EINVAL); 1669 1670 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 1671 np = (struct netcred *) kmalloc(i, M_NETADDR, M_WAITOK | M_ZERO); 1672 saddr = (struct sockaddr *) (np + 1); 1673 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 1674 goto out; 1675 if (saddr->sa_len > argp->ex_addrlen) 1676 saddr->sa_len = argp->ex_addrlen; 1677 if (argp->ex_masklen) { 1678 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 1679 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 1680 if (error) 1681 goto out; 1682 if (smask->sa_len > argp->ex_masklen) 1683 smask->sa_len = argp->ex_masklen; 1684 } 1685 i = saddr->sa_family; 1686 if ((rnh = nep->ne_rtable[i]) == 0) { 1687 /* 1688 * Seems silly to initialize every AF when most are not used, 1689 * do so on demand here 1690 */ 1691 SLIST_FOREACH(dom, &domains, dom_next) 1692 if (dom->dom_family == i && dom->dom_rtattach) { 1693 dom->dom_rtattach((void **) &nep->ne_rtable[i], 1694 dom->dom_rtoffset); 1695 break; 1696 } 1697 if ((rnh = nep->ne_rtable[i]) == 0) { 1698 error = ENOBUFS; 1699 goto out; 1700 } 1701 } 1702 rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh, 1703 np->netc_rnodes); 1704 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 1705 error = EPERM; 1706 goto out; 1707 } 1708 np->netc_exflags = argp->ex_flags; 1709 np->netc_anon = argp->ex_anon; 1710 np->netc_anon.cr_ref = 1; 1711 return (0); 1712 out: 1713 kfree(np, M_NETADDR); 1714 return (error); 1715 } 1716 1717 /* ARGSUSED */ 1718 static int 1719 vfs_free_netcred(struct radix_node *rn, void *w) 1720 { 1721 struct radix_node_head *rnh = (struct radix_node_head *) w; 1722 1723 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 1724 kfree((caddr_t) rn, M_NETADDR); 1725 return (0); 1726 } 1727 1728 /* 1729 * Free the net address hash lists that are hanging off the mount points. 1730 */ 1731 static void 1732 vfs_free_addrlist(struct netexport *nep) 1733 { 1734 int i; 1735 struct radix_node_head *rnh; 1736 1737 for (i = 0; i <= AF_MAX; i++) 1738 if ((rnh = nep->ne_rtable[i])) { 1739 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 1740 (caddr_t) rnh); 1741 kfree((caddr_t) rnh, M_RTABLE); 1742 nep->ne_rtable[i] = 0; 1743 } 1744 } 1745 1746 int 1747 vfs_export(struct mount *mp, struct netexport *nep, 1748 const struct export_args *argp) 1749 { 1750 int error; 1751 1752 if (argp->ex_flags & MNT_DELEXPORT) { 1753 if (mp->mnt_flag & MNT_EXPUBLIC) { 1754 vfs_setpublicfs(NULL, NULL, NULL); 1755 mp->mnt_flag &= ~MNT_EXPUBLIC; 1756 } 1757 vfs_free_addrlist(nep); 1758 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 1759 } 1760 if (argp->ex_flags & MNT_EXPORTED) { 1761 if (argp->ex_flags & MNT_EXPUBLIC) { 1762 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 1763 return (error); 1764 mp->mnt_flag |= MNT_EXPUBLIC; 1765 } 1766 if ((error = vfs_hang_addrlist(mp, nep, argp))) 1767 return (error); 1768 mp->mnt_flag |= MNT_EXPORTED; 1769 } 1770 return (0); 1771 } 1772 1773 1774 /* 1775 * Set the publicly exported filesystem (WebNFS). Currently, only 1776 * one public filesystem is possible in the spec (RFC 2054 and 2055) 1777 */ 1778 int 1779 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 1780 const struct export_args *argp) 1781 { 1782 int error; 1783 struct vnode *rvp; 1784 char *cp; 1785 1786 /* 1787 * mp == NULL -> invalidate the current info, the FS is 1788 * no longer exported. May be called from either vfs_export 1789 * or unmount, so check if it hasn't already been done. 1790 */ 1791 if (mp == NULL) { 1792 if (nfs_pub.np_valid) { 1793 nfs_pub.np_valid = 0; 1794 if (nfs_pub.np_index != NULL) { 1795 FREE(nfs_pub.np_index, M_TEMP); 1796 nfs_pub.np_index = NULL; 1797 } 1798 } 1799 return (0); 1800 } 1801 1802 /* 1803 * Only one allowed at a time. 1804 */ 1805 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 1806 return (EBUSY); 1807 1808 /* 1809 * Get real filehandle for root of exported FS. 1810 */ 1811 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 1812 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 1813 1814 if ((error = VFS_ROOT(mp, &rvp))) 1815 return (error); 1816 1817 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 1818 return (error); 1819 1820 vput(rvp); 1821 1822 /* 1823 * If an indexfile was specified, pull it in. 1824 */ 1825 if (argp->ex_indexfile != NULL) { 1826 int namelen; 1827 1828 error = vn_get_namelen(rvp, &namelen); 1829 if (error) 1830 return (error); 1831 MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP, 1832 M_WAITOK); 1833 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 1834 namelen, NULL); 1835 if (!error) { 1836 /* 1837 * Check for illegal filenames. 1838 */ 1839 for (cp = nfs_pub.np_index; *cp; cp++) { 1840 if (*cp == '/') { 1841 error = EINVAL; 1842 break; 1843 } 1844 } 1845 } 1846 if (error) { 1847 FREE(nfs_pub.np_index, M_TEMP); 1848 return (error); 1849 } 1850 } 1851 1852 nfs_pub.np_mount = mp; 1853 nfs_pub.np_valid = 1; 1854 return (0); 1855 } 1856 1857 struct netcred * 1858 vfs_export_lookup(struct mount *mp, struct netexport *nep, 1859 struct sockaddr *nam) 1860 { 1861 struct netcred *np; 1862 struct radix_node_head *rnh; 1863 struct sockaddr *saddr; 1864 1865 np = NULL; 1866 if (mp->mnt_flag & MNT_EXPORTED) { 1867 /* 1868 * Lookup in the export list first. 1869 */ 1870 if (nam != NULL) { 1871 saddr = nam; 1872 rnh = nep->ne_rtable[saddr->sa_family]; 1873 if (rnh != NULL) { 1874 np = (struct netcred *) 1875 (*rnh->rnh_matchaddr)((char *)saddr, 1876 rnh); 1877 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 1878 np = NULL; 1879 } 1880 } 1881 /* 1882 * If no address match, use the default if it exists. 1883 */ 1884 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 1885 np = &nep->ne_defexported; 1886 } 1887 return (np); 1888 } 1889 1890 /* 1891 * perform msync on all vnodes under a mount point. The mount point must 1892 * be locked. This code is also responsible for lazy-freeing unreferenced 1893 * vnodes whos VM objects no longer contain pages. 1894 * 1895 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state. 1896 * 1897 * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked, 1898 * but vnode_pager_putpages() doesn't lock the vnode. We have to do it 1899 * way up in this high level function. 1900 */ 1901 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data); 1902 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data); 1903 1904 void 1905 vfs_msync(struct mount *mp, int flags) 1906 { 1907 int vmsc_flags; 1908 1909 vmsc_flags = VMSC_GETVP; 1910 if (flags != MNT_WAIT) 1911 vmsc_flags |= VMSC_NOWAIT; 1912 vmntvnodescan(mp, vmsc_flags, vfs_msync_scan1, vfs_msync_scan2, 1913 (void *)(intptr_t)flags); 1914 } 1915 1916 /* 1917 * scan1 is a fast pre-check. There could be hundreds of thousands of 1918 * vnodes, we cannot afford to do anything heavy weight until we have a 1919 * fairly good indication that there is work to do. 1920 */ 1921 static 1922 int 1923 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data) 1924 { 1925 int flags = (int)(intptr_t)data; 1926 1927 if ((vp->v_flag & VRECLAIMED) == 0) { 1928 if (vshouldmsync(vp)) 1929 return(0); /* call scan2 */ 1930 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1931 (vp->v_flag & VOBJDIRTY) && 1932 (flags == MNT_WAIT || vn_islocked(vp) == 0)) { 1933 return(0); /* call scan2 */ 1934 } 1935 } 1936 1937 /* 1938 * do not call scan2, continue the loop 1939 */ 1940 return(-1); 1941 } 1942 1943 /* 1944 * This callback is handed a locked vnode. 1945 */ 1946 static 1947 int 1948 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data) 1949 { 1950 vm_object_t obj; 1951 int flags = (int)(intptr_t)data; 1952 1953 if (vp->v_flag & VRECLAIMED) 1954 return(0); 1955 1956 if ((mp->mnt_flag & MNT_RDONLY) == 0 && (vp->v_flag & VOBJDIRTY)) { 1957 if ((obj = vp->v_object) != NULL) { 1958 vm_object_page_clean(obj, 0, 0, 1959 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 1960 } 1961 } 1962 return(0); 1963 } 1964 1965 /* 1966 * Record a process's interest in events which might happen to 1967 * a vnode. Because poll uses the historic select-style interface 1968 * internally, this routine serves as both the ``check for any 1969 * pending events'' and the ``record my interest in future events'' 1970 * functions. (These are done together, while the lock is held, 1971 * to avoid race conditions.) 1972 */ 1973 int 1974 vn_pollrecord(struct vnode *vp, int events) 1975 { 1976 lwkt_tokref vlock; 1977 1978 KKASSERT(curthread->td_proc != NULL); 1979 1980 lwkt_gettoken(&vlock, &vp->v_token); 1981 if (vp->v_pollinfo.vpi_revents & events) { 1982 /* 1983 * This leaves events we are not interested 1984 * in available for the other process which 1985 * which presumably had requested them 1986 * (otherwise they would never have been 1987 * recorded). 1988 */ 1989 events &= vp->v_pollinfo.vpi_revents; 1990 vp->v_pollinfo.vpi_revents &= ~events; 1991 1992 lwkt_reltoken(&vlock); 1993 return events; 1994 } 1995 vp->v_pollinfo.vpi_events |= events; 1996 selrecord(curthread, &vp->v_pollinfo.vpi_selinfo); 1997 lwkt_reltoken(&vlock); 1998 return 0; 1999 } 2000 2001 /* 2002 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2003 * it is possible for us to miss an event due to race conditions, but 2004 * that condition is expected to be rare, so for the moment it is the 2005 * preferred interface. 2006 */ 2007 void 2008 vn_pollevent(struct vnode *vp, int events) 2009 { 2010 lwkt_tokref vlock; 2011 2012 lwkt_gettoken(&vlock, &vp->v_token); 2013 if (vp->v_pollinfo.vpi_events & events) { 2014 /* 2015 * We clear vpi_events so that we don't 2016 * call selwakeup() twice if two events are 2017 * posted before the polling process(es) is 2018 * awakened. This also ensures that we take at 2019 * most one selwakeup() if the polling process 2020 * is no longer interested. However, it does 2021 * mean that only one event can be noticed at 2022 * a time. (Perhaps we should only clear those 2023 * event bits which we note?) XXX 2024 */ 2025 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2026 vp->v_pollinfo.vpi_revents |= events; 2027 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2028 } 2029 lwkt_reltoken(&vlock); 2030 } 2031 2032 /* 2033 * Wake up anyone polling on vp because it is being revoked. 2034 * This depends on dead_poll() returning POLLHUP for correct 2035 * behavior. 2036 */ 2037 void 2038 vn_pollgone(struct vnode *vp) 2039 { 2040 lwkt_tokref vlock; 2041 2042 lwkt_gettoken(&vlock, &vp->v_token); 2043 if (vp->v_pollinfo.vpi_events) { 2044 vp->v_pollinfo.vpi_events = 0; 2045 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2046 } 2047 lwkt_reltoken(&vlock); 2048 } 2049 2050 /* 2051 * extract the cdev_t from a VBLK or VCHR. The vnode must have been opened 2052 * (or v_rdev might be NULL). 2053 */ 2054 cdev_t 2055 vn_todev(struct vnode *vp) 2056 { 2057 if (vp->v_type != VBLK && vp->v_type != VCHR) 2058 return (NULL); 2059 KKASSERT(vp->v_rdev != NULL); 2060 return (vp->v_rdev); 2061 } 2062 2063 /* 2064 * Check if vnode represents a disk device. The vnode does not need to be 2065 * opened. 2066 * 2067 * MPALMOSTSAFE 2068 */ 2069 int 2070 vn_isdisk(struct vnode *vp, int *errp) 2071 { 2072 cdev_t dev; 2073 2074 if (vp->v_type != VCHR) { 2075 if (errp != NULL) 2076 *errp = ENOTBLK; 2077 return (0); 2078 } 2079 2080 if ((dev = vp->v_rdev) == NULL) { 2081 get_mplock(); 2082 dev = get_dev(vp->v_umajor, vp->v_uminor); 2083 rel_mplock(); 2084 } 2085 2086 if (dev == NULL) { 2087 if (errp != NULL) 2088 *errp = ENXIO; 2089 return (0); 2090 } 2091 if (dev_is_good(dev) == 0) { 2092 if (errp != NULL) 2093 *errp = ENXIO; 2094 return (0); 2095 } 2096 if ((dev_dflags(dev) & D_DISK) == 0) { 2097 if (errp != NULL) 2098 *errp = ENOTBLK; 2099 return (0); 2100 } 2101 if (errp != NULL) 2102 *errp = 0; 2103 return (1); 2104 } 2105 2106 int 2107 vn_get_namelen(struct vnode *vp, int *namelen) 2108 { 2109 int error; 2110 register_t retval[2]; 2111 2112 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval); 2113 if (error) 2114 return (error); 2115 *namelen = (int)retval[0]; 2116 return (0); 2117 } 2118 2119 int 2120 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type, 2121 uint16_t d_namlen, const char *d_name) 2122 { 2123 struct dirent *dp; 2124 size_t len; 2125 2126 len = _DIRENT_RECLEN(d_namlen); 2127 if (len > uio->uio_resid) 2128 return(1); 2129 2130 dp = kmalloc(len, M_TEMP, M_WAITOK | M_ZERO); 2131 2132 dp->d_ino = d_ino; 2133 dp->d_namlen = d_namlen; 2134 dp->d_type = d_type; 2135 bcopy(d_name, dp->d_name, d_namlen); 2136 2137 *error = uiomove((caddr_t)dp, len, uio); 2138 2139 kfree(dp, M_TEMP); 2140 2141 return(0); 2142 } 2143 2144 void 2145 vn_mark_atime(struct vnode *vp, struct thread *td) 2146 { 2147 struct proc *p = td->td_proc; 2148 struct ucred *cred = p ? p->p_ucred : proc0.p_ucred; 2149 2150 if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { 2151 VOP_MARKATIME(vp, cred); 2152 } 2153 } 2154