1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.19 2003/09/01 00:35:29 hmp Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/namei.h> 63 #include <sys/reboot.h> 64 #include <sys/socket.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/syslog.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/pmap.h> 77 #include <vm/vm_map.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vnode_pager.h> 81 #include <vm/vm_zone.h> 82 83 #include <sys/buf2.h> 84 #include <sys/thread2.h> 85 86 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 87 88 static void insmntque (struct vnode *vp, struct mount *mp); 89 static void vclean (struct vnode *vp, int flags, struct thread *td); 90 static unsigned long numvnodes; 91 static void vlruvp(struct vnode *vp); 92 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 93 94 enum vtype iftovt_tab[16] = { 95 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 96 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 97 }; 98 int vttoif_tab[9] = { 99 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 100 S_IFSOCK, S_IFIFO, S_IFMT, 101 }; 102 103 static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 104 105 static u_long wantfreevnodes = 25; 106 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 107 static u_long freevnodes = 0; 108 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 109 110 static int reassignbufcalls; 111 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 112 static int reassignbufloops; 113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 114 static int reassignbufsortgood; 115 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 116 static int reassignbufsortbad; 117 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 118 static int reassignbufmethod = 1; 119 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 120 static int nameileafonly = 0; 121 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); 122 123 #ifdef ENABLE_VFS_IOOPT 124 int vfs_ioopt = 0; 125 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 126 #endif 127 128 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */ 129 struct lwkt_token mountlist_token; 130 struct lwkt_token mntvnode_token; 131 int nfs_mount_type = -1; 132 static struct lwkt_token mntid_token; 133 static struct lwkt_token vnode_free_list_token; 134 static struct lwkt_token spechash_token; 135 struct nfs_public nfs_pub; /* publicly exported FS */ 136 static vm_zone_t vnode_zone; 137 138 /* 139 * The workitem queue. 140 */ 141 #define SYNCER_MAXDELAY 32 142 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 143 time_t syncdelay = 30; /* max time to delay syncing data */ 144 SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW, &syncdelay, 0, 145 "VFS data synchronization delay"); 146 time_t filedelay = 30; /* time to delay syncing files */ 147 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 148 "File synchronization delay"); 149 time_t dirdelay = 29; /* time to delay syncing directories */ 150 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 151 "Directory synchronization delay"); 152 time_t metadelay = 28; /* time to delay syncing metadata */ 153 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 154 "VFS metadata synchronization delay"); 155 static int rushjob; /* number of slots to run ASAP */ 156 static int stat_rush_requests; /* number of times I/O speeded up */ 157 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 158 159 static int syncer_delayno = 0; 160 static long syncer_mask; 161 LIST_HEAD(synclist, vnode); 162 static struct synclist *syncer_workitem_pending; 163 164 int desiredvnodes; 165 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 166 &desiredvnodes, 0, "Maximum number of vnodes"); 167 static int minvnodes; 168 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 169 &minvnodes, 0, "Minimum number of vnodes"); 170 static int vnlru_nowhere = 0; 171 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, 172 "Number of times the vnlru process ran without success"); 173 174 static void vfs_free_addrlist (struct netexport *nep); 175 static int vfs_free_netcred (struct radix_node *rn, void *w); 176 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 177 struct export_args *argp); 178 179 /* 180 * Initialize the vnode management data structures. 181 */ 182 void 183 vntblinit() 184 { 185 186 desiredvnodes = maxproc + vmstats.v_page_count / 4; 187 minvnodes = desiredvnodes / 4; 188 lwkt_inittoken(&mntvnode_token); 189 lwkt_inittoken(&mntid_token); 190 lwkt_inittoken(&spechash_token); 191 TAILQ_INIT(&vnode_free_list); 192 lwkt_inittoken(&vnode_free_list_token); 193 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 194 /* 195 * Initialize the filesystem syncer. 196 */ 197 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 198 &syncer_mask); 199 syncer_maxdelay = syncer_mask + 1; 200 } 201 202 /* 203 * Mark a mount point as busy. Used to synchronize access and to delay 204 * unmounting. Interlock is not released on failure. 205 */ 206 int 207 vfs_busy(struct mount *mp, int flags, struct lwkt_token *interlkp, 208 struct thread *td) 209 { 210 int lkflags; 211 212 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 213 if (flags & LK_NOWAIT) 214 return (ENOENT); 215 mp->mnt_kern_flag |= MNTK_MWAIT; 216 if (interlkp) { 217 lwkt_reltoken(interlkp); 218 } 219 /* 220 * Since all busy locks are shared except the exclusive 221 * lock granted when unmounting, the only place that a 222 * wakeup needs to be done is at the release of the 223 * exclusive lock at the end of dounmount. 224 */ 225 tsleep((caddr_t)mp, 0, "vfs_busy", 0); 226 if (interlkp) { 227 lwkt_gettoken(interlkp); 228 } 229 return (ENOENT); 230 } 231 lkflags = LK_SHARED | LK_NOPAUSE; 232 if (interlkp) 233 lkflags |= LK_INTERLOCK; 234 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td)) 235 panic("vfs_busy: unexpected lock failure"); 236 return (0); 237 } 238 239 /* 240 * Free a busy filesystem. 241 */ 242 void 243 vfs_unbusy(struct mount *mp, struct thread *td) 244 { 245 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 246 } 247 248 /* 249 * Lookup a filesystem type, and if found allocate and initialize 250 * a mount structure for it. 251 * 252 * Devname is usually updated by mount(8) after booting. 253 */ 254 int 255 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp) 256 { 257 struct thread *td = curthread; /* XXX */ 258 struct vfsconf *vfsp; 259 struct mount *mp; 260 261 if (fstypename == NULL) 262 return (ENODEV); 263 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 264 if (!strcmp(vfsp->vfc_name, fstypename)) 265 break; 266 if (vfsp == NULL) 267 return (ENODEV); 268 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 269 bzero((char *)mp, (u_long)sizeof(struct mount)); 270 lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE); 271 (void)vfs_busy(mp, LK_NOWAIT, 0, td); 272 TAILQ_INIT(&mp->mnt_nvnodelist); 273 TAILQ_INIT(&mp->mnt_reservedvnlist); 274 mp->mnt_nvnodelistsize = 0; 275 mp->mnt_vfc = vfsp; 276 mp->mnt_op = vfsp->vfc_vfsops; 277 mp->mnt_flag = MNT_RDONLY; 278 mp->mnt_vnodecovered = NULLVP; 279 vfsp->vfc_refcount++; 280 mp->mnt_iosize_max = DFLTPHYS; 281 mp->mnt_stat.f_type = vfsp->vfc_typenum; 282 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 283 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 284 mp->mnt_stat.f_mntonname[0] = '/'; 285 mp->mnt_stat.f_mntonname[1] = 0; 286 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 287 *mpp = mp; 288 return (0); 289 } 290 291 /* 292 * Find an appropriate filesystem to use for the root. If a filesystem 293 * has not been preselected, walk through the list of known filesystems 294 * trying those that have mountroot routines, and try them until one 295 * works or we have tried them all. 296 */ 297 #ifdef notdef /* XXX JH */ 298 int 299 lite2_vfs_mountroot() 300 { 301 struct vfsconf *vfsp; 302 extern int (*lite2_mountroot) (void); 303 int error; 304 305 if (lite2_mountroot != NULL) 306 return ((*lite2_mountroot)()); 307 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 308 if (vfsp->vfc_mountroot == NULL) 309 continue; 310 if ((error = (*vfsp->vfc_mountroot)()) == 0) 311 return (0); 312 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 313 } 314 return (ENODEV); 315 } 316 #endif 317 318 /* 319 * Lookup a mount point by filesystem identifier. 320 */ 321 struct mount * 322 vfs_getvfs(fsid) 323 fsid_t *fsid; 324 { 325 struct mount *mp; 326 327 lwkt_gettoken(&mountlist_token); 328 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 329 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 330 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 331 lwkt_reltoken(&mountlist_token); 332 return (mp); 333 } 334 } 335 lwkt_reltoken(&mountlist_token); 336 return ((struct mount *) 0); 337 } 338 339 /* 340 * Get a new unique fsid. Try to make its val[0] unique, since this value 341 * will be used to create fake device numbers for stat(). Also try (but 342 * not so hard) make its val[0] unique mod 2^16, since some emulators only 343 * support 16-bit device numbers. We end up with unique val[0]'s for the 344 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 345 * 346 * Keep in mind that several mounts may be running in parallel. Starting 347 * the search one past where the previous search terminated is both a 348 * micro-optimization and a defense against returning the same fsid to 349 * different mounts. 350 */ 351 void 352 vfs_getnewfsid(mp) 353 struct mount *mp; 354 { 355 static u_int16_t mntid_base; 356 fsid_t tfsid; 357 int mtype; 358 359 lwkt_gettoken(&mntid_token); 360 mtype = mp->mnt_vfc->vfc_typenum; 361 tfsid.val[1] = mtype; 362 mtype = (mtype & 0xFF) << 24; 363 for (;;) { 364 tfsid.val[0] = makeudev(255, 365 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 366 mntid_base++; 367 if (vfs_getvfs(&tfsid) == NULL) 368 break; 369 } 370 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 371 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 372 lwkt_reltoken(&mntid_token); 373 } 374 375 /* 376 * Knob to control the precision of file timestamps: 377 * 378 * 0 = seconds only; nanoseconds zeroed. 379 * 1 = seconds and nanoseconds, accurate within 1/HZ. 380 * 2 = seconds and nanoseconds, truncated to microseconds. 381 * >=3 = seconds and nanoseconds, maximum precision. 382 */ 383 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 384 385 static int timestamp_precision = TSP_SEC; 386 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 387 ×tamp_precision, 0, ""); 388 389 /* 390 * Get a current timestamp. 391 */ 392 void 393 vfs_timestamp(tsp) 394 struct timespec *tsp; 395 { 396 struct timeval tv; 397 398 switch (timestamp_precision) { 399 case TSP_SEC: 400 tsp->tv_sec = time_second; 401 tsp->tv_nsec = 0; 402 break; 403 case TSP_HZ: 404 getnanotime(tsp); 405 break; 406 case TSP_USEC: 407 microtime(&tv); 408 TIMEVAL_TO_TIMESPEC(&tv, tsp); 409 break; 410 case TSP_NSEC: 411 default: 412 nanotime(tsp); 413 break; 414 } 415 } 416 417 /* 418 * Set vnode attributes to VNOVAL 419 */ 420 void 421 vattr_null(vap) 422 struct vattr *vap; 423 { 424 425 vap->va_type = VNON; 426 vap->va_size = VNOVAL; 427 vap->va_bytes = VNOVAL; 428 vap->va_mode = VNOVAL; 429 vap->va_nlink = VNOVAL; 430 vap->va_uid = VNOVAL; 431 vap->va_gid = VNOVAL; 432 vap->va_fsid = VNOVAL; 433 vap->va_fileid = VNOVAL; 434 vap->va_blocksize = VNOVAL; 435 vap->va_rdev = VNOVAL; 436 vap->va_atime.tv_sec = VNOVAL; 437 vap->va_atime.tv_nsec = VNOVAL; 438 vap->va_mtime.tv_sec = VNOVAL; 439 vap->va_mtime.tv_nsec = VNOVAL; 440 vap->va_ctime.tv_sec = VNOVAL; 441 vap->va_ctime.tv_nsec = VNOVAL; 442 vap->va_flags = VNOVAL; 443 vap->va_gen = VNOVAL; 444 vap->va_vaflags = 0; 445 } 446 447 /* 448 * This routine is called when we have too many vnodes. It attempts 449 * to free <count> vnodes and will potentially free vnodes that still 450 * have VM backing store (VM backing store is typically the cause 451 * of a vnode blowout so we want to do this). Therefore, this operation 452 * is not considered cheap. 453 * 454 * A number of conditions may prevent a vnode from being reclaimed. 455 * the buffer cache may have references on the vnode, a directory 456 * vnode may still have references due to the namei cache representing 457 * underlying files, or the vnode may be in active use. It is not 458 * desireable to reuse such vnodes. These conditions may cause the 459 * number of vnodes to reach some minimum value regardless of what 460 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 461 */ 462 static int 463 vlrureclaim(struct mount *mp) 464 { 465 struct vnode *vp; 466 int done; 467 int trigger; 468 int usevnodes; 469 int count; 470 int gen; 471 472 /* 473 * Calculate the trigger point, don't allow user 474 * screwups to blow us up. This prevents us from 475 * recycling vnodes with lots of resident pages. We 476 * aren't trying to free memory, we are trying to 477 * free vnodes. 478 */ 479 usevnodes = desiredvnodes; 480 if (usevnodes <= 0) 481 usevnodes = 1; 482 trigger = vmstats.v_page_count * 2 / usevnodes; 483 484 done = 0; 485 gen = lwkt_gettoken(&mntvnode_token); 486 count = mp->mnt_nvnodelistsize / 10 + 1; 487 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 488 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 489 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 490 491 if (vp->v_type != VNON && 492 vp->v_type != VBAD && 493 VMIGHTFREE(vp) && /* critical path opt */ 494 (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) 495 ) { 496 lwkt_gettoken(&vp->v_interlock); 497 if (lwkt_gentoken(&mntvnode_token, &gen) == 0) { 498 if (VMIGHTFREE(vp)) { 499 vgonel(vp, curthread); 500 done++; 501 } else { 502 lwkt_reltoken(&vp->v_interlock); 503 } 504 } else { 505 lwkt_reltoken(&vp->v_interlock); 506 } 507 } 508 --count; 509 } 510 lwkt_reltoken(&mntvnode_token); 511 return done; 512 } 513 514 /* 515 * Attempt to recycle vnodes in a context that is always safe to block. 516 * Calling vlrurecycle() from the bowels of file system code has some 517 * interesting deadlock problems. 518 */ 519 static struct thread *vnlruthread; 520 static int vnlruproc_sig; 521 522 static void 523 vnlru_proc(void) 524 { 525 struct mount *mp, *nmp; 526 int s; 527 int done; 528 struct thread *td = curthread; 529 530 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td, 531 SHUTDOWN_PRI_FIRST); 532 533 s = splbio(); 534 for (;;) { 535 kproc_suspend_loop(); 536 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 537 vnlruproc_sig = 0; 538 wakeup(&vnlruproc_sig); 539 tsleep(td, 0, "vlruwt", hz); 540 continue; 541 } 542 done = 0; 543 lwkt_gettoken(&mountlist_token); 544 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 545 if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, td)) { 546 nmp = TAILQ_NEXT(mp, mnt_list); 547 continue; 548 } 549 done += vlrureclaim(mp); 550 lwkt_gettoken(&mountlist_token); 551 nmp = TAILQ_NEXT(mp, mnt_list); 552 vfs_unbusy(mp, td); 553 } 554 lwkt_reltoken(&mountlist_token); 555 if (done == 0) { 556 vnlru_nowhere++; 557 tsleep(td, 0, "vlrup", hz * 3); 558 } 559 } 560 splx(s); 561 } 562 563 static struct kproc_desc vnlru_kp = { 564 "vnlru", 565 vnlru_proc, 566 &vnlruthread 567 }; 568 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 569 570 /* 571 * Routines having to do with the management of the vnode table. 572 */ 573 extern vop_t **dead_vnodeop_p; 574 575 /* 576 * Return the next vnode from the free list. 577 */ 578 int 579 getnewvnode(tag, mp, vops, vpp) 580 enum vtagtype tag; 581 struct mount *mp; 582 vop_t **vops; 583 struct vnode **vpp; 584 { 585 int s; 586 int gen; 587 int vgen; 588 struct thread *td = curthread; /* XXX */ 589 struct vnode *vp = NULL; 590 vm_object_t object; 591 592 s = splbio(); 593 594 /* 595 * Try to reuse vnodes if we hit the max. This situation only 596 * occurs in certain large-memory (2G+) situations. We cannot 597 * attempt to directly reclaim vnodes due to nasty recursion 598 * problems. 599 */ 600 while (numvnodes - freevnodes > desiredvnodes) { 601 if (vnlruproc_sig == 0) { 602 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 603 wakeup(vnlruthread); 604 } 605 tsleep(&vnlruproc_sig, 0, "vlruwk", hz); 606 } 607 608 609 /* 610 * Attempt to reuse a vnode already on the free list, allocating 611 * a new vnode if we can't find one or if we have not reached a 612 * good minimum for good LRU performance. 613 */ 614 gen = lwkt_gettoken(&vnode_free_list_token); 615 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 616 int count; 617 618 for (count = 0; count < freevnodes; count++) { 619 vp = TAILQ_FIRST(&vnode_free_list); 620 if (vp == NULL || vp->v_usecount) 621 panic("getnewvnode: free vnode isn't"); 622 623 /* 624 * Get the vnode's interlock, then re-obtain 625 * vnode_free_list_token in case we lost it. If we 626 * did lose it while getting the vnode interlock, 627 * even if we got it back again, then retry. 628 */ 629 vgen = lwkt_gettoken(&vp->v_interlock); 630 if (lwkt_gentoken(&vnode_free_list_token, &gen) != 0) { 631 --count; 632 lwkt_reltoken(&vp->v_interlock); 633 vp = NULL; 634 continue; 635 } 636 637 /* 638 * Whew! We have both tokens. Since we didn't lose 639 * the free list VFREE had better still be set. But 640 * we aren't out of the woods yet. We have to get 641 * the object (may block). If the vnode is not 642 * suitable then move it to the end of the list 643 * if we can. If we can't move it to the end of the 644 * list retry again. 645 */ 646 if ((VOP_GETVOBJECT(vp, &object) == 0 && 647 (object->resident_page_count || object->ref_count)) 648 ) { 649 if (lwkt_gentoken(&vp->v_interlock, &vgen) == 0 && 650 lwkt_gentoken(&vnode_free_list_token, &gen) == 0 651 ) { 652 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 653 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 654 } else { 655 --count; 656 } 657 lwkt_reltoken(&vp->v_interlock); 658 vp = NULL; 659 continue; 660 } 661 662 /* 663 * Still not out of the woods. VOBJECT might have 664 * blocked, if we did not retain our tokens we have 665 * to retry. 666 */ 667 if (lwkt_gentoken(&vp->v_interlock, &vgen) != 0 || 668 lwkt_gentoken(&vnode_free_list_token, &gen) != 0) { 669 --count; 670 vp = NULL; 671 continue; 672 } 673 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 674 KKASSERT(vp->v_flag & VFREE); 675 676 if (LIST_FIRST(&vp->v_cache_src)) { 677 /* 678 * note: nameileafonly sysctl is temporary, 679 * for debugging only, and will eventually be 680 * removed. 681 */ 682 if (nameileafonly > 0) { 683 /* 684 * Do not reuse namei-cached directory 685 * vnodes that have cached 686 * subdirectories. 687 */ 688 if (cache_leaf_test(vp) < 0) { 689 lwkt_reltoken(&vp->v_interlock); 690 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 691 vp = NULL; 692 continue; 693 } 694 } else if (nameileafonly < 0 || 695 vmiodirenable == 0) { 696 /* 697 * Do not reuse namei-cached directory 698 * vnodes if nameileafonly is -1 or 699 * if VMIO backing for directories is 700 * turned off (otherwise we reuse them 701 * too quickly). 702 */ 703 lwkt_reltoken(&vp->v_interlock); 704 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 705 vp = NULL; 706 continue; 707 } 708 } 709 break; 710 } 711 } 712 713 if (vp) { 714 vp->v_flag |= VDOOMED; 715 vp->v_flag &= ~VFREE; 716 freevnodes--; 717 lwkt_reltoken(&vnode_free_list_token); 718 cache_purge(vp); /* YYY may block */ 719 vp->v_lease = NULL; 720 if (vp->v_type != VBAD) { 721 vgonel(vp, td); 722 } else { 723 lwkt_reltoken(&vp->v_interlock); 724 } 725 726 #ifdef INVARIANTS 727 { 728 int s; 729 730 if (vp->v_data) 731 panic("cleaned vnode isn't"); 732 s = splbio(); 733 if (vp->v_numoutput) 734 panic("Clean vnode has pending I/O's"); 735 splx(s); 736 } 737 #endif 738 vp->v_flag = 0; 739 vp->v_lastw = 0; 740 vp->v_lasta = 0; 741 vp->v_cstart = 0; 742 vp->v_clen = 0; 743 vp->v_socket = 0; 744 vp->v_writecount = 0; /* XXX */ 745 } else { 746 lwkt_reltoken(&vnode_free_list_token); 747 vp = (struct vnode *) zalloc(vnode_zone); 748 bzero((char *) vp, sizeof *vp); 749 lwkt_inittoken(&vp->v_interlock); 750 vp->v_dd = vp; 751 cache_purge(vp); 752 LIST_INIT(&vp->v_cache_src); 753 TAILQ_INIT(&vp->v_cache_dst); 754 numvnodes++; 755 } 756 757 TAILQ_INIT(&vp->v_cleanblkhd); 758 TAILQ_INIT(&vp->v_dirtyblkhd); 759 vp->v_type = VNON; 760 vp->v_tag = tag; 761 vp->v_op = vops; 762 insmntque(vp, mp); 763 *vpp = vp; 764 vp->v_usecount = 1; 765 vp->v_data = 0; 766 splx(s); 767 768 vfs_object_create(vp, td); 769 return (0); 770 } 771 772 /* 773 * Move a vnode from one mount queue to another. 774 */ 775 static void 776 insmntque(vp, mp) 777 struct vnode *vp; 778 struct mount *mp; 779 { 780 781 lwkt_gettoken(&mntvnode_token); 782 /* 783 * Delete from old mount point vnode list, if on one. 784 */ 785 if (vp->v_mount != NULL) { 786 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0, 787 ("bad mount point vnode list size")); 788 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 789 vp->v_mount->mnt_nvnodelistsize--; 790 } 791 /* 792 * Insert into list of vnodes for the new mount point, if available. 793 */ 794 if ((vp->v_mount = mp) == NULL) { 795 lwkt_reltoken(&mntvnode_token); 796 return; 797 } 798 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 799 mp->mnt_nvnodelistsize++; 800 lwkt_reltoken(&mntvnode_token); 801 } 802 803 /* 804 * Update outstanding I/O count and do wakeup if requested. 805 */ 806 void 807 vwakeup(bp) 808 struct buf *bp; 809 { 810 struct vnode *vp; 811 812 bp->b_flags &= ~B_WRITEINPROG; 813 if ((vp = bp->b_vp)) { 814 vp->v_numoutput--; 815 if (vp->v_numoutput < 0) 816 panic("vwakeup: neg numoutput"); 817 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 818 vp->v_flag &= ~VBWAIT; 819 wakeup((caddr_t) &vp->v_numoutput); 820 } 821 } 822 } 823 824 /* 825 * Flush out and invalidate all buffers associated with a vnode. 826 * Called with the underlying object locked. 827 */ 828 int 829 vinvalbuf(struct vnode *vp, int flags, struct thread *td, 830 int slpflag, int slptimeo) 831 { 832 struct buf *bp; 833 struct buf *nbp, *blist; 834 int s, error; 835 vm_object_t object; 836 837 if (flags & V_SAVE) { 838 s = splbio(); 839 while (vp->v_numoutput) { 840 vp->v_flag |= VBWAIT; 841 error = tsleep((caddr_t)&vp->v_numoutput, 842 slpflag, "vinvlbuf", slptimeo); 843 if (error) { 844 splx(s); 845 return (error); 846 } 847 } 848 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 849 splx(s); 850 if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) 851 return (error); 852 s = splbio(); 853 if (vp->v_numoutput > 0 || 854 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 855 panic("vinvalbuf: dirty bufs"); 856 } 857 splx(s); 858 } 859 s = splbio(); 860 for (;;) { 861 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 862 if (!blist) 863 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 864 if (!blist) 865 break; 866 867 for (bp = blist; bp; bp = nbp) { 868 nbp = TAILQ_NEXT(bp, b_vnbufs); 869 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 870 error = BUF_TIMELOCK(bp, 871 LK_EXCLUSIVE | LK_SLEEPFAIL, 872 "vinvalbuf", slpflag, slptimeo); 873 if (error == ENOLCK) 874 break; 875 splx(s); 876 return (error); 877 } 878 /* 879 * XXX Since there are no node locks for NFS, I 880 * believe there is a slight chance that a delayed 881 * write will occur while sleeping just above, so 882 * check for it. Note that vfs_bio_awrite expects 883 * buffers to reside on a queue, while VOP_BWRITE and 884 * brelse do not. 885 */ 886 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 887 (flags & V_SAVE)) { 888 889 if (bp->b_vp == vp) { 890 if (bp->b_flags & B_CLUSTEROK) { 891 BUF_UNLOCK(bp); 892 vfs_bio_awrite(bp); 893 } else { 894 bremfree(bp); 895 bp->b_flags |= B_ASYNC; 896 VOP_BWRITE(bp->b_vp, bp); 897 } 898 } else { 899 bremfree(bp); 900 (void) VOP_BWRITE(bp->b_vp, bp); 901 } 902 break; 903 } 904 bremfree(bp); 905 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 906 bp->b_flags &= ~B_ASYNC; 907 brelse(bp); 908 } 909 } 910 911 /* 912 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 913 * have write I/O in-progress but if there is a VM object then the 914 * VM object can also have read-I/O in-progress. 915 */ 916 do { 917 while (vp->v_numoutput > 0) { 918 vp->v_flag |= VBWAIT; 919 tsleep(&vp->v_numoutput, 0, "vnvlbv", 0); 920 } 921 if (VOP_GETVOBJECT(vp, &object) == 0) { 922 while (object->paging_in_progress) 923 vm_object_pip_sleep(object, "vnvlbx"); 924 } 925 } while (vp->v_numoutput > 0); 926 927 splx(s); 928 929 /* 930 * Destroy the copy in the VM cache, too. 931 */ 932 lwkt_gettoken(&vp->v_interlock); 933 if (VOP_GETVOBJECT(vp, &object) == 0) { 934 vm_object_page_remove(object, 0, 0, 935 (flags & V_SAVE) ? TRUE : FALSE); 936 } 937 lwkt_reltoken(&vp->v_interlock); 938 939 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 940 panic("vinvalbuf: flush failed"); 941 return (0); 942 } 943 944 /* 945 * Truncate a file's buffer and pages to a specified length. This 946 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 947 * sync activity. 948 */ 949 int 950 vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize) 951 { 952 struct buf *bp; 953 struct buf *nbp; 954 int s, anyfreed; 955 int trunclbn; 956 957 /* 958 * Round up to the *next* lbn. 959 */ 960 trunclbn = (length + blksize - 1) / blksize; 961 962 s = splbio(); 963 restart: 964 anyfreed = 1; 965 for (;anyfreed;) { 966 anyfreed = 0; 967 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 968 nbp = TAILQ_NEXT(bp, b_vnbufs); 969 if (bp->b_lblkno >= trunclbn) { 970 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 971 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 972 goto restart; 973 } else { 974 bremfree(bp); 975 bp->b_flags |= (B_INVAL | B_RELBUF); 976 bp->b_flags &= ~B_ASYNC; 977 brelse(bp); 978 anyfreed = 1; 979 } 980 if (nbp && 981 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 982 (nbp->b_vp != vp) || 983 (nbp->b_flags & B_DELWRI))) { 984 goto restart; 985 } 986 } 987 } 988 989 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 990 nbp = TAILQ_NEXT(bp, b_vnbufs); 991 if (bp->b_lblkno >= trunclbn) { 992 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 993 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 994 goto restart; 995 } else { 996 bremfree(bp); 997 bp->b_flags |= (B_INVAL | B_RELBUF); 998 bp->b_flags &= ~B_ASYNC; 999 brelse(bp); 1000 anyfreed = 1; 1001 } 1002 if (nbp && 1003 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1004 (nbp->b_vp != vp) || 1005 (nbp->b_flags & B_DELWRI) == 0)) { 1006 goto restart; 1007 } 1008 } 1009 } 1010 } 1011 1012 if (length > 0) { 1013 restartsync: 1014 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1015 nbp = TAILQ_NEXT(bp, b_vnbufs); 1016 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 1017 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1018 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1019 goto restart; 1020 } else { 1021 bremfree(bp); 1022 if (bp->b_vp == vp) { 1023 bp->b_flags |= B_ASYNC; 1024 } else { 1025 bp->b_flags &= ~B_ASYNC; 1026 } 1027 VOP_BWRITE(bp->b_vp, bp); 1028 } 1029 goto restartsync; 1030 } 1031 1032 } 1033 } 1034 1035 while (vp->v_numoutput > 0) { 1036 vp->v_flag |= VBWAIT; 1037 tsleep(&vp->v_numoutput, 0, "vbtrunc", 0); 1038 } 1039 1040 splx(s); 1041 1042 vnode_pager_setsize(vp, length); 1043 1044 return (0); 1045 } 1046 1047 /* 1048 * Associate a buffer with a vnode. 1049 */ 1050 void 1051 bgetvp(vp, bp) 1052 struct vnode *vp; 1053 struct buf *bp; 1054 { 1055 int s; 1056 1057 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 1058 1059 vhold(vp); 1060 bp->b_vp = vp; 1061 bp->b_dev = vn_todev(vp); 1062 /* 1063 * Insert onto list for new vnode. 1064 */ 1065 s = splbio(); 1066 bp->b_xflags |= BX_VNCLEAN; 1067 bp->b_xflags &= ~BX_VNDIRTY; 1068 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 1069 splx(s); 1070 } 1071 1072 /* 1073 * Disassociate a buffer from a vnode. 1074 */ 1075 void 1076 brelvp(bp) 1077 struct buf *bp; 1078 { 1079 struct vnode *vp; 1080 struct buflists *listheadp; 1081 int s; 1082 1083 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1084 1085 /* 1086 * Delete from old vnode list, if on one. 1087 */ 1088 vp = bp->b_vp; 1089 s = splbio(); 1090 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1091 if (bp->b_xflags & BX_VNDIRTY) 1092 listheadp = &vp->v_dirtyblkhd; 1093 else 1094 listheadp = &vp->v_cleanblkhd; 1095 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1096 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1097 } 1098 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1099 vp->v_flag &= ~VONWORKLST; 1100 LIST_REMOVE(vp, v_synclist); 1101 } 1102 splx(s); 1103 bp->b_vp = (struct vnode *) 0; 1104 vdrop(vp); 1105 } 1106 1107 /* 1108 * The workitem queue. 1109 * 1110 * It is useful to delay writes of file data and filesystem metadata 1111 * for tens of seconds so that quickly created and deleted files need 1112 * not waste disk bandwidth being created and removed. To realize this, 1113 * we append vnodes to a "workitem" queue. When running with a soft 1114 * updates implementation, most pending metadata dependencies should 1115 * not wait for more than a few seconds. Thus, mounted on block devices 1116 * are delayed only about a half the time that file data is delayed. 1117 * Similarly, directory updates are more critical, so are only delayed 1118 * about a third the time that file data is delayed. Thus, there are 1119 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 1120 * one each second (driven off the filesystem syncer process). The 1121 * syncer_delayno variable indicates the next queue that is to be processed. 1122 * Items that need to be processed soon are placed in this queue: 1123 * 1124 * syncer_workitem_pending[syncer_delayno] 1125 * 1126 * A delay of fifteen seconds is done by placing the request fifteen 1127 * entries later in the queue: 1128 * 1129 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 1130 * 1131 */ 1132 1133 /* 1134 * Add an item to the syncer work queue. 1135 */ 1136 static void 1137 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1138 { 1139 int s, slot; 1140 1141 s = splbio(); 1142 1143 if (vp->v_flag & VONWORKLST) { 1144 LIST_REMOVE(vp, v_synclist); 1145 } 1146 1147 if (delay > syncer_maxdelay - 2) 1148 delay = syncer_maxdelay - 2; 1149 slot = (syncer_delayno + delay) & syncer_mask; 1150 1151 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1152 vp->v_flag |= VONWORKLST; 1153 splx(s); 1154 } 1155 1156 struct thread *updatethread; 1157 static void sched_sync (void); 1158 static struct kproc_desc up_kp = { 1159 "syncer", 1160 sched_sync, 1161 &updatethread 1162 }; 1163 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1164 1165 /* 1166 * System filesystem synchronizer daemon. 1167 */ 1168 void 1169 sched_sync(void) 1170 { 1171 struct synclist *slp; 1172 struct vnode *vp; 1173 long starttime; 1174 int s; 1175 struct thread *td = curthread; 1176 1177 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td, 1178 SHUTDOWN_PRI_LAST); 1179 1180 for (;;) { 1181 kproc_suspend_loop(); 1182 1183 starttime = time_second; 1184 1185 /* 1186 * Push files whose dirty time has expired. Be careful 1187 * of interrupt race on slp queue. 1188 */ 1189 s = splbio(); 1190 slp = &syncer_workitem_pending[syncer_delayno]; 1191 syncer_delayno += 1; 1192 if (syncer_delayno == syncer_maxdelay) 1193 syncer_delayno = 0; 1194 splx(s); 1195 1196 while ((vp = LIST_FIRST(slp)) != NULL) { 1197 if (VOP_ISLOCKED(vp, NULL) == 0) { 1198 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1199 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1200 VOP_UNLOCK(vp, 0, td); 1201 } 1202 s = splbio(); 1203 if (LIST_FIRST(slp) == vp) { 1204 /* 1205 * Note: v_tag VT_VFS vps can remain on the 1206 * worklist too with no dirty blocks, but 1207 * since sync_fsync() moves it to a different 1208 * slot we are safe. 1209 */ 1210 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1211 !vn_isdisk(vp, NULL)) 1212 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1213 /* 1214 * Put us back on the worklist. The worklist 1215 * routine will remove us from our current 1216 * position and then add us back in at a later 1217 * position. 1218 */ 1219 vn_syncer_add_to_worklist(vp, syncdelay); 1220 } 1221 splx(s); 1222 } 1223 1224 /* 1225 * Do soft update processing. 1226 */ 1227 if (bioops.io_sync) 1228 (*bioops.io_sync)(NULL); 1229 1230 /* 1231 * The variable rushjob allows the kernel to speed up the 1232 * processing of the filesystem syncer process. A rushjob 1233 * value of N tells the filesystem syncer to process the next 1234 * N seconds worth of work on its queue ASAP. Currently rushjob 1235 * is used by the soft update code to speed up the filesystem 1236 * syncer process when the incore state is getting so far 1237 * ahead of the disk that the kernel memory pool is being 1238 * threatened with exhaustion. 1239 */ 1240 if (rushjob > 0) { 1241 rushjob -= 1; 1242 continue; 1243 } 1244 /* 1245 * If it has taken us less than a second to process the 1246 * current work, then wait. Otherwise start right over 1247 * again. We can still lose time if any single round 1248 * takes more than two seconds, but it does not really 1249 * matter as we are just trying to generally pace the 1250 * filesystem activity. 1251 */ 1252 if (time_second == starttime) 1253 tsleep(&lbolt, 0, "syncer", 0); 1254 } 1255 } 1256 1257 /* 1258 * Request the syncer daemon to speed up its work. 1259 * We never push it to speed up more than half of its 1260 * normal turn time, otherwise it could take over the cpu. 1261 * 1262 * YYY wchan field protected by the BGL. 1263 */ 1264 int 1265 speedup_syncer() 1266 { 1267 crit_enter(); 1268 if (updatethread->td_wchan == &lbolt) { /* YYY */ 1269 unsleep(updatethread); 1270 lwkt_schedule(updatethread); 1271 } 1272 crit_exit(); 1273 if (rushjob < syncdelay / 2) { 1274 rushjob += 1; 1275 stat_rush_requests += 1; 1276 return (1); 1277 } 1278 return(0); 1279 } 1280 1281 /* 1282 * Associate a p-buffer with a vnode. 1283 * 1284 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1285 * with the buffer. i.e. the bp has not been linked into the vnode or 1286 * ref-counted. 1287 */ 1288 void 1289 pbgetvp(vp, bp) 1290 struct vnode *vp; 1291 struct buf *bp; 1292 { 1293 1294 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1295 1296 bp->b_vp = vp; 1297 bp->b_flags |= B_PAGING; 1298 bp->b_dev = vn_todev(vp); 1299 } 1300 1301 /* 1302 * Disassociate a p-buffer from a vnode. 1303 */ 1304 void 1305 pbrelvp(bp) 1306 struct buf *bp; 1307 { 1308 1309 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1310 1311 /* XXX REMOVE ME */ 1312 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1313 panic( 1314 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1315 bp, 1316 (int)bp->b_flags 1317 ); 1318 } 1319 bp->b_vp = (struct vnode *) 0; 1320 bp->b_flags &= ~B_PAGING; 1321 } 1322 1323 void 1324 pbreassignbuf(bp, newvp) 1325 struct buf *bp; 1326 struct vnode *newvp; 1327 { 1328 if ((bp->b_flags & B_PAGING) == 0) { 1329 panic( 1330 "pbreassignbuf() on non phys bp %p", 1331 bp 1332 ); 1333 } 1334 bp->b_vp = newvp; 1335 } 1336 1337 /* 1338 * Reassign a buffer from one vnode to another. 1339 * Used to assign file specific control information 1340 * (indirect blocks) to the vnode to which they belong. 1341 */ 1342 void 1343 reassignbuf(bp, newvp) 1344 struct buf *bp; 1345 struct vnode *newvp; 1346 { 1347 struct buflists *listheadp; 1348 int delay; 1349 int s; 1350 1351 if (newvp == NULL) { 1352 printf("reassignbuf: NULL"); 1353 return; 1354 } 1355 ++reassignbufcalls; 1356 1357 /* 1358 * B_PAGING flagged buffers cannot be reassigned because their vp 1359 * is not fully linked in. 1360 */ 1361 if (bp->b_flags & B_PAGING) 1362 panic("cannot reassign paging buffer"); 1363 1364 s = splbio(); 1365 /* 1366 * Delete from old vnode list, if on one. 1367 */ 1368 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1369 if (bp->b_xflags & BX_VNDIRTY) 1370 listheadp = &bp->b_vp->v_dirtyblkhd; 1371 else 1372 listheadp = &bp->b_vp->v_cleanblkhd; 1373 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1374 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1375 if (bp->b_vp != newvp) { 1376 vdrop(bp->b_vp); 1377 bp->b_vp = NULL; /* for clarification */ 1378 } 1379 } 1380 /* 1381 * If dirty, put on list of dirty buffers; otherwise insert onto list 1382 * of clean buffers. 1383 */ 1384 if (bp->b_flags & B_DELWRI) { 1385 struct buf *tbp; 1386 1387 listheadp = &newvp->v_dirtyblkhd; 1388 if ((newvp->v_flag & VONWORKLST) == 0) { 1389 switch (newvp->v_type) { 1390 case VDIR: 1391 delay = dirdelay; 1392 break; 1393 case VCHR: 1394 case VBLK: 1395 if (newvp->v_specmountpoint != NULL) { 1396 delay = metadelay; 1397 break; 1398 } 1399 /* fall through */ 1400 default: 1401 delay = filedelay; 1402 } 1403 vn_syncer_add_to_worklist(newvp, delay); 1404 } 1405 bp->b_xflags |= BX_VNDIRTY; 1406 tbp = TAILQ_FIRST(listheadp); 1407 if (tbp == NULL || 1408 bp->b_lblkno == 0 || 1409 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1410 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1411 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1412 ++reassignbufsortgood; 1413 } else if (bp->b_lblkno < 0) { 1414 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1415 ++reassignbufsortgood; 1416 } else if (reassignbufmethod == 1) { 1417 /* 1418 * New sorting algorithm, only handle sequential case, 1419 * otherwise append to end (but before metadata) 1420 */ 1421 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1422 (tbp->b_xflags & BX_VNDIRTY)) { 1423 /* 1424 * Found the best place to insert the buffer 1425 */ 1426 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1427 ++reassignbufsortgood; 1428 } else { 1429 /* 1430 * Missed, append to end, but before meta-data. 1431 * We know that the head buffer in the list is 1432 * not meta-data due to prior conditionals. 1433 * 1434 * Indirect effects: NFS second stage write 1435 * tends to wind up here, giving maximum 1436 * distance between the unstable write and the 1437 * commit rpc. 1438 */ 1439 tbp = TAILQ_LAST(listheadp, buflists); 1440 while (tbp && tbp->b_lblkno < 0) 1441 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1442 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1443 ++reassignbufsortbad; 1444 } 1445 } else { 1446 /* 1447 * Old sorting algorithm, scan queue and insert 1448 */ 1449 struct buf *ttbp; 1450 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1451 (ttbp->b_lblkno < bp->b_lblkno)) { 1452 ++reassignbufloops; 1453 tbp = ttbp; 1454 } 1455 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1456 } 1457 } else { 1458 bp->b_xflags |= BX_VNCLEAN; 1459 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1460 if ((newvp->v_flag & VONWORKLST) && 1461 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1462 newvp->v_flag &= ~VONWORKLST; 1463 LIST_REMOVE(newvp, v_synclist); 1464 } 1465 } 1466 if (bp->b_vp != newvp) { 1467 bp->b_vp = newvp; 1468 vhold(bp->b_vp); 1469 } 1470 splx(s); 1471 } 1472 1473 /* 1474 * Create a vnode for a block device. 1475 * Used for mounting the root file system. 1476 */ 1477 int 1478 bdevvp(dev, vpp) 1479 dev_t dev; 1480 struct vnode **vpp; 1481 { 1482 struct vnode *vp; 1483 struct vnode *nvp; 1484 int error; 1485 1486 if (dev == NODEV) { 1487 *vpp = NULLVP; 1488 return (ENXIO); 1489 } 1490 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1491 if (error) { 1492 *vpp = NULLVP; 1493 return (error); 1494 } 1495 vp = nvp; 1496 vp->v_type = VBLK; 1497 addalias(vp, dev); 1498 *vpp = vp; 1499 return (0); 1500 } 1501 1502 /* 1503 * Add a vnode to the alias list hung off the dev_t. 1504 * 1505 * The reason for this gunk is that multiple vnodes can reference 1506 * the same physical device, so checking vp->v_usecount to see 1507 * how many users there are is inadequate; the v_usecount for 1508 * the vnodes need to be accumulated. vcount() does that. 1509 */ 1510 void 1511 addaliasu(struct vnode *nvp, udev_t nvp_rdev) 1512 { 1513 dev_t dev; 1514 1515 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1516 panic("addaliasu on non-special vnode"); 1517 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); 1518 if (dev != NODEV) { 1519 nvp->v_rdev = dev; 1520 addalias(nvp, dev); 1521 } else 1522 nvp->v_rdev = NULL; 1523 } 1524 1525 void 1526 addalias(struct vnode *nvp, dev_t dev) 1527 { 1528 1529 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1530 panic("addalias on non-special vnode"); 1531 1532 nvp->v_rdev = dev; 1533 lwkt_gettoken(&spechash_token); 1534 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1535 lwkt_reltoken(&spechash_token); 1536 } 1537 1538 /* 1539 * Grab a particular vnode from the free list, increment its 1540 * reference count and lock it. The vnode lock bit is set if the 1541 * vnode is being eliminated in vgone. The process is awakened 1542 * when the transition is completed, and an error returned to 1543 * indicate that the vnode is no longer usable (possibly having 1544 * been changed to a new file system type). 1545 */ 1546 int 1547 vget(vp, flags, td) 1548 struct vnode *vp; 1549 int flags; 1550 struct thread *td; 1551 { 1552 int error; 1553 1554 /* 1555 * If the vnode is in the process of being cleaned out for 1556 * another use, we wait for the cleaning to finish and then 1557 * return failure. Cleaning is determined by checking that 1558 * the VXLOCK flag is set. 1559 */ 1560 if ((flags & LK_INTERLOCK) == 0) { 1561 lwkt_gettoken(&vp->v_interlock); 1562 } 1563 if (vp->v_flag & VXLOCK) { 1564 if (vp->v_vxproc == curproc) { 1565 #if 0 1566 /* this can now occur in normal operation */ 1567 log(LOG_INFO, "VXLOCK interlock avoided\n"); 1568 #endif 1569 } else { 1570 vp->v_flag |= VXWANT; 1571 lwkt_reltoken(&vp->v_interlock); 1572 tsleep((caddr_t)vp, 0, "vget", 0); 1573 return (ENOENT); 1574 } 1575 } 1576 1577 vp->v_usecount++; 1578 1579 if (VSHOULDBUSY(vp)) 1580 vbusy(vp); 1581 if (flags & LK_TYPE_MASK) { 1582 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { 1583 /* 1584 * must expand vrele here because we do not want 1585 * to call VOP_INACTIVE if the reference count 1586 * drops back to zero since it was never really 1587 * active. We must remove it from the free list 1588 * before sleeping so that multiple processes do 1589 * not try to recycle it. 1590 */ 1591 lwkt_gettoken(&vp->v_interlock); 1592 vp->v_usecount--; 1593 if (VSHOULDFREE(vp)) 1594 vfree(vp); 1595 else 1596 vlruvp(vp); 1597 lwkt_reltoken(&vp->v_interlock); 1598 } 1599 return (error); 1600 } 1601 lwkt_reltoken(&vp->v_interlock); 1602 return (0); 1603 } 1604 1605 void 1606 vref(struct vnode *vp) 1607 { 1608 lwkt_gettoken(&vp->v_interlock); 1609 vp->v_usecount++; 1610 lwkt_reltoken(&vp->v_interlock); 1611 } 1612 1613 /* 1614 * Vnode put/release. 1615 * If count drops to zero, call inactive routine and return to freelist. 1616 */ 1617 void 1618 vrele(struct vnode *vp) 1619 { 1620 struct thread *td = curthread; /* XXX */ 1621 1622 KASSERT(vp != NULL, ("vrele: null vp")); 1623 1624 lwkt_gettoken(&vp->v_interlock); 1625 1626 if (vp->v_usecount > 1) { 1627 1628 vp->v_usecount--; 1629 lwkt_reltoken(&vp->v_interlock); 1630 1631 return; 1632 } 1633 1634 if (vp->v_usecount == 1) { 1635 vp->v_usecount--; 1636 /* 1637 * We must call VOP_INACTIVE with the node locked. 1638 * If we are doing a vpu, the node is already locked, 1639 * but, in the case of vrele, we must explicitly lock 1640 * the vnode before calling VOP_INACTIVE 1641 */ 1642 1643 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) 1644 VOP_INACTIVE(vp, td); 1645 if (VSHOULDFREE(vp)) 1646 vfree(vp); 1647 else 1648 vlruvp(vp); 1649 } else { 1650 #ifdef DIAGNOSTIC 1651 vprint("vrele: negative ref count", vp); 1652 lwkt_reltoken(&vp->v_interlock); 1653 #endif 1654 panic("vrele: negative ref cnt"); 1655 } 1656 } 1657 1658 void 1659 vput(struct vnode *vp) 1660 { 1661 struct thread *td = curthread; /* XXX */ 1662 1663 KASSERT(vp != NULL, ("vput: null vp")); 1664 1665 lwkt_gettoken(&vp->v_interlock); 1666 1667 if (vp->v_usecount > 1) { 1668 vp->v_usecount--; 1669 VOP_UNLOCK(vp, LK_INTERLOCK, td); 1670 return; 1671 } 1672 1673 if (vp->v_usecount == 1) { 1674 vp->v_usecount--; 1675 /* 1676 * We must call VOP_INACTIVE with the node locked. 1677 * If we are doing a vpu, the node is already locked, 1678 * so we just need to release the vnode mutex. 1679 */ 1680 lwkt_reltoken(&vp->v_interlock); 1681 VOP_INACTIVE(vp, td); 1682 if (VSHOULDFREE(vp)) 1683 vfree(vp); 1684 else 1685 vlruvp(vp); 1686 } else { 1687 #ifdef DIAGNOSTIC 1688 vprint("vput: negative ref count", vp); 1689 #endif 1690 panic("vput: negative ref cnt"); 1691 } 1692 } 1693 1694 /* 1695 * Somebody doesn't want the vnode recycled. 1696 */ 1697 void 1698 vhold(vp) 1699 struct vnode *vp; 1700 { 1701 int s; 1702 1703 s = splbio(); 1704 vp->v_holdcnt++; 1705 if (VSHOULDBUSY(vp)) 1706 vbusy(vp); 1707 splx(s); 1708 } 1709 1710 /* 1711 * One less who cares about this vnode. 1712 */ 1713 void 1714 vdrop(vp) 1715 struct vnode *vp; 1716 { 1717 int s; 1718 1719 s = splbio(); 1720 if (vp->v_holdcnt <= 0) 1721 panic("vdrop: holdcnt"); 1722 vp->v_holdcnt--; 1723 if (VSHOULDFREE(vp)) 1724 vfree(vp); 1725 splx(s); 1726 } 1727 1728 /* 1729 * Remove any vnodes in the vnode table belonging to mount point mp. 1730 * 1731 * If FORCECLOSE is not specified, there should not be any active ones, 1732 * return error if any are found (nb: this is a user error, not a 1733 * system error). If FORCECLOSE is specified, detach any active vnodes 1734 * that are found. 1735 * 1736 * If WRITECLOSE is set, only flush out regular file vnodes open for 1737 * writing. 1738 * 1739 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1740 * 1741 * `rootrefs' specifies the base reference count for the root vnode 1742 * of this filesystem. The root vnode is considered busy if its 1743 * v_usecount exceeds this value. On a successful return, vflush() 1744 * will call vrele() on the root vnode exactly rootrefs times. 1745 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1746 * be zero. 1747 */ 1748 #ifdef DIAGNOSTIC 1749 static int busyprt = 0; /* print out busy vnodes */ 1750 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1751 #endif 1752 1753 int 1754 vflush(mp, rootrefs, flags) 1755 struct mount *mp; 1756 int rootrefs; 1757 int flags; 1758 { 1759 struct thread *td = curthread; /* XXX */ 1760 struct vnode *vp, *nvp, *rootvp = NULL; 1761 struct vattr vattr; 1762 int busy = 0, error; 1763 1764 if (rootrefs > 0) { 1765 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1766 ("vflush: bad args")); 1767 /* 1768 * Get the filesystem root vnode. We can vput() it 1769 * immediately, since with rootrefs > 0, it won't go away. 1770 */ 1771 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1772 return (error); 1773 vput(rootvp); 1774 } 1775 lwkt_gettoken(&mntvnode_token); 1776 loop: 1777 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) { 1778 /* 1779 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1780 * Start over if it has (it won't be on the list anymore). 1781 */ 1782 if (vp->v_mount != mp) 1783 goto loop; 1784 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 1785 1786 lwkt_gettoken(&vp->v_interlock); 1787 /* 1788 * Skip over a vnodes marked VSYSTEM. 1789 */ 1790 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1791 lwkt_reltoken(&vp->v_interlock); 1792 continue; 1793 } 1794 /* 1795 * If WRITECLOSE is set, flush out unlinked but still open 1796 * files (even if open only for reading) and regular file 1797 * vnodes open for writing. 1798 */ 1799 if ((flags & WRITECLOSE) && 1800 (vp->v_type == VNON || 1801 (VOP_GETATTR(vp, &vattr, td) == 0 && 1802 vattr.va_nlink > 0)) && 1803 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1804 lwkt_reltoken(&vp->v_interlock); 1805 continue; 1806 } 1807 1808 /* 1809 * With v_usecount == 0, all we need to do is clear out the 1810 * vnode data structures and we are done. 1811 */ 1812 if (vp->v_usecount == 0) { 1813 lwkt_reltoken(&mntvnode_token); 1814 vgonel(vp, td); 1815 lwkt_gettoken(&mntvnode_token); 1816 continue; 1817 } 1818 1819 /* 1820 * If FORCECLOSE is set, forcibly close the vnode. For block 1821 * or character devices, revert to an anonymous device. For 1822 * all other files, just kill them. 1823 */ 1824 if (flags & FORCECLOSE) { 1825 lwkt_reltoken(&mntvnode_token); 1826 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1827 vgonel(vp, td); 1828 } else { 1829 vclean(vp, 0, td); 1830 vp->v_op = spec_vnodeop_p; 1831 insmntque(vp, (struct mount *) 0); 1832 } 1833 lwkt_gettoken(&mntvnode_token); 1834 continue; 1835 } 1836 #ifdef DIAGNOSTIC 1837 if (busyprt) 1838 vprint("vflush: busy vnode", vp); 1839 #endif 1840 lwkt_reltoken(&vp->v_interlock); 1841 busy++; 1842 } 1843 lwkt_reltoken(&mntvnode_token); 1844 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1845 /* 1846 * If just the root vnode is busy, and if its refcount 1847 * is equal to `rootrefs', then go ahead and kill it. 1848 */ 1849 lwkt_gettoken(&rootvp->v_interlock); 1850 KASSERT(busy > 0, ("vflush: not busy")); 1851 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1852 if (busy == 1 && rootvp->v_usecount == rootrefs) { 1853 vgonel(rootvp, td); 1854 busy = 0; 1855 } else 1856 lwkt_reltoken(&rootvp->v_interlock); 1857 } 1858 if (busy) 1859 return (EBUSY); 1860 for (; rootrefs > 0; rootrefs--) 1861 vrele(rootvp); 1862 return (0); 1863 } 1864 1865 /* 1866 * We do not want to recycle the vnode too quickly. 1867 * 1868 * XXX we can't move vp's around the nvnodelist without really screwing 1869 * up the efficiency of filesystem SYNC and friends. This code is 1870 * disabled until we fix the syncing code's scanning algorithm. 1871 */ 1872 static void 1873 vlruvp(struct vnode *vp) 1874 { 1875 #if 0 1876 struct mount *mp; 1877 1878 if ((mp = vp->v_mount) != NULL) { 1879 lwkt_gettoken(&mntvnode_token); 1880 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1881 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1882 lwkt_reltoken(&mntvnode_token); 1883 } 1884 #endif 1885 } 1886 1887 /* 1888 * Disassociate the underlying file system from a vnode. 1889 */ 1890 static void 1891 vclean(struct vnode *vp, int flags, struct thread *td) 1892 { 1893 int active; 1894 1895 /* 1896 * Check to see if the vnode is in use. If so we have to reference it 1897 * before we clean it out so that its count cannot fall to zero and 1898 * generate a race against ourselves to recycle it. 1899 */ 1900 if ((active = vp->v_usecount)) 1901 vp->v_usecount++; 1902 1903 /* 1904 * Prevent the vnode from being recycled or brought into use while we 1905 * clean it out. 1906 */ 1907 if (vp->v_flag & VXLOCK) 1908 panic("vclean: deadlock"); 1909 vp->v_flag |= VXLOCK; 1910 vp->v_vxproc = curproc; 1911 /* 1912 * Even if the count is zero, the VOP_INACTIVE routine may still 1913 * have the object locked while it cleans it out. The VOP_LOCK 1914 * ensures that the VOP_INACTIVE routine is done with its work. 1915 * For active vnodes, it ensures that no other activity can 1916 * occur while the underlying object is being cleaned out. 1917 */ 1918 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); 1919 1920 /* 1921 * Clean out any buffers associated with the vnode. 1922 */ 1923 vinvalbuf(vp, V_SAVE, td, 0, 0); 1924 1925 VOP_DESTROYVOBJECT(vp); 1926 1927 /* 1928 * If purging an active vnode, it must be closed and 1929 * deactivated before being reclaimed. Note that the 1930 * VOP_INACTIVE will unlock the vnode. 1931 */ 1932 if (active) { 1933 if (flags & DOCLOSE) 1934 VOP_CLOSE(vp, FNONBLOCK, td); 1935 VOP_INACTIVE(vp, td); 1936 } else { 1937 /* 1938 * Any other processes trying to obtain this lock must first 1939 * wait for VXLOCK to clear, then call the new lock operation. 1940 */ 1941 VOP_UNLOCK(vp, 0, td); 1942 } 1943 /* 1944 * Reclaim the vnode. 1945 */ 1946 if (VOP_RECLAIM(vp, td)) 1947 panic("vclean: cannot reclaim"); 1948 1949 if (active) { 1950 /* 1951 * Inline copy of vrele() since VOP_INACTIVE 1952 * has already been called. 1953 */ 1954 lwkt_gettoken(&vp->v_interlock); 1955 if (--vp->v_usecount <= 0) { 1956 #ifdef DIAGNOSTIC 1957 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1958 vprint("vclean: bad ref count", vp); 1959 panic("vclean: ref cnt"); 1960 } 1961 #endif 1962 vfree(vp); 1963 } 1964 lwkt_reltoken(&vp->v_interlock); 1965 } 1966 1967 cache_purge(vp); 1968 vp->v_vnlock = NULL; 1969 1970 if (VSHOULDFREE(vp)) 1971 vfree(vp); 1972 1973 /* 1974 * Done with purge, notify sleepers of the grim news. 1975 */ 1976 vp->v_op = dead_vnodeop_p; 1977 vn_pollgone(vp); 1978 vp->v_tag = VT_NON; 1979 vp->v_flag &= ~VXLOCK; 1980 vp->v_vxproc = NULL; 1981 if (vp->v_flag & VXWANT) { 1982 vp->v_flag &= ~VXWANT; 1983 wakeup((caddr_t) vp); 1984 } 1985 } 1986 1987 /* 1988 * Eliminate all activity associated with the requested vnode 1989 * and with all vnodes aliased to the requested vnode. 1990 */ 1991 int 1992 vop_revoke(ap) 1993 struct vop_revoke_args /* { 1994 struct vnode *a_vp; 1995 int a_flags; 1996 } */ *ap; 1997 { 1998 struct vnode *vp, *vq; 1999 dev_t dev; 2000 2001 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 2002 2003 vp = ap->a_vp; 2004 /* 2005 * If a vgone (or vclean) is already in progress, 2006 * wait until it is done and return. 2007 */ 2008 if (vp->v_flag & VXLOCK) { 2009 vp->v_flag |= VXWANT; 2010 lwkt_reltoken(&vp->v_interlock); 2011 tsleep((caddr_t)vp, 0, "vop_revokeall", 0); 2012 return (0); 2013 } 2014 dev = vp->v_rdev; 2015 for (;;) { 2016 lwkt_gettoken(&spechash_token); 2017 vq = SLIST_FIRST(&dev->si_hlist); 2018 lwkt_reltoken(&spechash_token); 2019 if (!vq) 2020 break; 2021 vgone(vq); 2022 } 2023 return (0); 2024 } 2025 2026 /* 2027 * Recycle an unused vnode to the front of the free list. 2028 * Release the passed interlock if the vnode will be recycled. 2029 */ 2030 int 2031 vrecycle(struct vnode *vp, struct lwkt_token *inter_lkp, struct thread *td) 2032 { 2033 lwkt_gettoken(&vp->v_interlock); 2034 if (vp->v_usecount == 0) { 2035 if (inter_lkp) { 2036 lwkt_reltoken(inter_lkp); 2037 } 2038 vgonel(vp, td); 2039 return (1); 2040 } 2041 lwkt_reltoken(&vp->v_interlock); 2042 return (0); 2043 } 2044 2045 /* 2046 * Eliminate all activity associated with a vnode 2047 * in preparation for reuse. 2048 */ 2049 void 2050 vgone(struct vnode *vp) 2051 { 2052 struct thread *td = curthread; /* XXX */ 2053 2054 lwkt_gettoken(&vp->v_interlock); 2055 vgonel(vp, td); 2056 } 2057 2058 /* 2059 * vgone, with the vp interlock held. 2060 */ 2061 void 2062 vgonel(struct vnode *vp, struct thread *td) 2063 { 2064 int s; 2065 2066 /* 2067 * If a vgone (or vclean) is already in progress, 2068 * wait until it is done and return. 2069 */ 2070 if (vp->v_flag & VXLOCK) { 2071 vp->v_flag |= VXWANT; 2072 lwkt_reltoken(&vp->v_interlock); 2073 tsleep((caddr_t)vp, 0, "vgone", 0); 2074 return; 2075 } 2076 2077 /* 2078 * Clean out the filesystem specific data. 2079 */ 2080 vclean(vp, DOCLOSE, td); 2081 lwkt_gettoken(&vp->v_interlock); 2082 2083 /* 2084 * Delete from old mount point vnode list, if on one. 2085 */ 2086 if (vp->v_mount != NULL) 2087 insmntque(vp, (struct mount *)0); 2088 /* 2089 * If special device, remove it from special device alias list 2090 * if it is on one. 2091 */ 2092 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 2093 lwkt_gettoken(&spechash_token); 2094 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext); 2095 freedev(vp->v_rdev); 2096 lwkt_reltoken(&spechash_token); 2097 vp->v_rdev = NULL; 2098 } 2099 2100 /* 2101 * If it is on the freelist and not already at the head, 2102 * move it to the head of the list. The test of the 2103 * VDOOMED flag and the reference count of zero is because 2104 * it will be removed from the free list by getnewvnode, 2105 * but will not have its reference count incremented until 2106 * after calling vgone. If the reference count were 2107 * incremented first, vgone would (incorrectly) try to 2108 * close the previous instance of the underlying object. 2109 */ 2110 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2111 s = splbio(); 2112 lwkt_gettoken(&vnode_free_list_token); 2113 if (vp->v_flag & VFREE) 2114 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2115 else 2116 freevnodes++; 2117 vp->v_flag |= VFREE; 2118 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2119 lwkt_reltoken(&vnode_free_list_token); 2120 splx(s); 2121 } 2122 2123 vp->v_type = VBAD; 2124 lwkt_reltoken(&vp->v_interlock); 2125 } 2126 2127 /* 2128 * Lookup a vnode by device number. 2129 */ 2130 int 2131 vfinddev(dev, type, vpp) 2132 dev_t dev; 2133 enum vtype type; 2134 struct vnode **vpp; 2135 { 2136 struct vnode *vp; 2137 2138 lwkt_gettoken(&spechash_token); 2139 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2140 if (type == vp->v_type) { 2141 *vpp = vp; 2142 lwkt_reltoken(&spechash_token); 2143 return (1); 2144 } 2145 } 2146 lwkt_reltoken(&spechash_token); 2147 return (0); 2148 } 2149 2150 /* 2151 * Calculate the total number of references to a special device. 2152 */ 2153 int 2154 vcount(vp) 2155 struct vnode *vp; 2156 { 2157 struct vnode *vq; 2158 int count; 2159 2160 count = 0; 2161 lwkt_gettoken(&spechash_token); 2162 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext) 2163 count += vq->v_usecount; 2164 lwkt_reltoken(&spechash_token); 2165 return (count); 2166 } 2167 2168 /* 2169 * Same as above, but using the dev_t as argument 2170 */ 2171 2172 int 2173 count_dev(dev) 2174 dev_t dev; 2175 { 2176 struct vnode *vp; 2177 2178 vp = SLIST_FIRST(&dev->si_hlist); 2179 if (vp == NULL) 2180 return (0); 2181 return(vcount(vp)); 2182 } 2183 2184 /* 2185 * Print out a description of a vnode. 2186 */ 2187 static char *typename[] = 2188 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2189 2190 void 2191 vprint(label, vp) 2192 char *label; 2193 struct vnode *vp; 2194 { 2195 char buf[96]; 2196 2197 if (label != NULL) 2198 printf("%s: %p: ", label, (void *)vp); 2199 else 2200 printf("%p: ", (void *)vp); 2201 printf("type %s, usecount %d, writecount %d, refcount %d,", 2202 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2203 vp->v_holdcnt); 2204 buf[0] = '\0'; 2205 if (vp->v_flag & VROOT) 2206 strcat(buf, "|VROOT"); 2207 if (vp->v_flag & VTEXT) 2208 strcat(buf, "|VTEXT"); 2209 if (vp->v_flag & VSYSTEM) 2210 strcat(buf, "|VSYSTEM"); 2211 if (vp->v_flag & VXLOCK) 2212 strcat(buf, "|VXLOCK"); 2213 if (vp->v_flag & VXWANT) 2214 strcat(buf, "|VXWANT"); 2215 if (vp->v_flag & VBWAIT) 2216 strcat(buf, "|VBWAIT"); 2217 if (vp->v_flag & VDOOMED) 2218 strcat(buf, "|VDOOMED"); 2219 if (vp->v_flag & VFREE) 2220 strcat(buf, "|VFREE"); 2221 if (vp->v_flag & VOBJBUF) 2222 strcat(buf, "|VOBJBUF"); 2223 if (buf[0] != '\0') 2224 printf(" flags (%s)", &buf[1]); 2225 if (vp->v_data == NULL) { 2226 printf("\n"); 2227 } else { 2228 printf("\n\t"); 2229 VOP_PRINT(vp); 2230 } 2231 } 2232 2233 #ifdef DDB 2234 #include <ddb/ddb.h> 2235 /* 2236 * List all of the locked vnodes in the system. 2237 * Called when debugging the kernel. 2238 */ 2239 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2240 { 2241 struct thread *td = curthread; /* XXX */ 2242 struct mount *mp, *nmp; 2243 struct vnode *vp; 2244 2245 printf("Locked vnodes\n"); 2246 lwkt_gettoken(&mountlist_token); 2247 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2248 if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, td)) { 2249 nmp = TAILQ_NEXT(mp, mnt_list); 2250 continue; 2251 } 2252 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2253 if (VOP_ISLOCKED(vp, NULL)) 2254 vprint((char *)0, vp); 2255 } 2256 lwkt_gettoken(&mountlist_token); 2257 nmp = TAILQ_NEXT(mp, mnt_list); 2258 vfs_unbusy(mp, td); 2259 } 2260 lwkt_reltoken(&mountlist_token); 2261 } 2262 #endif 2263 2264 /* 2265 * Top level filesystem related information gathering. 2266 */ 2267 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 2268 2269 static int 2270 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2271 { 2272 int *name = (int *)arg1 - 1; /* XXX */ 2273 u_int namelen = arg2 + 1; /* XXX */ 2274 struct vfsconf *vfsp; 2275 2276 #if 1 || defined(COMPAT_PRELITE2) 2277 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2278 if (namelen == 1) 2279 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2280 #endif 2281 2282 #ifdef notyet 2283 /* all sysctl names at this level are at least name and field */ 2284 if (namelen < 2) 2285 return (ENOTDIR); /* overloaded */ 2286 if (name[0] != VFS_GENERIC) { 2287 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2288 if (vfsp->vfc_typenum == name[0]) 2289 break; 2290 if (vfsp == NULL) 2291 return (EOPNOTSUPP); 2292 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2293 oldp, oldlenp, newp, newlen, p)); 2294 } 2295 #endif 2296 switch (name[1]) { 2297 case VFS_MAXTYPENUM: 2298 if (namelen != 2) 2299 return (ENOTDIR); 2300 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2301 case VFS_CONF: 2302 if (namelen != 3) 2303 return (ENOTDIR); /* overloaded */ 2304 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2305 if (vfsp->vfc_typenum == name[2]) 2306 break; 2307 if (vfsp == NULL) 2308 return (EOPNOTSUPP); 2309 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2310 } 2311 return (EOPNOTSUPP); 2312 } 2313 2314 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2315 "Generic filesystem"); 2316 2317 #if 1 || defined(COMPAT_PRELITE2) 2318 2319 static int 2320 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2321 { 2322 int error; 2323 struct vfsconf *vfsp; 2324 struct ovfsconf ovfs; 2325 2326 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2327 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2328 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2329 ovfs.vfc_index = vfsp->vfc_typenum; 2330 ovfs.vfc_refcount = vfsp->vfc_refcount; 2331 ovfs.vfc_flags = vfsp->vfc_flags; 2332 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2333 if (error) 2334 return error; 2335 } 2336 return 0; 2337 } 2338 2339 #endif /* 1 || COMPAT_PRELITE2 */ 2340 2341 #if 0 2342 #define KINFO_VNODESLOP 10 2343 /* 2344 * Dump vnode list (via sysctl). 2345 * Copyout address of vnode followed by vnode. 2346 */ 2347 /* ARGSUSED */ 2348 static int 2349 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2350 { 2351 struct proc *p = curproc; /* XXX */ 2352 struct mount *mp, *nmp; 2353 struct vnode *nvp, *vp; 2354 int error; 2355 2356 #define VPTRSZ sizeof (struct vnode *) 2357 #define VNODESZ sizeof (struct vnode) 2358 2359 req->lock = 0; 2360 if (!req->oldptr) /* Make an estimate */ 2361 return (SYSCTL_OUT(req, 0, 2362 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2363 2364 lwkt_gettoken(&mountlist_token); 2365 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2366 if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, p)) { 2367 nmp = TAILQ_NEXT(mp, mnt_list); 2368 continue; 2369 } 2370 again: 2371 lwkt_gettoken(&mntvnode_token); 2372 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 2373 vp != NULL; 2374 vp = nvp) { 2375 /* 2376 * Check that the vp is still associated with 2377 * this filesystem. RACE: could have been 2378 * recycled onto the same filesystem. 2379 */ 2380 if (vp->v_mount != mp) { 2381 lwkt_reltoken(&mntvnode_token); 2382 goto again; 2383 } 2384 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2385 lwkt_reltoken(&mntvnode_token); 2386 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2387 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2388 return (error); 2389 lwkt_gettoken(&mntvnode_token); 2390 } 2391 lwkt_reltoken(&mntvnode_token); 2392 lwkt_gettoken(&mountlist_token); 2393 nmp = TAILQ_NEXT(mp, mnt_list); 2394 vfs_unbusy(mp, p); 2395 } 2396 lwkt_reltoken(&mountlist_token); 2397 2398 return (0); 2399 } 2400 #endif 2401 2402 /* 2403 * XXX 2404 * Exporting the vnode list on large systems causes them to crash. 2405 * Exporting the vnode list on medium systems causes sysctl to coredump. 2406 */ 2407 #if 0 2408 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2409 0, 0, sysctl_vnode, "S,vnode", ""); 2410 #endif 2411 2412 /* 2413 * Check to see if a filesystem is mounted on a block device. 2414 */ 2415 int 2416 vfs_mountedon(vp) 2417 struct vnode *vp; 2418 { 2419 2420 if (vp->v_specmountpoint != NULL) 2421 return (EBUSY); 2422 return (0); 2423 } 2424 2425 /* 2426 * Unmount all filesystems. The list is traversed in reverse order 2427 * of mounting to avoid dependencies. 2428 */ 2429 void 2430 vfs_unmountall() 2431 { 2432 struct mount *mp; 2433 struct thread *td = curthread; 2434 int error; 2435 2436 if (td->td_proc == NULL) 2437 td = initproc->p_thread; /* XXX XXX use proc0 instead? */ 2438 2439 /* 2440 * Since this only runs when rebooting, it is not interlocked. 2441 */ 2442 while(!TAILQ_EMPTY(&mountlist)) { 2443 mp = TAILQ_LAST(&mountlist, mntlist); 2444 error = dounmount(mp, MNT_FORCE, td); 2445 if (error) { 2446 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2447 printf("unmount of %s failed (", 2448 mp->mnt_stat.f_mntonname); 2449 if (error == EBUSY) 2450 printf("BUSY)\n"); 2451 else 2452 printf("%d)\n", error); 2453 } else { 2454 /* The unmount has removed mp from the mountlist */ 2455 } 2456 } 2457 } 2458 2459 /* 2460 * Build hash lists of net addresses and hang them off the mount point. 2461 * Called by ufs_mount() to set up the lists of export addresses. 2462 */ 2463 static int 2464 vfs_hang_addrlist(mp, nep, argp) 2465 struct mount *mp; 2466 struct netexport *nep; 2467 struct export_args *argp; 2468 { 2469 struct netcred *np; 2470 struct radix_node_head *rnh; 2471 int i; 2472 struct radix_node *rn; 2473 struct sockaddr *saddr, *smask = 0; 2474 struct domain *dom; 2475 int error; 2476 2477 if (argp->ex_addrlen == 0) { 2478 if (mp->mnt_flag & MNT_DEFEXPORTED) 2479 return (EPERM); 2480 np = &nep->ne_defexported; 2481 np->netc_exflags = argp->ex_flags; 2482 np->netc_anon = argp->ex_anon; 2483 np->netc_anon.cr_ref = 1; 2484 mp->mnt_flag |= MNT_DEFEXPORTED; 2485 return (0); 2486 } 2487 2488 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 2489 return (EINVAL); 2490 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 2491 return (EINVAL); 2492 2493 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2494 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2495 bzero((caddr_t) np, i); 2496 saddr = (struct sockaddr *) (np + 1); 2497 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2498 goto out; 2499 if (saddr->sa_len > argp->ex_addrlen) 2500 saddr->sa_len = argp->ex_addrlen; 2501 if (argp->ex_masklen) { 2502 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2503 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2504 if (error) 2505 goto out; 2506 if (smask->sa_len > argp->ex_masklen) 2507 smask->sa_len = argp->ex_masklen; 2508 } 2509 i = saddr->sa_family; 2510 if ((rnh = nep->ne_rtable[i]) == 0) { 2511 /* 2512 * Seems silly to initialize every AF when most are not used, 2513 * do so on demand here 2514 */ 2515 for (dom = domains; dom; dom = dom->dom_next) 2516 if (dom->dom_family == i && dom->dom_rtattach) { 2517 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2518 dom->dom_rtoffset); 2519 break; 2520 } 2521 if ((rnh = nep->ne_rtable[i]) == 0) { 2522 error = ENOBUFS; 2523 goto out; 2524 } 2525 } 2526 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2527 np->netc_rnodes); 2528 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2529 error = EPERM; 2530 goto out; 2531 } 2532 np->netc_exflags = argp->ex_flags; 2533 np->netc_anon = argp->ex_anon; 2534 np->netc_anon.cr_ref = 1; 2535 return (0); 2536 out: 2537 free(np, M_NETADDR); 2538 return (error); 2539 } 2540 2541 /* ARGSUSED */ 2542 static int 2543 vfs_free_netcred(rn, w) 2544 struct radix_node *rn; 2545 void *w; 2546 { 2547 struct radix_node_head *rnh = (struct radix_node_head *) w; 2548 2549 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2550 free((caddr_t) rn, M_NETADDR); 2551 return (0); 2552 } 2553 2554 /* 2555 * Free the net address hash lists that are hanging off the mount points. 2556 */ 2557 static void 2558 vfs_free_addrlist(nep) 2559 struct netexport *nep; 2560 { 2561 int i; 2562 struct radix_node_head *rnh; 2563 2564 for (i = 0; i <= AF_MAX; i++) 2565 if ((rnh = nep->ne_rtable[i])) { 2566 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2567 (caddr_t) rnh); 2568 free((caddr_t) rnh, M_RTABLE); 2569 nep->ne_rtable[i] = 0; 2570 } 2571 } 2572 2573 int 2574 vfs_export(mp, nep, argp) 2575 struct mount *mp; 2576 struct netexport *nep; 2577 struct export_args *argp; 2578 { 2579 int error; 2580 2581 if (argp->ex_flags & MNT_DELEXPORT) { 2582 if (mp->mnt_flag & MNT_EXPUBLIC) { 2583 vfs_setpublicfs(NULL, NULL, NULL); 2584 mp->mnt_flag &= ~MNT_EXPUBLIC; 2585 } 2586 vfs_free_addrlist(nep); 2587 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2588 } 2589 if (argp->ex_flags & MNT_EXPORTED) { 2590 if (argp->ex_flags & MNT_EXPUBLIC) { 2591 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2592 return (error); 2593 mp->mnt_flag |= MNT_EXPUBLIC; 2594 } 2595 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2596 return (error); 2597 mp->mnt_flag |= MNT_EXPORTED; 2598 } 2599 return (0); 2600 } 2601 2602 2603 /* 2604 * Set the publicly exported filesystem (WebNFS). Currently, only 2605 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2606 */ 2607 int 2608 vfs_setpublicfs(mp, nep, argp) 2609 struct mount *mp; 2610 struct netexport *nep; 2611 struct export_args *argp; 2612 { 2613 int error; 2614 struct vnode *rvp; 2615 char *cp; 2616 2617 /* 2618 * mp == NULL -> invalidate the current info, the FS is 2619 * no longer exported. May be called from either vfs_export 2620 * or unmount, so check if it hasn't already been done. 2621 */ 2622 if (mp == NULL) { 2623 if (nfs_pub.np_valid) { 2624 nfs_pub.np_valid = 0; 2625 if (nfs_pub.np_index != NULL) { 2626 FREE(nfs_pub.np_index, M_TEMP); 2627 nfs_pub.np_index = NULL; 2628 } 2629 } 2630 return (0); 2631 } 2632 2633 /* 2634 * Only one allowed at a time. 2635 */ 2636 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2637 return (EBUSY); 2638 2639 /* 2640 * Get real filehandle for root of exported FS. 2641 */ 2642 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2643 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2644 2645 if ((error = VFS_ROOT(mp, &rvp))) 2646 return (error); 2647 2648 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2649 return (error); 2650 2651 vput(rvp); 2652 2653 /* 2654 * If an indexfile was specified, pull it in. 2655 */ 2656 if (argp->ex_indexfile != NULL) { 2657 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2658 M_WAITOK); 2659 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2660 MAXNAMLEN, (size_t *)0); 2661 if (!error) { 2662 /* 2663 * Check for illegal filenames. 2664 */ 2665 for (cp = nfs_pub.np_index; *cp; cp++) { 2666 if (*cp == '/') { 2667 error = EINVAL; 2668 break; 2669 } 2670 } 2671 } 2672 if (error) { 2673 FREE(nfs_pub.np_index, M_TEMP); 2674 return (error); 2675 } 2676 } 2677 2678 nfs_pub.np_mount = mp; 2679 nfs_pub.np_valid = 1; 2680 return (0); 2681 } 2682 2683 struct netcred * 2684 vfs_export_lookup(mp, nep, nam) 2685 struct mount *mp; 2686 struct netexport *nep; 2687 struct sockaddr *nam; 2688 { 2689 struct netcred *np; 2690 struct radix_node_head *rnh; 2691 struct sockaddr *saddr; 2692 2693 np = NULL; 2694 if (mp->mnt_flag & MNT_EXPORTED) { 2695 /* 2696 * Lookup in the export list first. 2697 */ 2698 if (nam != NULL) { 2699 saddr = nam; 2700 rnh = nep->ne_rtable[saddr->sa_family]; 2701 if (rnh != NULL) { 2702 np = (struct netcred *) 2703 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2704 rnh); 2705 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2706 np = NULL; 2707 } 2708 } 2709 /* 2710 * If no address match, use the default if it exists. 2711 */ 2712 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2713 np = &nep->ne_defexported; 2714 } 2715 return (np); 2716 } 2717 2718 /* 2719 * perform msync on all vnodes under a mount point 2720 * the mount point must be locked. 2721 */ 2722 void 2723 vfs_msync(struct mount *mp, int flags) 2724 { 2725 struct thread *td = curthread; /* XXX */ 2726 struct vnode *vp, *nvp; 2727 struct vm_object *obj; 2728 int tries; 2729 2730 tries = 5; 2731 lwkt_gettoken(&mntvnode_token); 2732 loop: 2733 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { 2734 if (vp->v_mount != mp) { 2735 if (--tries > 0) 2736 goto loop; 2737 break; 2738 } 2739 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2740 2741 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2742 continue; 2743 2744 /* 2745 * There could be hundreds of thousands of vnodes, we cannot 2746 * afford to do anything heavy-weight until we have a fairly 2747 * good indication that there is something to do. 2748 */ 2749 if ((vp->v_flag & VOBJDIRTY) && 2750 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2751 lwkt_reltoken(&mntvnode_token); 2752 if (!vget(vp, 2753 LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, td)) { 2754 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2755 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2756 } 2757 vput(vp); 2758 } 2759 lwkt_gettoken(&mntvnode_token); 2760 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2761 if (--tries > 0) 2762 goto loop; 2763 break; 2764 } 2765 } 2766 } 2767 lwkt_reltoken(&mntvnode_token); 2768 } 2769 2770 /* 2771 * Create the VM object needed for VMIO and mmap support. This 2772 * is done for all VREG files in the system. Some filesystems might 2773 * afford the additional metadata buffering capability of the 2774 * VMIO code by making the device node be VMIO mode also. 2775 * 2776 * vp must be locked when vfs_object_create is called. 2777 */ 2778 int 2779 vfs_object_create(struct vnode *vp, struct thread *td) 2780 { 2781 return (VOP_CREATEVOBJECT(vp, td)); 2782 } 2783 2784 void 2785 vfree(vp) 2786 struct vnode *vp; 2787 { 2788 int s; 2789 2790 s = splbio(); 2791 lwkt_gettoken(&vnode_free_list_token); 2792 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2793 if (vp->v_flag & VAGE) { 2794 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2795 } else { 2796 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2797 } 2798 freevnodes++; 2799 lwkt_reltoken(&vnode_free_list_token); 2800 vp->v_flag &= ~VAGE; 2801 vp->v_flag |= VFREE; 2802 splx(s); 2803 } 2804 2805 void 2806 vbusy(vp) 2807 struct vnode *vp; 2808 { 2809 int s; 2810 2811 s = splbio(); 2812 lwkt_gettoken(&vnode_free_list_token); 2813 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2814 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2815 freevnodes--; 2816 lwkt_reltoken(&vnode_free_list_token); 2817 vp->v_flag &= ~(VFREE|VAGE); 2818 splx(s); 2819 } 2820 2821 /* 2822 * Record a process's interest in events which might happen to 2823 * a vnode. Because poll uses the historic select-style interface 2824 * internally, this routine serves as both the ``check for any 2825 * pending events'' and the ``record my interest in future events'' 2826 * functions. (These are done together, while the lock is held, 2827 * to avoid race conditions.) 2828 */ 2829 int 2830 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 2831 { 2832 lwkt_gettoken(&vp->v_pollinfo.vpi_token); 2833 if (vp->v_pollinfo.vpi_revents & events) { 2834 /* 2835 * This leaves events we are not interested 2836 * in available for the other process which 2837 * which presumably had requested them 2838 * (otherwise they would never have been 2839 * recorded). 2840 */ 2841 events &= vp->v_pollinfo.vpi_revents; 2842 vp->v_pollinfo.vpi_revents &= ~events; 2843 2844 lwkt_reltoken(&vp->v_pollinfo.vpi_token); 2845 return events; 2846 } 2847 vp->v_pollinfo.vpi_events |= events; 2848 selrecord(td, &vp->v_pollinfo.vpi_selinfo); 2849 lwkt_reltoken(&vp->v_pollinfo.vpi_token); 2850 return 0; 2851 } 2852 2853 /* 2854 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2855 * it is possible for us to miss an event due to race conditions, but 2856 * that condition is expected to be rare, so for the moment it is the 2857 * preferred interface. 2858 */ 2859 void 2860 vn_pollevent(vp, events) 2861 struct vnode *vp; 2862 short events; 2863 { 2864 lwkt_gettoken(&vp->v_pollinfo.vpi_token); 2865 if (vp->v_pollinfo.vpi_events & events) { 2866 /* 2867 * We clear vpi_events so that we don't 2868 * call selwakeup() twice if two events are 2869 * posted before the polling process(es) is 2870 * awakened. This also ensures that we take at 2871 * most one selwakeup() if the polling process 2872 * is no longer interested. However, it does 2873 * mean that only one event can be noticed at 2874 * a time. (Perhaps we should only clear those 2875 * event bits which we note?) XXX 2876 */ 2877 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2878 vp->v_pollinfo.vpi_revents |= events; 2879 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2880 } 2881 lwkt_reltoken(&vp->v_pollinfo.vpi_token); 2882 } 2883 2884 /* 2885 * Wake up anyone polling on vp because it is being revoked. 2886 * This depends on dead_poll() returning POLLHUP for correct 2887 * behavior. 2888 */ 2889 void 2890 vn_pollgone(vp) 2891 struct vnode *vp; 2892 { 2893 lwkt_gettoken(&vp->v_pollinfo.vpi_token); 2894 if (vp->v_pollinfo.vpi_events) { 2895 vp->v_pollinfo.vpi_events = 0; 2896 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2897 } 2898 lwkt_reltoken(&vp->v_pollinfo.vpi_token); 2899 } 2900 2901 2902 2903 /* 2904 * Routine to create and manage a filesystem syncer vnode. 2905 */ 2906 #define sync_close ((int (*) (struct vop_close_args *))nullop) 2907 static int sync_fsync (struct vop_fsync_args *); 2908 static int sync_inactive (struct vop_inactive_args *); 2909 static int sync_reclaim (struct vop_reclaim_args *); 2910 #define sync_lock ((int (*) (struct vop_lock_args *))vop_nolock) 2911 #define sync_unlock ((int (*) (struct vop_unlock_args *))vop_nounlock) 2912 static int sync_print (struct vop_print_args *); 2913 #define sync_islocked ((int(*) (struct vop_islocked_args *))vop_noislocked) 2914 2915 static vop_t **sync_vnodeop_p; 2916 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2917 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2918 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2919 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2920 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2921 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2922 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2923 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2924 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2925 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2926 { NULL, NULL } 2927 }; 2928 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2929 { &sync_vnodeop_p, sync_vnodeop_entries }; 2930 2931 VNODEOP_SET(sync_vnodeop_opv_desc); 2932 2933 /* 2934 * Create a new filesystem syncer vnode for the specified mount point. 2935 */ 2936 int 2937 vfs_allocate_syncvnode(mp) 2938 struct mount *mp; 2939 { 2940 struct vnode *vp; 2941 static long start, incr, next; 2942 int error; 2943 2944 /* Allocate a new vnode */ 2945 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2946 mp->mnt_syncer = NULL; 2947 return (error); 2948 } 2949 vp->v_type = VNON; 2950 /* 2951 * Place the vnode onto the syncer worklist. We attempt to 2952 * scatter them about on the list so that they will go off 2953 * at evenly distributed times even if all the filesystems 2954 * are mounted at once. 2955 */ 2956 next += incr; 2957 if (next == 0 || next > syncer_maxdelay) { 2958 start /= 2; 2959 incr /= 2; 2960 if (start == 0) { 2961 start = syncer_maxdelay / 2; 2962 incr = syncer_maxdelay; 2963 } 2964 next = start; 2965 } 2966 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2967 mp->mnt_syncer = vp; 2968 return (0); 2969 } 2970 2971 /* 2972 * Do a lazy sync of the filesystem. 2973 */ 2974 static int 2975 sync_fsync(ap) 2976 struct vop_fsync_args /* { 2977 struct vnode *a_vp; 2978 struct ucred *a_cred; 2979 int a_waitfor; 2980 struct thread *a_td; 2981 } */ *ap; 2982 { 2983 struct vnode *syncvp = ap->a_vp; 2984 struct mount *mp = syncvp->v_mount; 2985 struct thread *td = ap->a_td; 2986 int asyncflag; 2987 2988 /* 2989 * We only need to do something if this is a lazy evaluation. 2990 */ 2991 if (ap->a_waitfor != MNT_LAZY) 2992 return (0); 2993 2994 /* 2995 * Move ourselves to the back of the sync list. 2996 */ 2997 vn_syncer_add_to_worklist(syncvp, syncdelay); 2998 2999 /* 3000 * Walk the list of vnodes pushing all that are dirty and 3001 * not already on the sync list. 3002 */ 3003 lwkt_gettoken(&mountlist_token); 3004 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_token, td) != 0) { 3005 lwkt_reltoken(&mountlist_token); 3006 return (0); 3007 } 3008 asyncflag = mp->mnt_flag & MNT_ASYNC; 3009 mp->mnt_flag &= ~MNT_ASYNC; 3010 vfs_msync(mp, MNT_NOWAIT); 3011 VFS_SYNC(mp, MNT_LAZY, td); 3012 if (asyncflag) 3013 mp->mnt_flag |= MNT_ASYNC; 3014 vfs_unbusy(mp, td); 3015 return (0); 3016 } 3017 3018 /* 3019 * The syncer vnode is no referenced. 3020 */ 3021 static int 3022 sync_inactive(ap) 3023 struct vop_inactive_args /* { 3024 struct vnode *a_vp; 3025 struct proc *a_p; 3026 } */ *ap; 3027 { 3028 3029 vgone(ap->a_vp); 3030 return (0); 3031 } 3032 3033 /* 3034 * The syncer vnode is no longer needed and is being decommissioned. 3035 * 3036 * Modifications to the worklist must be protected at splbio(). 3037 */ 3038 static int 3039 sync_reclaim(ap) 3040 struct vop_reclaim_args /* { 3041 struct vnode *a_vp; 3042 } */ *ap; 3043 { 3044 struct vnode *vp = ap->a_vp; 3045 int s; 3046 3047 s = splbio(); 3048 vp->v_mount->mnt_syncer = NULL; 3049 if (vp->v_flag & VONWORKLST) { 3050 LIST_REMOVE(vp, v_synclist); 3051 vp->v_flag &= ~VONWORKLST; 3052 } 3053 splx(s); 3054 3055 return (0); 3056 } 3057 3058 /* 3059 * Print out a syncer vnode. 3060 */ 3061 static int 3062 sync_print(ap) 3063 struct vop_print_args /* { 3064 struct vnode *a_vp; 3065 } */ *ap; 3066 { 3067 struct vnode *vp = ap->a_vp; 3068 3069 printf("syncer vnode"); 3070 if (vp->v_vnlock != NULL) 3071 lockmgr_printinfo(vp->v_vnlock); 3072 printf("\n"); 3073 return (0); 3074 } 3075 3076 /* 3077 * extract the dev_t from a VBLK or VCHR 3078 */ 3079 dev_t 3080 vn_todev(vp) 3081 struct vnode *vp; 3082 { 3083 if (vp->v_type != VBLK && vp->v_type != VCHR) 3084 return (NODEV); 3085 return (vp->v_rdev); 3086 } 3087 3088 /* 3089 * Check if vnode represents a disk device 3090 */ 3091 int 3092 vn_isdisk(vp, errp) 3093 struct vnode *vp; 3094 int *errp; 3095 { 3096 if (vp->v_type != VBLK && vp->v_type != VCHR) { 3097 if (errp != NULL) 3098 *errp = ENOTBLK; 3099 return (0); 3100 } 3101 if (vp->v_rdev == NULL) { 3102 if (errp != NULL) 3103 *errp = ENXIO; 3104 return (0); 3105 } 3106 if (!dev_dport(vp->v_rdev)) { 3107 if (errp != NULL) 3108 *errp = ENXIO; 3109 return (0); 3110 } 3111 if (!(dev_dflags(vp->v_rdev) & D_DISK)) { 3112 if (errp != NULL) 3113 *errp = ENOTBLK; 3114 return (0); 3115 } 3116 if (errp != NULL) 3117 *errp = 0; 3118 return (1); 3119 } 3120 3121 void 3122 NDFREE(ndp, flags) 3123 struct nameidata *ndp; 3124 const uint flags; 3125 { 3126 if (!(flags & NDF_NO_FREE_PNBUF) && 3127 (ndp->ni_cnd.cn_flags & HASBUF)) { 3128 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3129 ndp->ni_cnd.cn_flags &= ~HASBUF; 3130 } 3131 if (!(flags & NDF_NO_DVP_UNLOCK) && 3132 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3133 ndp->ni_dvp != ndp->ni_vp) 3134 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_td); 3135 if (!(flags & NDF_NO_DVP_RELE) && 3136 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3137 vrele(ndp->ni_dvp); 3138 ndp->ni_dvp = NULL; 3139 } 3140 if (!(flags & NDF_NO_VP_UNLOCK) && 3141 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3142 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_td); 3143 if (!(flags & NDF_NO_VP_RELE) && 3144 ndp->ni_vp) { 3145 vrele(ndp->ni_vp); 3146 ndp->ni_vp = NULL; 3147 } 3148 if (!(flags & NDF_NO_STARTDIR_RELE) && 3149 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3150 vrele(ndp->ni_startdir); 3151 ndp->ni_startdir = NULL; 3152 } 3153 } 3154