1 /* 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/nlookup.h> 39 #include <sys/vnode.h> 40 #include <sys/mount.h> 41 #include <sys/fcntl.h> 42 #include <sys/buf.h> 43 #include <sys/uuid.h> 44 #include <sys/vfsops.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 #include <sys/objcache.h> 48 49 #include <sys/proc.h> 50 #include <sys/namei.h> 51 #include <sys/mountctl.h> 52 #include <sys/dirent.h> 53 #include <sys/uio.h> 54 55 #include "hammer2.h" 56 #include "hammer2_disk.h" 57 #include "hammer2_mount.h" 58 #include "hammer2_lz4.h" 59 60 #include "zlib/hammer2_zlib.h" 61 62 #define REPORT_REFS_ERRORS 1 /* XXX remove me */ 63 64 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); 65 66 struct hammer2_sync_info { 67 int error; 68 int waitfor; 69 int pass; 70 }; 71 72 TAILQ_HEAD(hammer2_mntlist, hammer2_dev); 73 static struct hammer2_mntlist hammer2_mntlist; 74 75 struct hammer2_pfslist hammer2_pfslist; 76 struct hammer2_pfslist hammer2_spmplist; 77 struct lock hammer2_mntlk; 78 79 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT; 80 int hammer2_debug; 81 int hammer2_xopgroups; 82 long hammer2_debug_inode; 83 int hammer2_cluster_meta_read = 1; /* physical read-ahead */ 84 int hammer2_cluster_data_read = 4; /* physical read-ahead */ 85 int hammer2_cluster_write = 0; /* physical write clustering */ 86 int hammer2_dedup_enable = 1; 87 int hammer2_always_compress = 0; /* always try to compress */ 88 int hammer2_inval_enable = 0; 89 int hammer2_flush_pipe = 100; 90 int hammer2_dio_count; 91 int hammer2_dio_limit = 256; 92 int hammer2_bulkfree_tps = 5000; 93 int hammer2_worker_rmask = 3; 94 long hammer2_chain_allocs; 95 long hammer2_chain_frees; 96 long hammer2_limit_dirty_chains; 97 long hammer2_limit_dirty_inodes; 98 long hammer2_count_modified_chains; 99 long hammer2_iod_invals; 100 long hammer2_iod_file_read; 101 long hammer2_iod_meta_read; 102 long hammer2_iod_indr_read; 103 long hammer2_iod_fmap_read; 104 long hammer2_iod_volu_read; 105 long hammer2_iod_file_write; 106 long hammer2_iod_file_wembed; 107 long hammer2_iod_file_wzero; 108 long hammer2_iod_file_wdedup; 109 long hammer2_iod_meta_write; 110 long hammer2_iod_indr_write; 111 long hammer2_iod_fmap_write; 112 long hammer2_iod_volu_write; 113 long hammer2_iod_inode_creates; 114 long hammer2_iod_inode_deletes; 115 116 MALLOC_DECLARE(M_HAMMER2_CBUFFER); 117 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer", 118 "Buffer used for compression."); 119 120 MALLOC_DECLARE(M_HAMMER2_DEBUFFER); 121 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer", 122 "Buffer used for decompression."); 123 124 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem"); 125 126 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD, 127 &hammer2_supported_version, 0, ""); 128 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW, 129 &hammer2_debug, 0, ""); 130 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, debug_inode, CTLFLAG_RW, 131 &hammer2_debug_inode, 0, ""); 132 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW, 133 &hammer2_cluster_meta_read, 0, ""); 134 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW, 135 &hammer2_cluster_data_read, 0, ""); 136 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW, 137 &hammer2_cluster_write, 0, ""); 138 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW, 139 &hammer2_dedup_enable, 0, ""); 140 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW, 141 &hammer2_always_compress, 0, ""); 142 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, 143 &hammer2_inval_enable, 0, ""); 144 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, 145 &hammer2_flush_pipe, 0, ""); 146 SYSCTL_INT(_vfs_hammer2, OID_AUTO, worker_rmask, CTLFLAG_RW, 147 &hammer2_worker_rmask, 0, ""); 148 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW, 149 &hammer2_bulkfree_tps, 0, ""); 150 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW, 151 &hammer2_chain_allocs, 0, ""); 152 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW, 153 &hammer2_chain_frees, 0, ""); 154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW, 155 &hammer2_limit_dirty_chains, 0, ""); 156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW, 157 &hammer2_limit_dirty_inodes, 0, ""); 158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, 159 &hammer2_count_modified_chains, 0, ""); 160 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, 161 &hammer2_dio_count, 0, ""); 162 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW, 163 &hammer2_dio_limit, 0, ""); 164 165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, 166 &hammer2_iod_invals, 0, ""); 167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, 168 &hammer2_iod_file_read, 0, ""); 169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, 170 &hammer2_iod_meta_read, 0, ""); 171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, 172 &hammer2_iod_indr_read, 0, ""); 173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, 174 &hammer2_iod_fmap_read, 0, ""); 175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, 176 &hammer2_iod_volu_read, 0, ""); 177 178 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, 179 &hammer2_iod_file_write, 0, ""); 180 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW, 181 &hammer2_iod_file_wembed, 0, ""); 182 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW, 183 &hammer2_iod_file_wzero, 0, ""); 184 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW, 185 &hammer2_iod_file_wdedup, 0, ""); 186 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, 187 &hammer2_iod_meta_write, 0, ""); 188 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, 189 &hammer2_iod_indr_write, 0, ""); 190 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, 191 &hammer2_iod_fmap_write, 0, ""); 192 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, 193 &hammer2_iod_volu_write, 0, ""); 194 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_creates, CTLFLAG_RW, 195 &hammer2_iod_inode_creates, 0, ""); 196 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_deletes, CTLFLAG_RW, 197 &hammer2_iod_inode_deletes, 0, ""); 198 199 long hammer2_process_icrc32; 200 long hammer2_process_xxhash64; 201 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW, 202 &hammer2_process_icrc32, 0, ""); 203 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW, 204 &hammer2_process_xxhash64, 0, ""); 205 206 static int hammer2_vfs_init(struct vfsconf *conf); 207 static int hammer2_vfs_uninit(struct vfsconf *vfsp); 208 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 209 struct ucred *cred); 210 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *, 211 struct vnode *, struct ucred *); 212 static int hammer2_recovery(hammer2_dev_t *hmp); 213 static int hammer2_vfs_unmount(struct mount *mp, int mntflags); 214 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); 215 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, 216 struct ucred *cred); 217 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, 218 struct ucred *cred); 219 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 220 struct fid *fhp, struct vnode **vpp); 221 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp); 222 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 223 int *exflagsp, struct ucred **credanonp); 224 static int hammer2_vfs_modifying(struct mount *mp); 225 226 static int hammer2_install_volume_header(hammer2_dev_t *hmp); 227 #if 0 228 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 229 #endif 230 231 static void hammer2_update_pmps(hammer2_dev_t *hmp); 232 233 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp); 234 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, 235 hammer2_dev_t *hmp); 236 static int hammer2_fixup_pfses(hammer2_dev_t *hmp); 237 238 /* 239 * HAMMER2 vfs operations. 240 */ 241 static struct vfsops hammer2_vfsops = { 242 .vfs_flags = 0, 243 .vfs_init = hammer2_vfs_init, 244 .vfs_uninit = hammer2_vfs_uninit, 245 .vfs_sync = hammer2_vfs_sync, 246 .vfs_mount = hammer2_vfs_mount, 247 .vfs_unmount = hammer2_vfs_unmount, 248 .vfs_root = hammer2_vfs_root, 249 .vfs_statfs = hammer2_vfs_statfs, 250 .vfs_statvfs = hammer2_vfs_statvfs, 251 .vfs_vget = hammer2_vfs_vget, 252 .vfs_vptofh = hammer2_vfs_vptofh, 253 .vfs_fhtovp = hammer2_vfs_fhtovp, 254 .vfs_checkexp = hammer2_vfs_checkexp, 255 .vfs_modifying = hammer2_vfs_modifying 256 }; 257 258 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); 259 260 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE); 261 MODULE_VERSION(hammer2, 1); 262 263 static 264 int 265 hammer2_vfs_init(struct vfsconf *conf) 266 { 267 static struct objcache_malloc_args margs_read; 268 static struct objcache_malloc_args margs_write; 269 static struct objcache_malloc_args margs_vop; 270 271 int error; 272 273 error = 0; 274 kmalloc_raise_limit(M_HAMMER2, 0); /* unlimited */ 275 276 /* 277 * hammer2_xopgroups must be even and is most optimal if 278 * 2 x ncpus so strategy functions can be queued to the same 279 * cpu. 280 */ 281 hammer2_xopgroups = HAMMER2_XOPGROUPS_MIN; 282 if (hammer2_xopgroups < ncpus * 2) 283 hammer2_xopgroups = ncpus * 2; 284 285 /* 286 * A large DIO cache is needed to retain dedup enablement masks. 287 * The bulkfree code clears related masks as part of the disk block 288 * recycling algorithm, preventing it from being used for a later 289 * dedup. 290 * 291 * NOTE: A large buffer cache can actually interfere with dedup 292 * operation because we dedup based on media physical buffers 293 * and not logical buffers. Try to make the DIO case large 294 * enough to avoid this problem, but also cap it. 295 */ 296 hammer2_dio_limit = nbuf * 2; 297 if (hammer2_dio_limit > 100000) 298 hammer2_dio_limit = 100000; 299 300 if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) 301 error = EINVAL; 302 if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data)) 303 error = EINVAL; 304 if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data)) 305 error = EINVAL; 306 307 if (error) 308 kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); 309 310 margs_read.objsize = 65536; 311 margs_read.mtype = M_HAMMER2_DEBUFFER; 312 313 margs_write.objsize = 32768; 314 margs_write.mtype = M_HAMMER2_CBUFFER; 315 316 margs_vop.objsize = sizeof(hammer2_xop_t); 317 margs_vop.mtype = M_HAMMER2; 318 319 /* 320 * Note thaht for the XOPS cache we want backing store allocations 321 * to use M_ZERO. This is not allowed in objcache_get() (to avoid 322 * confusion), so use the backing store function that does it. This 323 * means that initial XOPS objects are zerod but REUSED objects are 324 * not. So we are responsible for cleaning the object up sufficiently 325 * for our needs before objcache_put()ing it back (typically just the 326 * FIFO indices). 327 */ 328 cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, 329 0, 1, NULL, NULL, NULL, 330 objcache_malloc_alloc, 331 objcache_malloc_free, 332 &margs_read); 333 cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, 334 0, 1, NULL, NULL, NULL, 335 objcache_malloc_alloc, 336 objcache_malloc_free, 337 &margs_write); 338 cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc, 339 0, 1, NULL, NULL, NULL, 340 objcache_malloc_alloc_zero, 341 objcache_malloc_free, 342 &margs_vop); 343 344 345 lockinit(&hammer2_mntlk, "mntlk", 0, 0); 346 TAILQ_INIT(&hammer2_mntlist); 347 TAILQ_INIT(&hammer2_pfslist); 348 TAILQ_INIT(&hammer2_spmplist); 349 350 hammer2_limit_dirty_chains = maxvnodes / 10; 351 if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS) 352 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS; 353 if (hammer2_limit_dirty_chains < 1000) 354 hammer2_limit_dirty_chains = 1000; 355 356 hammer2_limit_dirty_inodes = maxvnodes / 25; 357 if (hammer2_limit_dirty_inodes < 100) 358 hammer2_limit_dirty_inodes = 100; 359 if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES) 360 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES; 361 362 return (error); 363 } 364 365 static 366 int 367 hammer2_vfs_uninit(struct vfsconf *vfsp __unused) 368 { 369 objcache_destroy(cache_buffer_read); 370 objcache_destroy(cache_buffer_write); 371 objcache_destroy(cache_xops); 372 return 0; 373 } 374 375 /* 376 * Core PFS allocator. Used to allocate or reference the pmp structure 377 * for PFS cluster mounts and the spmp structure for media (hmp) structures. 378 * The pmp can be passed in or loaded by this function using the chain and 379 * inode data. 380 * 381 * pmp->modify_tid tracks new modify_tid transaction ids for front-end 382 * transactions. Note that synchronization does not use this field. 383 * (typically frontend operations and synchronization cannot run on the 384 * same PFS node at the same time). 385 * 386 * XXX check locking 387 */ 388 hammer2_pfs_t * 389 hammer2_pfsalloc(hammer2_chain_t *chain, 390 const hammer2_inode_data_t *ripdata, 391 hammer2_tid_t modify_tid, hammer2_dev_t *force_local) 392 { 393 hammer2_pfs_t *pmp; 394 hammer2_inode_t *iroot; 395 int count; 396 int i; 397 int j; 398 399 pmp = NULL; 400 401 /* 402 * Locate or create the PFS based on the cluster id. If ripdata 403 * is NULL this is a spmp which is unique and is always allocated. 404 * 405 * If the device is mounted in local mode all PFSs are considered 406 * independent and not part of any cluster (for debugging only). 407 */ 408 if (ripdata) { 409 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 410 if (force_local != pmp->force_local) 411 continue; 412 if (force_local == NULL && 413 bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid, 414 sizeof(pmp->pfs_clid)) == 0) { 415 break; 416 } else if (force_local && pmp->pfs_names[0] && 417 strcmp(pmp->pfs_names[0], ripdata->filename) == 0) { 418 break; 419 } 420 } 421 } 422 423 if (pmp == NULL) { 424 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO); 425 pmp->force_local = force_local; 426 hammer2_trans_manage_init(pmp); 427 kmalloc_create(&pmp->minode, "HAMMER2-inodes"); 428 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg"); 429 lockinit(&pmp->lock, "pfslk", 0, 0); 430 lockinit(&pmp->lock_nlink, "h2nlink", 0, 0); 431 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum"); 432 spin_init(&pmp->xop_spin, "h2xop"); 433 spin_init(&pmp->lru_spin, "h2lru"); 434 RB_INIT(&pmp->inum_tree); 435 TAILQ_INIT(&pmp->syncq); 436 TAILQ_INIT(&pmp->depq); 437 TAILQ_INIT(&pmp->lru_list); 438 spin_init(&pmp->list_spin, "h2pfsalloc_list"); 439 440 /* 441 * Save the last media transaction id for the flusher. Set 442 * initial 443 */ 444 if (ripdata) { 445 pmp->pfs_clid = ripdata->meta.pfs_clid; 446 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry); 447 } else { 448 pmp->flags |= HAMMER2_PMPF_SPMP; 449 TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry); 450 } 451 452 /* 453 * The synchronization thread may start too early, make 454 * sure it stays frozen until we are ready to let it go. 455 * XXX 456 */ 457 /* 458 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN | 459 HAMMER2_THREAD_REMASTER; 460 */ 461 } 462 463 /* 464 * Create the PFS's root inode and any missing XOP helper threads. 465 */ 466 if ((iroot = pmp->iroot) == NULL) { 467 iroot = hammer2_inode_get(pmp, NULL, 1, -1); 468 if (ripdata) 469 iroot->meta = ripdata->meta; 470 pmp->iroot = iroot; 471 hammer2_inode_ref(iroot); 472 hammer2_inode_unlock(iroot); 473 } 474 475 /* 476 * Stop here if no chain is passed in. 477 */ 478 if (chain == NULL) 479 goto done; 480 481 /* 482 * When a chain is passed in we must add it to the PFS's root 483 * inode, update pmp->pfs_types[], and update the syncronization 484 * threads. 485 * 486 * When forcing local mode, mark the PFS as a MASTER regardless. 487 * 488 * At the moment empty spots can develop due to removals or failures. 489 * Ultimately we want to re-fill these spots but doing so might 490 * confused running code. XXX 491 */ 492 hammer2_inode_ref(iroot); 493 hammer2_mtx_ex(&iroot->lock); 494 j = iroot->cluster.nchains; 495 496 if (j == HAMMER2_MAXCLUSTER) { 497 kprintf("hammer2_mount: cluster full!\n"); 498 /* XXX fatal error? */ 499 } else { 500 KKASSERT(chain->pmp == NULL); 501 chain->pmp = pmp; 502 hammer2_chain_ref(chain); 503 iroot->cluster.array[j].chain = chain; 504 if (force_local) 505 pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER; 506 else 507 pmp->pfs_types[j] = ripdata->meta.pfs_type; 508 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2); 509 pmp->pfs_hmps[j] = chain->hmp; 510 hammer2_spin_ex(&pmp->inum_spin); 511 pmp->pfs_iroot_blocksets[j] = chain->data->ipdata.u.blockset; 512 hammer2_spin_unex(&pmp->inum_spin); 513 514 /* 515 * If the PFS is already mounted we must account 516 * for the mount_count here. 517 */ 518 if (pmp->mp) 519 ++chain->hmp->mount_count; 520 521 /* 522 * May have to fixup dirty chain tracking. Previous 523 * pmp was NULL so nothing to undo. 524 */ 525 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 526 hammer2_pfs_memory_inc(pmp); 527 ++j; 528 } 529 iroot->cluster.nchains = j; 530 531 /* 532 * Update nmasters from any PFS inode which is part of the cluster. 533 * It is possible that this will result in a value which is too 534 * high. MASTER PFSs are authoritative for pfs_nmasters and will 535 * override this value later on. 536 * 537 * (This informs us of masters that might not currently be 538 * discoverable by this mount). 539 */ 540 if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) { 541 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters; 542 } 543 544 /* 545 * Count visible masters. Masters are usually added with 546 * ripdata->meta.pfs_nmasters set to 1. This detects when there 547 * are more (XXX and must update the master inodes). 548 */ 549 count = 0; 550 for (i = 0; i < iroot->cluster.nchains; ++i) { 551 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) 552 ++count; 553 } 554 if (pmp->pfs_nmasters < count) 555 pmp->pfs_nmasters = count; 556 557 /* 558 * Create missing synchronization and support threads. 559 * 560 * Single-node masters (including snapshots) have nothing to 561 * synchronize and do not require this thread. 562 * 563 * Multi-node masters or any number of soft masters, slaves, copy, 564 * or other PFS types need the thread. 565 * 566 * Each thread is responsible for its particular cluster index. 567 * We use independent threads so stalls or mismatches related to 568 * any given target do not affect other targets. 569 */ 570 for (i = 0; i < iroot->cluster.nchains; ++i) { 571 /* 572 * Single-node masters (including snapshots) have nothing 573 * to synchronize and will make direct xops support calls, 574 * thus they do not require this thread. 575 * 576 * Note that there can be thousands of snapshots. We do not 577 * want to create thousands of threads. 578 */ 579 if (pmp->pfs_nmasters <= 1 && 580 pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) { 581 continue; 582 } 583 584 /* 585 * Sync support thread 586 */ 587 if (pmp->sync_thrs[i].td == NULL) { 588 hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL, 589 "h2nod", i, -1, 590 hammer2_primary_sync_thread); 591 } 592 } 593 594 /* 595 * Create missing Xop threads 596 * 597 * NOTE: We create helper threads for all mounted PFSs or any 598 * PFSs with 2+ nodes (so the sync thread can update them, 599 * even if not mounted). 600 */ 601 if (pmp->mp || iroot->cluster.nchains >= 2) 602 hammer2_xop_helper_create(pmp); 603 604 hammer2_mtx_unlock(&iroot->lock); 605 hammer2_inode_drop(iroot); 606 done: 607 return pmp; 608 } 609 610 /* 611 * Deallocate an element of a probed PFS. If destroying and this is a 612 * MASTER, adjust nmasters. 613 * 614 * This function does not physically destroy the PFS element in its device 615 * under the super-root (see hammer2_ioctl_pfs_delete()). 616 */ 617 void 618 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying) 619 { 620 hammer2_inode_t *iroot; 621 hammer2_chain_t *chain; 622 int j; 623 624 /* 625 * Cleanup our reference on iroot. iroot is (should) not be needed 626 * by the flush code. 627 */ 628 iroot = pmp->iroot; 629 if (iroot) { 630 /* 631 * Stop synchronizing 632 * 633 * XXX flush after acquiring the iroot lock. 634 * XXX clean out the cluster index from all inode structures. 635 */ 636 hammer2_thr_delete(&pmp->sync_thrs[clindex]); 637 638 /* 639 * Remove the cluster index from the group. If destroying 640 * the PFS and this is a master, adjust pfs_nmasters. 641 */ 642 hammer2_mtx_ex(&iroot->lock); 643 chain = iroot->cluster.array[clindex].chain; 644 iroot->cluster.array[clindex].chain = NULL; 645 646 switch(pmp->pfs_types[clindex]) { 647 case HAMMER2_PFSTYPE_MASTER: 648 if (destroying && pmp->pfs_nmasters > 0) 649 --pmp->pfs_nmasters; 650 /* XXX adjust ripdata->meta.pfs_nmasters */ 651 break; 652 default: 653 break; 654 } 655 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE; 656 657 hammer2_mtx_unlock(&iroot->lock); 658 659 /* 660 * Release the chain. 661 */ 662 if (chain) { 663 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 664 hammer2_chain_drop(chain); 665 } 666 667 /* 668 * Terminate all XOP threads for the cluster index. 669 */ 670 if (pmp->xop_groups) { 671 for (j = 0; j < hammer2_xopgroups; ++j) { 672 hammer2_thr_delete( 673 &pmp->xop_groups[j].thrs[clindex]); 674 } 675 } 676 } 677 } 678 679 /* 680 * Destroy a PFS, typically only occurs after the last mount on a device 681 * has gone away. 682 */ 683 static void 684 hammer2_pfsfree(hammer2_pfs_t *pmp) 685 { 686 hammer2_inode_t *iroot; 687 hammer2_chain_t *chain; 688 int chains_still_present = 0; 689 int i; 690 int j; 691 692 /* 693 * Cleanup our reference on iroot. iroot is (should) not be needed 694 * by the flush code. 695 */ 696 if (pmp->flags & HAMMER2_PMPF_SPMP) 697 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry); 698 else 699 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry); 700 701 /* 702 * Cleanup chains remaining on LRU list. 703 */ 704 hammer2_spin_ex(&pmp->lru_spin); 705 while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) { 706 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU); 707 atomic_add_int(&pmp->lru_count, -1); 708 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 709 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 710 hammer2_chain_ref(chain); 711 hammer2_spin_unex(&pmp->lru_spin); 712 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 713 hammer2_chain_drop(chain); 714 hammer2_spin_ex(&pmp->lru_spin); 715 } 716 hammer2_spin_unex(&pmp->lru_spin); 717 718 /* 719 * Clean up iroot 720 */ 721 iroot = pmp->iroot; 722 if (iroot) { 723 for (i = 0; i < iroot->cluster.nchains; ++i) { 724 hammer2_thr_delete(&pmp->sync_thrs[i]); 725 if (pmp->xop_groups) { 726 for (j = 0; j < hammer2_xopgroups; ++j) 727 hammer2_thr_delete( 728 &pmp->xop_groups[j].thrs[i]); 729 } 730 chain = iroot->cluster.array[i].chain; 731 if (chain && !RB_EMPTY(&chain->core.rbtree)) { 732 kprintf("hammer2: Warning pmp %p still " 733 "has active chains\n", pmp); 734 chains_still_present = 1; 735 } 736 } 737 #if REPORT_REFS_ERRORS 738 if (iroot->refs != 1) 739 kprintf("PMP->IROOT %p REFS WRONG %d\n", 740 iroot, iroot->refs); 741 #else 742 KKASSERT(iroot->refs == 1); 743 #endif 744 /* ref for iroot */ 745 hammer2_inode_drop(iroot); 746 pmp->iroot = NULL; 747 } 748 749 /* 750 * Free remaining pmp resources 751 */ 752 if (chains_still_present) { 753 kprintf("hammer2: cannot free pmp %p, still in use\n", pmp); 754 } else { 755 kmalloc_destroy(&pmp->mmsg); 756 kmalloc_destroy(&pmp->minode); 757 kfree(pmp, M_HAMMER2); 758 } 759 } 760 761 /* 762 * Remove all references to hmp from the pfs list. Any PFS which becomes 763 * empty is terminated and freed. 764 * 765 * XXX inefficient. 766 */ 767 static void 768 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which) 769 { 770 hammer2_pfs_t *pmp; 771 hammer2_inode_t *iroot; 772 hammer2_chain_t *rchain; 773 int i; 774 int j; 775 struct hammer2_pfslist *wlist; 776 777 if (which == 0) 778 wlist = &hammer2_pfslist; 779 else 780 wlist = &hammer2_spmplist; 781 again: 782 TAILQ_FOREACH(pmp, wlist, mntentry) { 783 if ((iroot = pmp->iroot) == NULL) 784 continue; 785 786 /* 787 * Determine if this PFS is affected. If it is we must 788 * freeze all management threads and lock its iroot. 789 * 790 * Freezing a management thread forces it idle, operations 791 * in-progress will be aborted and it will have to start 792 * over again when unfrozen, or exit if told to exit. 793 */ 794 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 795 if (pmp->pfs_hmps[i] == hmp) 796 break; 797 } 798 if (i == HAMMER2_MAXCLUSTER) 799 continue; 800 801 hammer2_vfs_sync_pmp(pmp, MNT_WAIT); 802 803 /* 804 * Make sure all synchronization threads are locked 805 * down. 806 */ 807 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 808 if (pmp->pfs_hmps[i] == NULL) 809 continue; 810 hammer2_thr_freeze_async(&pmp->sync_thrs[i]); 811 if (pmp->xop_groups) { 812 for (j = 0; j < hammer2_xopgroups; ++j) { 813 hammer2_thr_freeze_async( 814 &pmp->xop_groups[j].thrs[i]); 815 } 816 } 817 } 818 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 819 if (pmp->pfs_hmps[i] == NULL) 820 continue; 821 hammer2_thr_freeze(&pmp->sync_thrs[i]); 822 if (pmp->xop_groups) { 823 for (j = 0; j < hammer2_xopgroups; ++j) { 824 hammer2_thr_freeze( 825 &pmp->xop_groups[j].thrs[i]); 826 } 827 } 828 } 829 830 /* 831 * Lock the inode and clean out matching chains. 832 * Note that we cannot use hammer2_inode_lock_*() 833 * here because that would attempt to validate the 834 * cluster that we are in the middle of ripping 835 * apart. 836 * 837 * WARNING! We are working directly on the inodes 838 * embedded cluster. 839 */ 840 hammer2_mtx_ex(&iroot->lock); 841 842 /* 843 * Remove the chain from matching elements of the PFS. 844 */ 845 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 846 if (pmp->pfs_hmps[i] != hmp) 847 continue; 848 hammer2_thr_delete(&pmp->sync_thrs[i]); 849 if (pmp->xop_groups) { 850 for (j = 0; j < hammer2_xopgroups; ++j) { 851 hammer2_thr_delete( 852 &pmp->xop_groups[j].thrs[i]); 853 } 854 } 855 rchain = iroot->cluster.array[i].chain; 856 iroot->cluster.array[i].chain = NULL; 857 pmp->pfs_types[i] = 0; 858 if (pmp->pfs_names[i]) { 859 kfree(pmp->pfs_names[i], M_HAMMER2); 860 pmp->pfs_names[i] = NULL; 861 } 862 if (rchain) { 863 hammer2_chain_drop(rchain); 864 /* focus hint */ 865 if (iroot->cluster.focus == rchain) 866 iroot->cluster.focus = NULL; 867 } 868 pmp->pfs_hmps[i] = NULL; 869 } 870 hammer2_mtx_unlock(&iroot->lock); 871 872 /* 873 * Cleanup trailing chains. Gaps may remain. 874 */ 875 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) { 876 if (pmp->pfs_hmps[i]) 877 break; 878 } 879 iroot->cluster.nchains = i + 1; 880 881 /* 882 * If the PMP has no elements remaining we can destroy it. 883 * (this will transition management threads from frozen->exit). 884 */ 885 if (iroot->cluster.nchains == 0) { 886 /* 887 * If this was the hmp's spmp, we need to clean 888 * a little more stuff out. 889 */ 890 if (hmp->spmp == pmp) { 891 hmp->spmp = NULL; 892 hmp->vchain.pmp = NULL; 893 hmp->fchain.pmp = NULL; 894 } 895 896 /* 897 * Free the pmp and restart the loop 898 */ 899 KKASSERT(TAILQ_EMPTY(&pmp->syncq)); 900 KKASSERT(TAILQ_EMPTY(&pmp->depq)); 901 hammer2_pfsfree(pmp); 902 goto again; 903 } 904 905 /* 906 * If elements still remain we need to set the REMASTER 907 * flag and unfreeze it. 908 */ 909 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 910 if (pmp->pfs_hmps[i] == NULL) 911 continue; 912 hammer2_thr_remaster(&pmp->sync_thrs[i]); 913 hammer2_thr_unfreeze(&pmp->sync_thrs[i]); 914 if (pmp->xop_groups) { 915 for (j = 0; j < hammer2_xopgroups; ++j) { 916 hammer2_thr_remaster( 917 &pmp->xop_groups[j].thrs[i]); 918 hammer2_thr_unfreeze( 919 &pmp->xop_groups[j].thrs[i]); 920 } 921 } 922 } 923 } 924 } 925 926 /* 927 * Mount or remount HAMMER2 fileystem from physical media 928 * 929 * mountroot 930 * mp mount point structure 931 * path NULL 932 * data <unused> 933 * cred <unused> 934 * 935 * mount 936 * mp mount point structure 937 * path path to mount point 938 * data pointer to argument structure in user space 939 * volume volume path (device@LABEL form) 940 * hflags user mount flags 941 * cred user credentials 942 * 943 * RETURNS: 0 Success 944 * !0 error number 945 */ 946 static 947 int 948 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 949 struct ucred *cred) 950 { 951 struct hammer2_mount_info info; 952 hammer2_pfs_t *pmp; 953 hammer2_pfs_t *spmp; 954 hammer2_dev_t *hmp; 955 hammer2_dev_t *force_local; 956 hammer2_key_t key_next; 957 hammer2_key_t key_dummy; 958 hammer2_key_t lhc; 959 struct vnode *devvp; 960 struct nlookupdata nd; 961 hammer2_chain_t *parent; 962 hammer2_chain_t *chain; 963 const hammer2_inode_data_t *ripdata; 964 hammer2_blockref_t bref; 965 struct file *fp; 966 char devstr[MNAMELEN]; 967 size_t size; 968 size_t done; 969 char *dev; 970 char *label; 971 int ronly = 1; 972 int error; 973 int i; 974 975 hmp = NULL; 976 pmp = NULL; 977 dev = NULL; 978 label = NULL; 979 devvp = NULL; 980 981 if (path == NULL) { 982 /* 983 * Root mount 984 */ 985 bzero(&info, sizeof(info)); 986 info.cluster_fd = -1; 987 ksnprintf(devstr, sizeof(devstr), "%s", 988 mp->mnt_stat.f_mntfromname); 989 kprintf("hammer2_mount: root '%s'\n", devstr); 990 done = strlen(devstr) + 1; 991 } else { 992 /* 993 * Non-root mount or updating a mount 994 */ 995 error = copyin(data, &info, sizeof(info)); 996 if (error) 997 return (error); 998 999 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); 1000 if (error) 1001 return (error); 1002 kprintf("hammer2_mount: '%s'\n", devstr); 1003 } 1004 1005 /* 1006 * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA 1007 * if no label specified, based on the partition id. Error out if no 1008 * label or device (with partition id) is specified. This is strictly 1009 * a convenience to match the default label created by newfs_hammer2, 1010 * our preference is that a label always be specified. 1011 * 1012 * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command 1013 * that does not specify a device, as long as some H2 label 1014 * has already been mounted from that device. This makes 1015 * mounting snapshots a lot easier. 1016 */ 1017 dev = devstr; 1018 label = strchr(devstr, '@'); 1019 if (label && ((label + 1) - dev) > done) { 1020 kprintf("hammer2: mount: bad label %s/%zd\n", 1021 devstr, done); 1022 return (EINVAL); 1023 } 1024 if (label == NULL || label[1] == 0) { 1025 char slice; 1026 1027 if (label == NULL) 1028 label = devstr + strlen(devstr); 1029 else 1030 *label = '\0'; /* clean up trailing @ */ 1031 1032 slice = label[-1]; 1033 switch(slice) { 1034 case 'a': 1035 label = "BOOT"; 1036 break; 1037 case 'd': 1038 label = "ROOT"; 1039 break; 1040 default: 1041 label = "DATA"; 1042 break; 1043 } 1044 } else { 1045 *label = '\0'; 1046 label++; 1047 } 1048 1049 kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n", 1050 dev, label, (mp->mnt_flag & MNT_RDONLY)); 1051 1052 if (mp->mnt_flag & MNT_UPDATE) { 1053 /* 1054 * Update mount. Note that pmp->iroot->cluster is 1055 * an inode-embedded cluster and thus cannot be 1056 * directly locked. 1057 * 1058 * XXX HAMMER2 needs to implement NFS export via 1059 * mountctl. 1060 */ 1061 hammer2_cluster_t *cluster; 1062 1063 pmp = MPTOPMP(mp); 1064 pmp->hflags = info.hflags; 1065 cluster = &pmp->iroot->cluster; 1066 for (i = 0; i < cluster->nchains; ++i) { 1067 if (cluster->array[i].chain == NULL) 1068 continue; 1069 hmp = cluster->array[i].chain->hmp; 1070 devvp = hmp->devvp; 1071 error = hammer2_remount(hmp, mp, path, 1072 devvp, cred); 1073 if (error) 1074 break; 1075 } 1076 1077 return error; 1078 } 1079 1080 /* 1081 * HMP device mount 1082 * 1083 * If a path is specified and dev is not an empty string, lookup the 1084 * name and verify that it referes to a block device. 1085 * 1086 * If a path is specified and dev is an empty string we fall through 1087 * and locate the label in the hmp search. 1088 */ 1089 if (path && *dev != 0) { 1090 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); 1091 if (error == 0) 1092 error = nlookup(&nd); 1093 if (error == 0) 1094 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); 1095 nlookup_done(&nd); 1096 } else if (path == NULL) { 1097 /* root mount */ 1098 cdev_t cdev = kgetdiskbyname(dev); 1099 error = bdevvp(cdev, &devvp); 1100 if (error) 1101 kprintf("hammer2: cannot find '%s'\n", dev); 1102 } else { 1103 /* 1104 * We will locate the hmp using the label in the hmp loop. 1105 */ 1106 error = 0; 1107 } 1108 1109 /* 1110 * Make sure its a block device. Do not check to see if it is 1111 * already mounted until we determine that its a fresh H2 device. 1112 */ 1113 if (error == 0 && devvp) { 1114 vn_isdisk(devvp, &error); 1115 } 1116 1117 /* 1118 * Determine if the device has already been mounted. After this 1119 * check hmp will be non-NULL if we are doing the second or more 1120 * hammer2 mounts from the same device. 1121 */ 1122 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1123 if (devvp) { 1124 /* 1125 * Match the device. Due to the way devfs works, 1126 * we may not be able to directly match the vnode pointer, 1127 * so also check to see if the underlying device matches. 1128 */ 1129 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1130 if (hmp->devvp == devvp) 1131 break; 1132 if (devvp->v_rdev && 1133 hmp->devvp->v_rdev == devvp->v_rdev) { 1134 break; 1135 } 1136 } 1137 1138 /* 1139 * If no match this may be a fresh H2 mount, make sure 1140 * the device is not mounted on anything else. 1141 */ 1142 if (hmp == NULL) 1143 error = vfs_mountedon(devvp); 1144 } else if (error == 0) { 1145 /* 1146 * Match the label to a pmp already probed. 1147 */ 1148 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 1149 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 1150 if (pmp->pfs_names[i] && 1151 strcmp(pmp->pfs_names[i], label) == 0) { 1152 hmp = pmp->pfs_hmps[i]; 1153 break; 1154 } 1155 } 1156 if (hmp) 1157 break; 1158 } 1159 if (hmp == NULL) 1160 error = ENOENT; 1161 } 1162 1163 /* 1164 * Open the device if this isn't a secondary mount and construct 1165 * the H2 device mount (hmp). 1166 */ 1167 if (hmp == NULL) { 1168 hammer2_chain_t *schain; 1169 hammer2_xid_t xid; 1170 hammer2_xop_head_t xop; 1171 1172 if (error == 0 && vcount(devvp) > 0) { 1173 kprintf("Primary device already has references\n"); 1174 error = EBUSY; 1175 } 1176 1177 /* 1178 * Now open the device 1179 */ 1180 if (error == 0) { 1181 ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); 1182 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1183 error = vinvalbuf(devvp, V_SAVE, 0, 0); 1184 if (error == 0) { 1185 error = VOP_OPEN(devvp, 1186 (ronly ? FREAD : FREAD | FWRITE), 1187 FSCRED, NULL); 1188 } 1189 vn_unlock(devvp); 1190 } 1191 if (error && devvp) { 1192 vrele(devvp); 1193 devvp = NULL; 1194 } 1195 if (error) { 1196 lockmgr(&hammer2_mntlk, LK_RELEASE); 1197 return error; 1198 } 1199 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); 1200 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev); 1201 hmp->ronly = ronly; 1202 hmp->devvp = devvp; 1203 hmp->hflags = info.hflags & HMNT2_DEVFLAGS; 1204 kmalloc_create(&hmp->mchain, "HAMMER2-chains"); 1205 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry); 1206 RB_INIT(&hmp->iotree); 1207 spin_init(&hmp->io_spin, "h2mount_io"); 1208 spin_init(&hmp->list_spin, "h2mount_list"); 1209 1210 lockinit(&hmp->vollk, "h2vol", 0, 0); 1211 lockinit(&hmp->bulklk, "h2bulk", 0, 0); 1212 lockinit(&hmp->bflock, "h2bflk", 0, 0); 1213 1214 /* 1215 * vchain setup. vchain.data is embedded. 1216 * vchain.refs is initialized and will never drop to 0. 1217 * 1218 * NOTE! voldata is not yet loaded. 1219 */ 1220 hmp->vchain.hmp = hmp; 1221 hmp->vchain.refs = 1; 1222 hmp->vchain.data = (void *)&hmp->voldata; 1223 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; 1224 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1225 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1226 1227 hammer2_chain_core_init(&hmp->vchain); 1228 /* hmp->vchain.u.xxx is left NULL */ 1229 1230 /* 1231 * fchain setup. fchain.data is embedded. 1232 * fchain.refs is initialized and will never drop to 0. 1233 * 1234 * The data is not used but needs to be initialized to 1235 * pass assertion muster. We use this chain primarily 1236 * as a placeholder for the freemap's top-level RBTREE 1237 * so it does not interfere with the volume's topology 1238 * RBTREE. 1239 */ 1240 hmp->fchain.hmp = hmp; 1241 hmp->fchain.refs = 1; 1242 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset; 1243 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP; 1244 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1245 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1246 hmp->fchain.bref.methods = 1247 HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) | 1248 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 1249 1250 hammer2_chain_core_init(&hmp->fchain); 1251 /* hmp->fchain.u.xxx is left NULL */ 1252 1253 /* 1254 * Install the volume header and initialize fields from 1255 * voldata. 1256 */ 1257 error = hammer2_install_volume_header(hmp); 1258 if (error) { 1259 hammer2_unmount_helper(mp, NULL, hmp); 1260 lockmgr(&hammer2_mntlk, LK_RELEASE); 1261 hammer2_vfs_unmount(mp, MNT_FORCE); 1262 return error; 1263 } 1264 1265 /* 1266 * Really important to get these right or the flush and 1267 * teardown code will get confused. 1268 */ 1269 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL); 1270 spmp = hmp->spmp; 1271 spmp->pfs_hmps[0] = hmp; 1272 1273 /* 1274 * Dummy-up vchain and fchain's modify_tid. mirror_tid 1275 * is inherited from the volume header. 1276 */ 1277 xid = 0; 1278 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1279 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid; 1280 hmp->vchain.pmp = spmp; 1281 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1282 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid; 1283 hmp->fchain.pmp = spmp; 1284 1285 /* 1286 * First locate the super-root inode, which is key 0 1287 * relative to the volume header's blockset. 1288 * 1289 * Then locate the root inode by scanning the directory keyspace 1290 * represented by the label. 1291 */ 1292 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 1293 schain = hammer2_chain_lookup(&parent, &key_dummy, 1294 HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 1295 &error, 0); 1296 hammer2_chain_lookup_done(parent); 1297 if (schain == NULL) { 1298 kprintf("hammer2_mount: invalid super-root\n"); 1299 hammer2_unmount_helper(mp, NULL, hmp); 1300 lockmgr(&hammer2_mntlk, LK_RELEASE); 1301 hammer2_vfs_unmount(mp, MNT_FORCE); 1302 return EINVAL; 1303 } 1304 if (schain->error) { 1305 kprintf("hammer2_mount: error %s reading super-root\n", 1306 hammer2_error_str(schain->error)); 1307 hammer2_chain_unlock(schain); 1308 hammer2_chain_drop(schain); 1309 schain = NULL; 1310 hammer2_unmount_helper(mp, NULL, hmp); 1311 lockmgr(&hammer2_mntlk, LK_RELEASE); 1312 hammer2_vfs_unmount(mp, MNT_FORCE); 1313 return EINVAL; 1314 } 1315 1316 /* 1317 * The super-root always uses an inode_tid of 1 when 1318 * creating PFSs. 1319 */ 1320 spmp->inode_tid = 1; 1321 spmp->modify_tid = schain->bref.modify_tid + 1; 1322 1323 /* 1324 * Sanity-check schain's pmp and finish initialization. 1325 * Any chain belonging to the super-root topology should 1326 * have a NULL pmp (not even set to spmp). 1327 */ 1328 ripdata = &hammer2_chain_rdata(schain)->ipdata; 1329 KKASSERT(schain->pmp == NULL); 1330 spmp->pfs_clid = ripdata->meta.pfs_clid; 1331 1332 /* 1333 * Replace the dummy spmp->iroot with a real one. It's 1334 * easier to just do a wholesale replacement than to try 1335 * to update the chain and fixup the iroot fields. 1336 * 1337 * The returned inode is locked with the supplied cluster. 1338 */ 1339 hammer2_dummy_xop_from_chain(&xop, schain); 1340 hammer2_inode_drop(spmp->iroot); 1341 spmp->iroot = NULL; 1342 spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1); 1343 spmp->spmp_hmp = hmp; 1344 spmp->pfs_types[0] = ripdata->meta.pfs_type; 1345 spmp->pfs_hmps[0] = hmp; 1346 hammer2_inode_ref(spmp->iroot); 1347 hammer2_inode_unlock(spmp->iroot); 1348 hammer2_cluster_unlock(&xop.cluster); 1349 hammer2_chain_drop(schain); 1350 /* do not call hammer2_cluster_drop() on an embedded cluster */ 1351 schain = NULL; /* now invalid */ 1352 /* leave spmp->iroot with one ref */ 1353 1354 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 1355 error = hammer2_recovery(hmp); 1356 if (error == 0) 1357 error |= hammer2_fixup_pfses(hmp); 1358 /* XXX do something with error */ 1359 } 1360 hammer2_update_pmps(hmp); 1361 hammer2_iocom_init(hmp); 1362 hammer2_bulkfree_init(hmp); 1363 1364 /* 1365 * Ref the cluster management messaging descriptor. The mount 1366 * program deals with the other end of the communications pipe. 1367 * 1368 * Root mounts typically do not supply one. 1369 */ 1370 if (info.cluster_fd >= 0) { 1371 fp = holdfp(curthread, info.cluster_fd, -1); 1372 if (fp) { 1373 hammer2_cluster_reconnect(hmp, fp); 1374 } else { 1375 kprintf("hammer2_mount: bad cluster_fd!\n"); 1376 } 1377 } 1378 } else { 1379 spmp = hmp->spmp; 1380 if (info.hflags & HMNT2_DEVFLAGS) { 1381 kprintf("hammer2: Warning: mount flags pertaining " 1382 "to the whole device may only be specified " 1383 "on the first mount of the device: %08x\n", 1384 info.hflags & HMNT2_DEVFLAGS); 1385 } 1386 } 1387 1388 /* 1389 * Force local mount (disassociate all PFSs from their clusters). 1390 * Used primarily for debugging. 1391 */ 1392 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1393 1394 /* 1395 * Lookup the mount point under the media-localized super-root. 1396 * Scanning hammer2_pfslist doesn't help us because it represents 1397 * PFS cluster ids which can aggregate several named PFSs together. 1398 * 1399 * cluster->pmp will incorrectly point to spmp and must be fixed 1400 * up later on. 1401 */ 1402 hammer2_inode_lock(spmp->iroot, 0); 1403 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1404 lhc = hammer2_dirhash(label, strlen(label)); 1405 chain = hammer2_chain_lookup(&parent, &key_next, 1406 lhc, lhc + HAMMER2_DIRHASH_LOMASK, 1407 &error, 0); 1408 while (chain) { 1409 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 1410 strcmp(label, chain->data->ipdata.filename) == 0) { 1411 break; 1412 } 1413 chain = hammer2_chain_next(&parent, chain, &key_next, 1414 key_next, 1415 lhc + HAMMER2_DIRHASH_LOMASK, 1416 &error, 0); 1417 } 1418 if (parent) { 1419 hammer2_chain_unlock(parent); 1420 hammer2_chain_drop(parent); 1421 } 1422 hammer2_inode_unlock(spmp->iroot); 1423 1424 /* 1425 * PFS could not be found? 1426 */ 1427 if (chain == NULL) { 1428 if (error) 1429 kprintf("hammer2_mount: PFS label I/O error\n"); 1430 else 1431 kprintf("hammer2_mount: PFS label not found\n"); 1432 hammer2_unmount_helper(mp, NULL, hmp); 1433 lockmgr(&hammer2_mntlk, LK_RELEASE); 1434 hammer2_vfs_unmount(mp, MNT_FORCE); 1435 1436 return EINVAL; 1437 } 1438 1439 /* 1440 * Acquire the pmp structure (it should have already been allocated 1441 * via hammer2_update_pmps() so do not pass cluster in to add to 1442 * available chains). 1443 * 1444 * Check if the cluster has already been mounted. A cluster can 1445 * only be mounted once, use null mounts to mount additional copies. 1446 */ 1447 if (chain->error) { 1448 kprintf("hammer2_mount: PFS label I/O error\n"); 1449 } else { 1450 ripdata = &chain->data->ipdata; 1451 bref = chain->bref; 1452 pmp = hammer2_pfsalloc(NULL, ripdata, 1453 bref.modify_tid, force_local); 1454 } 1455 hammer2_chain_unlock(chain); 1456 hammer2_chain_drop(chain); 1457 1458 /* 1459 * Finish the mount 1460 */ 1461 kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp); 1462 1463 if (pmp->mp) { 1464 kprintf("hammer2_mount: PFS already mounted!\n"); 1465 hammer2_unmount_helper(mp, NULL, hmp); 1466 lockmgr(&hammer2_mntlk, LK_RELEASE); 1467 hammer2_vfs_unmount(mp, MNT_FORCE); 1468 1469 return EBUSY; 1470 } 1471 1472 pmp->hflags = info.hflags; 1473 mp->mnt_flag |= MNT_LOCAL; 1474 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; /* all entry pts are SMP */ 1475 mp->mnt_kern_flag |= MNTK_THR_SYNC; /* new vsyncscan semantics */ 1476 1477 /* 1478 * required mount structure initializations 1479 */ 1480 mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE; 1481 mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE; 1482 1483 mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE; 1484 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 1485 1486 /* 1487 * Optional fields 1488 */ 1489 mp->mnt_iosize_max = MAXPHYS; 1490 1491 /* 1492 * Connect up mount pointers. 1493 */ 1494 hammer2_mount_helper(mp, pmp); 1495 1496 lockmgr(&hammer2_mntlk, LK_RELEASE); 1497 1498 /* 1499 * Finish setup 1500 */ 1501 vfs_getnewfsid(mp); 1502 vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops); 1503 vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops); 1504 vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops); 1505 1506 if (path) { 1507 copyinstr(info.volume, mp->mnt_stat.f_mntfromname, 1508 MNAMELEN - 1, &size); 1509 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); 1510 } /* else root mount, already in there */ 1511 1512 bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); 1513 if (path) { 1514 copyinstr(path, mp->mnt_stat.f_mntonname, 1515 sizeof(mp->mnt_stat.f_mntonname) - 1, 1516 &size); 1517 } else { 1518 /* root mount */ 1519 mp->mnt_stat.f_mntonname[0] = '/'; 1520 } 1521 1522 /* 1523 * Initial statfs to prime mnt_stat. 1524 */ 1525 hammer2_vfs_statfs(mp, &mp->mnt_stat, cred); 1526 1527 return 0; 1528 } 1529 1530 /* 1531 * Scan PFSs under the super-root and create hammer2_pfs structures. 1532 */ 1533 static 1534 void 1535 hammer2_update_pmps(hammer2_dev_t *hmp) 1536 { 1537 const hammer2_inode_data_t *ripdata; 1538 hammer2_chain_t *parent; 1539 hammer2_chain_t *chain; 1540 hammer2_blockref_t bref; 1541 hammer2_dev_t *force_local; 1542 hammer2_pfs_t *spmp; 1543 hammer2_pfs_t *pmp; 1544 hammer2_key_t key_next; 1545 int error; 1546 1547 /* 1548 * Force local mount (disassociate all PFSs from their clusters). 1549 * Used primarily for debugging. 1550 */ 1551 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1552 1553 /* 1554 * Lookup mount point under the media-localized super-root. 1555 * 1556 * cluster->pmp will incorrectly point to spmp and must be fixed 1557 * up later on. 1558 */ 1559 spmp = hmp->spmp; 1560 hammer2_inode_lock(spmp->iroot, 0); 1561 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1562 chain = hammer2_chain_lookup(&parent, &key_next, 1563 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 1564 &error, 0); 1565 while (chain) { 1566 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 1567 continue; 1568 if (chain->error) { 1569 kprintf("I/O error scanning PFS labels\n"); 1570 } else { 1571 ripdata = &chain->data->ipdata; 1572 bref = chain->bref; 1573 1574 pmp = hammer2_pfsalloc(chain, ripdata, 1575 bref.modify_tid, force_local); 1576 } 1577 chain = hammer2_chain_next(&parent, chain, &key_next, 1578 key_next, HAMMER2_KEY_MAX, 1579 &error, 0); 1580 } 1581 if (parent) { 1582 hammer2_chain_unlock(parent); 1583 hammer2_chain_drop(parent); 1584 } 1585 hammer2_inode_unlock(spmp->iroot); 1586 } 1587 1588 static 1589 int 1590 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused, 1591 struct vnode *devvp, struct ucred *cred) 1592 { 1593 int error; 1594 1595 if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { 1596 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1597 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL); 1598 vn_unlock(devvp); 1599 error = hammer2_recovery(hmp); 1600 if (error == 0) 1601 error |= hammer2_fixup_pfses(hmp); 1602 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1603 if (error == 0) { 1604 VOP_CLOSE(devvp, FREAD, NULL); 1605 hmp->ronly = 0; 1606 } else { 1607 VOP_CLOSE(devvp, FREAD | FWRITE, NULL); 1608 } 1609 vn_unlock(devvp); 1610 } else { 1611 error = 0; 1612 } 1613 return error; 1614 } 1615 1616 static 1617 int 1618 hammer2_vfs_unmount(struct mount *mp, int mntflags) 1619 { 1620 hammer2_pfs_t *pmp; 1621 int flags; 1622 int error = 0; 1623 1624 pmp = MPTOPMP(mp); 1625 1626 if (pmp == NULL) 1627 return(0); 1628 1629 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1630 1631 /* 1632 * If mount initialization proceeded far enough we must flush 1633 * its vnodes and sync the underlying mount points. Three syncs 1634 * are required to fully flush the filesystem (freemap updates lag 1635 * by one flush, and one extra for safety). 1636 */ 1637 if (mntflags & MNT_FORCE) 1638 flags = FORCECLOSE; 1639 else 1640 flags = 0; 1641 if (pmp->iroot) { 1642 error = vflush(mp, 0, flags); 1643 if (error) 1644 goto failed; 1645 hammer2_vfs_sync(mp, MNT_WAIT); 1646 hammer2_vfs_sync(mp, MNT_WAIT); 1647 hammer2_vfs_sync(mp, MNT_WAIT); 1648 } 1649 1650 /* 1651 * Cleanup the frontend support XOPS threads 1652 */ 1653 hammer2_xop_helper_cleanup(pmp); 1654 1655 if (pmp->mp) 1656 hammer2_unmount_helper(mp, pmp, NULL); 1657 1658 error = 0; 1659 failed: 1660 lockmgr(&hammer2_mntlk, LK_RELEASE); 1661 1662 return (error); 1663 } 1664 1665 /* 1666 * Mount helper, hook the system mount into our PFS. 1667 * The mount lock is held. 1668 * 1669 * We must bump the mount_count on related devices for any 1670 * mounted PFSs. 1671 */ 1672 static 1673 void 1674 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) 1675 { 1676 hammer2_cluster_t *cluster; 1677 hammer2_chain_t *rchain; 1678 int i; 1679 1680 mp->mnt_data = (qaddr_t)pmp; 1681 pmp->mp = mp; 1682 1683 /* 1684 * After pmp->mp is set we have to adjust hmp->mount_count. 1685 */ 1686 cluster = &pmp->iroot->cluster; 1687 for (i = 0; i < cluster->nchains; ++i) { 1688 rchain = cluster->array[i].chain; 1689 if (rchain == NULL) 1690 continue; 1691 ++rchain->hmp->mount_count; 1692 } 1693 1694 /* 1695 * Create missing Xop threads 1696 */ 1697 hammer2_xop_helper_create(pmp); 1698 } 1699 1700 /* 1701 * Mount helper, unhook the system mount from our PFS. 1702 * The mount lock is held. 1703 * 1704 * If hmp is supplied a mount responsible for being the first to open 1705 * the block device failed and the block device and all PFSs using the 1706 * block device must be cleaned up. 1707 * 1708 * If pmp is supplied multiple devices might be backing the PFS and each 1709 * must be disconnected. This might not be the last PFS using some of the 1710 * underlying devices. Also, we have to adjust our hmp->mount_count 1711 * accounting for the devices backing the pmp which is now undergoing an 1712 * unmount. 1713 */ 1714 static 1715 void 1716 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp) 1717 { 1718 hammer2_cluster_t *cluster; 1719 hammer2_chain_t *rchain; 1720 struct vnode *devvp; 1721 int dumpcnt; 1722 int ronly; 1723 int i; 1724 1725 /* 1726 * If no device supplied this is a high-level unmount and we have to 1727 * to disconnect the mount, adjust mount_count, and locate devices 1728 * that might now have no mounts. 1729 */ 1730 if (pmp) { 1731 KKASSERT(hmp == NULL); 1732 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp); 1733 pmp->mp = NULL; 1734 mp->mnt_data = NULL; 1735 1736 /* 1737 * After pmp->mp is cleared we have to account for 1738 * mount_count. 1739 */ 1740 cluster = &pmp->iroot->cluster; 1741 for (i = 0; i < cluster->nchains; ++i) { 1742 rchain = cluster->array[i].chain; 1743 if (rchain == NULL) 1744 continue; 1745 --rchain->hmp->mount_count; 1746 /* scrapping hmp now may invalidate the pmp */ 1747 } 1748 again: 1749 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1750 if (hmp->mount_count == 0) { 1751 hammer2_unmount_helper(NULL, NULL, hmp); 1752 goto again; 1753 } 1754 } 1755 return; 1756 } 1757 1758 /* 1759 * Try to terminate the block device. We can't terminate it if 1760 * there are still PFSs referencing it. 1761 */ 1762 if (hmp->mount_count) 1763 return; 1764 1765 /* 1766 * Decomission the network before we start messing with the 1767 * device and PFS. 1768 */ 1769 hammer2_iocom_uninit(hmp); 1770 1771 hammer2_bulkfree_uninit(hmp); 1772 hammer2_pfsfree_scan(hmp, 0); 1773 #if 0 1774 hammer2_dev_exlock(hmp); /* XXX order */ 1775 #endif 1776 1777 /* 1778 * Cycle the volume data lock as a safety (probably not needed any 1779 * more). To ensure everything is out we need to flush at least 1780 * three times. (1) The running of the sideq can dirty the 1781 * filesystem, (2) A normal flush can dirty the freemap, and 1782 * (3) ensure that the freemap is fully synchronized. 1783 * 1784 * The next mount's recovery scan can clean everything up but we want 1785 * to leave the filesystem in a 100% clean state on a normal unmount. 1786 */ 1787 #if 0 1788 hammer2_voldata_lock(hmp); 1789 hammer2_voldata_unlock(hmp); 1790 #endif 1791 1792 /* 1793 * Flush whatever is left. Unmounted but modified PFS's might still 1794 * have some dirty chains on them. 1795 */ 1796 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1797 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 1798 1799 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1800 hammer2_voldata_modify(hmp); 1801 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP | 1802 HAMMER2_FLUSH_ALL); 1803 } 1804 hammer2_chain_unlock(&hmp->fchain); 1805 1806 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1807 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP | 1808 HAMMER2_FLUSH_ALL); 1809 } 1810 hammer2_chain_unlock(&hmp->vchain); 1811 1812 if ((hmp->vchain.flags | hmp->fchain.flags) & 1813 HAMMER2_CHAIN_FLUSH_MASK) { 1814 kprintf("hammer2_unmount: chains left over " 1815 "after final sync\n"); 1816 kprintf(" vchain %08x\n", hmp->vchain.flags); 1817 kprintf(" fchain %08x\n", hmp->fchain.flags); 1818 1819 if (hammer2_debug & 0x0010) 1820 Debugger("entered debugger"); 1821 } 1822 1823 hammer2_pfsfree_scan(hmp, 1); 1824 1825 KKASSERT(hmp->spmp == NULL); 1826 1827 /* 1828 * Finish up with the device vnode 1829 */ 1830 if ((devvp = hmp->devvp) != NULL) { 1831 ronly = hmp->ronly; 1832 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1833 kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n", 1834 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree), 1835 ronly); 1836 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0); 1837 kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n", 1838 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree)); 1839 hmp->devvp = NULL; 1840 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL); 1841 vn_unlock(devvp); 1842 vrele(devvp); 1843 devvp = NULL; 1844 } 1845 1846 /* 1847 * Clear vchain/fchain flags that might prevent final cleanup 1848 * of these chains. 1849 */ 1850 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) { 1851 atomic_add_long(&hammer2_count_modified_chains, -1); 1852 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1853 hammer2_pfs_memory_wakeup(hmp->vchain.pmp, -1); 1854 } 1855 if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) { 1856 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE); 1857 } 1858 1859 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) { 1860 atomic_add_long(&hammer2_count_modified_chains, -1); 1861 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED); 1862 hammer2_pfs_memory_wakeup(hmp->fchain.pmp, -1); 1863 } 1864 if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) { 1865 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE); 1866 } 1867 1868 /* 1869 * Final drop of embedded freemap root chain to 1870 * clean up fchain.core (fchain structure is not 1871 * flagged ALLOCATED so it is cleaned out and then 1872 * left to rot). 1873 */ 1874 hammer2_chain_drop(&hmp->fchain); 1875 1876 /* 1877 * Final drop of embedded volume root chain to clean 1878 * up vchain.core (vchain structure is not flagged 1879 * ALLOCATED so it is cleaned out and then left to 1880 * rot). 1881 */ 1882 dumpcnt = 50; 1883 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1); 1884 dumpcnt = 50; 1885 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1); 1886 #if 0 1887 hammer2_dev_unlock(hmp); 1888 #endif 1889 hammer2_chain_drop(&hmp->vchain); 1890 1891 hammer2_io_cleanup(hmp, &hmp->iotree); 1892 if (hmp->iofree_count) { 1893 kprintf("io_cleanup: %d I/O's left hanging\n", 1894 hmp->iofree_count); 1895 } 1896 1897 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry); 1898 kmalloc_destroy(&hmp->mchain); 1899 kfree(hmp, M_HAMMER2); 1900 } 1901 1902 int 1903 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 1904 ino_t ino, struct vnode **vpp) 1905 { 1906 hammer2_xop_lookup_t *xop; 1907 hammer2_pfs_t *pmp; 1908 hammer2_inode_t *ip; 1909 hammer2_tid_t inum; 1910 int error; 1911 1912 inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK; 1913 1914 error = 0; 1915 pmp = MPTOPMP(mp); 1916 1917 /* 1918 * Easy if we already have it cached 1919 */ 1920 ip = hammer2_inode_lookup(pmp, inum); 1921 if (ip) { 1922 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 1923 *vpp = hammer2_igetv(ip, &error); 1924 hammer2_inode_unlock(ip); 1925 hammer2_inode_drop(ip); /* from lookup */ 1926 1927 return error; 1928 } 1929 1930 /* 1931 * Otherwise we have to find the inode 1932 */ 1933 xop = hammer2_xop_alloc(pmp->iroot, 0); 1934 xop->lhc = inum; 1935 hammer2_xop_start(&xop->head, &hammer2_lookup_desc); 1936 error = hammer2_xop_collect(&xop->head, 0); 1937 1938 if (error == 0) 1939 ip = hammer2_inode_get(pmp, &xop->head, -1, -1); 1940 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1941 1942 if (ip) { 1943 *vpp = hammer2_igetv(ip, &error); 1944 hammer2_inode_unlock(ip); 1945 } else { 1946 *vpp = NULL; 1947 error = ENOENT; 1948 } 1949 return (error); 1950 } 1951 1952 static 1953 int 1954 hammer2_vfs_root(struct mount *mp, struct vnode **vpp) 1955 { 1956 hammer2_pfs_t *pmp; 1957 struct vnode *vp; 1958 int error; 1959 1960 pmp = MPTOPMP(mp); 1961 if (pmp->iroot == NULL) { 1962 kprintf("hammer2 (%s): no root inode\n", 1963 mp->mnt_stat.f_mntfromname); 1964 *vpp = NULL; 1965 return EINVAL; 1966 } 1967 1968 error = 0; 1969 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1970 1971 while (pmp->inode_tid == 0) { 1972 hammer2_xop_ipcluster_t *xop; 1973 const hammer2_inode_meta_t *meta; 1974 1975 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING); 1976 hammer2_xop_start(&xop->head, &hammer2_ipcluster_desc); 1977 error = hammer2_xop_collect(&xop->head, 0); 1978 1979 if (error == 0) { 1980 meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta; 1981 pmp->iroot->meta = *meta; 1982 pmp->inode_tid = meta->pfs_inum + 1; 1983 hammer2_xop_pdata(&xop->head); 1984 /* meta invalid */ 1985 1986 if (pmp->inode_tid < HAMMER2_INODE_START) 1987 pmp->inode_tid = HAMMER2_INODE_START; 1988 pmp->modify_tid = 1989 xop->head.cluster.focus->bref.modify_tid + 1; 1990 #if 0 1991 kprintf("PFS: Starting inode %jd\n", 1992 (intmax_t)pmp->inode_tid); 1993 kprintf("PMP focus good set nextino=%ld mod=%016jx\n", 1994 pmp->inode_tid, pmp->modify_tid); 1995 #endif 1996 wakeup(&pmp->iroot); 1997 1998 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1999 2000 /* 2001 * Prime the mount info. 2002 */ 2003 hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL); 2004 break; 2005 } 2006 2007 /* 2008 * Loop, try again 2009 */ 2010 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2011 hammer2_inode_unlock(pmp->iroot); 2012 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz); 2013 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 2014 if (error == EINTR) 2015 break; 2016 } 2017 2018 if (error) { 2019 hammer2_inode_unlock(pmp->iroot); 2020 *vpp = NULL; 2021 } else { 2022 vp = hammer2_igetv(pmp->iroot, &error); 2023 hammer2_inode_unlock(pmp->iroot); 2024 *vpp = vp; 2025 } 2026 2027 return (error); 2028 } 2029 2030 /* 2031 * Filesystem status 2032 * 2033 * XXX incorporate ipdata->meta.inode_quota and data_quota 2034 */ 2035 static 2036 int 2037 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) 2038 { 2039 hammer2_pfs_t *pmp; 2040 hammer2_dev_t *hmp; 2041 hammer2_blockref_t bref; 2042 struct statfs tmp; 2043 int i; 2044 2045 /* 2046 * NOTE: iroot might not have validated the cluster yet. 2047 */ 2048 pmp = MPTOPMP(mp); 2049 2050 bzero(&tmp, sizeof(tmp)); 2051 2052 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2053 hmp = pmp->pfs_hmps[i]; 2054 if (hmp == NULL) 2055 continue; 2056 if (pmp->iroot->cluster.array[i].chain) 2057 bref = pmp->iroot->cluster.array[i].chain->bref; 2058 else 2059 bzero(&bref, sizeof(bref)); 2060 2061 tmp.f_files = bref.embed.stats.inode_count; 2062 tmp.f_ffree = 0; 2063 tmp.f_blocks = hmp->voldata.allocator_size / 2064 mp->mnt_vstat.f_bsize; 2065 tmp.f_bfree = hmp->voldata.allocator_free / 2066 mp->mnt_vstat.f_bsize; 2067 tmp.f_bavail = tmp.f_bfree; 2068 2069 if (cred && cred->cr_uid != 0) { 2070 uint64_t adj; 2071 2072 /* 5% */ 2073 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2074 tmp.f_blocks -= adj; 2075 tmp.f_bfree -= adj; 2076 tmp.f_bavail -= adj; 2077 } 2078 2079 mp->mnt_stat.f_blocks = tmp.f_blocks; 2080 mp->mnt_stat.f_bfree = tmp.f_bfree; 2081 mp->mnt_stat.f_bavail = tmp.f_bavail; 2082 mp->mnt_stat.f_files = tmp.f_files; 2083 mp->mnt_stat.f_ffree = tmp.f_ffree; 2084 2085 *sbp = mp->mnt_stat; 2086 } 2087 return (0); 2088 } 2089 2090 static 2091 int 2092 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) 2093 { 2094 hammer2_pfs_t *pmp; 2095 hammer2_dev_t *hmp; 2096 hammer2_blockref_t bref; 2097 struct statvfs tmp; 2098 int i; 2099 2100 /* 2101 * NOTE: iroot might not have validated the cluster yet. 2102 */ 2103 pmp = MPTOPMP(mp); 2104 bzero(&tmp, sizeof(tmp)); 2105 2106 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2107 hmp = pmp->pfs_hmps[i]; 2108 if (hmp == NULL) 2109 continue; 2110 if (pmp->iroot->cluster.array[i].chain) 2111 bref = pmp->iroot->cluster.array[i].chain->bref; 2112 else 2113 bzero(&bref, sizeof(bref)); 2114 2115 tmp.f_files = bref.embed.stats.inode_count; 2116 tmp.f_ffree = 0; 2117 tmp.f_blocks = hmp->voldata.allocator_size / 2118 mp->mnt_vstat.f_bsize; 2119 tmp.f_bfree = hmp->voldata.allocator_free / 2120 mp->mnt_vstat.f_bsize; 2121 tmp.f_bavail = tmp.f_bfree; 2122 2123 if (cred && cred->cr_uid != 0) { 2124 uint64_t adj; 2125 2126 /* 5% */ 2127 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2128 tmp.f_blocks -= adj; 2129 tmp.f_bfree -= adj; 2130 tmp.f_bavail -= adj; 2131 } 2132 2133 mp->mnt_vstat.f_blocks = tmp.f_blocks; 2134 mp->mnt_vstat.f_bfree = tmp.f_bfree; 2135 mp->mnt_vstat.f_bavail = tmp.f_bavail; 2136 mp->mnt_vstat.f_files = tmp.f_files; 2137 mp->mnt_vstat.f_ffree = tmp.f_ffree; 2138 2139 *sbp = mp->mnt_vstat; 2140 } 2141 return (0); 2142 } 2143 2144 /* 2145 * Mount-time recovery (RW mounts) 2146 * 2147 * Updates to the free block table are allowed to lag flushes by one 2148 * transaction. In case of a crash, then on a fresh mount we must do an 2149 * incremental scan of the last committed transaction id and make sure that 2150 * all related blocks have been marked allocated. 2151 * 2152 * The super-root topology and each PFS has its own transaction id domain, 2153 * so we must track PFS boundary transitions. 2154 */ 2155 struct hammer2_recovery_elm { 2156 TAILQ_ENTRY(hammer2_recovery_elm) entry; 2157 hammer2_chain_t *chain; 2158 hammer2_tid_t sync_tid; 2159 }; 2160 2161 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); 2162 2163 struct hammer2_recovery_info { 2164 struct hammer2_recovery_list list; 2165 hammer2_tid_t mtid; 2166 int depth; 2167 }; 2168 2169 static int hammer2_recovery_scan(hammer2_dev_t *hmp, 2170 hammer2_chain_t *parent, 2171 struct hammer2_recovery_info *info, 2172 hammer2_tid_t sync_tid); 2173 2174 #define HAMMER2_RECOVERY_MAXDEPTH 10 2175 2176 static 2177 int 2178 hammer2_recovery(hammer2_dev_t *hmp) 2179 { 2180 struct hammer2_recovery_info info; 2181 struct hammer2_recovery_elm *elm; 2182 hammer2_chain_t *parent; 2183 hammer2_tid_t sync_tid; 2184 hammer2_tid_t mirror_tid; 2185 int error; 2186 2187 hammer2_trans_init(hmp->spmp, 0); 2188 2189 sync_tid = hmp->voldata.freemap_tid; 2190 mirror_tid = hmp->voldata.mirror_tid; 2191 2192 kprintf("hammer2 mount \"%s\": ", hmp->devrepname); 2193 if (sync_tid >= mirror_tid) { 2194 kprintf(" no recovery needed\n"); 2195 } else { 2196 kprintf(" freemap recovery %016jx-%016jx\n", 2197 sync_tid + 1, mirror_tid); 2198 } 2199 2200 TAILQ_INIT(&info.list); 2201 info.depth = 0; 2202 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 2203 error = hammer2_recovery_scan(hmp, parent, &info, sync_tid); 2204 hammer2_chain_lookup_done(parent); 2205 2206 while ((elm = TAILQ_FIRST(&info.list)) != NULL) { 2207 TAILQ_REMOVE(&info.list, elm, entry); 2208 parent = elm->chain; 2209 sync_tid = elm->sync_tid; 2210 kfree(elm, M_HAMMER2); 2211 2212 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2213 error |= hammer2_recovery_scan(hmp, parent, &info, 2214 hmp->voldata.freemap_tid); 2215 hammer2_chain_unlock(parent); 2216 hammer2_chain_drop(parent); /* drop elm->chain ref */ 2217 } 2218 2219 hammer2_trans_done(hmp->spmp, 0); 2220 2221 return error; 2222 } 2223 2224 static 2225 int 2226 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, 2227 struct hammer2_recovery_info *info, 2228 hammer2_tid_t sync_tid) 2229 { 2230 const hammer2_inode_data_t *ripdata; 2231 hammer2_chain_t *chain; 2232 hammer2_blockref_t bref; 2233 int tmp_error; 2234 int rup_error; 2235 int error; 2236 int first; 2237 2238 /* 2239 * Adjust freemap to ensure that the block(s) are marked allocated. 2240 */ 2241 if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { 2242 hammer2_freemap_adjust(hmp, &parent->bref, 2243 HAMMER2_FREEMAP_DORECOVER); 2244 } 2245 2246 /* 2247 * Check type for recursive scan 2248 */ 2249 switch(parent->bref.type) { 2250 case HAMMER2_BREF_TYPE_VOLUME: 2251 /* data already instantiated */ 2252 break; 2253 case HAMMER2_BREF_TYPE_INODE: 2254 /* 2255 * Must instantiate data for DIRECTDATA test and also 2256 * for recursion. 2257 */ 2258 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2259 ripdata = &hammer2_chain_rdata(parent)->ipdata; 2260 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2261 /* not applicable to recovery scan */ 2262 hammer2_chain_unlock(parent); 2263 return 0; 2264 } 2265 hammer2_chain_unlock(parent); 2266 break; 2267 case HAMMER2_BREF_TYPE_INDIRECT: 2268 /* 2269 * Must instantiate data for recursion 2270 */ 2271 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2272 hammer2_chain_unlock(parent); 2273 break; 2274 case HAMMER2_BREF_TYPE_DIRENT: 2275 case HAMMER2_BREF_TYPE_DATA: 2276 case HAMMER2_BREF_TYPE_FREEMAP: 2277 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2278 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2279 /* not applicable to recovery scan */ 2280 return 0; 2281 break; 2282 default: 2283 return HAMMER2_ERROR_BADBREF; 2284 } 2285 2286 /* 2287 * Defer operation if depth limit reached or if we are crossing a 2288 * PFS boundary. 2289 */ 2290 if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) { 2291 struct hammer2_recovery_elm *elm; 2292 2293 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); 2294 elm->chain = parent; 2295 elm->sync_tid = sync_tid; 2296 hammer2_chain_ref(parent); 2297 TAILQ_INSERT_TAIL(&info->list, elm, entry); 2298 /* unlocked by caller */ 2299 2300 return(0); 2301 } 2302 2303 2304 /* 2305 * Recursive scan of the last flushed transaction only. We are 2306 * doing this without pmp assignments so don't leave the chains 2307 * hanging around after we are done with them. 2308 * 2309 * error Cumulative error this level only 2310 * rup_error Cumulative error for recursion 2311 * tmp_error Specific non-cumulative recursion error 2312 */ 2313 chain = NULL; 2314 first = 1; 2315 rup_error = 0; 2316 error = 0; 2317 2318 for (;;) { 2319 error |= hammer2_chain_scan(parent, &chain, &bref, 2320 &first, 2321 HAMMER2_LOOKUP_NODATA); 2322 2323 /* 2324 * Problem during scan or EOF 2325 */ 2326 if (error) 2327 break; 2328 2329 /* 2330 * If this is a leaf 2331 */ 2332 if (chain == NULL) { 2333 if (bref.mirror_tid > sync_tid) { 2334 hammer2_freemap_adjust(hmp, &bref, 2335 HAMMER2_FREEMAP_DORECOVER); 2336 } 2337 continue; 2338 } 2339 2340 /* 2341 * This may or may not be a recursive node. 2342 */ 2343 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 2344 if (bref.mirror_tid > sync_tid) { 2345 ++info->depth; 2346 tmp_error = hammer2_recovery_scan(hmp, chain, 2347 info, sync_tid); 2348 --info->depth; 2349 } else { 2350 tmp_error = 0; 2351 } 2352 2353 /* 2354 * Flush the recovery at the PFS boundary to stage it for 2355 * the final flush of the super-root topology. 2356 */ 2357 if (tmp_error == 0 && 2358 (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && 2359 (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { 2360 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2361 HAMMER2_FLUSH_ALL); 2362 } 2363 rup_error |= tmp_error; 2364 } 2365 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 2366 } 2367 2368 /* 2369 * This fixes up an error introduced in earlier H2 implementations where 2370 * moving a PFS inode into an indirect block wound up causing the 2371 * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared. 2372 */ 2373 static 2374 int 2375 hammer2_fixup_pfses(hammer2_dev_t *hmp) 2376 { 2377 const hammer2_inode_data_t *ripdata; 2378 hammer2_chain_t *parent; 2379 hammer2_chain_t *chain; 2380 hammer2_key_t key_next; 2381 hammer2_pfs_t *spmp; 2382 int error; 2383 2384 error = 0; 2385 2386 /* 2387 * Lookup mount point under the media-localized super-root. 2388 * 2389 * cluster->pmp will incorrectly point to spmp and must be fixed 2390 * up later on. 2391 */ 2392 spmp = hmp->spmp; 2393 hammer2_inode_lock(spmp->iroot, 0); 2394 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 2395 chain = hammer2_chain_lookup(&parent, &key_next, 2396 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 2397 &error, 0); 2398 while (chain) { 2399 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 2400 continue; 2401 if (chain->error) { 2402 kprintf("I/O error scanning PFS labels\n"); 2403 error |= chain->error; 2404 } else if ((chain->bref.flags & 2405 HAMMER2_BREF_FLAG_PFSROOT) == 0) { 2406 int error2; 2407 2408 ripdata = &chain->data->ipdata; 2409 hammer2_trans_init(hmp->spmp, 0); 2410 error2 = hammer2_chain_modify(chain, 2411 chain->bref.modify_tid, 2412 0, 0); 2413 if (error2 == 0) { 2414 kprintf("hammer2: Correct mis-flagged PFS %s\n", 2415 ripdata->filename); 2416 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 2417 } else { 2418 error |= error2; 2419 } 2420 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2421 HAMMER2_FLUSH_ALL); 2422 hammer2_trans_done(hmp->spmp, 0); 2423 } 2424 chain = hammer2_chain_next(&parent, chain, &key_next, 2425 key_next, HAMMER2_KEY_MAX, 2426 &error, 0); 2427 } 2428 if (parent) { 2429 hammer2_chain_unlock(parent); 2430 hammer2_chain_drop(parent); 2431 } 2432 hammer2_inode_unlock(spmp->iroot); 2433 2434 return error; 2435 } 2436 2437 /* 2438 * Sync a mount point; this is called periodically on a per-mount basis from 2439 * the filesystem syncer, and whenever a user issues a sync. 2440 */ 2441 int 2442 hammer2_vfs_sync(struct mount *mp, int waitfor) 2443 { 2444 int error; 2445 2446 error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor); 2447 2448 return error; 2449 } 2450 2451 /* 2452 * Because frontend operations lock vnodes before we get a chance to 2453 * lock the related inode, we can't just acquire a vnode lock without 2454 * risking a deadlock. The frontend may be holding a vnode lock while 2455 * also blocked on our SYNCQ flag while trying to get the inode lock. 2456 * 2457 * To deal with this situation we can check the vnode lock situation 2458 * after locking the inode and perform a work-around. 2459 */ 2460 int 2461 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor) 2462 { 2463 struct mount *mp; 2464 /*hammer2_xop_flush_t *xop;*/ 2465 /*struct hammer2_sync_info info;*/ 2466 hammer2_inode_t *ip; 2467 hammer2_depend_t *depend; 2468 hammer2_depend_t *depend_next; 2469 struct vnode *vp; 2470 uint32_t pass2; 2471 int error; 2472 int wakecount; 2473 int dorestart; 2474 2475 mp = pmp->mp; 2476 2477 /* 2478 * Move all inodes on sideq to syncq. This will clear sideq. 2479 * This should represent all flushable inodes. These inodes 2480 * will already have refs due to being on syncq or sideq. We 2481 * must do this all at once with the spinlock held to ensure that 2482 * all inode dependencies are part of the same flush. 2483 * 2484 * We should be able to do this asynchronously from frontend 2485 * operations because we will be locking the inodes later on 2486 * to actually flush them, and that will partition any frontend 2487 * op using the same inode. Either it has already locked the 2488 * inode and we will block, or it has not yet locked the inode 2489 * and it will block until we are finished flushing that inode. 2490 * 2491 * When restarting, only move the inodes flagged as PASS2 from 2492 * SIDEQ to SYNCQ. PASS2 propagation by inode_lock4() and 2493 * inode_depend() are atomic with the spin-lock. 2494 */ 2495 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 2496 #ifdef HAMMER2_DEBUG_SYNC 2497 kprintf("FILESYSTEM SYNC BOUNDARY\n"); 2498 #endif 2499 dorestart = 0; 2500 2501 /* 2502 * Move inodes from depq to syncq, releasing the related 2503 * depend structures. 2504 */ 2505 restart: 2506 #ifdef HAMMER2_DEBUG_SYNC 2507 kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart); 2508 #endif 2509 hammer2_trans_setflags(pmp, 0/*HAMMER2_TRANS_COPYQ*/); 2510 hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN); 2511 2512 /* 2513 * Move inodes from depq to syncq. When restarting, only depq's 2514 * marked pass2 are moved. 2515 */ 2516 hammer2_spin_ex(&pmp->list_spin); 2517 depend_next = TAILQ_FIRST(&pmp->depq); 2518 wakecount = 0; 2519 2520 while ((depend = depend_next) != NULL) { 2521 depend_next = TAILQ_NEXT(depend, entry); 2522 if (dorestart && depend->pass2 == 0) 2523 continue; 2524 TAILQ_FOREACH(ip, &depend->sideq, entry) { 2525 KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ); 2526 atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ); 2527 atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ); 2528 ip->depend = NULL; 2529 } 2530 2531 /* 2532 * NOTE: pmp->sideq_count includes both sideq and syncq 2533 */ 2534 TAILQ_CONCAT(&pmp->syncq, &depend->sideq, entry); 2535 2536 depend->count = 0; 2537 depend->pass2 = 0; 2538 TAILQ_REMOVE(&pmp->depq, depend, entry); 2539 } 2540 2541 hammer2_spin_unex(&pmp->list_spin); 2542 hammer2_trans_clearflags(pmp, /*HAMMER2_TRANS_COPYQ |*/ 2543 HAMMER2_TRANS_WAITING); 2544 dorestart = 0; 2545 2546 /* 2547 * sideq_count may have dropped enough to allow us to unstall 2548 * the frontend. 2549 */ 2550 hammer2_pfs_memory_wakeup(pmp, 0); 2551 2552 /* 2553 * Now run through all inodes on syncq. 2554 * 2555 * Flush transactions only interlock with other flush transactions. 2556 * Any conflicting frontend operations will block on the inode, but 2557 * may hold a vnode lock while doing so. 2558 */ 2559 hammer2_spin_ex(&pmp->list_spin); 2560 while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) { 2561 /* 2562 * Remove the inode from the SYNCQ, transfer the syncq ref 2563 * to us. We must clear SYNCQ to allow any potential 2564 * front-end deadlock to proceed. We must set PASS2 so 2565 * the dependency code knows what to do. 2566 */ 2567 pass2 = ip->flags; 2568 cpu_ccfence(); 2569 if (atomic_cmpset_int(&ip->flags, 2570 pass2, 2571 (pass2 & ~(HAMMER2_INODE_SYNCQ | 2572 HAMMER2_INODE_SYNCQ_WAKEUP)) | 2573 HAMMER2_INODE_SYNCQ_PASS2) == 0) { 2574 continue; 2575 } 2576 TAILQ_REMOVE(&pmp->syncq, ip, entry); 2577 --pmp->sideq_count; 2578 hammer2_spin_unex(&pmp->list_spin); 2579 2580 /* 2581 * Tickle anyone waiting on ip->flags or the hysteresis 2582 * on the dirty inode count. 2583 */ 2584 if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP) 2585 wakeup(&ip->flags); 2586 if (++wakecount >= hammer2_limit_dirty_inodes / 20 + 1) { 2587 wakecount = 0; 2588 hammer2_pfs_memory_wakeup(pmp, 0); 2589 } 2590 2591 /* 2592 * Relock the inode, and we inherit a ref from the above. 2593 * We will check for a race after we acquire the vnode. 2594 */ 2595 hammer2_mtx_ex(&ip->lock); 2596 2597 /* 2598 * We need the vp in order to vfsync() dirty buffers, so if 2599 * one isn't attached we can skip it. 2600 * 2601 * Ordering the inode lock and then the vnode lock has the 2602 * potential to deadlock. If we had left SYNCQ set that could 2603 * also deadlock us against the frontend even if we don't hold 2604 * any locks, but the latter is not a problem now since we 2605 * cleared it. igetv will temporarily release the inode lock 2606 * in a safe manner to work-around the deadlock. 2607 * 2608 * Unfortunately it is still possible to deadlock when the 2609 * frontend obtains multiple inode locks, because all the 2610 * related vnodes are already locked (nor can the vnode locks 2611 * be released and reacquired without messing up RECLAIM and 2612 * INACTIVE sequencing). 2613 * 2614 * The solution for now is to move the vp back onto SIDEQ 2615 * and set dorestart, which will restart the flush after we 2616 * exhaust the current SYNCQ. Note that additional 2617 * dependencies may build up, so we definitely need to move 2618 * the whole SIDEQ back to SYNCQ when we restart. 2619 */ 2620 vp = ip->vp; 2621 if (vp) { 2622 if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) { 2623 /* 2624 * Failed to get the vnode, requeue the inode 2625 * (PASS2 is already set so it will be found 2626 * again on the restart). 2627 * 2628 * Then unlock, possibly sleep, and retry 2629 * later. We sleep if PASS2 was *previously* 2630 * set, before we set it again above. 2631 */ 2632 vp = NULL; 2633 dorestart = 1; 2634 #ifdef HAMMER2_DEBUG_SYNC 2635 kprintf("inum %ld (sync delayed by vnode)\n", 2636 (long)ip->meta.inum); 2637 #endif 2638 hammer2_inode_delayed_sideq(ip); 2639 2640 hammer2_mtx_unlock(&ip->lock); 2641 hammer2_inode_drop(ip); 2642 2643 if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) { 2644 tsleep(&dorestart, 0, "h2syndel", 2); 2645 } 2646 hammer2_spin_ex(&pmp->list_spin); 2647 continue; 2648 } 2649 } else { 2650 vp = NULL; 2651 } 2652 2653 /* 2654 * If the inode wound up on a SIDEQ again it will already be 2655 * prepped for another PASS2. In this situation if we flush 2656 * it now we will just wind up flushing it again in the same 2657 * syncer run, so we might as well not flush it now. 2658 */ 2659 if (ip->flags & HAMMER2_INODE_SIDEQ) { 2660 hammer2_mtx_unlock(&ip->lock); 2661 hammer2_inode_drop(ip); 2662 if (vp) 2663 vput(vp); 2664 dorestart = 1; 2665 hammer2_spin_ex(&pmp->list_spin); 2666 continue; 2667 } 2668 2669 /* 2670 * Ok we have the inode exclusively locked and if vp is 2671 * not NULL that will also be exclusively locked. Do the 2672 * meat of the flush. 2673 * 2674 * vp token needed for v_rbdirty_tree check / vclrisdirty 2675 * sequencing. Though we hold the vnode exclusively so 2676 * we shouldn't need to hold the token also in this case. 2677 */ 2678 if (vp) { 2679 vfsync(vp, MNT_WAIT, 1, NULL, NULL); 2680 bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */ 2681 } 2682 2683 /* 2684 * If the inode has not yet been inserted into the tree 2685 * we must do so. Then sync and flush it. The flush should 2686 * update the parent. 2687 */ 2688 if (ip->flags & HAMMER2_INODE_DELETING) { 2689 #ifdef HAMMER2_DEBUG_SYNC 2690 kprintf("inum %ld destroy\n", (long)ip->meta.inum); 2691 #endif 2692 hammer2_inode_chain_des(ip); 2693 atomic_add_long(&hammer2_iod_inode_deletes, 1); 2694 } else if (ip->flags & HAMMER2_INODE_CREATING) { 2695 #ifdef HAMMER2_DEBUG_SYNC 2696 kprintf("inum %ld insert\n", (long)ip->meta.inum); 2697 #endif 2698 hammer2_inode_chain_ins(ip); 2699 atomic_add_long(&hammer2_iod_inode_creates, 1); 2700 } 2701 #ifdef HAMMER2_DEBUG_SYNC 2702 kprintf("inum %ld chain-sync\n", (long)ip->meta.inum); 2703 #endif 2704 2705 /* 2706 * Because I kinda messed up the design and index the inodes 2707 * under the root inode, along side the directory entries, 2708 * we can't flush the inode index under the iroot until the 2709 * end. If we do it now we might miss effects created by 2710 * other inodes on the SYNCQ. 2711 * 2712 * Do a normal (non-FSSYNC) flush instead, which allows the 2713 * vnode code to work the same. We don't want to force iroot 2714 * back onto the SIDEQ, and we also don't want the flush code 2715 * to update pfs_iroot_blocksets until the final flush later. 2716 * 2717 * XXX at the moment this will likely result in a double-flush 2718 * of the iroot chain. 2719 */ 2720 hammer2_inode_chain_sync(ip); 2721 if (ip == pmp->iroot) { 2722 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP); 2723 } else { 2724 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP | 2725 HAMMER2_XOP_FSSYNC); 2726 } 2727 if (vp) { 2728 lwkt_gettoken(&vp->v_token); 2729 if ((ip->flags & (HAMMER2_INODE_MODIFIED | 2730 HAMMER2_INODE_RESIZED | 2731 HAMMER2_INODE_DIRTYDATA)) == 0 && 2732 RB_EMPTY(&vp->v_rbdirty_tree) && 2733 !bio_track_active(&vp->v_track_write)) { 2734 vclrisdirty(vp); 2735 } else { 2736 hammer2_inode_delayed_sideq(ip); 2737 } 2738 lwkt_reltoken(&vp->v_token); 2739 vput(vp); 2740 vp = NULL; /* safety */ 2741 } 2742 atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2); 2743 hammer2_inode_unlock(ip); /* unlock+drop */ 2744 /* ip pointer invalid */ 2745 2746 /* 2747 * If the inode got dirted after we dropped our locks, 2748 * it will have already been moved back to the SIDEQ. 2749 */ 2750 hammer2_spin_ex(&pmp->list_spin); 2751 } 2752 hammer2_spin_unex(&pmp->list_spin); 2753 hammer2_pfs_memory_wakeup(pmp, 0); 2754 2755 if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) { 2756 #ifdef HAMMER2_DEBUG_SYNC 2757 kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n"); 2758 /*tsleep(&dorestart, 0, "h2STG1-R", hz*20);*/ 2759 #endif 2760 dorestart = 1; 2761 goto restart; 2762 } 2763 #ifdef HAMMER2_DEBUG_SYNC 2764 kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n"); 2765 /*tsleep(&dorestart, 0, "h2STG2", hz*20);*/ 2766 #endif 2767 2768 /* 2769 * We have to flush the PFS root last, even if it does not appear to 2770 * be dirty, because all the inodes in the PFS are indexed under it. 2771 * The normal flushing of iroot above would only occur if directory 2772 * entries under the root were changed. 2773 * 2774 * Specifying VOLHDR will cause an additionl flush of hmp->spmp 2775 * for the media making up the cluster. 2776 */ 2777 if ((ip = pmp->iroot) != NULL) { 2778 hammer2_inode_ref(ip); 2779 hammer2_mtx_ex(&ip->lock); 2780 hammer2_inode_chain_sync(ip); 2781 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP | 2782 HAMMER2_XOP_FSSYNC | 2783 HAMMER2_XOP_VOLHDR); 2784 hammer2_inode_unlock(ip); /* unlock+drop */ 2785 } 2786 #ifdef HAMMER2_DEBUG_SYNC 2787 kprintf("FILESYSTEM SYNC STAGE 2 DONE\n"); 2788 #endif 2789 2790 /* 2791 * device bioq sync 2792 */ 2793 hammer2_bioq_sync(pmp); 2794 2795 #if 0 2796 info.pass = 1; 2797 info.waitfor = MNT_WAIT; 2798 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2799 2800 info.pass = 2; 2801 info.waitfor = MNT_WAIT; 2802 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2803 #endif 2804 #if 0 2805 /* 2806 * Generally speaking we now want to flush the media topology from 2807 * the iroot through to the inodes. The flush stops at any inode 2808 * boundary, which allows the frontend to continue running concurrent 2809 * modifying operations on inodes (including kernel flushes of 2810 * buffers) without interfering with the main sync. 2811 * 2812 * Use the XOP interface to concurrently flush all nodes to 2813 * synchronize the PFSROOT subtopology to the media. A standard 2814 * end-of-scan ENOENT error indicates cluster sufficiency. 2815 * 2816 * Note that this flush will not be visible on crash recovery until 2817 * we flush the super-root topology in the next loop. 2818 * 2819 * XXX For now wait for all flushes to complete. 2820 */ 2821 if (mp && (ip = pmp->iroot) != NULL) { 2822 /* 2823 * If unmounting try to flush everything including any 2824 * sub-trees under inodes, just in case there is dangling 2825 * modified data, as a safety. Otherwise just flush up to 2826 * the inodes in this stage. 2827 */ 2828 kprintf("MP & IROOT\n"); 2829 #ifdef HAMMER2_DEBUG_SYNC 2830 kprintf("FILESYSTEM SYNC STAGE 3 IROOT BEGIN\n"); 2831 #endif 2832 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 2833 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | 2834 HAMMER2_XOP_VOLHDR | 2835 HAMMER2_XOP_FSSYNC | 2836 HAMMER2_XOP_INODE_STOP); 2837 } else { 2838 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | 2839 HAMMER2_XOP_INODE_STOP | 2840 HAMMER2_XOP_VOLHDR | 2841 HAMMER2_XOP_FSSYNC | 2842 HAMMER2_XOP_INODE_STOP); 2843 } 2844 hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc); 2845 error = hammer2_xop_collect(&xop->head, 2846 HAMMER2_XOP_COLLECT_WAITALL); 2847 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2848 #ifdef HAMMER2_DEBUG_SYNC 2849 kprintf("FILESYSTEM SYNC STAGE 3 IROOT END\n"); 2850 #endif 2851 if (error == HAMMER2_ERROR_ENOENT) 2852 error = 0; 2853 else 2854 error = hammer2_error_to_errno(error); 2855 } else { 2856 error = 0; 2857 } 2858 #endif 2859 error = 0; /* XXX */ 2860 hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH); 2861 2862 return (error); 2863 } 2864 2865 static 2866 int 2867 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp) 2868 { 2869 hammer2_inode_t *ip; 2870 2871 KKASSERT(MAXFIDSZ >= 16); 2872 ip = VTOI(vp); 2873 fhp->fid_len = offsetof(struct fid, fid_data[16]); 2874 fhp->fid_ext = 0; 2875 ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum; 2876 ((hammer2_tid_t *)fhp->fid_data)[1] = 0; 2877 2878 return 0; 2879 } 2880 2881 static 2882 int 2883 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 2884 struct fid *fhp, struct vnode **vpp) 2885 { 2886 hammer2_pfs_t *pmp; 2887 hammer2_tid_t inum; 2888 int error; 2889 2890 pmp = MPTOPMP(mp); 2891 inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK; 2892 if (vpp) { 2893 if (inum == 1) 2894 error = hammer2_vfs_root(mp, vpp); 2895 else 2896 error = hammer2_vfs_vget(mp, NULL, inum, vpp); 2897 } else { 2898 error = 0; 2899 } 2900 if (error) 2901 kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error); 2902 return error; 2903 } 2904 2905 static 2906 int 2907 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 2908 int *exflagsp, struct ucred **credanonp) 2909 { 2910 hammer2_pfs_t *pmp; 2911 struct netcred *np; 2912 int error; 2913 2914 pmp = MPTOPMP(mp); 2915 np = vfs_export_lookup(mp, &pmp->export, nam); 2916 if (np) { 2917 *exflagsp = np->netc_exflags; 2918 *credanonp = &np->netc_anon; 2919 error = 0; 2920 } else { 2921 error = EACCES; 2922 } 2923 return error; 2924 } 2925 2926 /* 2927 * Support code for hammer2_vfs_mount(). Read, verify, and install the volume 2928 * header into the HMP 2929 * 2930 * XXX read four volhdrs and use the one with the highest TID whos CRC 2931 * matches. 2932 * 2933 * XXX check iCRCs. 2934 * 2935 * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to 2936 * nonexistant locations. 2937 * 2938 * XXX Record selected volhdr and ring updates to each of 4 volhdrs 2939 */ 2940 static 2941 int 2942 hammer2_install_volume_header(hammer2_dev_t *hmp) 2943 { 2944 hammer2_volume_data_t *vd; 2945 struct buf *bp; 2946 hammer2_crc32_t crc0, crc, bcrc0, bcrc; 2947 int error_reported; 2948 int error; 2949 int valid; 2950 int i; 2951 2952 error_reported = 0; 2953 error = 0; 2954 valid = 0; 2955 bp = NULL; 2956 2957 /* 2958 * There are up to 4 copies of the volume header (syncs iterate 2959 * between them so there is no single master). We don't trust the 2960 * volu_size field so we don't know precisely how large the filesystem 2961 * is, so depend on the OS to return an error if we go beyond the 2962 * block device's EOF. 2963 */ 2964 for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { 2965 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2966 HAMMER2_VOLUME_BYTES, &bp); 2967 if (error) { 2968 brelse(bp); 2969 bp = NULL; 2970 continue; 2971 } 2972 2973 vd = (struct hammer2_volume_data *) bp->b_data; 2974 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) && 2975 (vd->magic != HAMMER2_VOLUME_ID_ABO)) { 2976 brelse(bp); 2977 bp = NULL; 2978 continue; 2979 } 2980 2981 if (vd->magic == HAMMER2_VOLUME_ID_ABO) { 2982 /* XXX: Reversed-endianness filesystem */ 2983 kprintf("hammer2: reverse-endian filesystem detected"); 2984 brelse(bp); 2985 bp = NULL; 2986 continue; 2987 } 2988 2989 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0]; 2990 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF, 2991 HAMMER2_VOLUME_ICRC0_SIZE); 2992 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1]; 2993 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF, 2994 HAMMER2_VOLUME_ICRC1_SIZE); 2995 if ((crc0 != crc) || (bcrc0 != bcrc)) { 2996 kprintf("hammer2 volume header crc " 2997 "mismatch copy #%d %08x/%08x\n", 2998 i, crc0, crc); 2999 error_reported = 1; 3000 brelse(bp); 3001 bp = NULL; 3002 continue; 3003 } 3004 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) { 3005 valid = 1; 3006 hmp->voldata = *vd; 3007 hmp->volhdrno = i; 3008 } 3009 brelse(bp); 3010 bp = NULL; 3011 } 3012 if (valid) { 3013 hmp->volsync = hmp->voldata; 3014 hmp->free_reserved = hmp->voldata.allocator_size / 20; 3015 error = 0; 3016 if (error_reported || bootverbose || 1) { /* 1/DEBUG */ 3017 kprintf("hammer2: using volume header #%d\n", 3018 hmp->volhdrno); 3019 } 3020 } else { 3021 error = EINVAL; 3022 kprintf("hammer2: no valid volume headers found!\n"); 3023 } 3024 return (error); 3025 } 3026 3027 /* 3028 * This handles hysteresis on regular file flushes. Because the BIOs are 3029 * routed to a thread it is possible for an excessive number to build up 3030 * and cause long front-end stalls long before the runningbuffspace limit 3031 * is hit, so we implement hammer2_flush_pipe to control the 3032 * hysteresis. 3033 * 3034 * This is a particular problem when compression is used. 3035 */ 3036 void 3037 hammer2_lwinprog_ref(hammer2_pfs_t *pmp) 3038 { 3039 atomic_add_int(&pmp->count_lwinprog, 1); 3040 } 3041 3042 void 3043 hammer2_lwinprog_drop(hammer2_pfs_t *pmp) 3044 { 3045 int lwinprog; 3046 3047 lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1); 3048 if ((lwinprog & HAMMER2_LWINPROG_WAITING) && 3049 (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) { 3050 atomic_clear_int(&pmp->count_lwinprog, 3051 HAMMER2_LWINPROG_WAITING); 3052 wakeup(&pmp->count_lwinprog); 3053 } 3054 if ((lwinprog & HAMMER2_LWINPROG_WAITING0) && 3055 (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) { 3056 atomic_clear_int(&pmp->count_lwinprog, 3057 HAMMER2_LWINPROG_WAITING0); 3058 wakeup(&pmp->count_lwinprog); 3059 } 3060 } 3061 3062 void 3063 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe) 3064 { 3065 int lwinprog; 3066 int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING : 3067 HAMMER2_LWINPROG_WAITING0; 3068 3069 for (;;) { 3070 lwinprog = pmp->count_lwinprog; 3071 cpu_ccfence(); 3072 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 3073 break; 3074 tsleep_interlock(&pmp->count_lwinprog, 0); 3075 atomic_set_int(&pmp->count_lwinprog, lwflag); 3076 lwinprog = pmp->count_lwinprog; 3077 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 3078 break; 3079 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz); 3080 } 3081 } 3082 3083 /* 3084 * It is possible for an excessive number of dirty chains or dirty inodes 3085 * to build up. When this occurs we start an asynchronous filesystem sync. 3086 * If the level continues to build up, we stall, waiting for it to drop, 3087 * with some hysteresis. 3088 * 3089 * This relies on the kernel calling hammer2_vfs_modifying() prior to 3090 * obtaining any vnode locks before making a modifying VOP call. 3091 */ 3092 static int 3093 hammer2_vfs_modifying(struct mount *mp) 3094 { 3095 if (mp->mnt_flag & MNT_RDONLY) 3096 return EROFS; 3097 hammer2_pfs_memory_wait(MPTOPMP(mp)); 3098 3099 return 0; 3100 } 3101 3102 /* 3103 * Initiate an asynchronous filesystem sync and, with hysteresis, 3104 * stall if the internal data structure count becomes too bloated. 3105 */ 3106 void 3107 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp) 3108 { 3109 uint32_t waiting; 3110 int pcatch; 3111 int error; 3112 3113 if (pmp == NULL || pmp->mp == NULL) 3114 return; 3115 3116 for (;;) { 3117 waiting = pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK; 3118 cpu_ccfence(); 3119 3120 /* 3121 * Start the syncer running at 1/2 the limit 3122 */ 3123 if (waiting > hammer2_limit_dirty_chains / 2 || 3124 pmp->sideq_count > hammer2_limit_dirty_inodes / 2) { 3125 trigger_syncer(pmp->mp); 3126 } 3127 3128 /* 3129 * Stall at the limit waiting for the counts to drop. 3130 * This code will typically be woken up once the count 3131 * drops below 3/4 the limit, or in one second. 3132 */ 3133 if (waiting < hammer2_limit_dirty_chains && 3134 pmp->sideq_count < hammer2_limit_dirty_inodes) { 3135 break; 3136 } 3137 3138 pcatch = curthread->td_proc ? PCATCH : 0; 3139 3140 tsleep_interlock(&pmp->inmem_dirty_chains, pcatch); 3141 atomic_set_int(&pmp->inmem_dirty_chains, 3142 HAMMER2_DIRTYCHAIN_WAITING); 3143 if (waiting < hammer2_limit_dirty_chains && 3144 pmp->sideq_count < hammer2_limit_dirty_inodes) { 3145 break; 3146 } 3147 trigger_syncer(pmp->mp); 3148 error = tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED | pcatch, 3149 "h2memw", hz); 3150 if (error == ERESTART) 3151 break; 3152 } 3153 } 3154 3155 /* 3156 * Wake up any stalled frontend ops waiting, with hysteresis, using 3157 * 2/3 of the limit. 3158 */ 3159 void 3160 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp, int count) 3161 { 3162 uint32_t waiting; 3163 3164 if (pmp) { 3165 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, count); 3166 /* don't need --waiting to test flag */ 3167 3168 if ((waiting & HAMMER2_DIRTYCHAIN_WAITING) && 3169 (pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK) <= 3170 hammer2_limit_dirty_chains * 2 / 3 && 3171 pmp->sideq_count <= hammer2_limit_dirty_inodes * 2 / 3) { 3172 atomic_clear_int(&pmp->inmem_dirty_chains, 3173 HAMMER2_DIRTYCHAIN_WAITING); 3174 wakeup(&pmp->inmem_dirty_chains); 3175 } 3176 } 3177 } 3178 3179 void 3180 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp) 3181 { 3182 if (pmp) { 3183 atomic_add_int(&pmp->inmem_dirty_chains, 1); 3184 } 3185 } 3186 3187 /* 3188 * Returns 0 if the filesystem has tons of free space 3189 * Returns 1 if the filesystem has less than 10% remaining 3190 * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining. 3191 */ 3192 int 3193 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred) 3194 { 3195 hammer2_pfs_t *pmp; 3196 hammer2_dev_t *hmp; 3197 hammer2_off_t free_reserved; 3198 hammer2_off_t free_nominal; 3199 int i; 3200 3201 pmp = ip->pmp; 3202 3203 if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) { 3204 free_reserved = HAMMER2_SEGSIZE; 3205 free_nominal = 0x7FFFFFFFFFFFFFFFLLU; 3206 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 3207 hmp = pmp->pfs_hmps[i]; 3208 if (hmp == NULL) 3209 continue; 3210 if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER && 3211 pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER) 3212 continue; 3213 3214 if (free_nominal > hmp->voldata.allocator_free) 3215 free_nominal = hmp->voldata.allocator_free; 3216 if (free_reserved < hmp->free_reserved) 3217 free_reserved = hmp->free_reserved; 3218 } 3219 3220 /* 3221 * SMP races ok 3222 */ 3223 pmp->free_reserved = free_reserved; 3224 pmp->free_nominal = free_nominal; 3225 pmp->free_ticks = ticks; 3226 } else { 3227 free_reserved = pmp->free_reserved; 3228 free_nominal = pmp->free_nominal; 3229 } 3230 if (cred && cred->cr_uid != 0) { 3231 if ((int64_t)(free_nominal - bytes) < 3232 (int64_t)free_reserved) { 3233 return 2; 3234 } 3235 } else { 3236 if ((int64_t)(free_nominal - bytes) < 3237 (int64_t)free_reserved / 2) { 3238 return 2; 3239 } 3240 } 3241 if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2) 3242 return 1; 3243 return 0; 3244 } 3245 3246 /* 3247 * Debugging 3248 */ 3249 void 3250 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, 3251 u_int flags) 3252 { 3253 hammer2_chain_t *scan; 3254 hammer2_chain_t *parent; 3255 3256 --*countp; 3257 if (*countp == 0) { 3258 kprintf("%*.*s...\n", tab, tab, ""); 3259 return; 3260 } 3261 if (*countp < 0) 3262 return; 3263 kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n", 3264 tab, tab, "", pfx, 3265 chain, chain->bref.type, 3266 chain->bref.key, chain->bref.keybits, 3267 chain->bref.mirror_tid); 3268 3269 kprintf("%*.*s [%08x] (%s) refs=%d", 3270 tab, tab, "", 3271 chain->flags, 3272 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 3273 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 3274 chain->refs); 3275 3276 parent = chain->parent; 3277 if (parent) 3278 kprintf("\n%*.*s p=%p [pflags %08x prefs %d", 3279 tab, tab, "", 3280 parent, parent->flags, parent->refs); 3281 if (RB_EMPTY(&chain->core.rbtree)) { 3282 kprintf("\n"); 3283 } else { 3284 kprintf(" {\n"); 3285 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 3286 if ((scan->flags & flags) || flags == (u_int)-1) { 3287 hammer2_dump_chain(scan, tab + 4, countp, 'a', 3288 flags); 3289 } 3290 } 3291 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 3292 kprintf("%*.*s}(%s)\n", tab, tab, "", 3293 chain->data->ipdata.filename); 3294 else 3295 kprintf("%*.*s}\n", tab, tab, ""); 3296 } 3297 } 3298