xref: /dflybsd-src/sys/vfs/hammer2/hammer2_vfsops.c (revision f354e0e64689159f00d07d7caa59dab0cea92fcb)
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48 
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54 
55 #include "hammer2.h"
56 #include "hammer2_disk.h"
57 #include "hammer2_mount.h"
58 #include "hammer2_lz4.h"
59 
60 #include "zlib/hammer2_zlib.h"
61 
62 #define REPORT_REFS_ERRORS 1	/* XXX remove me */
63 
64 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
65 
66 struct hammer2_sync_info {
67 	int error;
68 	int waitfor;
69 	int pass;
70 };
71 
72 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
73 static struct hammer2_mntlist hammer2_mntlist;
74 
75 struct hammer2_pfslist hammer2_pfslist;
76 struct hammer2_pfslist hammer2_spmplist;
77 struct lock hammer2_mntlk;
78 
79 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT;
80 int hammer2_debug;
81 int hammer2_xopgroups;
82 long hammer2_debug_inode;
83 int hammer2_cluster_meta_read = 1;	/* physical read-ahead */
84 int hammer2_cluster_data_read = 4;	/* physical read-ahead */
85 int hammer2_cluster_write = 0;		/* physical write clustering */
86 int hammer2_dedup_enable = 1;
87 int hammer2_always_compress = 0;	/* always try to compress */
88 int hammer2_inval_enable = 0;
89 int hammer2_flush_pipe = 100;
90 int hammer2_dio_count;
91 int hammer2_dio_limit = 256;
92 int hammer2_bulkfree_tps = 5000;
93 int hammer2_worker_rmask = 3;
94 long hammer2_chain_allocs;
95 long hammer2_chain_frees;
96 long hammer2_limit_dirty_chains;
97 long hammer2_limit_dirty_inodes;
98 long hammer2_count_modified_chains;
99 long hammer2_iod_invals;
100 long hammer2_iod_file_read;
101 long hammer2_iod_meta_read;
102 long hammer2_iod_indr_read;
103 long hammer2_iod_fmap_read;
104 long hammer2_iod_volu_read;
105 long hammer2_iod_file_write;
106 long hammer2_iod_file_wembed;
107 long hammer2_iod_file_wzero;
108 long hammer2_iod_file_wdedup;
109 long hammer2_iod_meta_write;
110 long hammer2_iod_indr_write;
111 long hammer2_iod_fmap_write;
112 long hammer2_iod_volu_write;
113 long hammer2_iod_inode_creates;
114 long hammer2_iod_inode_deletes;
115 
116 MALLOC_DECLARE(M_HAMMER2_CBUFFER);
117 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer",
118 		"Buffer used for compression.");
119 
120 MALLOC_DECLARE(M_HAMMER2_DEBUFFER);
121 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer",
122 		"Buffer used for decompression.");
123 
124 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
125 
126 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD,
127 	   &hammer2_supported_version, 0, "");
128 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
129 	   &hammer2_debug, 0, "");
130 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, debug_inode, CTLFLAG_RW,
131 	   &hammer2_debug_inode, 0, "");
132 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW,
133 	   &hammer2_cluster_meta_read, 0, "");
134 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW,
135 	   &hammer2_cluster_data_read, 0, "");
136 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW,
137 	   &hammer2_cluster_write, 0, "");
138 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW,
139 	   &hammer2_dedup_enable, 0, "");
140 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW,
141 	   &hammer2_always_compress, 0, "");
142 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW,
143 	   &hammer2_inval_enable, 0, "");
144 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
145 	   &hammer2_flush_pipe, 0, "");
146 SYSCTL_INT(_vfs_hammer2, OID_AUTO, worker_rmask, CTLFLAG_RW,
147 	   &hammer2_worker_rmask, 0, "");
148 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW,
149 	   &hammer2_bulkfree_tps, 0, "");
150 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW,
151 	   &hammer2_chain_allocs, 0, "");
152 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW,
153 	   &hammer2_chain_frees, 0, "");
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
155 	   &hammer2_limit_dirty_chains, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW,
157 	   &hammer2_limit_dirty_inodes, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW,
159 	   &hammer2_count_modified_chains, 0, "");
160 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
161 	   &hammer2_dio_count, 0, "");
162 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW,
163 	   &hammer2_dio_limit, 0, "");
164 
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW,
166 	   &hammer2_iod_invals, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
168 	   &hammer2_iod_file_read, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
170 	   &hammer2_iod_meta_read, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
172 	   &hammer2_iod_indr_read, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
174 	   &hammer2_iod_fmap_read, 0, "");
175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
176 	   &hammer2_iod_volu_read, 0, "");
177 
178 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
179 	   &hammer2_iod_file_write, 0, "");
180 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW,
181 	   &hammer2_iod_file_wembed, 0, "");
182 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW,
183 	   &hammer2_iod_file_wzero, 0, "");
184 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW,
185 	   &hammer2_iod_file_wdedup, 0, "");
186 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
187 	   &hammer2_iod_meta_write, 0, "");
188 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
189 	   &hammer2_iod_indr_write, 0, "");
190 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
191 	   &hammer2_iod_fmap_write, 0, "");
192 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
193 	   &hammer2_iod_volu_write, 0, "");
194 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_creates, CTLFLAG_RW,
195 	   &hammer2_iod_inode_creates, 0, "");
196 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_deletes, CTLFLAG_RW,
197 	   &hammer2_iod_inode_deletes, 0, "");
198 
199 long hammer2_process_icrc32;
200 long hammer2_process_xxhash64;
201 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW,
202 	   &hammer2_process_icrc32, 0, "");
203 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW,
204 	   &hammer2_process_xxhash64, 0, "");
205 
206 static int hammer2_vfs_init(struct vfsconf *conf);
207 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
208 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
209 				struct ucred *cred);
210 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
211 				struct vnode *, struct ucred *);
212 static int hammer2_recovery(hammer2_dev_t *hmp);
213 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
214 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
215 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
216 				struct ucred *cred);
217 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
218 				struct ucred *cred);
219 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
220 				struct fid *fhp, struct vnode **vpp);
221 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
222 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
223 				int *exflagsp, struct ucred **credanonp);
224 static int hammer2_vfs_modifying(struct mount *mp);
225 
226 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
227 #if 0
228 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
229 #endif
230 
231 static void hammer2_update_pmps(hammer2_dev_t *hmp);
232 
233 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
234 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
235 				hammer2_dev_t *hmp);
236 static int hammer2_fixup_pfses(hammer2_dev_t *hmp);
237 
238 /*
239  * HAMMER2 vfs operations.
240  */
241 static struct vfsops hammer2_vfsops = {
242 	.vfs_flags	= 0,
243 	.vfs_init	= hammer2_vfs_init,
244 	.vfs_uninit	= hammer2_vfs_uninit,
245 	.vfs_sync	= hammer2_vfs_sync,
246 	.vfs_mount	= hammer2_vfs_mount,
247 	.vfs_unmount	= hammer2_vfs_unmount,
248 	.vfs_root 	= hammer2_vfs_root,
249 	.vfs_statfs	= hammer2_vfs_statfs,
250 	.vfs_statvfs	= hammer2_vfs_statvfs,
251 	.vfs_vget	= hammer2_vfs_vget,
252 	.vfs_vptofh	= hammer2_vfs_vptofh,
253 	.vfs_fhtovp	= hammer2_vfs_fhtovp,
254 	.vfs_checkexp	= hammer2_vfs_checkexp,
255 	.vfs_modifying	= hammer2_vfs_modifying
256 };
257 
258 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
259 
260 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE);
261 MODULE_VERSION(hammer2, 1);
262 
263 static
264 int
265 hammer2_vfs_init(struct vfsconf *conf)
266 {
267 	static struct objcache_malloc_args margs_read;
268 	static struct objcache_malloc_args margs_write;
269 	static struct objcache_malloc_args margs_vop;
270 
271 	int error;
272 
273 	error = 0;
274 	kmalloc_raise_limit(M_HAMMER2, 0);	/* unlimited */
275 
276 	/*
277 	 * hammer2_xopgroups must be even and is most optimal if
278 	 * 2 x ncpus so strategy functions can be queued to the same
279 	 * cpu.
280 	 */
281 	hammer2_xopgroups = HAMMER2_XOPGROUPS_MIN;
282 	if (hammer2_xopgroups < ncpus * 2)
283 		hammer2_xopgroups = ncpus * 2;
284 
285 	/*
286 	 * A large DIO cache is needed to retain dedup enablement masks.
287 	 * The bulkfree code clears related masks as part of the disk block
288 	 * recycling algorithm, preventing it from being used for a later
289 	 * dedup.
290 	 *
291 	 * NOTE: A large buffer cache can actually interfere with dedup
292 	 *	 operation because we dedup based on media physical buffers
293 	 *	 and not logical buffers.  Try to make the DIO case large
294 	 *	 enough to avoid this problem, but also cap it.
295 	 */
296 	hammer2_dio_limit = nbuf * 2;
297 	if (hammer2_dio_limit > 100000)
298 		hammer2_dio_limit = 100000;
299 
300 	if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
301 		error = EINVAL;
302 	if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
303 		error = EINVAL;
304 	if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
305 		error = EINVAL;
306 
307 	if (error)
308 		kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
309 
310 	margs_read.objsize = 65536;
311 	margs_read.mtype = M_HAMMER2_DEBUFFER;
312 
313 	margs_write.objsize = 32768;
314 	margs_write.mtype = M_HAMMER2_CBUFFER;
315 
316 	margs_vop.objsize = sizeof(hammer2_xop_t);
317 	margs_vop.mtype = M_HAMMER2;
318 
319 	/*
320 	 * Note thaht for the XOPS cache we want backing store allocations
321 	 * to use M_ZERO.  This is not allowed in objcache_get() (to avoid
322 	 * confusion), so use the backing store function that does it.  This
323 	 * means that initial XOPS objects are zerod but REUSED objects are
324 	 * not.  So we are responsible for cleaning the object up sufficiently
325 	 * for our needs before objcache_put()ing it back (typically just the
326 	 * FIFO indices).
327 	 */
328 	cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
329 				0, 1, NULL, NULL, NULL,
330 				objcache_malloc_alloc,
331 				objcache_malloc_free,
332 				&margs_read);
333 	cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
334 				0, 1, NULL, NULL, NULL,
335 				objcache_malloc_alloc,
336 				objcache_malloc_free,
337 				&margs_write);
338 	cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc,
339 				0, 1, NULL, NULL, NULL,
340 				objcache_malloc_alloc_zero,
341 				objcache_malloc_free,
342 				&margs_vop);
343 
344 
345 	lockinit(&hammer2_mntlk, "mntlk", 0, 0);
346 	TAILQ_INIT(&hammer2_mntlist);
347 	TAILQ_INIT(&hammer2_pfslist);
348 	TAILQ_INIT(&hammer2_spmplist);
349 
350 	hammer2_limit_dirty_chains = maxvnodes / 10;
351 	if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS)
352 		hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS;
353 	if (hammer2_limit_dirty_chains < 1000)
354 		hammer2_limit_dirty_chains = 1000;
355 
356 	hammer2_limit_dirty_inodes = maxvnodes / 25;
357 	if (hammer2_limit_dirty_inodes < 100)
358 		hammer2_limit_dirty_inodes = 100;
359 	if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES)
360 		hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES;
361 
362 	return (error);
363 }
364 
365 static
366 int
367 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
368 {
369 	objcache_destroy(cache_buffer_read);
370 	objcache_destroy(cache_buffer_write);
371 	objcache_destroy(cache_xops);
372 	return 0;
373 }
374 
375 /*
376  * Core PFS allocator.  Used to allocate or reference the pmp structure
377  * for PFS cluster mounts and the spmp structure for media (hmp) structures.
378  * The pmp can be passed in or loaded by this function using the chain and
379  * inode data.
380  *
381  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
382  * transactions.  Note that synchronization does not use this field.
383  * (typically frontend operations and synchronization cannot run on the
384  * same PFS node at the same time).
385  *
386  * XXX check locking
387  */
388 hammer2_pfs_t *
389 hammer2_pfsalloc(hammer2_chain_t *chain,
390 		 const hammer2_inode_data_t *ripdata,
391 		 hammer2_tid_t modify_tid, hammer2_dev_t *force_local)
392 {
393 	hammer2_pfs_t *pmp;
394 	hammer2_inode_t *iroot;
395 	int count;
396 	int i;
397 	int j;
398 
399 	pmp = NULL;
400 
401 	/*
402 	 * Locate or create the PFS based on the cluster id.  If ripdata
403 	 * is NULL this is a spmp which is unique and is always allocated.
404 	 *
405 	 * If the device is mounted in local mode all PFSs are considered
406 	 * independent and not part of any cluster (for debugging only).
407 	 */
408 	if (ripdata) {
409 		TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
410 			if (force_local != pmp->force_local)
411 				continue;
412 			if (force_local == NULL &&
413 			    bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid,
414 				 sizeof(pmp->pfs_clid)) == 0) {
415 					break;
416 			} else if (force_local && pmp->pfs_names[0] &&
417 			    strcmp(pmp->pfs_names[0], ripdata->filename) == 0) {
418 					break;
419 			}
420 		}
421 	}
422 
423 	if (pmp == NULL) {
424 		pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
425 		pmp->force_local = force_local;
426 		hammer2_trans_manage_init(pmp);
427 		kmalloc_create(&pmp->minode, "HAMMER2-inodes");
428 		kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
429 		lockinit(&pmp->lock, "pfslk", 0, 0);
430 		lockinit(&pmp->lock_nlink, "h2nlink", 0, 0);
431 		spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
432 		spin_init(&pmp->xop_spin, "h2xop");
433 		spin_init(&pmp->lru_spin, "h2lru");
434 		RB_INIT(&pmp->inum_tree);
435 		TAILQ_INIT(&pmp->syncq);
436 		TAILQ_INIT(&pmp->depq);
437 		TAILQ_INIT(&pmp->lru_list);
438 		spin_init(&pmp->list_spin, "h2pfsalloc_list");
439 
440 		/*
441 		 * Save the last media transaction id for the flusher.  Set
442 		 * initial
443 		 */
444 		if (ripdata) {
445 			pmp->pfs_clid = ripdata->meta.pfs_clid;
446 			TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
447 		} else {
448 			pmp->flags |= HAMMER2_PMPF_SPMP;
449 			TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry);
450 		}
451 
452 		/*
453 		 * The synchronization thread may start too early, make
454 		 * sure it stays frozen until we are ready to let it go.
455 		 * XXX
456 		 */
457 		/*
458 		pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN |
459 					 HAMMER2_THREAD_REMASTER;
460 		*/
461 	}
462 
463 	/*
464 	 * Create the PFS's root inode and any missing XOP helper threads.
465 	 */
466 	if ((iroot = pmp->iroot) == NULL) {
467 		iroot = hammer2_inode_get(pmp, NULL, 1, -1);
468 		if (ripdata)
469 			iroot->meta = ripdata->meta;
470 		pmp->iroot = iroot;
471 		hammer2_inode_ref(iroot);
472 		hammer2_inode_unlock(iroot);
473 	}
474 
475 	/*
476 	 * Stop here if no chain is passed in.
477 	 */
478 	if (chain == NULL)
479 		goto done;
480 
481 	/*
482 	 * When a chain is passed in we must add it to the PFS's root
483 	 * inode, update pmp->pfs_types[], and update the syncronization
484 	 * threads.
485 	 *
486 	 * When forcing local mode, mark the PFS as a MASTER regardless.
487 	 *
488 	 * At the moment empty spots can develop due to removals or failures.
489 	 * Ultimately we want to re-fill these spots but doing so might
490 	 * confused running code. XXX
491 	 */
492 	hammer2_inode_ref(iroot);
493 	hammer2_mtx_ex(&iroot->lock);
494 	j = iroot->cluster.nchains;
495 
496 	if (j == HAMMER2_MAXCLUSTER) {
497 		kprintf("hammer2_mount: cluster full!\n");
498 		/* XXX fatal error? */
499 	} else {
500 		KKASSERT(chain->pmp == NULL);
501 		chain->pmp = pmp;
502 		hammer2_chain_ref(chain);
503 		iroot->cluster.array[j].chain = chain;
504 		if (force_local)
505 			pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER;
506 		else
507 			pmp->pfs_types[j] = ripdata->meta.pfs_type;
508 		pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2);
509 		pmp->pfs_hmps[j] = chain->hmp;
510 		hammer2_spin_ex(&pmp->inum_spin);
511 		pmp->pfs_iroot_blocksets[j] = chain->data->ipdata.u.blockset;
512 		hammer2_spin_unex(&pmp->inum_spin);
513 
514 		/*
515 		 * If the PFS is already mounted we must account
516 		 * for the mount_count here.
517 		 */
518 		if (pmp->mp)
519 			++chain->hmp->mount_count;
520 
521 		/*
522 		 * May have to fixup dirty chain tracking.  Previous
523 		 * pmp was NULL so nothing to undo.
524 		 */
525 		if (chain->flags & HAMMER2_CHAIN_MODIFIED)
526 			hammer2_pfs_memory_inc(pmp);
527 		++j;
528 	}
529 	iroot->cluster.nchains = j;
530 
531 	/*
532 	 * Update nmasters from any PFS inode which is part of the cluster.
533 	 * It is possible that this will result in a value which is too
534 	 * high.  MASTER PFSs are authoritative for pfs_nmasters and will
535 	 * override this value later on.
536 	 *
537 	 * (This informs us of masters that might not currently be
538 	 *  discoverable by this mount).
539 	 */
540 	if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) {
541 		pmp->pfs_nmasters = ripdata->meta.pfs_nmasters;
542 	}
543 
544 	/*
545 	 * Count visible masters.  Masters are usually added with
546 	 * ripdata->meta.pfs_nmasters set to 1.  This detects when there
547 	 * are more (XXX and must update the master inodes).
548 	 */
549 	count = 0;
550 	for (i = 0; i < iroot->cluster.nchains; ++i) {
551 		if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)
552 			++count;
553 	}
554 	if (pmp->pfs_nmasters < count)
555 		pmp->pfs_nmasters = count;
556 
557 	/*
558 	 * Create missing synchronization and support threads.
559 	 *
560 	 * Single-node masters (including snapshots) have nothing to
561 	 * synchronize and do not require this thread.
562 	 *
563 	 * Multi-node masters or any number of soft masters, slaves, copy,
564 	 * or other PFS types need the thread.
565 	 *
566 	 * Each thread is responsible for its particular cluster index.
567 	 * We use independent threads so stalls or mismatches related to
568 	 * any given target do not affect other targets.
569 	 */
570 	for (i = 0; i < iroot->cluster.nchains; ++i) {
571 		/*
572 		 * Single-node masters (including snapshots) have nothing
573 		 * to synchronize and will make direct xops support calls,
574 		 * thus they do not require this thread.
575 		 *
576 		 * Note that there can be thousands of snapshots.  We do not
577 		 * want to create thousands of threads.
578 		 */
579 		if (pmp->pfs_nmasters <= 1 &&
580 		    pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) {
581 			continue;
582 		}
583 
584 		/*
585 		 * Sync support thread
586 		 */
587 		if (pmp->sync_thrs[i].td == NULL) {
588 			hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL,
589 					   "h2nod", i, -1,
590 					   hammer2_primary_sync_thread);
591 		}
592 	}
593 
594 	/*
595 	 * Create missing Xop threads
596 	 *
597 	 * NOTE: We create helper threads for all mounted PFSs or any
598 	 *	 PFSs with 2+ nodes (so the sync thread can update them,
599 	 *	 even if not mounted).
600 	 */
601 	if (pmp->mp || iroot->cluster.nchains >= 2)
602 		hammer2_xop_helper_create(pmp);
603 
604 	hammer2_mtx_unlock(&iroot->lock);
605 	hammer2_inode_drop(iroot);
606 done:
607 	return pmp;
608 }
609 
610 /*
611  * Deallocate an element of a probed PFS.  If destroying and this is a
612  * MASTER, adjust nmasters.
613  *
614  * This function does not physically destroy the PFS element in its device
615  * under the super-root  (see hammer2_ioctl_pfs_delete()).
616  */
617 void
618 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying)
619 {
620 	hammer2_inode_t *iroot;
621 	hammer2_chain_t *chain;
622 	int j;
623 
624 	/*
625 	 * Cleanup our reference on iroot.  iroot is (should) not be needed
626 	 * by the flush code.
627 	 */
628 	iroot = pmp->iroot;
629 	if (iroot) {
630 		/*
631 		 * Stop synchronizing
632 		 *
633 		 * XXX flush after acquiring the iroot lock.
634 		 * XXX clean out the cluster index from all inode structures.
635 		 */
636 		hammer2_thr_delete(&pmp->sync_thrs[clindex]);
637 
638 		/*
639 		 * Remove the cluster index from the group.  If destroying
640 		 * the PFS and this is a master, adjust pfs_nmasters.
641 		 */
642 		hammer2_mtx_ex(&iroot->lock);
643 		chain = iroot->cluster.array[clindex].chain;
644 		iroot->cluster.array[clindex].chain = NULL;
645 
646 		switch(pmp->pfs_types[clindex]) {
647 		case HAMMER2_PFSTYPE_MASTER:
648 			if (destroying && pmp->pfs_nmasters > 0)
649 				--pmp->pfs_nmasters;
650 			/* XXX adjust ripdata->meta.pfs_nmasters */
651 			break;
652 		default:
653 			break;
654 		}
655 		pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE;
656 
657 		hammer2_mtx_unlock(&iroot->lock);
658 
659 		/*
660 		 * Release the chain.
661 		 */
662 		if (chain) {
663 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
664 			hammer2_chain_drop(chain);
665 		}
666 
667 		/*
668 		 * Terminate all XOP threads for the cluster index.
669 		 */
670 		if (pmp->xop_groups) {
671 			for (j = 0; j < hammer2_xopgroups; ++j) {
672 				hammer2_thr_delete(
673 					&pmp->xop_groups[j].thrs[clindex]);
674 			}
675 		}
676 	}
677 }
678 
679 /*
680  * Destroy a PFS, typically only occurs after the last mount on a device
681  * has gone away.
682  */
683 static void
684 hammer2_pfsfree(hammer2_pfs_t *pmp)
685 {
686 	hammer2_inode_t *iroot;
687 	hammer2_chain_t *chain;
688 	int chains_still_present = 0;
689 	int i;
690 	int j;
691 
692 	/*
693 	 * Cleanup our reference on iroot.  iroot is (should) not be needed
694 	 * by the flush code.
695 	 */
696 	if (pmp->flags & HAMMER2_PMPF_SPMP)
697 		TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry);
698 	else
699 		TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
700 
701 	/*
702 	 * Cleanup chains remaining on LRU list.
703 	 */
704 	hammer2_spin_ex(&pmp->lru_spin);
705 	while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) {
706 		KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU);
707 		atomic_add_int(&pmp->lru_count, -1);
708 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
709 		TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
710 		hammer2_chain_ref(chain);
711 		hammer2_spin_unex(&pmp->lru_spin);
712 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
713 		hammer2_chain_drop(chain);
714 		hammer2_spin_ex(&pmp->lru_spin);
715 	}
716 	hammer2_spin_unex(&pmp->lru_spin);
717 
718 	/*
719 	 * Clean up iroot
720 	 */
721 	iroot = pmp->iroot;
722 	if (iroot) {
723 		for (i = 0; i < iroot->cluster.nchains; ++i) {
724 			hammer2_thr_delete(&pmp->sync_thrs[i]);
725 			if (pmp->xop_groups) {
726 				for (j = 0; j < hammer2_xopgroups; ++j)
727 					hammer2_thr_delete(
728 						&pmp->xop_groups[j].thrs[i]);
729 			}
730 			chain = iroot->cluster.array[i].chain;
731 			if (chain && !RB_EMPTY(&chain->core.rbtree)) {
732 				kprintf("hammer2: Warning pmp %p still "
733 					"has active chains\n", pmp);
734 				chains_still_present = 1;
735 			}
736 		}
737 #if REPORT_REFS_ERRORS
738 		if (iroot->refs != 1)
739 			kprintf("PMP->IROOT %p REFS WRONG %d\n",
740 				iroot, iroot->refs);
741 #else
742 		KKASSERT(iroot->refs == 1);
743 #endif
744 		/* ref for iroot */
745 		hammer2_inode_drop(iroot);
746 		pmp->iroot = NULL;
747 	}
748 
749 	/*
750 	 * Free remaining pmp resources
751 	 */
752 	if (chains_still_present) {
753 		kprintf("hammer2: cannot free pmp %p, still in use\n", pmp);
754 	} else {
755 		kmalloc_destroy(&pmp->mmsg);
756 		kmalloc_destroy(&pmp->minode);
757 		kfree(pmp, M_HAMMER2);
758 	}
759 }
760 
761 /*
762  * Remove all references to hmp from the pfs list.  Any PFS which becomes
763  * empty is terminated and freed.
764  *
765  * XXX inefficient.
766  */
767 static void
768 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which)
769 {
770 	hammer2_pfs_t *pmp;
771 	hammer2_inode_t *iroot;
772 	hammer2_chain_t *rchain;
773 	int i;
774 	int j;
775 	struct hammer2_pfslist *wlist;
776 
777 	if (which == 0)
778 		wlist = &hammer2_pfslist;
779 	else
780 		wlist = &hammer2_spmplist;
781 again:
782 	TAILQ_FOREACH(pmp, wlist, mntentry) {
783 		if ((iroot = pmp->iroot) == NULL)
784 			continue;
785 
786 		/*
787 		 * Determine if this PFS is affected.  If it is we must
788 		 * freeze all management threads and lock its iroot.
789 		 *
790 		 * Freezing a management thread forces it idle, operations
791 		 * in-progress will be aborted and it will have to start
792 		 * over again when unfrozen, or exit if told to exit.
793 		 */
794 		for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
795 			if (pmp->pfs_hmps[i] == hmp)
796 				break;
797 		}
798 		if (i == HAMMER2_MAXCLUSTER)
799 			continue;
800 
801 		hammer2_vfs_sync_pmp(pmp, MNT_WAIT);
802 
803 		/*
804 		 * Make sure all synchronization threads are locked
805 		 * down.
806 		 */
807 		for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
808 			if (pmp->pfs_hmps[i] == NULL)
809 				continue;
810 			hammer2_thr_freeze_async(&pmp->sync_thrs[i]);
811 			if (pmp->xop_groups) {
812 				for (j = 0; j < hammer2_xopgroups; ++j) {
813 					hammer2_thr_freeze_async(
814 						&pmp->xop_groups[j].thrs[i]);
815 				}
816 			}
817 		}
818 		for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
819 			if (pmp->pfs_hmps[i] == NULL)
820 				continue;
821 			hammer2_thr_freeze(&pmp->sync_thrs[i]);
822 			if (pmp->xop_groups) {
823 				for (j = 0; j < hammer2_xopgroups; ++j) {
824 					hammer2_thr_freeze(
825 						&pmp->xop_groups[j].thrs[i]);
826 				}
827 			}
828 		}
829 
830 		/*
831 		 * Lock the inode and clean out matching chains.
832 		 * Note that we cannot use hammer2_inode_lock_*()
833 		 * here because that would attempt to validate the
834 		 * cluster that we are in the middle of ripping
835 		 * apart.
836 		 *
837 		 * WARNING! We are working directly on the inodes
838 		 *	    embedded cluster.
839 		 */
840 		hammer2_mtx_ex(&iroot->lock);
841 
842 		/*
843 		 * Remove the chain from matching elements of the PFS.
844 		 */
845 		for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
846 			if (pmp->pfs_hmps[i] != hmp)
847 				continue;
848 			hammer2_thr_delete(&pmp->sync_thrs[i]);
849 			if (pmp->xop_groups) {
850 				for (j = 0; j < hammer2_xopgroups; ++j) {
851 					hammer2_thr_delete(
852 						&pmp->xop_groups[j].thrs[i]);
853 				}
854 			}
855 			rchain = iroot->cluster.array[i].chain;
856 			iroot->cluster.array[i].chain = NULL;
857 			pmp->pfs_types[i] = 0;
858 			if (pmp->pfs_names[i]) {
859 				kfree(pmp->pfs_names[i], M_HAMMER2);
860 				pmp->pfs_names[i] = NULL;
861 			}
862 			if (rchain) {
863 				hammer2_chain_drop(rchain);
864 				/* focus hint */
865 				if (iroot->cluster.focus == rchain)
866 					iroot->cluster.focus = NULL;
867 			}
868 			pmp->pfs_hmps[i] = NULL;
869 		}
870 		hammer2_mtx_unlock(&iroot->lock);
871 
872 		/*
873 		 * Cleanup trailing chains.  Gaps may remain.
874 		 */
875 		for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) {
876 			if (pmp->pfs_hmps[i])
877 				break;
878 		}
879 		iroot->cluster.nchains = i + 1;
880 
881 		/*
882 		 * If the PMP has no elements remaining we can destroy it.
883 		 * (this will transition management threads from frozen->exit).
884 		 */
885 		if (iroot->cluster.nchains == 0) {
886 			/*
887 			 * If this was the hmp's spmp, we need to clean
888 			 * a little more stuff out.
889 			 */
890 			if (hmp->spmp == pmp) {
891 				hmp->spmp = NULL;
892 				hmp->vchain.pmp = NULL;
893 				hmp->fchain.pmp = NULL;
894 			}
895 
896 			/*
897 			 * Free the pmp and restart the loop
898 			 */
899 			KKASSERT(TAILQ_EMPTY(&pmp->syncq));
900 			KKASSERT(TAILQ_EMPTY(&pmp->depq));
901 			hammer2_pfsfree(pmp);
902 			goto again;
903 		}
904 
905 		/*
906 		 * If elements still remain we need to set the REMASTER
907 		 * flag and unfreeze it.
908 		 */
909 		for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
910 			if (pmp->pfs_hmps[i] == NULL)
911 				continue;
912 			hammer2_thr_remaster(&pmp->sync_thrs[i]);
913 			hammer2_thr_unfreeze(&pmp->sync_thrs[i]);
914 			if (pmp->xop_groups) {
915 				for (j = 0; j < hammer2_xopgroups; ++j) {
916 					hammer2_thr_remaster(
917 						&pmp->xop_groups[j].thrs[i]);
918 					hammer2_thr_unfreeze(
919 						&pmp->xop_groups[j].thrs[i]);
920 				}
921 			}
922 		}
923 	}
924 }
925 
926 /*
927  * Mount or remount HAMMER2 fileystem from physical media
928  *
929  *	mountroot
930  *		mp		mount point structure
931  *		path		NULL
932  *		data		<unused>
933  *		cred		<unused>
934  *
935  *	mount
936  *		mp		mount point structure
937  *		path		path to mount point
938  *		data		pointer to argument structure in user space
939  *			volume	volume path (device@LABEL form)
940  *			hflags	user mount flags
941  *		cred		user credentials
942  *
943  * RETURNS:	0	Success
944  *		!0	error number
945  */
946 static
947 int
948 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
949 		  struct ucred *cred)
950 {
951 	struct hammer2_mount_info info;
952 	hammer2_pfs_t *pmp;
953 	hammer2_pfs_t *spmp;
954 	hammer2_dev_t *hmp;
955 	hammer2_dev_t *force_local;
956 	hammer2_key_t key_next;
957 	hammer2_key_t key_dummy;
958 	hammer2_key_t lhc;
959 	struct vnode *devvp;
960 	struct nlookupdata nd;
961 	hammer2_chain_t *parent;
962 	hammer2_chain_t *chain;
963 	const hammer2_inode_data_t *ripdata;
964 	hammer2_blockref_t bref;
965 	struct file *fp;
966 	char devstr[MNAMELEN];
967 	size_t size;
968 	size_t done;
969 	char *dev;
970 	char *label;
971 	int ronly = 1;
972 	int error;
973 	int i;
974 
975 	hmp = NULL;
976 	pmp = NULL;
977 	dev = NULL;
978 	label = NULL;
979 	devvp = NULL;
980 
981 	if (path == NULL) {
982 		/*
983 		 * Root mount
984 		 */
985 		bzero(&info, sizeof(info));
986 		info.cluster_fd = -1;
987 		ksnprintf(devstr, sizeof(devstr), "%s",
988 			  mp->mnt_stat.f_mntfromname);
989 		kprintf("hammer2_mount: root '%s'\n", devstr);
990 		done = strlen(devstr) + 1;
991 	} else {
992 		/*
993 		 * Non-root mount or updating a mount
994 		 */
995 		error = copyin(data, &info, sizeof(info));
996 		if (error)
997 			return (error);
998 
999 		error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
1000 		if (error)
1001 			return (error);
1002 		kprintf("hammer2_mount: '%s'\n", devstr);
1003 	}
1004 
1005 	/*
1006 	 * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA
1007 	 * if no label specified, based on the partition id.  Error out if no
1008 	 * label or device (with partition id) is specified.  This is strictly
1009 	 * a convenience to match the default label created by newfs_hammer2,
1010 	 * our preference is that a label always be specified.
1011 	 *
1012 	 * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command
1013 	 *	 that does not specify a device, as long as some H2 label
1014 	 *	 has already been mounted from that device.  This makes
1015 	 *	 mounting snapshots a lot easier.
1016 	 */
1017 	dev = devstr;
1018 	label = strchr(devstr, '@');
1019 	if (label && ((label + 1) - dev) > done) {
1020 		kprintf("hammer2: mount: bad label %s/%zd\n",
1021 			devstr, done);
1022 		return (EINVAL);
1023 	}
1024 	if (label == NULL || label[1] == 0) {
1025 		char slice;
1026 
1027 		if (label == NULL)
1028 			label = devstr + strlen(devstr);
1029 		else
1030 			*label = '\0';		/* clean up trailing @ */
1031 
1032 		slice = label[-1];
1033 		switch(slice) {
1034 		case 'a':
1035 			label = "BOOT";
1036 			break;
1037 		case 'd':
1038 			label = "ROOT";
1039 			break;
1040 		default:
1041 			label = "DATA";
1042 			break;
1043 		}
1044 	} else {
1045 		*label = '\0';
1046 		label++;
1047 	}
1048 
1049 	kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n",
1050 		dev, label, (mp->mnt_flag & MNT_RDONLY));
1051 
1052 	if (mp->mnt_flag & MNT_UPDATE) {
1053 		/*
1054 		 * Update mount.  Note that pmp->iroot->cluster is
1055 		 * an inode-embedded cluster and thus cannot be
1056 		 * directly locked.
1057 		 *
1058 		 * XXX HAMMER2 needs to implement NFS export via
1059 		 *     mountctl.
1060 		 */
1061 		hammer2_cluster_t *cluster;
1062 
1063 		pmp = MPTOPMP(mp);
1064 		pmp->hflags = info.hflags;
1065 		cluster = &pmp->iroot->cluster;
1066 		for (i = 0; i < cluster->nchains; ++i) {
1067 			if (cluster->array[i].chain == NULL)
1068 				continue;
1069 			hmp = cluster->array[i].chain->hmp;
1070 			devvp = hmp->devvp;
1071 			error = hammer2_remount(hmp, mp, path,
1072 						devvp, cred);
1073 			if (error)
1074 				break;
1075 		}
1076 
1077 		return error;
1078 	}
1079 
1080 	/*
1081 	 * HMP device mount
1082 	 *
1083 	 * If a path is specified and dev is not an empty string, lookup the
1084 	 * name and verify that it referes to a block device.
1085 	 *
1086 	 * If a path is specified and dev is an empty string we fall through
1087 	 * and locate the label in the hmp search.
1088 	 */
1089 	if (path && *dev != 0) {
1090 		error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
1091 		if (error == 0)
1092 			error = nlookup(&nd);
1093 		if (error == 0)
1094 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
1095 		nlookup_done(&nd);
1096 	} else if (path == NULL) {
1097 		/* root mount */
1098 		cdev_t cdev = kgetdiskbyname(dev);
1099 		error = bdevvp(cdev, &devvp);
1100 		if (error)
1101 			kprintf("hammer2: cannot find '%s'\n", dev);
1102 	} else {
1103 		/*
1104 		 * We will locate the hmp using the label in the hmp loop.
1105 		 */
1106 		error = 0;
1107 	}
1108 
1109 	/*
1110 	 * Make sure its a block device.  Do not check to see if it is
1111 	 * already mounted until we determine that its a fresh H2 device.
1112 	 */
1113 	if (error == 0 && devvp) {
1114 		vn_isdisk(devvp, &error);
1115 	}
1116 
1117 	/*
1118 	 * Determine if the device has already been mounted.  After this
1119 	 * check hmp will be non-NULL if we are doing the second or more
1120 	 * hammer2 mounts from the same device.
1121 	 */
1122 	lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1123 	if (devvp) {
1124 		/*
1125 		 * Match the device.  Due to the way devfs works,
1126 		 * we may not be able to directly match the vnode pointer,
1127 		 * so also check to see if the underlying device matches.
1128 		 */
1129 		TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
1130 			if (hmp->devvp == devvp)
1131 				break;
1132 			if (devvp->v_rdev &&
1133 			    hmp->devvp->v_rdev == devvp->v_rdev) {
1134 				break;
1135 			}
1136 		}
1137 
1138 		/*
1139 		 * If no match this may be a fresh H2 mount, make sure
1140 		 * the device is not mounted on anything else.
1141 		 */
1142 		if (hmp == NULL)
1143 			error = vfs_mountedon(devvp);
1144 	} else if (error == 0) {
1145 		/*
1146 		 * Match the label to a pmp already probed.
1147 		 */
1148 		TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
1149 			for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
1150 				if (pmp->pfs_names[i] &&
1151 				    strcmp(pmp->pfs_names[i], label) == 0) {
1152 					hmp = pmp->pfs_hmps[i];
1153 					break;
1154 				}
1155 			}
1156 			if (hmp)
1157 				break;
1158 		}
1159 		if (hmp == NULL)
1160 			error = ENOENT;
1161 	}
1162 
1163 	/*
1164 	 * Open the device if this isn't a secondary mount and construct
1165 	 * the H2 device mount (hmp).
1166 	 */
1167 	if (hmp == NULL) {
1168 		hammer2_chain_t *schain;
1169 		hammer2_xid_t xid;
1170 		hammer2_xop_head_t xop;
1171 
1172 		if (error == 0 && vcount(devvp) > 0) {
1173 			kprintf("Primary device already has references\n");
1174 			error = EBUSY;
1175 		}
1176 
1177 		/*
1178 		 * Now open the device
1179 		 */
1180 		if (error == 0) {
1181 			ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1182 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1183 			error = vinvalbuf(devvp, V_SAVE, 0, 0);
1184 			if (error == 0) {
1185 				error = VOP_OPEN(devvp,
1186 					     (ronly ? FREAD : FREAD | FWRITE),
1187 					     FSCRED, NULL);
1188 			}
1189 			vn_unlock(devvp);
1190 		}
1191 		if (error && devvp) {
1192 			vrele(devvp);
1193 			devvp = NULL;
1194 		}
1195 		if (error) {
1196 			lockmgr(&hammer2_mntlk, LK_RELEASE);
1197 			return error;
1198 		}
1199 		hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
1200 		ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
1201 		hmp->ronly = ronly;
1202 		hmp->devvp = devvp;
1203 		hmp->hflags = info.hflags & HMNT2_DEVFLAGS;
1204 		kmalloc_create(&hmp->mchain, "HAMMER2-chains");
1205 		TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
1206 		RB_INIT(&hmp->iotree);
1207 		spin_init(&hmp->io_spin, "h2mount_io");
1208 		spin_init(&hmp->list_spin, "h2mount_list");
1209 
1210 		lockinit(&hmp->vollk, "h2vol", 0, 0);
1211 		lockinit(&hmp->bulklk, "h2bulk", 0, 0);
1212 		lockinit(&hmp->bflock, "h2bflk", 0, 0);
1213 
1214 		/*
1215 		 * vchain setup. vchain.data is embedded.
1216 		 * vchain.refs is initialized and will never drop to 0.
1217 		 *
1218 		 * NOTE! voldata is not yet loaded.
1219 		 */
1220 		hmp->vchain.hmp = hmp;
1221 		hmp->vchain.refs = 1;
1222 		hmp->vchain.data = (void *)&hmp->voldata;
1223 		hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
1224 		hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
1225 		hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
1226 
1227 		hammer2_chain_core_init(&hmp->vchain);
1228 		/* hmp->vchain.u.xxx is left NULL */
1229 
1230 		/*
1231 		 * fchain setup.  fchain.data is embedded.
1232 		 * fchain.refs is initialized and will never drop to 0.
1233 		 *
1234 		 * The data is not used but needs to be initialized to
1235 		 * pass assertion muster.  We use this chain primarily
1236 		 * as a placeholder for the freemap's top-level RBTREE
1237 		 * so it does not interfere with the volume's topology
1238 		 * RBTREE.
1239 		 */
1240 		hmp->fchain.hmp = hmp;
1241 		hmp->fchain.refs = 1;
1242 		hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
1243 		hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
1244 		hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
1245 		hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
1246 		hmp->fchain.bref.methods =
1247 			HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
1248 			HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
1249 
1250 		hammer2_chain_core_init(&hmp->fchain);
1251 		/* hmp->fchain.u.xxx is left NULL */
1252 
1253 		/*
1254 		 * Install the volume header and initialize fields from
1255 		 * voldata.
1256 		 */
1257 		error = hammer2_install_volume_header(hmp);
1258 		if (error) {
1259 			hammer2_unmount_helper(mp, NULL, hmp);
1260 			lockmgr(&hammer2_mntlk, LK_RELEASE);
1261 			hammer2_vfs_unmount(mp, MNT_FORCE);
1262 			return error;
1263 		}
1264 
1265 		/*
1266 		 * Really important to get these right or the flush and
1267 		 * teardown code will get confused.
1268 		 */
1269 		hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL);
1270 		spmp = hmp->spmp;
1271 		spmp->pfs_hmps[0] = hmp;
1272 
1273 		/*
1274 		 * Dummy-up vchain and fchain's modify_tid.  mirror_tid
1275 		 * is inherited from the volume header.
1276 		 */
1277 		xid = 0;
1278 		hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
1279 		hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
1280 		hmp->vchain.pmp = spmp;
1281 		hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
1282 		hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
1283 		hmp->fchain.pmp = spmp;
1284 
1285 		/*
1286 		 * First locate the super-root inode, which is key 0
1287 		 * relative to the volume header's blockset.
1288 		 *
1289 		 * Then locate the root inode by scanning the directory keyspace
1290 		 * represented by the label.
1291 		 */
1292 		parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
1293 		schain = hammer2_chain_lookup(&parent, &key_dummy,
1294 				      HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
1295 				      &error, 0);
1296 		hammer2_chain_lookup_done(parent);
1297 		if (schain == NULL) {
1298 			kprintf("hammer2_mount: invalid super-root\n");
1299 			hammer2_unmount_helper(mp, NULL, hmp);
1300 			lockmgr(&hammer2_mntlk, LK_RELEASE);
1301 			hammer2_vfs_unmount(mp, MNT_FORCE);
1302 			return EINVAL;
1303 		}
1304 		if (schain->error) {
1305 			kprintf("hammer2_mount: error %s reading super-root\n",
1306 				hammer2_error_str(schain->error));
1307 			hammer2_chain_unlock(schain);
1308 			hammer2_chain_drop(schain);
1309 			schain = NULL;
1310 			hammer2_unmount_helper(mp, NULL, hmp);
1311 			lockmgr(&hammer2_mntlk, LK_RELEASE);
1312 			hammer2_vfs_unmount(mp, MNT_FORCE);
1313 			return EINVAL;
1314 		}
1315 
1316 		/*
1317 		 * The super-root always uses an inode_tid of 1 when
1318 		 * creating PFSs.
1319 		 */
1320 		spmp->inode_tid = 1;
1321 		spmp->modify_tid = schain->bref.modify_tid + 1;
1322 
1323 		/*
1324 		 * Sanity-check schain's pmp and finish initialization.
1325 		 * Any chain belonging to the super-root topology should
1326 		 * have a NULL pmp (not even set to spmp).
1327 		 */
1328 		ripdata = &hammer2_chain_rdata(schain)->ipdata;
1329 		KKASSERT(schain->pmp == NULL);
1330 		spmp->pfs_clid = ripdata->meta.pfs_clid;
1331 
1332 		/*
1333 		 * Replace the dummy spmp->iroot with a real one.  It's
1334 		 * easier to just do a wholesale replacement than to try
1335 		 * to update the chain and fixup the iroot fields.
1336 		 *
1337 		 * The returned inode is locked with the supplied cluster.
1338 		 */
1339 		hammer2_dummy_xop_from_chain(&xop, schain);
1340 		hammer2_inode_drop(spmp->iroot);
1341 		spmp->iroot = NULL;
1342 		spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1);
1343 		spmp->spmp_hmp = hmp;
1344 		spmp->pfs_types[0] = ripdata->meta.pfs_type;
1345 		spmp->pfs_hmps[0] = hmp;
1346 		hammer2_inode_ref(spmp->iroot);
1347 		hammer2_inode_unlock(spmp->iroot);
1348 		hammer2_cluster_unlock(&xop.cluster);
1349 		hammer2_chain_drop(schain);
1350 		/* do not call hammer2_cluster_drop() on an embedded cluster */
1351 		schain = NULL;	/* now invalid */
1352 		/* leave spmp->iroot with one ref */
1353 
1354 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1355 			error = hammer2_recovery(hmp);
1356 			if (error == 0)
1357 				error |= hammer2_fixup_pfses(hmp);
1358 			/* XXX do something with error */
1359 		}
1360 		hammer2_update_pmps(hmp);
1361 		hammer2_iocom_init(hmp);
1362 		hammer2_bulkfree_init(hmp);
1363 
1364 		/*
1365 		 * Ref the cluster management messaging descriptor.  The mount
1366 		 * program deals with the other end of the communications pipe.
1367 		 *
1368 		 * Root mounts typically do not supply one.
1369 		 */
1370 		if (info.cluster_fd >= 0) {
1371 			fp = holdfp(curthread, info.cluster_fd, -1);
1372 			if (fp) {
1373 				hammer2_cluster_reconnect(hmp, fp);
1374 			} else {
1375 				kprintf("hammer2_mount: bad cluster_fd!\n");
1376 			}
1377 		}
1378 	} else {
1379 		spmp = hmp->spmp;
1380 		if (info.hflags & HMNT2_DEVFLAGS) {
1381 			kprintf("hammer2: Warning: mount flags pertaining "
1382 				"to the whole device may only be specified "
1383 				"on the first mount of the device: %08x\n",
1384 				info.hflags & HMNT2_DEVFLAGS);
1385 		}
1386 	}
1387 
1388 	/*
1389 	 * Force local mount (disassociate all PFSs from their clusters).
1390 	 * Used primarily for debugging.
1391 	 */
1392 	force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL;
1393 
1394 	/*
1395 	 * Lookup the mount point under the media-localized super-root.
1396 	 * Scanning hammer2_pfslist doesn't help us because it represents
1397 	 * PFS cluster ids which can aggregate several named PFSs together.
1398 	 *
1399 	 * cluster->pmp will incorrectly point to spmp and must be fixed
1400 	 * up later on.
1401 	 */
1402 	hammer2_inode_lock(spmp->iroot, 0);
1403 	parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
1404 	lhc = hammer2_dirhash(label, strlen(label));
1405 	chain = hammer2_chain_lookup(&parent, &key_next,
1406 				     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1407 				     &error, 0);
1408 	while (chain) {
1409 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1410 		    strcmp(label, chain->data->ipdata.filename) == 0) {
1411 			break;
1412 		}
1413 		chain = hammer2_chain_next(&parent, chain, &key_next,
1414 					    key_next,
1415 					    lhc + HAMMER2_DIRHASH_LOMASK,
1416 					    &error, 0);
1417 	}
1418 	if (parent) {
1419 		hammer2_chain_unlock(parent);
1420 		hammer2_chain_drop(parent);
1421 	}
1422 	hammer2_inode_unlock(spmp->iroot);
1423 
1424 	/*
1425 	 * PFS could not be found?
1426 	 */
1427 	if (chain == NULL) {
1428 		if (error)
1429 			kprintf("hammer2_mount: PFS label I/O error\n");
1430 		else
1431 			kprintf("hammer2_mount: PFS label not found\n");
1432 		hammer2_unmount_helper(mp, NULL, hmp);
1433 		lockmgr(&hammer2_mntlk, LK_RELEASE);
1434 		hammer2_vfs_unmount(mp, MNT_FORCE);
1435 
1436 		return EINVAL;
1437 	}
1438 
1439 	/*
1440 	 * Acquire the pmp structure (it should have already been allocated
1441 	 * via hammer2_update_pmps() so do not pass cluster in to add to
1442 	 * available chains).
1443 	 *
1444 	 * Check if the cluster has already been mounted.  A cluster can
1445 	 * only be mounted once, use null mounts to mount additional copies.
1446 	 */
1447 	if (chain->error) {
1448 		kprintf("hammer2_mount: PFS label I/O error\n");
1449 	} else {
1450 		ripdata = &chain->data->ipdata;
1451 		bref = chain->bref;
1452 		pmp = hammer2_pfsalloc(NULL, ripdata,
1453 				       bref.modify_tid, force_local);
1454 	}
1455 	hammer2_chain_unlock(chain);
1456 	hammer2_chain_drop(chain);
1457 
1458 	/*
1459 	 * Finish the mount
1460 	 */
1461         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
1462 
1463 	if (pmp->mp) {
1464 		kprintf("hammer2_mount: PFS already mounted!\n");
1465 		hammer2_unmount_helper(mp, NULL, hmp);
1466 		lockmgr(&hammer2_mntlk, LK_RELEASE);
1467 		hammer2_vfs_unmount(mp, MNT_FORCE);
1468 
1469 		return EBUSY;
1470 	}
1471 
1472 	pmp->hflags = info.hflags;
1473         mp->mnt_flag |= MNT_LOCAL;
1474         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1475         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1476 
1477         /*
1478          * required mount structure initializations
1479          */
1480         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1481         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1482 
1483         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1484         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1485 
1486         /*
1487          * Optional fields
1488          */
1489         mp->mnt_iosize_max = MAXPHYS;
1490 
1491 	/*
1492 	 * Connect up mount pointers.
1493 	 */
1494 	hammer2_mount_helper(mp, pmp);
1495 
1496         lockmgr(&hammer2_mntlk, LK_RELEASE);
1497 
1498 	/*
1499 	 * Finish setup
1500 	 */
1501 	vfs_getnewfsid(mp);
1502 	vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1503 	vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1504 	vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1505 
1506 	if (path) {
1507 		copyinstr(info.volume, mp->mnt_stat.f_mntfromname,
1508 			  MNAMELEN - 1, &size);
1509 		bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1510 	} /* else root mount, already in there */
1511 
1512 	bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1513 	if (path) {
1514 		copyinstr(path, mp->mnt_stat.f_mntonname,
1515 			  sizeof(mp->mnt_stat.f_mntonname) - 1,
1516 			  &size);
1517 	} else {
1518 		/* root mount */
1519 		mp->mnt_stat.f_mntonname[0] = '/';
1520 	}
1521 
1522 	/*
1523 	 * Initial statfs to prime mnt_stat.
1524 	 */
1525 	hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1526 
1527 	return 0;
1528 }
1529 
1530 /*
1531  * Scan PFSs under the super-root and create hammer2_pfs structures.
1532  */
1533 static
1534 void
1535 hammer2_update_pmps(hammer2_dev_t *hmp)
1536 {
1537 	const hammer2_inode_data_t *ripdata;
1538 	hammer2_chain_t *parent;
1539 	hammer2_chain_t *chain;
1540 	hammer2_blockref_t bref;
1541 	hammer2_dev_t *force_local;
1542 	hammer2_pfs_t *spmp;
1543 	hammer2_pfs_t *pmp;
1544 	hammer2_key_t key_next;
1545 	int error;
1546 
1547 	/*
1548 	 * Force local mount (disassociate all PFSs from their clusters).
1549 	 * Used primarily for debugging.
1550 	 */
1551 	force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL;
1552 
1553 	/*
1554 	 * Lookup mount point under the media-localized super-root.
1555 	 *
1556 	 * cluster->pmp will incorrectly point to spmp and must be fixed
1557 	 * up later on.
1558 	 */
1559 	spmp = hmp->spmp;
1560 	hammer2_inode_lock(spmp->iroot, 0);
1561 	parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
1562 	chain = hammer2_chain_lookup(&parent, &key_next,
1563 					 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
1564 					 &error, 0);
1565 	while (chain) {
1566 		if (chain->bref.type != HAMMER2_BREF_TYPE_INODE)
1567 			continue;
1568 		if (chain->error) {
1569 			kprintf("I/O error scanning PFS labels\n");
1570 		} else {
1571 			ripdata = &chain->data->ipdata;
1572 			bref = chain->bref;
1573 
1574 			pmp = hammer2_pfsalloc(chain, ripdata,
1575 					       bref.modify_tid, force_local);
1576 		}
1577 		chain = hammer2_chain_next(&parent, chain, &key_next,
1578 					   key_next, HAMMER2_KEY_MAX,
1579 					   &error, 0);
1580 	}
1581 	if (parent) {
1582 		hammer2_chain_unlock(parent);
1583 		hammer2_chain_drop(parent);
1584 	}
1585 	hammer2_inode_unlock(spmp->iroot);
1586 }
1587 
1588 static
1589 int
1590 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused,
1591 		struct vnode *devvp, struct ucred *cred)
1592 {
1593 	int error;
1594 
1595 	if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1596 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1597 		VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL);
1598 		vn_unlock(devvp);
1599 		error = hammer2_recovery(hmp);
1600 		if (error == 0)
1601 			error |= hammer2_fixup_pfses(hmp);
1602 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1603 		if (error == 0) {
1604 			VOP_CLOSE(devvp, FREAD, NULL);
1605 			hmp->ronly = 0;
1606 		} else {
1607 			VOP_CLOSE(devvp, FREAD | FWRITE, NULL);
1608 		}
1609 		vn_unlock(devvp);
1610 	} else {
1611 		error = 0;
1612 	}
1613 	return error;
1614 }
1615 
1616 static
1617 int
1618 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1619 {
1620 	hammer2_pfs_t *pmp;
1621 	int flags;
1622 	int error = 0;
1623 
1624 	pmp = MPTOPMP(mp);
1625 
1626 	if (pmp == NULL)
1627 		return(0);
1628 
1629 	lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1630 
1631 	/*
1632 	 * If mount initialization proceeded far enough we must flush
1633 	 * its vnodes and sync the underlying mount points.  Three syncs
1634 	 * are required to fully flush the filesystem (freemap updates lag
1635 	 * by one flush, and one extra for safety).
1636 	 */
1637 	if (mntflags & MNT_FORCE)
1638 		flags = FORCECLOSE;
1639 	else
1640 		flags = 0;
1641 	if (pmp->iroot) {
1642 		error = vflush(mp, 0, flags);
1643 		if (error)
1644 			goto failed;
1645 		hammer2_vfs_sync(mp, MNT_WAIT);
1646 		hammer2_vfs_sync(mp, MNT_WAIT);
1647 		hammer2_vfs_sync(mp, MNT_WAIT);
1648 	}
1649 
1650 	/*
1651 	 * Cleanup the frontend support XOPS threads
1652 	 */
1653 	hammer2_xop_helper_cleanup(pmp);
1654 
1655 	if (pmp->mp)
1656 		hammer2_unmount_helper(mp, pmp, NULL);
1657 
1658 	error = 0;
1659 failed:
1660 	lockmgr(&hammer2_mntlk, LK_RELEASE);
1661 
1662 	return (error);
1663 }
1664 
1665 /*
1666  * Mount helper, hook the system mount into our PFS.
1667  * The mount lock is held.
1668  *
1669  * We must bump the mount_count on related devices for any
1670  * mounted PFSs.
1671  */
1672 static
1673 void
1674 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
1675 {
1676 	hammer2_cluster_t *cluster;
1677 	hammer2_chain_t *rchain;
1678 	int i;
1679 
1680         mp->mnt_data = (qaddr_t)pmp;
1681 	pmp->mp = mp;
1682 
1683 	/*
1684 	 * After pmp->mp is set we have to adjust hmp->mount_count.
1685 	 */
1686 	cluster = &pmp->iroot->cluster;
1687 	for (i = 0; i < cluster->nchains; ++i) {
1688 		rchain = cluster->array[i].chain;
1689 		if (rchain == NULL)
1690 			continue;
1691 		++rchain->hmp->mount_count;
1692 	}
1693 
1694 	/*
1695 	 * Create missing Xop threads
1696 	 */
1697 	hammer2_xop_helper_create(pmp);
1698 }
1699 
1700 /*
1701  * Mount helper, unhook the system mount from our PFS.
1702  * The mount lock is held.
1703  *
1704  * If hmp is supplied a mount responsible for being the first to open
1705  * the block device failed and the block device and all PFSs using the
1706  * block device must be cleaned up.
1707  *
1708  * If pmp is supplied multiple devices might be backing the PFS and each
1709  * must be disconnected.  This might not be the last PFS using some of the
1710  * underlying devices.  Also, we have to adjust our hmp->mount_count
1711  * accounting for the devices backing the pmp which is now undergoing an
1712  * unmount.
1713  */
1714 static
1715 void
1716 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
1717 {
1718 	hammer2_cluster_t *cluster;
1719 	hammer2_chain_t *rchain;
1720 	struct vnode *devvp;
1721 	int dumpcnt;
1722 	int ronly;
1723 	int i;
1724 
1725 	/*
1726 	 * If no device supplied this is a high-level unmount and we have to
1727 	 * to disconnect the mount, adjust mount_count, and locate devices
1728 	 * that might now have no mounts.
1729 	 */
1730 	if (pmp) {
1731 		KKASSERT(hmp == NULL);
1732 		KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
1733 		pmp->mp = NULL;
1734 		mp->mnt_data = NULL;
1735 
1736 		/*
1737 		 * After pmp->mp is cleared we have to account for
1738 		 * mount_count.
1739 		 */
1740 		cluster = &pmp->iroot->cluster;
1741 		for (i = 0; i < cluster->nchains; ++i) {
1742 			rchain = cluster->array[i].chain;
1743 			if (rchain == NULL)
1744 				continue;
1745 			--rchain->hmp->mount_count;
1746 			/* scrapping hmp now may invalidate the pmp */
1747 		}
1748 again:
1749 		TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
1750 			if (hmp->mount_count == 0) {
1751 				hammer2_unmount_helper(NULL, NULL, hmp);
1752 				goto again;
1753 			}
1754 		}
1755 		return;
1756 	}
1757 
1758 	/*
1759 	 * Try to terminate the block device.  We can't terminate it if
1760 	 * there are still PFSs referencing it.
1761 	 */
1762 	if (hmp->mount_count)
1763 		return;
1764 
1765 	/*
1766 	 * Decomission the network before we start messing with the
1767 	 * device and PFS.
1768 	 */
1769 	hammer2_iocom_uninit(hmp);
1770 
1771 	hammer2_bulkfree_uninit(hmp);
1772 	hammer2_pfsfree_scan(hmp, 0);
1773 #if 0
1774 	hammer2_dev_exlock(hmp);	/* XXX order */
1775 #endif
1776 
1777 	/*
1778 	 * Cycle the volume data lock as a safety (probably not needed any
1779 	 * more).  To ensure everything is out we need to flush at least
1780 	 * three times.  (1) The running of the sideq can dirty the
1781 	 * filesystem, (2) A normal flush can dirty the freemap, and
1782 	 * (3) ensure that the freemap is fully synchronized.
1783 	 *
1784 	 * The next mount's recovery scan can clean everything up but we want
1785 	 * to leave the filesystem in a 100% clean state on a normal unmount.
1786 	 */
1787 #if 0
1788 	hammer2_voldata_lock(hmp);
1789 	hammer2_voldata_unlock(hmp);
1790 #endif
1791 
1792 	/*
1793 	 * Flush whatever is left.  Unmounted but modified PFS's might still
1794 	 * have some dirty chains on them.
1795 	 */
1796 	hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1797 	hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1798 
1799 	if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1800 		hammer2_voldata_modify(hmp);
1801 		hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP |
1802 					    HAMMER2_FLUSH_ALL);
1803 	}
1804 	hammer2_chain_unlock(&hmp->fchain);
1805 
1806 	if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1807 		hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP |
1808 					    HAMMER2_FLUSH_ALL);
1809 	}
1810 	hammer2_chain_unlock(&hmp->vchain);
1811 
1812 	if ((hmp->vchain.flags | hmp->fchain.flags) &
1813 	    HAMMER2_CHAIN_FLUSH_MASK) {
1814 		kprintf("hammer2_unmount: chains left over "
1815 			"after final sync\n");
1816 		kprintf("    vchain %08x\n", hmp->vchain.flags);
1817 		kprintf("    fchain %08x\n", hmp->fchain.flags);
1818 
1819 		if (hammer2_debug & 0x0010)
1820 			Debugger("entered debugger");
1821 	}
1822 
1823 	hammer2_pfsfree_scan(hmp, 1);
1824 
1825 	KKASSERT(hmp->spmp == NULL);
1826 
1827 	/*
1828 	 * Finish up with the device vnode
1829 	 */
1830 	if ((devvp = hmp->devvp) != NULL) {
1831 		ronly = hmp->ronly;
1832 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1833 		kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n",
1834 			hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree),
1835 			ronly);
1836 		vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1837 		kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n",
1838 			hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree));
1839 		hmp->devvp = NULL;
1840 		VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
1841 		vn_unlock(devvp);
1842 		vrele(devvp);
1843 		devvp = NULL;
1844 	}
1845 
1846 	/*
1847 	 * Clear vchain/fchain flags that might prevent final cleanup
1848 	 * of these chains.
1849 	 */
1850 	if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
1851 		atomic_add_long(&hammer2_count_modified_chains, -1);
1852 		atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
1853 		hammer2_pfs_memory_wakeup(hmp->vchain.pmp, -1);
1854 	}
1855 	if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
1856 		atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE);
1857 	}
1858 
1859 	if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
1860 		atomic_add_long(&hammer2_count_modified_chains, -1);
1861 		atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED);
1862 		hammer2_pfs_memory_wakeup(hmp->fchain.pmp, -1);
1863 	}
1864 	if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
1865 		atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE);
1866 	}
1867 
1868 	/*
1869 	 * Final drop of embedded freemap root chain to
1870 	 * clean up fchain.core (fchain structure is not
1871 	 * flagged ALLOCATED so it is cleaned out and then
1872 	 * left to rot).
1873 	 */
1874 	hammer2_chain_drop(&hmp->fchain);
1875 
1876 	/*
1877 	 * Final drop of embedded volume root chain to clean
1878 	 * up vchain.core (vchain structure is not flagged
1879 	 * ALLOCATED so it is cleaned out and then left to
1880 	 * rot).
1881 	 */
1882 	dumpcnt = 50;
1883 	hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1);
1884 	dumpcnt = 50;
1885 	hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1);
1886 #if 0
1887 	hammer2_dev_unlock(hmp);
1888 #endif
1889 	hammer2_chain_drop(&hmp->vchain);
1890 
1891 	hammer2_io_cleanup(hmp, &hmp->iotree);
1892 	if (hmp->iofree_count) {
1893 		kprintf("io_cleanup: %d I/O's left hanging\n",
1894 			hmp->iofree_count);
1895 	}
1896 
1897 	TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1898 	kmalloc_destroy(&hmp->mchain);
1899 	kfree(hmp, M_HAMMER2);
1900 }
1901 
1902 int
1903 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1904 		 ino_t ino, struct vnode **vpp)
1905 {
1906 	hammer2_xop_lookup_t *xop;
1907 	hammer2_pfs_t *pmp;
1908 	hammer2_inode_t *ip;
1909 	hammer2_tid_t inum;
1910 	int error;
1911 
1912 	inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK;
1913 
1914 	error = 0;
1915 	pmp = MPTOPMP(mp);
1916 
1917 	/*
1918 	 * Easy if we already have it cached
1919 	 */
1920 	ip = hammer2_inode_lookup(pmp, inum);
1921 	if (ip) {
1922 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
1923 		*vpp = hammer2_igetv(ip, &error);
1924 		hammer2_inode_unlock(ip);
1925 		hammer2_inode_drop(ip);		/* from lookup */
1926 
1927 		return error;
1928 	}
1929 
1930 	/*
1931 	 * Otherwise we have to find the inode
1932 	 */
1933 	xop = hammer2_xop_alloc(pmp->iroot, 0);
1934 	xop->lhc = inum;
1935 	hammer2_xop_start(&xop->head, &hammer2_lookup_desc);
1936 	error = hammer2_xop_collect(&xop->head, 0);
1937 
1938 	if (error == 0)
1939 		ip = hammer2_inode_get(pmp, &xop->head, -1, -1);
1940 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1941 
1942 	if (ip) {
1943 		*vpp = hammer2_igetv(ip, &error);
1944 		hammer2_inode_unlock(ip);
1945 	} else {
1946 		*vpp = NULL;
1947 		error = ENOENT;
1948 	}
1949 	return (error);
1950 }
1951 
1952 static
1953 int
1954 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1955 {
1956 	hammer2_pfs_t *pmp;
1957 	struct vnode *vp;
1958 	int error;
1959 
1960 	pmp = MPTOPMP(mp);
1961 	if (pmp->iroot == NULL) {
1962 		kprintf("hammer2 (%s): no root inode\n",
1963 			mp->mnt_stat.f_mntfromname);
1964 		*vpp = NULL;
1965 		return EINVAL;
1966 	}
1967 
1968 	error = 0;
1969 	hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
1970 
1971 	while (pmp->inode_tid == 0) {
1972 		hammer2_xop_ipcluster_t *xop;
1973 		const hammer2_inode_meta_t *meta;
1974 
1975 		xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING);
1976 		hammer2_xop_start(&xop->head, &hammer2_ipcluster_desc);
1977 		error = hammer2_xop_collect(&xop->head, 0);
1978 
1979 		if (error == 0) {
1980 			meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta;
1981 			pmp->iroot->meta = *meta;
1982 			pmp->inode_tid = meta->pfs_inum + 1;
1983 			hammer2_xop_pdata(&xop->head);
1984 			/* meta invalid */
1985 
1986 			if (pmp->inode_tid < HAMMER2_INODE_START)
1987 				pmp->inode_tid = HAMMER2_INODE_START;
1988 			pmp->modify_tid =
1989 				xop->head.cluster.focus->bref.modify_tid + 1;
1990 #if 0
1991 			kprintf("PFS: Starting inode %jd\n",
1992 				(intmax_t)pmp->inode_tid);
1993 			kprintf("PMP focus good set nextino=%ld mod=%016jx\n",
1994 				pmp->inode_tid, pmp->modify_tid);
1995 #endif
1996 			wakeup(&pmp->iroot);
1997 
1998 			hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1999 
2000 			/*
2001 			 * Prime the mount info.
2002 			 */
2003 			hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL);
2004 			break;
2005 		}
2006 
2007 		/*
2008 		 * Loop, try again
2009 		 */
2010 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2011 		hammer2_inode_unlock(pmp->iroot);
2012 		error = tsleep(&pmp->iroot, PCATCH, "h2root", hz);
2013 		hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
2014 		if (error == EINTR)
2015 			break;
2016 	}
2017 
2018 	if (error) {
2019 		hammer2_inode_unlock(pmp->iroot);
2020 		*vpp = NULL;
2021 	} else {
2022 		vp = hammer2_igetv(pmp->iroot, &error);
2023 		hammer2_inode_unlock(pmp->iroot);
2024 		*vpp = vp;
2025 	}
2026 
2027 	return (error);
2028 }
2029 
2030 /*
2031  * Filesystem status
2032  *
2033  * XXX incorporate ipdata->meta.inode_quota and data_quota
2034  */
2035 static
2036 int
2037 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2038 {
2039 	hammer2_pfs_t *pmp;
2040 	hammer2_dev_t *hmp;
2041 	hammer2_blockref_t bref;
2042 	struct statfs tmp;
2043 	int i;
2044 
2045 	/*
2046 	 * NOTE: iroot might not have validated the cluster yet.
2047 	 */
2048 	pmp = MPTOPMP(mp);
2049 
2050 	bzero(&tmp, sizeof(tmp));
2051 
2052 	for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
2053 		hmp = pmp->pfs_hmps[i];
2054 		if (hmp == NULL)
2055 			continue;
2056 		if (pmp->iroot->cluster.array[i].chain)
2057 			bref = pmp->iroot->cluster.array[i].chain->bref;
2058 		else
2059 			bzero(&bref, sizeof(bref));
2060 
2061 		tmp.f_files = bref.embed.stats.inode_count;
2062 		tmp.f_ffree = 0;
2063 		tmp.f_blocks = hmp->voldata.allocator_size /
2064 			       mp->mnt_vstat.f_bsize;
2065 		tmp.f_bfree = hmp->voldata.allocator_free /
2066 			      mp->mnt_vstat.f_bsize;
2067 		tmp.f_bavail = tmp.f_bfree;
2068 
2069 		if (cred && cred->cr_uid != 0) {
2070 			uint64_t adj;
2071 
2072 			/* 5% */
2073 			adj = hmp->free_reserved / mp->mnt_vstat.f_bsize;
2074 			tmp.f_blocks -= adj;
2075 			tmp.f_bfree -= adj;
2076 			tmp.f_bavail -= adj;
2077 		}
2078 
2079 		mp->mnt_stat.f_blocks = tmp.f_blocks;
2080 		mp->mnt_stat.f_bfree = tmp.f_bfree;
2081 		mp->mnt_stat.f_bavail = tmp.f_bavail;
2082 		mp->mnt_stat.f_files = tmp.f_files;
2083 		mp->mnt_stat.f_ffree = tmp.f_ffree;
2084 
2085 		*sbp = mp->mnt_stat;
2086 	}
2087 	return (0);
2088 }
2089 
2090 static
2091 int
2092 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2093 {
2094 	hammer2_pfs_t *pmp;
2095 	hammer2_dev_t *hmp;
2096 	hammer2_blockref_t bref;
2097 	struct statvfs tmp;
2098 	int i;
2099 
2100 	/*
2101 	 * NOTE: iroot might not have validated the cluster yet.
2102 	 */
2103 	pmp = MPTOPMP(mp);
2104 	bzero(&tmp, sizeof(tmp));
2105 
2106 	for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
2107 		hmp = pmp->pfs_hmps[i];
2108 		if (hmp == NULL)
2109 			continue;
2110 		if (pmp->iroot->cluster.array[i].chain)
2111 			bref = pmp->iroot->cluster.array[i].chain->bref;
2112 		else
2113 			bzero(&bref, sizeof(bref));
2114 
2115 		tmp.f_files = bref.embed.stats.inode_count;
2116 		tmp.f_ffree = 0;
2117 		tmp.f_blocks = hmp->voldata.allocator_size /
2118 			       mp->mnt_vstat.f_bsize;
2119 		tmp.f_bfree = hmp->voldata.allocator_free /
2120 			      mp->mnt_vstat.f_bsize;
2121 		tmp.f_bavail = tmp.f_bfree;
2122 
2123 		if (cred && cred->cr_uid != 0) {
2124 			uint64_t adj;
2125 
2126 			/* 5% */
2127 			adj = hmp->free_reserved / mp->mnt_vstat.f_bsize;
2128 			tmp.f_blocks -= adj;
2129 			tmp.f_bfree -= adj;
2130 			tmp.f_bavail -= adj;
2131 		}
2132 
2133 		mp->mnt_vstat.f_blocks = tmp.f_blocks;
2134 		mp->mnt_vstat.f_bfree = tmp.f_bfree;
2135 		mp->mnt_vstat.f_bavail = tmp.f_bavail;
2136 		mp->mnt_vstat.f_files = tmp.f_files;
2137 		mp->mnt_vstat.f_ffree = tmp.f_ffree;
2138 
2139 		*sbp = mp->mnt_vstat;
2140 	}
2141 	return (0);
2142 }
2143 
2144 /*
2145  * Mount-time recovery (RW mounts)
2146  *
2147  * Updates to the free block table are allowed to lag flushes by one
2148  * transaction.  In case of a crash, then on a fresh mount we must do an
2149  * incremental scan of the last committed transaction id and make sure that
2150  * all related blocks have been marked allocated.
2151  *
2152  * The super-root topology and each PFS has its own transaction id domain,
2153  * so we must track PFS boundary transitions.
2154  */
2155 struct hammer2_recovery_elm {
2156 	TAILQ_ENTRY(hammer2_recovery_elm) entry;
2157 	hammer2_chain_t *chain;
2158 	hammer2_tid_t sync_tid;
2159 };
2160 
2161 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2162 
2163 struct hammer2_recovery_info {
2164 	struct hammer2_recovery_list list;
2165 	hammer2_tid_t	mtid;
2166 	int	depth;
2167 };
2168 
2169 static int hammer2_recovery_scan(hammer2_dev_t *hmp,
2170 			hammer2_chain_t *parent,
2171 			struct hammer2_recovery_info *info,
2172 			hammer2_tid_t sync_tid);
2173 
2174 #define HAMMER2_RECOVERY_MAXDEPTH	10
2175 
2176 static
2177 int
2178 hammer2_recovery(hammer2_dev_t *hmp)
2179 {
2180 	struct hammer2_recovery_info info;
2181 	struct hammer2_recovery_elm *elm;
2182 	hammer2_chain_t *parent;
2183 	hammer2_tid_t sync_tid;
2184 	hammer2_tid_t mirror_tid;
2185 	int error;
2186 
2187 	hammer2_trans_init(hmp->spmp, 0);
2188 
2189 	sync_tid = hmp->voldata.freemap_tid;
2190 	mirror_tid = hmp->voldata.mirror_tid;
2191 
2192 	kprintf("hammer2 mount \"%s\": ", hmp->devrepname);
2193 	if (sync_tid >= mirror_tid) {
2194 		kprintf(" no recovery needed\n");
2195 	} else {
2196 		kprintf(" freemap recovery %016jx-%016jx\n",
2197 			sync_tid + 1, mirror_tid);
2198 	}
2199 
2200 	TAILQ_INIT(&info.list);
2201 	info.depth = 0;
2202 	parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2203 	error = hammer2_recovery_scan(hmp, parent, &info, sync_tid);
2204 	hammer2_chain_lookup_done(parent);
2205 
2206 	while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2207 		TAILQ_REMOVE(&info.list, elm, entry);
2208 		parent = elm->chain;
2209 		sync_tid = elm->sync_tid;
2210 		kfree(elm, M_HAMMER2);
2211 
2212 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2213 		error |= hammer2_recovery_scan(hmp, parent, &info,
2214 					      hmp->voldata.freemap_tid);
2215 		hammer2_chain_unlock(parent);
2216 		hammer2_chain_drop(parent);	/* drop elm->chain ref */
2217 	}
2218 
2219 	hammer2_trans_done(hmp->spmp, 0);
2220 
2221 	return error;
2222 }
2223 
2224 static
2225 int
2226 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent,
2227 		      struct hammer2_recovery_info *info,
2228 		      hammer2_tid_t sync_tid)
2229 {
2230 	const hammer2_inode_data_t *ripdata;
2231 	hammer2_chain_t *chain;
2232 	hammer2_blockref_t bref;
2233 	int tmp_error;
2234 	int rup_error;
2235 	int error;
2236 	int first;
2237 
2238 	/*
2239 	 * Adjust freemap to ensure that the block(s) are marked allocated.
2240 	 */
2241 	if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2242 		hammer2_freemap_adjust(hmp, &parent->bref,
2243 				       HAMMER2_FREEMAP_DORECOVER);
2244 	}
2245 
2246 	/*
2247 	 * Check type for recursive scan
2248 	 */
2249 	switch(parent->bref.type) {
2250 	case HAMMER2_BREF_TYPE_VOLUME:
2251 		/* data already instantiated */
2252 		break;
2253 	case HAMMER2_BREF_TYPE_INODE:
2254 		/*
2255 		 * Must instantiate data for DIRECTDATA test and also
2256 		 * for recursion.
2257 		 */
2258 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2259 		ripdata = &hammer2_chain_rdata(parent)->ipdata;
2260 		if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2261 			/* not applicable to recovery scan */
2262 			hammer2_chain_unlock(parent);
2263 			return 0;
2264 		}
2265 		hammer2_chain_unlock(parent);
2266 		break;
2267 	case HAMMER2_BREF_TYPE_INDIRECT:
2268 		/*
2269 		 * Must instantiate data for recursion
2270 		 */
2271 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2272 		hammer2_chain_unlock(parent);
2273 		break;
2274 	case HAMMER2_BREF_TYPE_DIRENT:
2275 	case HAMMER2_BREF_TYPE_DATA:
2276 	case HAMMER2_BREF_TYPE_FREEMAP:
2277 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2278 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2279 		/* not applicable to recovery scan */
2280 		return 0;
2281 		break;
2282 	default:
2283 		return HAMMER2_ERROR_BADBREF;
2284 	}
2285 
2286 	/*
2287 	 * Defer operation if depth limit reached or if we are crossing a
2288 	 * PFS boundary.
2289 	 */
2290 	if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2291 		struct hammer2_recovery_elm *elm;
2292 
2293 		elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2294 		elm->chain = parent;
2295 		elm->sync_tid = sync_tid;
2296 		hammer2_chain_ref(parent);
2297 		TAILQ_INSERT_TAIL(&info->list, elm, entry);
2298 		/* unlocked by caller */
2299 
2300 		return(0);
2301 	}
2302 
2303 
2304 	/*
2305 	 * Recursive scan of the last flushed transaction only.  We are
2306 	 * doing this without pmp assignments so don't leave the chains
2307 	 * hanging around after we are done with them.
2308 	 *
2309 	 * error	Cumulative error this level only
2310 	 * rup_error	Cumulative error for recursion
2311 	 * tmp_error	Specific non-cumulative recursion error
2312 	 */
2313 	chain = NULL;
2314 	first = 1;
2315 	rup_error = 0;
2316 	error = 0;
2317 
2318 	for (;;) {
2319 		error |= hammer2_chain_scan(parent, &chain, &bref,
2320 					    &first,
2321 					    HAMMER2_LOOKUP_NODATA);
2322 
2323 		/*
2324 		 * Problem during scan or EOF
2325 		 */
2326 		if (error)
2327 			break;
2328 
2329 		/*
2330 		 * If this is a leaf
2331 		 */
2332 		if (chain == NULL) {
2333 			if (bref.mirror_tid > sync_tid) {
2334 				hammer2_freemap_adjust(hmp, &bref,
2335 						     HAMMER2_FREEMAP_DORECOVER);
2336 			}
2337 			continue;
2338 		}
2339 
2340 		/*
2341 		 * This may or may not be a recursive node.
2342 		 */
2343 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2344 		if (bref.mirror_tid > sync_tid) {
2345 			++info->depth;
2346 			tmp_error = hammer2_recovery_scan(hmp, chain,
2347 							   info, sync_tid);
2348 			--info->depth;
2349 		} else {
2350 			tmp_error = 0;
2351 		}
2352 
2353 		/*
2354 		 * Flush the recovery at the PFS boundary to stage it for
2355 		 * the final flush of the super-root topology.
2356 		 */
2357 		if (tmp_error == 0 &&
2358 		    (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
2359 		    (chain->flags & HAMMER2_CHAIN_ONFLUSH)) {
2360 			hammer2_flush(chain, HAMMER2_FLUSH_TOP |
2361 					     HAMMER2_FLUSH_ALL);
2362 		}
2363 		rup_error |= tmp_error;
2364 	}
2365 	return ((error | rup_error) & ~HAMMER2_ERROR_EOF);
2366 }
2367 
2368 /*
2369  * This fixes up an error introduced in earlier H2 implementations where
2370  * moving a PFS inode into an indirect block wound up causing the
2371  * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared.
2372  */
2373 static
2374 int
2375 hammer2_fixup_pfses(hammer2_dev_t *hmp)
2376 {
2377 	const hammer2_inode_data_t *ripdata;
2378 	hammer2_chain_t *parent;
2379 	hammer2_chain_t *chain;
2380 	hammer2_key_t key_next;
2381 	hammer2_pfs_t *spmp;
2382 	int error;
2383 
2384 	error = 0;
2385 
2386 	/*
2387 	 * Lookup mount point under the media-localized super-root.
2388 	 *
2389 	 * cluster->pmp will incorrectly point to spmp and must be fixed
2390 	 * up later on.
2391 	 */
2392 	spmp = hmp->spmp;
2393 	hammer2_inode_lock(spmp->iroot, 0);
2394 	parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
2395 	chain = hammer2_chain_lookup(&parent, &key_next,
2396 					 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
2397 					 &error, 0);
2398 	while (chain) {
2399 		if (chain->bref.type != HAMMER2_BREF_TYPE_INODE)
2400 			continue;
2401 		if (chain->error) {
2402 			kprintf("I/O error scanning PFS labels\n");
2403 			error |= chain->error;
2404 		} else if ((chain->bref.flags &
2405 			    HAMMER2_BREF_FLAG_PFSROOT) == 0) {
2406 			int error2;
2407 
2408 			ripdata = &chain->data->ipdata;
2409 			hammer2_trans_init(hmp->spmp, 0);
2410 			error2 = hammer2_chain_modify(chain,
2411 						      chain->bref.modify_tid,
2412 						      0, 0);
2413 			if (error2 == 0) {
2414 				kprintf("hammer2: Correct mis-flagged PFS %s\n",
2415 					ripdata->filename);
2416 				chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
2417 			} else {
2418 				error |= error2;
2419 			}
2420 			hammer2_flush(chain, HAMMER2_FLUSH_TOP |
2421 					     HAMMER2_FLUSH_ALL);
2422 			hammer2_trans_done(hmp->spmp, 0);
2423 		}
2424 		chain = hammer2_chain_next(&parent, chain, &key_next,
2425 					   key_next, HAMMER2_KEY_MAX,
2426 					   &error, 0);
2427 	}
2428 	if (parent) {
2429 		hammer2_chain_unlock(parent);
2430 		hammer2_chain_drop(parent);
2431 	}
2432 	hammer2_inode_unlock(spmp->iroot);
2433 
2434 	return error;
2435 }
2436 
2437 /*
2438  * Sync a mount point; this is called periodically on a per-mount basis from
2439  * the filesystem syncer, and whenever a user issues a sync.
2440  */
2441 int
2442 hammer2_vfs_sync(struct mount *mp, int waitfor)
2443 {
2444 	int error;
2445 
2446 	error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor);
2447 
2448 	return error;
2449 }
2450 
2451 /*
2452  * Because frontend operations lock vnodes before we get a chance to
2453  * lock the related inode, we can't just acquire a vnode lock without
2454  * risking a deadlock.  The frontend may be holding a vnode lock while
2455  * also blocked on our SYNCQ flag while trying to get the inode lock.
2456  *
2457  * To deal with this situation we can check the vnode lock situation
2458  * after locking the inode and perform a work-around.
2459  */
2460 int
2461 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
2462 {
2463 	struct mount *mp;
2464 	/*hammer2_xop_flush_t *xop;*/
2465 	/*struct hammer2_sync_info info;*/
2466 	hammer2_inode_t *ip;
2467 	hammer2_depend_t *depend;
2468 	hammer2_depend_t *depend_next;
2469 	struct vnode *vp;
2470 	uint32_t pass2;
2471 	int error;
2472 	int wakecount;
2473 	int dorestart;
2474 
2475 	mp = pmp->mp;
2476 
2477 	/*
2478 	 * Move all inodes on sideq to syncq.  This will clear sideq.
2479 	 * This should represent all flushable inodes.  These inodes
2480 	 * will already have refs due to being on syncq or sideq.  We
2481 	 * must do this all at once with the spinlock held to ensure that
2482 	 * all inode dependencies are part of the same flush.
2483 	 *
2484 	 * We should be able to do this asynchronously from frontend
2485 	 * operations because we will be locking the inodes later on
2486 	 * to actually flush them, and that will partition any frontend
2487 	 * op using the same inode.  Either it has already locked the
2488 	 * inode and we will block, or it has not yet locked the inode
2489 	 * and it will block until we are finished flushing that inode.
2490 	 *
2491 	 * When restarting, only move the inodes flagged as PASS2 from
2492 	 * SIDEQ to SYNCQ.  PASS2 propagation by inode_lock4() and
2493 	 * inode_depend() are atomic with the spin-lock.
2494 	 */
2495 	hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
2496 #ifdef HAMMER2_DEBUG_SYNC
2497 	kprintf("FILESYSTEM SYNC BOUNDARY\n");
2498 #endif
2499 	dorestart = 0;
2500 
2501 	/*
2502 	 * Move inodes from depq to syncq, releasing the related
2503 	 * depend structures.
2504 	 */
2505 restart:
2506 #ifdef HAMMER2_DEBUG_SYNC
2507 	kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart);
2508 #endif
2509 	hammer2_trans_setflags(pmp, 0/*HAMMER2_TRANS_COPYQ*/);
2510 	hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN);
2511 
2512 	/*
2513 	 * Move inodes from depq to syncq.  When restarting, only depq's
2514 	 * marked pass2 are moved.
2515 	 */
2516 	hammer2_spin_ex(&pmp->list_spin);
2517 	depend_next = TAILQ_FIRST(&pmp->depq);
2518 	wakecount = 0;
2519 
2520 	while ((depend = depend_next) != NULL) {
2521 		depend_next = TAILQ_NEXT(depend, entry);
2522 		if (dorestart && depend->pass2 == 0)
2523 			continue;
2524 		TAILQ_FOREACH(ip, &depend->sideq, entry) {
2525 			KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
2526 			atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ);
2527 			atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ);
2528 			ip->depend = NULL;
2529 		}
2530 
2531 		/*
2532 		 * NOTE: pmp->sideq_count includes both sideq and syncq
2533 		 */
2534 		TAILQ_CONCAT(&pmp->syncq, &depend->sideq, entry);
2535 
2536 		depend->count = 0;
2537 		depend->pass2 = 0;
2538 		TAILQ_REMOVE(&pmp->depq, depend, entry);
2539 	}
2540 
2541 	hammer2_spin_unex(&pmp->list_spin);
2542 	hammer2_trans_clearflags(pmp, /*HAMMER2_TRANS_COPYQ |*/
2543 				      HAMMER2_TRANS_WAITING);
2544 	dorestart = 0;
2545 
2546 	/*
2547 	 * sideq_count may have dropped enough to allow us to unstall
2548 	 * the frontend.
2549 	 */
2550 	hammer2_pfs_memory_wakeup(pmp, 0);
2551 
2552 	/*
2553 	 * Now run through all inodes on syncq.
2554 	 *
2555 	 * Flush transactions only interlock with other flush transactions.
2556 	 * Any conflicting frontend operations will block on the inode, but
2557 	 * may hold a vnode lock while doing so.
2558 	 */
2559 	hammer2_spin_ex(&pmp->list_spin);
2560 	while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) {
2561 		/*
2562 		 * Remove the inode from the SYNCQ, transfer the syncq ref
2563 		 * to us.  We must clear SYNCQ to allow any potential
2564 		 * front-end deadlock to proceed.  We must set PASS2 so
2565 		 * the dependency code knows what to do.
2566 		 */
2567 		pass2 = ip->flags;
2568 		cpu_ccfence();
2569 		if (atomic_cmpset_int(&ip->flags,
2570 			      pass2,
2571 			      (pass2 & ~(HAMMER2_INODE_SYNCQ |
2572 					 HAMMER2_INODE_SYNCQ_WAKEUP)) |
2573 			      HAMMER2_INODE_SYNCQ_PASS2) == 0) {
2574 			continue;
2575 		}
2576 		TAILQ_REMOVE(&pmp->syncq, ip, entry);
2577 		--pmp->sideq_count;
2578 		hammer2_spin_unex(&pmp->list_spin);
2579 
2580 		/*
2581 		 * Tickle anyone waiting on ip->flags or the hysteresis
2582 		 * on the dirty inode count.
2583 		 */
2584 		if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP)
2585 			wakeup(&ip->flags);
2586 		if (++wakecount >= hammer2_limit_dirty_inodes / 20 + 1) {
2587 			wakecount = 0;
2588 			hammer2_pfs_memory_wakeup(pmp, 0);
2589 		}
2590 
2591 		/*
2592 		 * Relock the inode, and we inherit a ref from the above.
2593 		 * We will check for a race after we acquire the vnode.
2594 		 */
2595 		hammer2_mtx_ex(&ip->lock);
2596 
2597 		/*
2598 		 * We need the vp in order to vfsync() dirty buffers, so if
2599 		 * one isn't attached we can skip it.
2600 		 *
2601 		 * Ordering the inode lock and then the vnode lock has the
2602 		 * potential to deadlock.  If we had left SYNCQ set that could
2603 		 * also deadlock us against the frontend even if we don't hold
2604 		 * any locks, but the latter is not a problem now since we
2605 		 * cleared it.  igetv will temporarily release the inode lock
2606 		 * in a safe manner to work-around the deadlock.
2607 		 *
2608 		 * Unfortunately it is still possible to deadlock when the
2609 		 * frontend obtains multiple inode locks, because all the
2610 		 * related vnodes are already locked (nor can the vnode locks
2611 		 * be released and reacquired without messing up RECLAIM and
2612 		 * INACTIVE sequencing).
2613 		 *
2614 		 * The solution for now is to move the vp back onto SIDEQ
2615 		 * and set dorestart, which will restart the flush after we
2616 		 * exhaust the current SYNCQ.  Note that additional
2617 		 * dependencies may build up, so we definitely need to move
2618 		 * the whole SIDEQ back to SYNCQ when we restart.
2619 		 */
2620 		vp = ip->vp;
2621 		if (vp) {
2622 			if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) {
2623 				/*
2624 				 * Failed to get the vnode, requeue the inode
2625 				 * (PASS2 is already set so it will be found
2626 				 * again on the restart).
2627 				 *
2628 				 * Then unlock, possibly sleep, and retry
2629 				 * later.  We sleep if PASS2 was *previously*
2630 				 * set, before we set it again above.
2631 				 */
2632 				vp = NULL;
2633 				dorestart = 1;
2634 #ifdef HAMMER2_DEBUG_SYNC
2635 				kprintf("inum %ld (sync delayed by vnode)\n",
2636 					(long)ip->meta.inum);
2637 #endif
2638 				hammer2_inode_delayed_sideq(ip);
2639 
2640 				hammer2_mtx_unlock(&ip->lock);
2641 				hammer2_inode_drop(ip);
2642 
2643 				if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) {
2644 					tsleep(&dorestart, 0, "h2syndel", 2);
2645 				}
2646 				hammer2_spin_ex(&pmp->list_spin);
2647 				continue;
2648 			}
2649 		} else {
2650 			vp = NULL;
2651 		}
2652 
2653 		/*
2654 		 * If the inode wound up on a SIDEQ again it will already be
2655 		 * prepped for another PASS2.  In this situation if we flush
2656 		 * it now we will just wind up flushing it again in the same
2657 		 * syncer run, so we might as well not flush it now.
2658 		 */
2659 		if (ip->flags & HAMMER2_INODE_SIDEQ) {
2660 			hammer2_mtx_unlock(&ip->lock);
2661 			hammer2_inode_drop(ip);
2662 			if (vp)
2663 				vput(vp);
2664 			dorestart = 1;
2665 			hammer2_spin_ex(&pmp->list_spin);
2666 			continue;
2667 		}
2668 
2669 		/*
2670 		 * Ok we have the inode exclusively locked and if vp is
2671 		 * not NULL that will also be exclusively locked.  Do the
2672 		 * meat of the flush.
2673 		 *
2674 		 * vp token needed for v_rbdirty_tree check / vclrisdirty
2675 		 * sequencing.  Though we hold the vnode exclusively so
2676 		 * we shouldn't need to hold the token also in this case.
2677 		 */
2678 		if (vp) {
2679 			vfsync(vp, MNT_WAIT, 1, NULL, NULL);
2680 			bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */
2681 		}
2682 
2683 		/*
2684 		 * If the inode has not yet been inserted into the tree
2685 		 * we must do so.  Then sync and flush it.  The flush should
2686 		 * update the parent.
2687 		 */
2688 		if (ip->flags & HAMMER2_INODE_DELETING) {
2689 #ifdef HAMMER2_DEBUG_SYNC
2690 			kprintf("inum %ld destroy\n", (long)ip->meta.inum);
2691 #endif
2692 			hammer2_inode_chain_des(ip);
2693 			atomic_add_long(&hammer2_iod_inode_deletes, 1);
2694 		} else if (ip->flags & HAMMER2_INODE_CREATING) {
2695 #ifdef HAMMER2_DEBUG_SYNC
2696 			kprintf("inum %ld insert\n", (long)ip->meta.inum);
2697 #endif
2698 			hammer2_inode_chain_ins(ip);
2699 			atomic_add_long(&hammer2_iod_inode_creates, 1);
2700 		}
2701 #ifdef HAMMER2_DEBUG_SYNC
2702 		kprintf("inum %ld chain-sync\n", (long)ip->meta.inum);
2703 #endif
2704 
2705 		/*
2706 		 * Because I kinda messed up the design and index the inodes
2707 		 * under the root inode, along side the directory entries,
2708 		 * we can't flush the inode index under the iroot until the
2709 		 * end.  If we do it now we might miss effects created by
2710 		 * other inodes on the SYNCQ.
2711 		 *
2712 		 * Do a normal (non-FSSYNC) flush instead, which allows the
2713 		 * vnode code to work the same.  We don't want to force iroot
2714 		 * back onto the SIDEQ, and we also don't want the flush code
2715 		 * to update pfs_iroot_blocksets until the final flush later.
2716 		 *
2717 		 * XXX at the moment this will likely result in a double-flush
2718 		 * of the iroot chain.
2719 		 */
2720 		hammer2_inode_chain_sync(ip);
2721 		if (ip == pmp->iroot) {
2722 			hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
2723 		} else {
2724 			hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
2725 						      HAMMER2_XOP_FSSYNC);
2726 		}
2727 		if (vp) {
2728 			lwkt_gettoken(&vp->v_token);
2729 			if ((ip->flags & (HAMMER2_INODE_MODIFIED |
2730 					  HAMMER2_INODE_RESIZED |
2731 					  HAMMER2_INODE_DIRTYDATA)) == 0 &&
2732 			    RB_EMPTY(&vp->v_rbdirty_tree) &&
2733 			    !bio_track_active(&vp->v_track_write)) {
2734 				vclrisdirty(vp);
2735 			} else {
2736 				hammer2_inode_delayed_sideq(ip);
2737 			}
2738 			lwkt_reltoken(&vp->v_token);
2739 			vput(vp);
2740 			vp = NULL;	/* safety */
2741 		}
2742 		atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2);
2743 		hammer2_inode_unlock(ip);	/* unlock+drop */
2744 		/* ip pointer invalid */
2745 
2746 		/*
2747 		 * If the inode got dirted after we dropped our locks,
2748 		 * it will have already been moved back to the SIDEQ.
2749 		 */
2750 		hammer2_spin_ex(&pmp->list_spin);
2751 	}
2752 	hammer2_spin_unex(&pmp->list_spin);
2753 	hammer2_pfs_memory_wakeup(pmp, 0);
2754 
2755 	if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) {
2756 #ifdef HAMMER2_DEBUG_SYNC
2757 		kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n");
2758 		/*tsleep(&dorestart, 0, "h2STG1-R", hz*20);*/
2759 #endif
2760 		dorestart = 1;
2761 		goto restart;
2762 	}
2763 #ifdef HAMMER2_DEBUG_SYNC
2764 	kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n");
2765 	/*tsleep(&dorestart, 0, "h2STG2", hz*20);*/
2766 #endif
2767 
2768 	/*
2769 	 * We have to flush the PFS root last, even if it does not appear to
2770 	 * be dirty, because all the inodes in the PFS are indexed under it.
2771 	 * The normal flushing of iroot above would only occur if directory
2772 	 * entries under the root were changed.
2773 	 *
2774 	 * Specifying VOLHDR will cause an additionl flush of hmp->spmp
2775 	 * for the media making up the cluster.
2776 	 */
2777 	if ((ip = pmp->iroot) != NULL) {
2778 		hammer2_inode_ref(ip);
2779 		hammer2_mtx_ex(&ip->lock);
2780 		hammer2_inode_chain_sync(ip);
2781 		hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
2782 					      HAMMER2_XOP_FSSYNC |
2783 					      HAMMER2_XOP_VOLHDR);
2784 		hammer2_inode_unlock(ip);	/* unlock+drop */
2785 	}
2786 #ifdef HAMMER2_DEBUG_SYNC
2787 	kprintf("FILESYSTEM SYNC STAGE 2 DONE\n");
2788 #endif
2789 
2790 	/*
2791 	 * device bioq sync
2792 	 */
2793 	hammer2_bioq_sync(pmp);
2794 
2795 #if 0
2796 	info.pass = 1;
2797 	info.waitfor = MNT_WAIT;
2798 	vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2799 
2800 	info.pass = 2;
2801 	info.waitfor = MNT_WAIT;
2802 	vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2803 #endif
2804 #if 0
2805 	/*
2806 	 * Generally speaking we now want to flush the media topology from
2807 	 * the iroot through to the inodes.  The flush stops at any inode
2808 	 * boundary, which allows the frontend to continue running concurrent
2809 	 * modifying operations on inodes (including kernel flushes of
2810 	 * buffers) without interfering with the main sync.
2811 	 *
2812 	 * Use the XOP interface to concurrently flush all nodes to
2813 	 * synchronize the PFSROOT subtopology to the media.  A standard
2814 	 * end-of-scan ENOENT error indicates cluster sufficiency.
2815 	 *
2816 	 * Note that this flush will not be visible on crash recovery until
2817 	 * we flush the super-root topology in the next loop.
2818 	 *
2819 	 * XXX For now wait for all flushes to complete.
2820 	 */
2821 	if (mp && (ip = pmp->iroot) != NULL) {
2822 		/*
2823 		 * If unmounting try to flush everything including any
2824 		 * sub-trees under inodes, just in case there is dangling
2825 		 * modified data, as a safety.  Otherwise just flush up to
2826 		 * the inodes in this stage.
2827 		 */
2828 		kprintf("MP & IROOT\n");
2829 #ifdef HAMMER2_DEBUG_SYNC
2830 		kprintf("FILESYSTEM SYNC STAGE 3 IROOT BEGIN\n");
2831 #endif
2832 		if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
2833 			xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
2834 						    HAMMER2_XOP_VOLHDR |
2835 						    HAMMER2_XOP_FSSYNC |
2836 						    HAMMER2_XOP_INODE_STOP);
2837 		} else {
2838 			xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
2839 						    HAMMER2_XOP_INODE_STOP |
2840 						    HAMMER2_XOP_VOLHDR |
2841 						    HAMMER2_XOP_FSSYNC |
2842 						    HAMMER2_XOP_INODE_STOP);
2843 		}
2844 		hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc);
2845 		error = hammer2_xop_collect(&xop->head,
2846 					    HAMMER2_XOP_COLLECT_WAITALL);
2847 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2848 #ifdef HAMMER2_DEBUG_SYNC
2849 		kprintf("FILESYSTEM SYNC STAGE 3 IROOT END\n");
2850 #endif
2851 		if (error == HAMMER2_ERROR_ENOENT)
2852 			error = 0;
2853 		else
2854 			error = hammer2_error_to_errno(error);
2855 	} else {
2856 		error = 0;
2857 	}
2858 #endif
2859 	error = 0;	/* XXX */
2860 	hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH);
2861 
2862 	return (error);
2863 }
2864 
2865 static
2866 int
2867 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2868 {
2869 	hammer2_inode_t *ip;
2870 
2871 	KKASSERT(MAXFIDSZ >= 16);
2872 	ip = VTOI(vp);
2873 	fhp->fid_len = offsetof(struct fid, fid_data[16]);
2874 	fhp->fid_ext = 0;
2875 	((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum;
2876 	((hammer2_tid_t *)fhp->fid_data)[1] = 0;
2877 
2878 	return 0;
2879 }
2880 
2881 static
2882 int
2883 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2884 	       struct fid *fhp, struct vnode **vpp)
2885 {
2886 	hammer2_pfs_t *pmp;
2887 	hammer2_tid_t inum;
2888 	int error;
2889 
2890 	pmp = MPTOPMP(mp);
2891 	inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK;
2892 	if (vpp) {
2893 		if (inum == 1)
2894 			error = hammer2_vfs_root(mp, vpp);
2895 		else
2896 			error = hammer2_vfs_vget(mp, NULL, inum, vpp);
2897 	} else {
2898 		error = 0;
2899 	}
2900 	if (error)
2901 		kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error);
2902 	return error;
2903 }
2904 
2905 static
2906 int
2907 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2908 		 int *exflagsp, struct ucred **credanonp)
2909 {
2910 	hammer2_pfs_t *pmp;
2911 	struct netcred *np;
2912 	int error;
2913 
2914 	pmp = MPTOPMP(mp);
2915 	np = vfs_export_lookup(mp, &pmp->export, nam);
2916 	if (np) {
2917 		*exflagsp = np->netc_exflags;
2918 		*credanonp = &np->netc_anon;
2919 		error = 0;
2920 	} else {
2921 		error = EACCES;
2922 	}
2923 	return error;
2924 }
2925 
2926 /*
2927  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2928  * header into the HMP
2929  *
2930  * XXX read four volhdrs and use the one with the highest TID whos CRC
2931  *     matches.
2932  *
2933  * XXX check iCRCs.
2934  *
2935  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2936  *     nonexistant locations.
2937  *
2938  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2939  */
2940 static
2941 int
2942 hammer2_install_volume_header(hammer2_dev_t *hmp)
2943 {
2944 	hammer2_volume_data_t *vd;
2945 	struct buf *bp;
2946 	hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2947 	int error_reported;
2948 	int error;
2949 	int valid;
2950 	int i;
2951 
2952 	error_reported = 0;
2953 	error = 0;
2954 	valid = 0;
2955 	bp = NULL;
2956 
2957 	/*
2958 	 * There are up to 4 copies of the volume header (syncs iterate
2959 	 * between them so there is no single master).  We don't trust the
2960 	 * volu_size field so we don't know precisely how large the filesystem
2961 	 * is, so depend on the OS to return an error if we go beyond the
2962 	 * block device's EOF.
2963 	 */
2964 	for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2965 		error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2966 			      HAMMER2_VOLUME_BYTES, &bp);
2967 		if (error) {
2968 			brelse(bp);
2969 			bp = NULL;
2970 			continue;
2971 		}
2972 
2973 		vd = (struct hammer2_volume_data *) bp->b_data;
2974 		if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2975 		    (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2976 			brelse(bp);
2977 			bp = NULL;
2978 			continue;
2979 		}
2980 
2981 		if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2982 			/* XXX: Reversed-endianness filesystem */
2983 			kprintf("hammer2: reverse-endian filesystem detected");
2984 			brelse(bp);
2985 			bp = NULL;
2986 			continue;
2987 		}
2988 
2989 		crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2990 		crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2991 				      HAMMER2_VOLUME_ICRC0_SIZE);
2992 		bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2993 		bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2994 				       HAMMER2_VOLUME_ICRC1_SIZE);
2995 		if ((crc0 != crc) || (bcrc0 != bcrc)) {
2996 			kprintf("hammer2 volume header crc "
2997 				"mismatch copy #%d %08x/%08x\n",
2998 				i, crc0, crc);
2999 			error_reported = 1;
3000 			brelse(bp);
3001 			bp = NULL;
3002 			continue;
3003 		}
3004 		if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
3005 			valid = 1;
3006 			hmp->voldata = *vd;
3007 			hmp->volhdrno = i;
3008 		}
3009 		brelse(bp);
3010 		bp = NULL;
3011 	}
3012 	if (valid) {
3013 		hmp->volsync = hmp->voldata;
3014 		hmp->free_reserved = hmp->voldata.allocator_size / 20;
3015 		error = 0;
3016 		if (error_reported || bootverbose || 1) { /* 1/DEBUG */
3017 			kprintf("hammer2: using volume header #%d\n",
3018 				hmp->volhdrno);
3019 		}
3020 	} else {
3021 		error = EINVAL;
3022 		kprintf("hammer2: no valid volume headers found!\n");
3023 	}
3024 	return (error);
3025 }
3026 
3027 /*
3028  * This handles hysteresis on regular file flushes.  Because the BIOs are
3029  * routed to a thread it is possible for an excessive number to build up
3030  * and cause long front-end stalls long before the runningbuffspace limit
3031  * is hit, so we implement hammer2_flush_pipe to control the
3032  * hysteresis.
3033  *
3034  * This is a particular problem when compression is used.
3035  */
3036 void
3037 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
3038 {
3039 	atomic_add_int(&pmp->count_lwinprog, 1);
3040 }
3041 
3042 void
3043 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
3044 {
3045 	int lwinprog;
3046 
3047 	lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
3048 	if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
3049 	    (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
3050 		atomic_clear_int(&pmp->count_lwinprog,
3051 				 HAMMER2_LWINPROG_WAITING);
3052 		wakeup(&pmp->count_lwinprog);
3053 	}
3054 	if ((lwinprog & HAMMER2_LWINPROG_WAITING0) &&
3055 	    (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) {
3056 		atomic_clear_int(&pmp->count_lwinprog,
3057 				 HAMMER2_LWINPROG_WAITING0);
3058 		wakeup(&pmp->count_lwinprog);
3059 	}
3060 }
3061 
3062 void
3063 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe)
3064 {
3065 	int lwinprog;
3066 	int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING :
3067 				    HAMMER2_LWINPROG_WAITING0;
3068 
3069 	for (;;) {
3070 		lwinprog = pmp->count_lwinprog;
3071 		cpu_ccfence();
3072 		if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe)
3073 			break;
3074 		tsleep_interlock(&pmp->count_lwinprog, 0);
3075 		atomic_set_int(&pmp->count_lwinprog, lwflag);
3076 		lwinprog = pmp->count_lwinprog;
3077 		if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe)
3078 			break;
3079 		tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
3080 	}
3081 }
3082 
3083 /*
3084  * It is possible for an excessive number of dirty chains or dirty inodes
3085  * to build up.  When this occurs we start an asynchronous filesystem sync.
3086  * If the level continues to build up, we stall, waiting for it to drop,
3087  * with some hysteresis.
3088  *
3089  * This relies on the kernel calling hammer2_vfs_modifying() prior to
3090  * obtaining any vnode locks before making a modifying VOP call.
3091  */
3092 static int
3093 hammer2_vfs_modifying(struct mount *mp)
3094 {
3095 	if (mp->mnt_flag & MNT_RDONLY)
3096 		return EROFS;
3097 	hammer2_pfs_memory_wait(MPTOPMP(mp));
3098 
3099 	return 0;
3100 }
3101 
3102 /*
3103  * Initiate an asynchronous filesystem sync and, with hysteresis,
3104  * stall if the internal data structure count becomes too bloated.
3105  */
3106 void
3107 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
3108 {
3109 	uint32_t waiting;
3110 	int pcatch;
3111 	int error;
3112 
3113 	if (pmp == NULL || pmp->mp == NULL)
3114 		return;
3115 
3116 	for (;;) {
3117 		waiting = pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK;
3118 		cpu_ccfence();
3119 
3120 		/*
3121 		 * Start the syncer running at 1/2 the limit
3122 		 */
3123 		if (waiting > hammer2_limit_dirty_chains / 2 ||
3124 		    pmp->sideq_count > hammer2_limit_dirty_inodes / 2) {
3125 			trigger_syncer(pmp->mp);
3126 		}
3127 
3128 		/*
3129 		 * Stall at the limit waiting for the counts to drop.
3130 		 * This code will typically be woken up once the count
3131 		 * drops below 3/4 the limit, or in one second.
3132 		 */
3133 		if (waiting < hammer2_limit_dirty_chains &&
3134 		    pmp->sideq_count < hammer2_limit_dirty_inodes) {
3135 			break;
3136 		}
3137 
3138 		pcatch = curthread->td_proc ? PCATCH : 0;
3139 
3140 		tsleep_interlock(&pmp->inmem_dirty_chains, pcatch);
3141 		atomic_set_int(&pmp->inmem_dirty_chains,
3142 			       HAMMER2_DIRTYCHAIN_WAITING);
3143 		if (waiting < hammer2_limit_dirty_chains &&
3144 		    pmp->sideq_count < hammer2_limit_dirty_inodes) {
3145 			break;
3146 		}
3147 		trigger_syncer(pmp->mp);
3148 		error = tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED | pcatch,
3149 			       "h2memw", hz);
3150 		if (error == ERESTART)
3151 			break;
3152 	}
3153 }
3154 
3155 /*
3156  * Wake up any stalled frontend ops waiting, with hysteresis, using
3157  * 2/3 of the limit.
3158  */
3159 void
3160 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp, int count)
3161 {
3162 	uint32_t waiting;
3163 
3164 	if (pmp) {
3165 		waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, count);
3166 		/* don't need --waiting to test flag */
3167 
3168 		if ((waiting & HAMMER2_DIRTYCHAIN_WAITING) &&
3169 		    (pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK) <=
3170 		    hammer2_limit_dirty_chains * 2 / 3 &&
3171 		    pmp->sideq_count <= hammer2_limit_dirty_inodes * 2 / 3) {
3172 			atomic_clear_int(&pmp->inmem_dirty_chains,
3173 					 HAMMER2_DIRTYCHAIN_WAITING);
3174 			wakeup(&pmp->inmem_dirty_chains);
3175 		}
3176 	}
3177 }
3178 
3179 void
3180 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
3181 {
3182 	if (pmp) {
3183 		atomic_add_int(&pmp->inmem_dirty_chains, 1);
3184 	}
3185 }
3186 
3187 /*
3188  * Returns 0 if the filesystem has tons of free space
3189  * Returns 1 if the filesystem has less than 10% remaining
3190  * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining.
3191  */
3192 int
3193 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred)
3194 {
3195 	hammer2_pfs_t *pmp;
3196 	hammer2_dev_t *hmp;
3197 	hammer2_off_t free_reserved;
3198 	hammer2_off_t free_nominal;
3199 	int i;
3200 
3201 	pmp = ip->pmp;
3202 
3203 	if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) {
3204 		free_reserved = HAMMER2_SEGSIZE;
3205 		free_nominal = 0x7FFFFFFFFFFFFFFFLLU;
3206 		for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
3207 			hmp = pmp->pfs_hmps[i];
3208 			if (hmp == NULL)
3209 				continue;
3210 			if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER &&
3211 			    pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER)
3212 				continue;
3213 
3214 			if (free_nominal > hmp->voldata.allocator_free)
3215 				free_nominal = hmp->voldata.allocator_free;
3216 			if (free_reserved < hmp->free_reserved)
3217 				free_reserved = hmp->free_reserved;
3218 		}
3219 
3220 		/*
3221 		 * SMP races ok
3222 		 */
3223 		pmp->free_reserved = free_reserved;
3224 		pmp->free_nominal = free_nominal;
3225 		pmp->free_ticks = ticks;
3226 	} else {
3227 		free_reserved = pmp->free_reserved;
3228 		free_nominal = pmp->free_nominal;
3229 	}
3230 	if (cred && cred->cr_uid != 0) {
3231 		if ((int64_t)(free_nominal - bytes) <
3232 		    (int64_t)free_reserved) {
3233 			return 2;
3234 		}
3235 	} else {
3236 		if ((int64_t)(free_nominal - bytes) <
3237 		    (int64_t)free_reserved / 2) {
3238 			return 2;
3239 		}
3240 	}
3241 	if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2)
3242 		return 1;
3243 	return 0;
3244 }
3245 
3246 /*
3247  * Debugging
3248  */
3249 void
3250 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx,
3251 		   u_int flags)
3252 {
3253 	hammer2_chain_t *scan;
3254 	hammer2_chain_t *parent;
3255 
3256 	--*countp;
3257 	if (*countp == 0) {
3258 		kprintf("%*.*s...\n", tab, tab, "");
3259 		return;
3260 	}
3261 	if (*countp < 0)
3262 		return;
3263 	kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
3264 		tab, tab, "", pfx,
3265 		chain, chain->bref.type,
3266 		chain->bref.key, chain->bref.keybits,
3267 		chain->bref.mirror_tid);
3268 
3269 	kprintf("%*.*s      [%08x] (%s) refs=%d",
3270 		tab, tab, "",
3271 		chain->flags,
3272 		((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3273 		chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3274 		chain->refs);
3275 
3276 	parent = chain->parent;
3277 	if (parent)
3278 		kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
3279 			tab, tab, "",
3280 			parent, parent->flags, parent->refs);
3281 	if (RB_EMPTY(&chain->core.rbtree)) {
3282 		kprintf("\n");
3283 	} else {
3284 		kprintf(" {\n");
3285 		RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) {
3286 			if ((scan->flags & flags) || flags == (u_int)-1) {
3287 				hammer2_dump_chain(scan, tab + 4, countp, 'a',
3288 						   flags);
3289 			}
3290 		}
3291 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3292 			kprintf("%*.*s}(%s)\n", tab, tab, "",
3293 				chain->data->ipdata.filename);
3294 		else
3295 			kprintf("%*.*s}\n", tab, tab, "");
3296 	}
3297 }
3298