xref: /dflybsd-src/sys/vfs/hammer2/hammer2_vfsops.c (revision 10cf3bfcde2ee9c50d77a153397b93d8026b03e1)
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48 
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54 
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57 
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 #include "hammer2_lz4.h"
62 
63 #include "zlib/hammer2_zlib.h"
64 
65 #define REPORT_REFS_ERRORS 1	/* XXX remove me */
66 
67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
68 
69 struct hammer2_sync_info {
70 	hammer2_trans_t trans;
71 	int error;
72 	int waitfor;
73 };
74 
75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
76 TAILQ_HEAD(hammer2_pfslist, hammer2_pfs);
77 static struct hammer2_mntlist hammer2_mntlist;
78 static struct hammer2_pfslist hammer2_pfslist;
79 static struct lock hammer2_mntlk;
80 
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 int hammer2_synchronous_flush = 1;
86 int hammer2_dio_count;
87 long hammer2_limit_dirty_chains;
88 long hammer2_iod_file_read;
89 long hammer2_iod_meta_read;
90 long hammer2_iod_indr_read;
91 long hammer2_iod_fmap_read;
92 long hammer2_iod_volu_read;
93 long hammer2_iod_file_write;
94 long hammer2_iod_meta_write;
95 long hammer2_iod_indr_write;
96 long hammer2_iod_fmap_write;
97 long hammer2_iod_volu_write;
98 long hammer2_ioa_file_read;
99 long hammer2_ioa_meta_read;
100 long hammer2_ioa_indr_read;
101 long hammer2_ioa_fmap_read;
102 long hammer2_ioa_volu_read;
103 long hammer2_ioa_fmap_write;
104 long hammer2_ioa_file_write;
105 long hammer2_ioa_meta_write;
106 long hammer2_ioa_indr_write;
107 long hammer2_ioa_volu_write;
108 
109 MALLOC_DECLARE(C_BUFFER);
110 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
111 
112 MALLOC_DECLARE(D_BUFFER);
113 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
114 
115 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
116 
117 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
118 	   &hammer2_debug, 0, "");
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
120 	   &hammer2_cluster_enable, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
122 	   &hammer2_hardlink_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
124 	   &hammer2_flush_pipe, 0, "");
125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
126 	   &hammer2_synchronous_flush, 0, "");
127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
128 	   &hammer2_limit_dirty_chains, 0, "");
129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
130 	   &hammer2_dio_count, 0, "");
131 
132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
133 	   &hammer2_iod_file_read, 0, "");
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
135 	   &hammer2_iod_meta_read, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
137 	   &hammer2_iod_indr_read, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
139 	   &hammer2_iod_fmap_read, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
141 	   &hammer2_iod_volu_read, 0, "");
142 
143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
144 	   &hammer2_iod_file_write, 0, "");
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
146 	   &hammer2_iod_meta_write, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
148 	   &hammer2_iod_indr_write, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
150 	   &hammer2_iod_fmap_write, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
152 	   &hammer2_iod_volu_write, 0, "");
153 
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
155 	   &hammer2_ioa_file_read, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
157 	   &hammer2_ioa_meta_read, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
159 	   &hammer2_ioa_indr_read, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
161 	   &hammer2_ioa_fmap_read, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
163 	   &hammer2_ioa_volu_read, 0, "");
164 
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
166 	   &hammer2_ioa_file_write, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
168 	   &hammer2_ioa_meta_write, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
170 	   &hammer2_ioa_indr_write, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
172 	   &hammer2_ioa_fmap_write, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
174 	   &hammer2_ioa_volu_write, 0, "");
175 
176 static int hammer2_vfs_init(struct vfsconf *conf);
177 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
178 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
179 				struct ucred *cred);
180 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
181 				struct vnode *, struct ucred *);
182 static int hammer2_recovery(hammer2_dev_t *hmp);
183 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
184 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
185 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
186 				struct ucred *cred);
187 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
188 				struct ucred *cred);
189 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
190 				ino_t ino, struct vnode **vpp);
191 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
192 				struct fid *fhp, struct vnode **vpp);
193 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
194 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
195 				int *exflagsp, struct ucred **credanonp);
196 
197 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
198 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
199 
200 static void hammer2_update_pmps(hammer2_dev_t *hmp);
201 static void hammer2_write_thread(void *arg);
202 
203 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
204 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
205 				hammer2_dev_t *hmp);
206 
207 /*
208  * Functions for compression in threads,
209  * from hammer2_vnops.c
210  */
211 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
212 				hammer2_inode_t *ip,
213 				const hammer2_inode_data_t *ripdata,
214 				hammer2_cluster_t *cparent,
215 				hammer2_key_t lbase, int ioflag, int pblksize,
216 				int *errorp);
217 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
218 				hammer2_inode_t *ip,
219 				const hammer2_inode_data_t *ripdata,
220 				hammer2_cluster_t *cparent,
221 				hammer2_key_t lbase, int ioflag,
222 				int pblksize, int *errorp,
223 				int comp_algo, int check_algo);
224 static void hammer2_zero_check_and_write(struct buf *bp,
225 				hammer2_trans_t *trans, hammer2_inode_t *ip,
226 				const hammer2_inode_data_t *ripdata,
227 				hammer2_cluster_t *cparent,
228 				hammer2_key_t lbase,
229 				int ioflag, int pblksize, int *errorp,
230 				int check_algo);
231 static int test_block_zeros(const char *buf, size_t bytes);
232 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
233 				hammer2_inode_t *ip,
234 				const hammer2_inode_data_t *ripdata,
235 				hammer2_cluster_t *cparent,
236 				hammer2_key_t lbase,
237 				int *errorp);
238 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
239 				int ioflag, int pblksize, int *errorp,
240 				int check_algo);
241 
242 /*
243  * HAMMER2 vfs operations.
244  */
245 static struct vfsops hammer2_vfsops = {
246 	.vfs_init	= hammer2_vfs_init,
247 	.vfs_uninit	= hammer2_vfs_uninit,
248 	.vfs_sync	= hammer2_vfs_sync,
249 	.vfs_mount	= hammer2_vfs_mount,
250 	.vfs_unmount	= hammer2_vfs_unmount,
251 	.vfs_root 	= hammer2_vfs_root,
252 	.vfs_statfs	= hammer2_vfs_statfs,
253 	.vfs_statvfs	= hammer2_vfs_statvfs,
254 	.vfs_vget	= hammer2_vfs_vget,
255 	.vfs_vptofh	= hammer2_vfs_vptofh,
256 	.vfs_fhtovp	= hammer2_vfs_fhtovp,
257 	.vfs_checkexp	= hammer2_vfs_checkexp
258 };
259 
260 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
261 
262 VFS_SET(hammer2_vfsops, hammer2, 0);
263 MODULE_VERSION(hammer2, 1);
264 
265 static
266 int
267 hammer2_vfs_init(struct vfsconf *conf)
268 {
269 	static struct objcache_malloc_args margs_read;
270 	static struct objcache_malloc_args margs_write;
271 
272 	int error;
273 
274 	error = 0;
275 
276 	if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
277 		error = EINVAL;
278 	if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
279 		error = EINVAL;
280 	if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
281 		error = EINVAL;
282 
283 	if (error)
284 		kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
285 
286 	margs_read.objsize = 65536;
287 	margs_read.mtype = D_BUFFER;
288 
289 	margs_write.objsize = 32768;
290 	margs_write.mtype = C_BUFFER;
291 
292 	cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
293 				0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
294 				objcache_malloc_free, &margs_read);
295 	cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
296 				0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
297 				objcache_malloc_free, &margs_write);
298 
299 	lockinit(&hammer2_mntlk, "mntlk", 0, 0);
300 	TAILQ_INIT(&hammer2_mntlist);
301 	TAILQ_INIT(&hammer2_pfslist);
302 
303 	hammer2_limit_dirty_chains = desiredvnodes / 10;
304 
305 	hammer2_trans_manage_init();
306 
307 	return (error);
308 }
309 
310 static
311 int
312 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
313 {
314 	objcache_destroy(cache_buffer_read);
315 	objcache_destroy(cache_buffer_write);
316 	return 0;
317 }
318 
319 /*
320  * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
321  * mounts and the spmp structure for media (hmp) structures.
322  *
323  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
324  * transactions.  Note that synchronization does not use this field.
325  * (typically frontend operations and synchronization cannot run on the
326  * same PFS node at the same time).
327  *
328  * XXX check locking
329  */
330 hammer2_pfs_t *
331 hammer2_pfsalloc(hammer2_cluster_t *cluster,
332 		 const hammer2_inode_data_t *ripdata,
333 		 hammer2_tid_t modify_tid)
334 {
335 	hammer2_chain_t *rchain;
336 	hammer2_pfs_t *pmp;
337 	int i;
338 	int j;
339 
340 	/*
341 	 * Locate or create the PFS based on the cluster id.  If ripdata
342 	 * is NULL this is a spmp which is unique and is always allocated.
343 	 */
344 	if (ripdata) {
345 		TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
346 			if (bcmp(&pmp->pfs_clid, &ripdata->pfs_clid,
347 				 sizeof(pmp->pfs_clid)) == 0) {
348 					break;
349 			}
350 		}
351 	} else {
352 		pmp = NULL;
353 	}
354 
355 	if (pmp == NULL) {
356 		pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
357 		kmalloc_create(&pmp->minode, "HAMMER2-inodes");
358 		kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
359 		lockinit(&pmp->lock, "pfslk", 0, 0);
360 		spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
361 		RB_INIT(&pmp->inum_tree);
362 		TAILQ_INIT(&pmp->unlinkq);
363 		spin_init(&pmp->list_spin, "hm2pfsalloc_list");
364 
365 		/*
366 		 * Save last media transaction id for flusher.
367 		 */
368 		pmp->modify_tid = modify_tid;
369 		if (ripdata) {
370 			pmp->inode_tid = ripdata->pfs_inum + 1;
371 			pmp->pfs_clid = ripdata->pfs_clid;
372 		}
373 		hammer2_mtx_init(&pmp->wthread_mtx, "h2wthr");
374 		bioq_init(&pmp->wthread_bioq);
375 		TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
376 
377 		/*
378 		 * The synchronization thread may start too early, make
379 		 * sure it stays frozen until we are ready to let it go.
380 		 * XXX
381 		 */
382 		/*
383 		pmp->primary_thr.flags = HAMMER2_SYNCTHR_FROZEN |
384 					 HAMMER2_SYNCTHR_REMASTER;
385 		*/
386 	}
387 
388 	/*
389 	 * Create the PFS's root inode.
390 	 */
391 	if (pmp->iroot == NULL) {
392 		pmp->iroot = hammer2_inode_get(pmp, NULL, NULL);
393 		hammer2_inode_ref(pmp->iroot);
394 		hammer2_inode_unlock(pmp->iroot, NULL);
395 	}
396 
397 	/*
398 	 * Create a primary synchronizer thread for the PFS if necessary.
399 	 * Single-node masters (including snapshots) have nothing to
400 	 * synchronize and do not require this thread.
401 	 *
402 	 * Multi-node masters or any number of soft masters, slaves, copy,
403 	 * or other PFS types need the thread.
404 	 */
405 	if (cluster && ripdata &&
406 	    (ripdata->pfs_type != HAMMER2_PFSTYPE_MASTER ||
407 	     ripdata->pfs_nmasters > 1) &&
408 	    pmp->primary_thr.td == NULL) {
409 		hammer2_syncthr_create(&pmp->primary_thr, pmp,
410 				       hammer2_syncthr_primary);
411 	}
412 
413 	/*
414 	 * Update nmasters from any PFS which is part of the cluster.
415 	 * It is possible that this will result in a value which is too
416 	 * high.  MASTER PFSs are authoritative for pfs_nmasters and will
417 	 * override this value later on.
418 	 */
419 	if (ripdata && pmp->pfs_nmasters < ripdata->pfs_nmasters) {
420 		pmp->pfs_nmasters = ripdata->pfs_nmasters;
421 	}
422 
423 	/*
424 	 * When a cluster is passed in we must add the cluster's chains
425 	 * to the PFS's root inode and update pmp->pfs_types[].
426 	 *
427 	 * At the moment empty spots can develop due to removals or failures.
428 	 * Ultimately we want to re-fill these spots. XXX
429 	 */
430 	if (cluster) {
431 		hammer2_inode_ref(pmp->iroot);
432 		hammer2_mtx_ex(&pmp->iroot->lock);
433 		j = pmp->iroot->cluster.nchains;
434 
435 		kprintf("add PFS to pmp %p[%d]\n", pmp, j);
436 
437 		for (i = 0; i < cluster->nchains; ++i) {
438 			if (j == HAMMER2_MAXCLUSTER)
439 				break;
440 			rchain = cluster->array[i].chain;
441 			KKASSERT(rchain->pmp == NULL);
442 			rchain->pmp = pmp;
443 			hammer2_chain_ref(rchain);
444 			pmp->iroot->cluster.array[j].chain = rchain;
445 			pmp->pfs_types[j] = ripdata->pfs_type;
446 
447 			/*
448 			 * If the PFS is already mounted we must account
449 			 * for the mount_count here.
450 			 */
451 			if (pmp->mp)
452 				++rchain->hmp->mount_count;
453 
454 			/*
455 			 * May have to fixup dirty chain tracking.  Previous
456 			 * pmp was NULL so nothing to undo.
457 			 */
458 			if (rchain->flags & HAMMER2_CHAIN_MODIFIED)
459 				hammer2_pfs_memory_inc(pmp);
460 			++j;
461 		}
462 		pmp->iroot->cluster.nchains = j;
463 		hammer2_mtx_unlock(&pmp->iroot->lock);
464 		hammer2_inode_drop(pmp->iroot);
465 
466 		if (i != cluster->nchains) {
467 			kprintf("hammer2_mount: cluster full!\n");
468 			/* XXX fatal error? */
469 		}
470 	}
471 
472 	return pmp;
473 }
474 
475 /*
476  * Destroy a PFS, typically only occurs after the last mount on a device
477  * has gone away.
478  */
479 static void
480 hammer2_pfsfree(hammer2_pfs_t *pmp)
481 {
482 	/*
483 	 * Cleanup our reference on iroot.  iroot is (should) not be needed
484 	 * by the flush code.
485 	 */
486 	TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
487 
488 	hammer2_syncthr_delete(&pmp->primary_thr);
489 
490 	if (pmp->iroot) {
491 #if REPORT_REFS_ERRORS
492 		if (pmp->iroot->refs != 1)
493 			kprintf("PMP->IROOT %p REFS WRONG %d\n",
494 				pmp->iroot, pmp->iroot->refs);
495 #else
496 		KKASSERT(pmp->iroot->refs == 1);
497 #endif
498 		/* ref for pmp->iroot */
499 		hammer2_inode_drop(pmp->iroot);
500 		pmp->iroot = NULL;
501 	}
502 
503 	kmalloc_destroy(&pmp->mmsg);
504 	kmalloc_destroy(&pmp->minode);
505 
506 	kfree(pmp, M_HAMMER2);
507 }
508 
509 /*
510  * Remove all references to hmp from the pfs list.  Any PFS which becomes
511  * empty is terminated and freed.
512  *
513  * XXX inefficient.
514  */
515 static void
516 hammer2_pfsfree_scan(hammer2_dev_t *hmp)
517 {
518 	hammer2_pfs_t *pmp;
519 	hammer2_cluster_t *cluster;
520 	hammer2_chain_t *rchain;
521 	int didfreeze;
522 	int i;
523 
524 again:
525 	TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
526 		if (pmp->iroot == NULL)
527 			continue;
528 		if (hmp->spmp == pmp) {
529 			kprintf("unmount hmp %p remove spmp %p\n",
530 				hmp, pmp);
531 			hmp->spmp = NULL;
532 		}
533 
534 		/*
535 		 * Determine if this PFS is affected.  If it is we must
536 		 * freeze all management threads and lock its iroot.
537 		 *
538 		 * Freezing a management thread forces it idle, operations
539 		 * in-progress will be aborted and it will have to start
540 		 * over again when unfrozen, or exit if told to exit.
541 		 */
542 		cluster = &pmp->iroot->cluster;
543 		for (i = 0; i < cluster->nchains; ++i) {
544 			rchain = cluster->array[i].chain;
545 			if (rchain == NULL || rchain->hmp != hmp)
546 				continue;
547 			break;
548 		}
549 		if (i != cluster->nchains) {
550 			hammer2_syncthr_freeze(&pmp->primary_thr);
551 
552 			/*
553 			 * Lock the inode and clean out matching chains.
554 			 * Note that we cannot use hammer2_inode_lock_*()
555 			 * here because that would attempt to validate the
556 			 * cluster that we are in the middle of ripping
557 			 * apart.
558 			 *
559 			 * WARNING! We are working directly on the inodes
560 			 *	    embedded cluster.
561 			 */
562 			hammer2_mtx_ex(&pmp->iroot->lock);
563 
564 			/*
565 			 * Remove the chain from matching elements of the PFS.
566 			 */
567 			for (i = 0; i < cluster->nchains; ++i) {
568 				rchain = cluster->array[i].chain;
569 				if (rchain == NULL || rchain->hmp != hmp)
570 					continue;
571 
572 				cluster->array[i].chain = NULL;
573 				pmp->pfs_types[i] = 0;
574 				hammer2_chain_drop(rchain);
575 
576 				/* focus hint */
577 				if (cluster->focus == rchain)
578 					cluster->focus = NULL;
579 			}
580 			hammer2_mtx_unlock(&pmp->iroot->lock);
581 			didfreeze = 1;	/* remaster, unfreeze down below */
582 		} else {
583 			didfreeze = 0;
584 		}
585 
586 		/*
587 		 * Cleanup trailing chains.  Do not reorder chains (for now).
588 		 * XXX might remove more than we intended.
589 		 */
590 		while (i > 0) {
591 			if (cluster->array[i - 1].chain)
592 				break;
593 			--i;
594 		}
595 		cluster->nchains = i;
596 
597 		/*
598 		 * If the PMP has no elements remaining we can destroy it.
599 		 * (this will transition management threads from frozen->exit).
600 		 */
601 		if (cluster->nchains == 0) {
602 			kprintf("unmount hmp %p last ref to PMP=%p\n",
603 				hmp, pmp);
604 			hammer2_pfsfree(pmp);
605 			goto again;
606 		}
607 
608 		/*
609 		 * If elements still remain we need to set the REMASTER
610 		 * flag and unfreeze it.
611 		 */
612 		if (didfreeze) {
613 			hammer2_syncthr_remaster(&pmp->primary_thr);
614 			hammer2_syncthr_unfreeze(&pmp->primary_thr);
615 		}
616 	}
617 }
618 
619 /*
620  * Mount or remount HAMMER2 fileystem from physical media
621  *
622  *	mountroot
623  *		mp		mount point structure
624  *		path		NULL
625  *		data		<unused>
626  *		cred		<unused>
627  *
628  *	mount
629  *		mp		mount point structure
630  *		path		path to mount point
631  *		data		pointer to argument structure in user space
632  *			volume	volume path (device@LABEL form)
633  *			hflags	user mount flags
634  *		cred		user credentials
635  *
636  * RETURNS:	0	Success
637  *		!0	error number
638  */
639 static
640 int
641 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
642 		  struct ucred *cred)
643 {
644 	struct hammer2_mount_info info;
645 	hammer2_pfs_t *pmp;
646 	hammer2_pfs_t *spmp;
647 	hammer2_dev_t *hmp;
648 	hammer2_key_t key_next;
649 	hammer2_key_t key_dummy;
650 	hammer2_key_t lhc;
651 	struct vnode *devvp;
652 	struct nlookupdata nd;
653 	hammer2_chain_t *parent;
654 	hammer2_cluster_t *cluster;
655 	hammer2_cluster_t *cparent;
656 	const hammer2_inode_data_t *ripdata;
657 	hammer2_blockref_t bref;
658 	struct file *fp;
659 	char devstr[MNAMELEN];
660 	size_t size;
661 	size_t done;
662 	char *dev;
663 	char *label;
664 	int ronly = 1;
665 	int error;
666 	int cache_index;
667 	int i;
668 
669 	hmp = NULL;
670 	pmp = NULL;
671 	dev = NULL;
672 	label = NULL;
673 	devvp = NULL;
674 	cache_index = -1;
675 
676 	kprintf("hammer2_mount\n");
677 
678 	if (path == NULL) {
679 		/*
680 		 * Root mount
681 		 */
682 		bzero(&info, sizeof(info));
683 		info.cluster_fd = -1;
684 		return (EOPNOTSUPP);
685 	} else {
686 		/*
687 		 * Non-root mount or updating a mount
688 		 */
689 		error = copyin(data, &info, sizeof(info));
690 		if (error)
691 			return (error);
692 
693 		error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
694 		if (error)
695 			return (error);
696 
697 		/* Extract device and label */
698 		dev = devstr;
699 		label = strchr(devstr, '@');
700 		if (label == NULL ||
701 		    ((label + 1) - dev) > done) {
702 			return (EINVAL);
703 		}
704 		*label = '\0';
705 		label++;
706 		if (*label == '\0')
707 			return (EINVAL);
708 
709 		if (mp->mnt_flag & MNT_UPDATE) {
710 			/*
711 			 * Update mount.  Note that pmp->iroot->cluster is
712 			 * an inode-embedded cluster and thus cannot be
713 			 * directly locked.
714 			 *
715 			 * XXX HAMMER2 needs to implement NFS export via
716 			 *     mountctl.
717 			 */
718 			pmp = MPTOPMP(mp);
719 			cluster = &pmp->iroot->cluster;
720 			for (i = 0; i < cluster->nchains; ++i) {
721 				if (cluster->array[i].chain == NULL)
722 					continue;
723 				hmp = cluster->array[i].chain->hmp;
724 				devvp = hmp->devvp;
725 				error = hammer2_remount(hmp, mp, path,
726 							devvp, cred);
727 				if (error)
728 					break;
729 			}
730 			/*hammer2_inode_install_hidden(pmp);*/
731 
732 			return error;
733 		}
734 	}
735 
736 	/*
737 	 * HMP device mount
738 	 *
739 	 * Lookup name and verify it refers to a block device.
740 	 */
741 	error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
742 	if (error == 0)
743 		error = nlookup(&nd);
744 	if (error == 0)
745 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
746 	nlookup_done(&nd);
747 
748 	if (error == 0) {
749 		if (vn_isdisk(devvp, &error))
750 			error = vfs_mountedon(devvp);
751 	}
752 
753 	/*
754 	 * Determine if the device has already been mounted.  After this
755 	 * check hmp will be non-NULL if we are doing the second or more
756 	 * hammer2 mounts from the same device.
757 	 */
758 	lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
759 	TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
760 		if (hmp->devvp == devvp)
761 			break;
762 	}
763 
764 	/*
765 	 * Open the device if this isn't a secondary mount and construct
766 	 * the H2 device mount (hmp).
767 	 */
768 	if (hmp == NULL) {
769 		hammer2_chain_t *schain;
770 		hammer2_xid_t xid;
771 
772 		if (error == 0 && vcount(devvp) > 0)
773 			error = EBUSY;
774 
775 		/*
776 		 * Now open the device
777 		 */
778 		if (error == 0) {
779 			ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
780 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
781 			error = vinvalbuf(devvp, V_SAVE, 0, 0);
782 			if (error == 0) {
783 				error = VOP_OPEN(devvp,
784 						 ronly ? FREAD : FREAD | FWRITE,
785 						 FSCRED, NULL);
786 			}
787 			vn_unlock(devvp);
788 		}
789 		if (error && devvp) {
790 			vrele(devvp);
791 			devvp = NULL;
792 		}
793 		if (error) {
794 			lockmgr(&hammer2_mntlk, LK_RELEASE);
795 			return error;
796 		}
797 		hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
798 		ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
799 		hmp->ronly = ronly;
800 		hmp->devvp = devvp;
801 		kmalloc_create(&hmp->mchain, "HAMMER2-chains");
802 		TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
803 		RB_INIT(&hmp->iotree);
804 		spin_init(&hmp->io_spin, "hm2mount_io");
805 		spin_init(&hmp->list_spin, "hm2mount_list");
806 		TAILQ_INIT(&hmp->flushq);
807 
808 		lockinit(&hmp->vollk, "h2vol", 0, 0);
809 
810 		/*
811 		 * vchain setup. vchain.data is embedded.
812 		 * vchain.refs is initialized and will never drop to 0.
813 		 *
814 		 * NOTE! voldata is not yet loaded.
815 		 */
816 		hmp->vchain.hmp = hmp;
817 		hmp->vchain.refs = 1;
818 		hmp->vchain.data = (void *)&hmp->voldata;
819 		hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
820 		hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
821 		hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
822 
823 		hammer2_chain_core_init(&hmp->vchain);
824 		/* hmp->vchain.u.xxx is left NULL */
825 
826 		/*
827 		 * fchain setup.  fchain.data is embedded.
828 		 * fchain.refs is initialized and will never drop to 0.
829 		 *
830 		 * The data is not used but needs to be initialized to
831 		 * pass assertion muster.  We use this chain primarily
832 		 * as a placeholder for the freemap's top-level RBTREE
833 		 * so it does not interfere with the volume's topology
834 		 * RBTREE.
835 		 */
836 		hmp->fchain.hmp = hmp;
837 		hmp->fchain.refs = 1;
838 		hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
839 		hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
840 		hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
841 		hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
842 		hmp->fchain.bref.methods =
843 			HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
844 			HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
845 
846 		hammer2_chain_core_init(&hmp->fchain);
847 		/* hmp->fchain.u.xxx is left NULL */
848 
849 		/*
850 		 * Install the volume header and initialize fields from
851 		 * voldata.
852 		 */
853 		error = hammer2_install_volume_header(hmp);
854 		if (error) {
855 			hammer2_unmount_helper(mp, NULL, hmp);
856 			lockmgr(&hammer2_mntlk, LK_RELEASE);
857 			hammer2_vfs_unmount(mp, MNT_FORCE);
858 			return error;
859 		}
860 
861 		/*
862 		 * Really important to get these right or flush will get
863 		 * confused.
864 		 */
865 		hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0);
866 		kprintf("alloc spmp %p tid %016jx\n",
867 			hmp->spmp, hmp->voldata.mirror_tid);
868 		spmp = hmp->spmp;
869 		spmp->inode_tid = 1;
870 
871 		/*
872 		 * Dummy-up vchain and fchain's modify_tid.  mirror_tid
873 		 * is inherited from the volume header.
874 		 */
875 		xid = 0;
876 		hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
877 		hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
878 		hmp->vchain.pmp = spmp;
879 		hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
880 		hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
881 		hmp->fchain.pmp = spmp;
882 
883 		/*
884 		 * First locate the super-root inode, which is key 0
885 		 * relative to the volume header's blockset.
886 		 *
887 		 * Then locate the root inode by scanning the directory keyspace
888 		 * represented by the label.
889 		 */
890 		parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
891 		schain = hammer2_chain_lookup(&parent, &key_dummy,
892 				      HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
893 				      &cache_index, 0);
894 		hammer2_chain_lookup_done(parent);
895 		if (schain == NULL) {
896 			kprintf("hammer2_mount: invalid super-root\n");
897 			hammer2_unmount_helper(mp, NULL, hmp);
898 			lockmgr(&hammer2_mntlk, LK_RELEASE);
899 			hammer2_vfs_unmount(mp, MNT_FORCE);
900 			return EINVAL;
901 		}
902 		if (schain->error) {
903 			kprintf("hammer2_mount: error %s reading super-root\n",
904 				hammer2_error_str(schain->error));
905 			hammer2_chain_unlock(schain);
906 			hammer2_chain_drop(schain);
907 			schain = NULL;
908 			hammer2_unmount_helper(mp, NULL, hmp);
909 			lockmgr(&hammer2_mntlk, LK_RELEASE);
910 			hammer2_vfs_unmount(mp, MNT_FORCE);
911 			return EINVAL;
912 		}
913 		spmp->modify_tid = schain->bref.modify_tid;
914 
915 		/*
916 		 * Sanity-check schain's pmp and finish initialization.
917 		 * Any chain belonging to the super-root topology should
918 		 * have a NULL pmp (not even set to spmp).
919 		 */
920 		ripdata = &hammer2_chain_rdata(schain)->ipdata;
921 		KKASSERT(schain->pmp == NULL);
922 		spmp->pfs_clid = ripdata->pfs_clid;
923 
924 		/*
925 		 * Replace the dummy spmp->iroot with a real one.  It's
926 		 * easier to just do a wholesale replacement than to try
927 		 * to update the chain and fixup the iroot fields.
928 		 *
929 		 * The returned inode is locked with the supplied cluster.
930 		 */
931 		cluster = hammer2_cluster_from_chain(schain);
932 		hammer2_inode_drop(spmp->iroot);
933 		spmp->iroot = NULL;
934 		spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
935 		spmp->spmp_hmp = hmp;
936 		spmp->pfs_types[0] = ripdata->pfs_type;
937 		hammer2_inode_ref(spmp->iroot);
938 		hammer2_inode_unlock(spmp->iroot, cluster);
939 		schain = NULL;
940 		/* leave spmp->iroot with one ref */
941 
942 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
943 			error = hammer2_recovery(hmp);
944 			/* XXX do something with error */
945 		}
946 		hammer2_update_pmps(hmp);
947 		hammer2_iocom_init(hmp);
948 
949 		/*
950 		 * Ref the cluster management messaging descriptor.  The mount
951 		 * program deals with the other end of the communications pipe.
952 		 */
953 		fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
954 		if (fp) {
955 			hammer2_cluster_reconnect(hmp, fp);
956 		} else {
957 			kprintf("hammer2_mount: bad cluster_fd!\n");
958 		}
959 	} else {
960 		spmp = hmp->spmp;
961 	}
962 
963 	/*
964 	 * Lookup the mount point under the media-localized super-root.
965 	 * Scanning hammer2_pfslist doesn't help us because it represents
966 	 * PFS cluster ids which can aggregate several named PFSs together.
967 	 *
968 	 * cluster->pmp will incorrectly point to spmp and must be fixed
969 	 * up later on.
970 	 */
971 	cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
972 	lhc = hammer2_dirhash(label, strlen(label));
973 	cluster = hammer2_cluster_lookup(cparent, &key_next,
974 				      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
975 				      0);
976 	while (cluster) {
977 		if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
978 		    strcmp(label,
979 		       hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) {
980 			break;
981 		}
982 		cluster = hammer2_cluster_next(cparent, cluster, &key_next,
983 					    key_next,
984 					    lhc + HAMMER2_DIRHASH_LOMASK, 0);
985 	}
986 	hammer2_inode_unlock(spmp->iroot, cparent);
987 
988 	/*
989 	 * PFS could not be found?
990 	 */
991 	if (cluster == NULL) {
992 		kprintf("hammer2_mount: PFS label not found\n");
993 		hammer2_unmount_helper(mp, NULL, hmp);
994 		lockmgr(&hammer2_mntlk, LK_RELEASE);
995 		hammer2_vfs_unmount(mp, MNT_FORCE);
996 
997 		return EINVAL;
998 	}
999 
1000 	/*
1001 	 * Acquire the pmp structure (it should have already been allocated
1002 	 * via hammer2_update_pmps() so do not pass cluster in to add to
1003 	 * available chains).
1004 	 *
1005 	 * Check if the cluster has already been mounted.  A cluster can
1006 	 * only be mounted once, use null mounts to mount additional copies.
1007 	 */
1008 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1009 	hammer2_cluster_bref(cluster, &bref);
1010 	pmp = hammer2_pfsalloc(NULL, ripdata, bref.modify_tid);
1011 	hammer2_cluster_unlock(cluster);
1012 	hammer2_cluster_drop(cluster);
1013 
1014 	if (pmp->mp) {
1015 		kprintf("hammer2_mount: PFS already mounted!\n");
1016 		hammer2_unmount_helper(mp, NULL, hmp);
1017 		lockmgr(&hammer2_mntlk, LK_RELEASE);
1018 		hammer2_vfs_unmount(mp, MNT_FORCE);
1019 
1020 		return EBUSY;
1021 	}
1022 
1023 	/*
1024 	 * Finish the mount
1025 	 */
1026         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
1027 
1028         mp->mnt_flag = MNT_LOCAL;
1029         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1030         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1031 
1032         /*
1033          * required mount structure initializations
1034          */
1035         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1036         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1037 
1038         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1039         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1040 
1041         /*
1042          * Optional fields
1043          */
1044         mp->mnt_iosize_max = MAXPHYS;
1045 
1046 	/*
1047 	 * Connect up mount pointers.
1048 	 */
1049 	hammer2_mount_helper(mp, pmp);
1050 
1051         lockmgr(&hammer2_mntlk, LK_RELEASE);
1052 
1053 	/*
1054 	 * A mounted PFS needs a write thread for logical buffers and
1055 	 * a hidden directory for deletions of open files.  These features
1056 	 * are not used by unmounted PFSs.
1057 	 *
1058 	 * The logical file buffer bio write thread handles things like
1059 	 * physical block assignment and compression.
1060 	 */
1061 	pmp->wthread_destroy = 0;
1062 	lwkt_create(hammer2_write_thread, pmp,
1063 		    &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
1064 
1065 	/*
1066 	 * With the cluster operational install ihidden.
1067 	 * (only applicable to pfs mounts, not applicable to spmp)
1068 	 */
1069 	hammer2_inode_install_hidden(pmp);
1070 
1071 	/*
1072 	 * Finish setup
1073 	 */
1074 	vfs_getnewfsid(mp);
1075 	vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1076 	vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1077 	vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1078 
1079 	copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
1080 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1081 	bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1082 	copyinstr(path, mp->mnt_stat.f_mntonname,
1083 		  sizeof(mp->mnt_stat.f_mntonname) - 1,
1084 		  &size);
1085 
1086 	/*
1087 	 * Initial statfs to prime mnt_stat.
1088 	 */
1089 	hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1090 
1091 	return 0;
1092 }
1093 
1094 /*
1095  * Scan PFSs under the super-root and create hammer2_pfs structures.
1096  */
1097 static
1098 void
1099 hammer2_update_pmps(hammer2_dev_t *hmp)
1100 {
1101 	const hammer2_inode_data_t *ripdata;
1102 	hammer2_cluster_t *cparent;
1103 	hammer2_cluster_t *cluster;
1104 	hammer2_blockref_t bref;
1105 	hammer2_pfs_t *spmp;
1106 	hammer2_pfs_t *pmp;
1107 	hammer2_key_t key_next;
1108 
1109 	/*
1110 	 * Lookup mount point under the media-localized super-root.
1111 	 *
1112 	 * cluster->pmp will incorrectly point to spmp and must be fixed
1113 	 * up later on.
1114 	 */
1115 	spmp = hmp->spmp;
1116 	cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1117 	cluster = hammer2_cluster_lookup(cparent, &key_next,
1118 					 HAMMER2_KEY_MIN,
1119 					 HAMMER2_KEY_MAX,
1120 					 0);
1121 	while (cluster) {
1122 		if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
1123 			continue;
1124 		ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1125 		hammer2_cluster_bref(cluster, &bref);
1126 		kprintf("ADD LOCAL PFS: %s\n", ripdata->filename);
1127 
1128 		pmp = hammer2_pfsalloc(cluster, ripdata, bref.modify_tid);
1129 		cluster = hammer2_cluster_next(cparent, cluster,
1130 					       &key_next,
1131 					       key_next,
1132 					       HAMMER2_KEY_MAX,
1133 					       0);
1134 	}
1135 	hammer2_inode_unlock(spmp->iroot, cparent);
1136 }
1137 
1138 /*
1139  * Handle bioq for strategy write
1140  */
1141 static
1142 void
1143 hammer2_write_thread(void *arg)
1144 {
1145 	hammer2_pfs_t *pmp;
1146 	struct bio *bio;
1147 	struct buf *bp;
1148 	hammer2_trans_t trans;
1149 	struct vnode *vp;
1150 	hammer2_inode_t *ip;
1151 	hammer2_cluster_t *cparent;
1152 	const hammer2_inode_data_t *ripdata;
1153 	hammer2_key_t lbase;
1154 	int lblksize;
1155 	int pblksize;
1156 	int error;
1157 
1158 	pmp = arg;
1159 
1160 	hammer2_mtx_ex(&pmp->wthread_mtx);
1161 	while (pmp->wthread_destroy == 0) {
1162 		if (bioq_first(&pmp->wthread_bioq) == NULL) {
1163 			mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
1164 				 0, "h2bioqw", 0);
1165 		}
1166 		cparent = NULL;
1167 
1168 		hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
1169 
1170 		while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
1171 			/*
1172 			 * dummy bio for synchronization.  The transaction
1173 			 * must be reinitialized.
1174 			 */
1175 			if (bio->bio_buf == NULL) {
1176 				bio->bio_flags |= BIO_DONE;
1177 				wakeup(bio);
1178 				hammer2_trans_done(&trans);
1179 				hammer2_trans_init(&trans, pmp,
1180 						   HAMMER2_TRANS_BUFCACHE);
1181 				continue;
1182 			}
1183 
1184 			/*
1185 			 * else normal bio processing
1186 			 */
1187 			hammer2_mtx_unlock(&pmp->wthread_mtx);
1188 
1189 			hammer2_lwinprog_drop(pmp);
1190 
1191 			error = 0;
1192 			bp = bio->bio_buf;
1193 			vp = bp->b_vp;
1194 			ip = VTOI(vp);
1195 
1196 			/*
1197 			 * Inode is modified, flush size and mtime changes
1198 			 * to ensure that the file size remains consistent
1199 			 * with the buffers being flushed.
1200 			 *
1201 			 * NOTE: The inode_fsync() call only flushes the
1202 			 *	 inode's meta-data state, it doesn't try
1203 			 *	 to flush underlying buffers or chains.
1204 			 *
1205 			 * NOTE: hammer2_write_file_core() may indirectly
1206 			 *	 modify and modsync the inode.
1207 			 */
1208 			cparent = hammer2_inode_lock(ip,
1209 						     HAMMER2_RESOLVE_ALWAYS);
1210 			if (ip->flags & (HAMMER2_INODE_RESIZED |
1211 					 HAMMER2_INODE_MTIME)) {
1212 				hammer2_inode_fsync(&trans, ip, cparent);
1213 			}
1214 			ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1215 			lblksize = hammer2_calc_logical(ip, bio->bio_offset,
1216 							&lbase, NULL);
1217 			pblksize = hammer2_calc_physical(ip, ripdata, lbase);
1218 			hammer2_write_file_core(bp, &trans, ip, ripdata,
1219 						cparent,
1220 						lbase, IO_ASYNC,
1221 						pblksize, &error);
1222 			/* ripdata can be invalid after call */
1223 			hammer2_inode_unlock(ip, cparent);
1224 			if (error) {
1225 				kprintf("hammer2: error in buffer write\n");
1226 				bp->b_flags |= B_ERROR;
1227 				bp->b_error = EIO;
1228 			}
1229 			biodone(bio);
1230 			hammer2_mtx_ex(&pmp->wthread_mtx);
1231 		}
1232 		hammer2_trans_done(&trans);
1233 	}
1234 	pmp->wthread_destroy = -1;
1235 	wakeup(&pmp->wthread_destroy);
1236 
1237 	hammer2_mtx_unlock(&pmp->wthread_mtx);
1238 }
1239 
1240 void
1241 hammer2_bioq_sync(hammer2_pfs_t *pmp)
1242 {
1243 	struct bio sync_bio;
1244 
1245 	bzero(&sync_bio, sizeof(sync_bio));	/* dummy with no bio_buf */
1246 	hammer2_mtx_ex(&pmp->wthread_mtx);
1247 	if (pmp->wthread_destroy == 0 &&
1248 	    TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
1249 		bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
1250 		while ((sync_bio.bio_flags & BIO_DONE) == 0)
1251 			mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
1252 	}
1253 	hammer2_mtx_unlock(&pmp->wthread_mtx);
1254 }
1255 
1256 /*
1257  * Return a chain suitable for I/O, creating the chain if necessary
1258  * and assigning its physical block.
1259  *
1260  * cparent can wind up being anything.
1261  */
1262 static
1263 hammer2_cluster_t *
1264 hammer2_assign_physical(hammer2_trans_t *trans,
1265 			hammer2_inode_t *ip, hammer2_cluster_t *cparent,
1266 			hammer2_key_t lbase, int pblksize, int *errorp)
1267 {
1268 	hammer2_cluster_t *cluster;
1269 	hammer2_cluster_t *dparent;
1270 	hammer2_key_t key_dummy;
1271 	int pradix = hammer2_getradix(pblksize);
1272 
1273 	/*
1274 	 * Locate the chain associated with lbase, return a locked chain.
1275 	 * However, do not instantiate any data reference (which utilizes a
1276 	 * device buffer) because we will be using direct IO via the
1277 	 * logical buffer cache buffer.
1278 	 */
1279 	*errorp = 0;
1280 	KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
1281 retry:
1282 	dparent = hammer2_cluster_lookup_init(cparent, 0);
1283 	cluster = hammer2_cluster_lookup(dparent, &key_dummy,
1284 				     lbase, lbase,
1285 				     HAMMER2_LOOKUP_NODATA);
1286 
1287 	if (cluster == NULL) {
1288 		/*
1289 		 * We found a hole, create a new chain entry.
1290 		 *
1291 		 * NOTE: DATA chains are created without device backing
1292 		 *	 store (nor do we want any).
1293 		 */
1294 		*errorp = hammer2_cluster_create(trans, dparent, &cluster,
1295 					       lbase, HAMMER2_PBUFRADIX,
1296 					       HAMMER2_BREF_TYPE_DATA,
1297 					       pblksize, 0);
1298 		if (cluster == NULL) {
1299 			hammer2_cluster_lookup_done(dparent);
1300 			panic("hammer2_cluster_create: par=%p error=%d\n",
1301 				dparent->focus, *errorp);
1302 			goto retry;
1303 		}
1304 		/*ip->delta_dcount += pblksize;*/
1305 	} else {
1306 		switch (hammer2_cluster_type(cluster)) {
1307 		case HAMMER2_BREF_TYPE_INODE:
1308 			/*
1309 			 * The data is embedded in the inode.  The
1310 			 * caller is responsible for marking the inode
1311 			 * modified and copying the data to the embedded
1312 			 * area.
1313 			 */
1314 			break;
1315 		case HAMMER2_BREF_TYPE_DATA:
1316 			if (hammer2_cluster_need_resize(cluster, pblksize)) {
1317 				hammer2_cluster_resize(trans, ip,
1318 						     dparent, cluster,
1319 						     pradix,
1320 						     HAMMER2_MODIFY_OPTDATA);
1321 			}
1322 
1323 			/*
1324 			 * DATA buffers must be marked modified whether the
1325 			 * data is in a logical buffer or not.  We also have
1326 			 * to make this call to fixup the chain data pointers
1327 			 * after resizing in case this is an encrypted or
1328 			 * compressed buffer.
1329 			 */
1330 			hammer2_cluster_modify(trans, cluster,
1331 					       HAMMER2_MODIFY_OPTDATA);
1332 			break;
1333 		default:
1334 			panic("hammer2_assign_physical: bad type");
1335 			/* NOT REACHED */
1336 			break;
1337 		}
1338 	}
1339 
1340 	/*
1341 	 * Cleanup.  If cluster wound up being the inode itself, i.e.
1342 	 * the DIRECTDATA case for offset 0, then we need to update cparent.
1343 	 * The caller expects cparent to not become stale.
1344 	 */
1345 	hammer2_cluster_lookup_done(dparent);
1346 	/* dparent = NULL; safety */
1347 	return (cluster);
1348 }
1349 
1350 /*
1351  * bio queued from hammer2_vnops.c.
1352  *
1353  * The core write function which determines which path to take
1354  * depending on compression settings.  We also have to locate the
1355  * related clusters so we can calculate and set the check data for
1356  * the blockref.
1357  */
1358 static
1359 void
1360 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
1361 			hammer2_inode_t *ip,
1362 			const hammer2_inode_data_t *ripdata,
1363 			hammer2_cluster_t *cparent,
1364 			hammer2_key_t lbase, int ioflag, int pblksize,
1365 			int *errorp)
1366 {
1367 	hammer2_cluster_t *cluster;
1368 
1369 	switch(HAMMER2_DEC_ALGO(ripdata->comp_algo)) {
1370 	case HAMMER2_COMP_NONE:
1371 		/*
1372 		 * We have to assign physical storage to the buffer
1373 		 * we intend to dirty or write now to avoid deadlocks
1374 		 * in the strategy code later.
1375 		 *
1376 		 * This can return NOOFFSET for inode-embedded data.
1377 		 * The strategy code will take care of it in that case.
1378 		 */
1379 		cluster = hammer2_assign_physical(trans, ip, cparent,
1380 						lbase, pblksize,
1381 						errorp);
1382 		hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1383 				 ripdata->check_algo);
1384 		/* ripdata can become invalid */
1385 		if (cluster) {
1386 			hammer2_cluster_unlock(cluster);
1387 			hammer2_cluster_drop(cluster);
1388 		}
1389 		break;
1390 	case HAMMER2_COMP_AUTOZERO:
1391 		/*
1392 		 * Check for zero-fill only
1393 		 */
1394 		hammer2_zero_check_and_write(bp, trans, ip,
1395 				    ripdata, cparent, lbase,
1396 				    ioflag, pblksize, errorp,
1397 				    ripdata->check_algo);
1398 		break;
1399 	case HAMMER2_COMP_LZ4:
1400 	case HAMMER2_COMP_ZLIB:
1401 	default:
1402 		/*
1403 		 * Check for zero-fill and attempt compression.
1404 		 */
1405 		hammer2_compress_and_write(bp, trans, ip,
1406 					   ripdata, cparent,
1407 					   lbase, ioflag,
1408 					   pblksize, errorp,
1409 					   ripdata->comp_algo,
1410 					   ripdata->check_algo);
1411 		break;
1412 	}
1413 }
1414 
1415 /*
1416  * Generic function that will perform the compression in compression
1417  * write path. The compression algorithm is determined by the settings
1418  * obtained from inode.
1419  */
1420 static
1421 void
1422 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1423 	hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1424 	hammer2_cluster_t *cparent,
1425 	hammer2_key_t lbase, int ioflag, int pblksize,
1426 	int *errorp, int comp_algo, int check_algo)
1427 {
1428 	hammer2_cluster_t *cluster;
1429 	hammer2_chain_t *chain;
1430 	int comp_size;
1431 	int comp_block_size;
1432 	int i;
1433 	char *comp_buffer;
1434 
1435 	if (test_block_zeros(bp->b_data, pblksize)) {
1436 		zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1437 		return;
1438 	}
1439 
1440 	comp_size = 0;
1441 	comp_buffer = NULL;
1442 
1443 	KKASSERT(pblksize / 2 <= 32768);
1444 
1445 	if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1446 		z_stream strm_compress;
1447 		int comp_level;
1448 		int ret;
1449 
1450 		switch(HAMMER2_DEC_ALGO(comp_algo)) {
1451 		case HAMMER2_COMP_LZ4:
1452 			comp_buffer = objcache_get(cache_buffer_write,
1453 						   M_INTWAIT);
1454 			comp_size = LZ4_compress_limitedOutput(
1455 					bp->b_data,
1456 					&comp_buffer[sizeof(int)],
1457 					pblksize,
1458 					pblksize / 2 - sizeof(int));
1459 			/*
1460 			 * We need to prefix with the size, LZ4
1461 			 * doesn't do it for us.  Add the related
1462 			 * overhead.
1463 			 */
1464 			*(int *)comp_buffer = comp_size;
1465 			if (comp_size)
1466 				comp_size += sizeof(int);
1467 			break;
1468 		case HAMMER2_COMP_ZLIB:
1469 			comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1470 			if (comp_level == 0)
1471 				comp_level = 6;	/* default zlib compression */
1472 			else if (comp_level < 6)
1473 				comp_level = 6;
1474 			else if (comp_level > 9)
1475 				comp_level = 9;
1476 			ret = deflateInit(&strm_compress, comp_level);
1477 			if (ret != Z_OK) {
1478 				kprintf("HAMMER2 ZLIB: fatal error "
1479 					"on deflateInit.\n");
1480 			}
1481 
1482 			comp_buffer = objcache_get(cache_buffer_write,
1483 						   M_INTWAIT);
1484 			strm_compress.next_in = bp->b_data;
1485 			strm_compress.avail_in = pblksize;
1486 			strm_compress.next_out = comp_buffer;
1487 			strm_compress.avail_out = pblksize / 2;
1488 			ret = deflate(&strm_compress, Z_FINISH);
1489 			if (ret == Z_STREAM_END) {
1490 				comp_size = pblksize / 2 -
1491 					    strm_compress.avail_out;
1492 			} else {
1493 				comp_size = 0;
1494 			}
1495 			ret = deflateEnd(&strm_compress);
1496 			break;
1497 		default:
1498 			kprintf("Error: Unknown compression method.\n");
1499 			kprintf("Comp_method = %d.\n", comp_algo);
1500 			break;
1501 		}
1502 	}
1503 
1504 	if (comp_size == 0) {
1505 		/*
1506 		 * compression failed or turned off
1507 		 */
1508 		comp_block_size = pblksize;	/* safety */
1509 		if (++ip->comp_heuristic > 128)
1510 			ip->comp_heuristic = 8;
1511 	} else {
1512 		/*
1513 		 * compression succeeded
1514 		 */
1515 		ip->comp_heuristic = 0;
1516 		if (comp_size <= 1024) {
1517 			comp_block_size = 1024;
1518 		} else if (comp_size <= 2048) {
1519 			comp_block_size = 2048;
1520 		} else if (comp_size <= 4096) {
1521 			comp_block_size = 4096;
1522 		} else if (comp_size <= 8192) {
1523 			comp_block_size = 8192;
1524 		} else if (comp_size <= 16384) {
1525 			comp_block_size = 16384;
1526 		} else if (comp_size <= 32768) {
1527 			comp_block_size = 32768;
1528 		} else {
1529 			panic("hammer2: WRITE PATH: "
1530 			      "Weird comp_size value.");
1531 			/* NOT REACHED */
1532 			comp_block_size = pblksize;
1533 		}
1534 	}
1535 
1536 	cluster = hammer2_assign_physical(trans, ip, cparent,
1537 					  lbase, comp_block_size,
1538 					  errorp);
1539 	ripdata = NULL;
1540 
1541 	if (*errorp) {
1542 		kprintf("WRITE PATH: An error occurred while "
1543 			"assigning physical space.\n");
1544 		KKASSERT(cluster == NULL);
1545 		goto done;
1546 	}
1547 
1548 	if (cluster->ddflag) {
1549 		hammer2_inode_data_t *wipdata;
1550 
1551 		wipdata = hammer2_cluster_modify_ip(trans, ip, cluster, 0);
1552 		KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1553 		KKASSERT(bp->b_loffset == 0);
1554 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1555 		hammer2_cluster_modsync(cluster);
1556 	} else
1557 	for (i = 0; i < cluster->nchains; ++i) {
1558 		hammer2_io_t *dio;
1559 		char *bdata;
1560 
1561 		/* XXX hackx */
1562 
1563 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1564 			continue;
1565 		chain = cluster->array[i].chain;	/* XXX */
1566 		if (chain == NULL)
1567 			continue;
1568 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1569 
1570 		switch(chain->bref.type) {
1571 		case HAMMER2_BREF_TYPE_INODE:
1572 			panic("hammer2_write_bp: unexpected inode\n");
1573 			break;
1574 		case HAMMER2_BREF_TYPE_DATA:
1575 			/*
1576 			 * Optimize out the read-before-write
1577 			 * if possible.
1578 			 */
1579 			*errorp = hammer2_io_newnz(chain->hmp,
1580 						   chain->bref.data_off,
1581 						   chain->bytes,
1582 						   &dio);
1583 			if (*errorp) {
1584 				hammer2_io_brelse(&dio);
1585 				kprintf("hammer2: WRITE PATH: "
1586 					"dbp bread error\n");
1587 				break;
1588 			}
1589 			bdata = hammer2_io_data(dio, chain->bref.data_off);
1590 
1591 			/*
1592 			 * When loading the block make sure we don't
1593 			 * leave garbage after the compressed data.
1594 			 */
1595 			if (comp_size) {
1596 				chain->bref.methods =
1597 					HAMMER2_ENC_COMP(comp_algo) +
1598 					HAMMER2_ENC_CHECK(check_algo);
1599 				bcopy(comp_buffer, bdata, comp_size);
1600 				if (comp_size != comp_block_size) {
1601 					bzero(bdata + comp_size,
1602 					      comp_block_size - comp_size);
1603 				}
1604 			} else {
1605 				chain->bref.methods =
1606 					HAMMER2_ENC_COMP(
1607 						HAMMER2_COMP_NONE) +
1608 					HAMMER2_ENC_CHECK(check_algo);
1609 				bcopy(bp->b_data, bdata, pblksize);
1610 			}
1611 
1612 			/*
1613 			 * The flush code doesn't calculate check codes for
1614 			 * file data (doing so can result in excessive I/O),
1615 			 * so we do it here.
1616 			 */
1617 			hammer2_chain_setcheck(chain, bdata);
1618 
1619 			/*
1620 			 * Device buffer is now valid, chain is no longer in
1621 			 * the initial state.
1622 			 *
1623 			 * (No blockref table worries with file data)
1624 			 */
1625 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1626 
1627 			/* Now write the related bdp. */
1628 			if (ioflag & IO_SYNC) {
1629 				/*
1630 				 * Synchronous I/O requested.
1631 				 */
1632 				hammer2_io_bwrite(&dio);
1633 			/*
1634 			} else if ((ioflag & IO_DIRECT) &&
1635 				   loff + n == pblksize) {
1636 				hammer2_io_bdwrite(&dio);
1637 			*/
1638 			} else if (ioflag & IO_ASYNC) {
1639 				hammer2_io_bawrite(&dio);
1640 			} else {
1641 				hammer2_io_bdwrite(&dio);
1642 			}
1643 			break;
1644 		default:
1645 			panic("hammer2_write_bp: bad chain type %d\n",
1646 				chain->bref.type);
1647 			/* NOT REACHED */
1648 			break;
1649 		}
1650 	}
1651 done:
1652 	if (cluster) {
1653 		hammer2_cluster_unlock(cluster);
1654 		hammer2_cluster_drop(cluster);
1655 	}
1656 	if (comp_buffer)
1657 		objcache_put(cache_buffer_write, comp_buffer);
1658 }
1659 
1660 /*
1661  * Function that performs zero-checking and writing without compression,
1662  * it corresponds to default zero-checking path.
1663  */
1664 static
1665 void
1666 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1667 	hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1668 	hammer2_cluster_t *cparent,
1669 	hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
1670 	int check_algo)
1671 {
1672 	hammer2_cluster_t *cluster;
1673 
1674 	if (test_block_zeros(bp->b_data, pblksize)) {
1675 		zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1676 		/* ripdata can become invalid */
1677 	} else {
1678 		cluster = hammer2_assign_physical(trans, ip, cparent,
1679 						  lbase, pblksize, errorp);
1680 		hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1681 				 check_algo);
1682 		/* ripdata can become invalid */
1683 		if (cluster) {
1684 			hammer2_cluster_unlock(cluster);
1685 			hammer2_cluster_drop(cluster);
1686 		}
1687 	}
1688 }
1689 
1690 /*
1691  * A function to test whether a block of data contains only zeros,
1692  * returns TRUE (non-zero) if the block is all zeros.
1693  */
1694 static
1695 int
1696 test_block_zeros(const char *buf, size_t bytes)
1697 {
1698 	size_t i;
1699 
1700 	for (i = 0; i < bytes; i += sizeof(long)) {
1701 		if (*(const long *)(buf + i) != 0)
1702 			return (0);
1703 	}
1704 	return (1);
1705 }
1706 
1707 /*
1708  * Function to "write" a block that contains only zeros.
1709  */
1710 static
1711 void
1712 zero_write(struct buf *bp, hammer2_trans_t *trans,
1713 	   hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1714 	   hammer2_cluster_t *cparent,
1715 	   hammer2_key_t lbase, int *errorp __unused)
1716 {
1717 	hammer2_cluster_t *cluster;
1718 	hammer2_key_t key_dummy;
1719 
1720 	cparent = hammer2_cluster_lookup_init(cparent, 0);
1721 	cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1722 				     HAMMER2_LOOKUP_NODATA);
1723 	if (cluster) {
1724 		if (cluster->ddflag) {
1725 			hammer2_inode_data_t *wipdata;
1726 
1727 			wipdata = hammer2_cluster_modify_ip(trans, ip,
1728 							    cluster, 0);
1729 			KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1730 			KKASSERT(bp->b_loffset == 0);
1731 			bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1732 			hammer2_cluster_modsync(cluster);
1733 		} else {
1734 			hammer2_cluster_delete(trans, cparent, cluster,
1735 					       HAMMER2_DELETE_PERMANENT);
1736 		}
1737 		hammer2_cluster_unlock(cluster);
1738 		hammer2_cluster_drop(cluster);
1739 	}
1740 	hammer2_cluster_lookup_done(cparent);
1741 }
1742 
1743 /*
1744  * Function to write the data as it is, without performing any sort of
1745  * compression. This function is used in path without compression and
1746  * default zero-checking path.
1747  */
1748 static
1749 void
1750 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1751 				int pblksize, int *errorp, int check_algo)
1752 {
1753 	hammer2_chain_t *chain;
1754 	hammer2_inode_data_t *wipdata;
1755 	hammer2_io_t *dio;
1756 	char *bdata;
1757 	int error;
1758 	int i;
1759 
1760 	error = 0;	/* XXX TODO below */
1761 
1762 	for (i = 0; i < cluster->nchains; ++i) {
1763 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1764 			continue;
1765 		chain = cluster->array[i].chain;	/* XXX */
1766 		if (chain == NULL)
1767 			continue;
1768 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1769 
1770 		switch(chain->bref.type) {
1771 		case HAMMER2_BREF_TYPE_INODE:
1772 			wipdata = &hammer2_chain_wdata(chain)->ipdata;
1773 			KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1774 			KKASSERT(bp->b_loffset == 0);
1775 			bcopy(bp->b_data, wipdata->u.data,
1776 			      HAMMER2_EMBEDDED_BYTES);
1777 			error = 0;
1778 			break;
1779 		case HAMMER2_BREF_TYPE_DATA:
1780 			error = hammer2_io_newnz(chain->hmp,
1781 						 chain->bref.data_off,
1782 						 chain->bytes, &dio);
1783 			if (error) {
1784 				hammer2_io_bqrelse(&dio);
1785 				kprintf("hammer2: WRITE PATH: "
1786 					"dbp bread error\n");
1787 				break;
1788 			}
1789 			bdata = hammer2_io_data(dio, chain->bref.data_off);
1790 
1791 			chain->bref.methods = HAMMER2_ENC_COMP(
1792 							HAMMER2_COMP_NONE) +
1793 					      HAMMER2_ENC_CHECK(check_algo);
1794 			bcopy(bp->b_data, bdata, chain->bytes);
1795 
1796 			/*
1797 			 * The flush code doesn't calculate check codes for
1798 			 * file data (doing so can result in excessive I/O),
1799 			 * so we do it here.
1800 			 */
1801 			hammer2_chain_setcheck(chain, bdata);
1802 
1803 			/*
1804 			 * Device buffer is now valid, chain is no longer in
1805 			 * the initial state.
1806 			 *
1807 			 * (No blockref table worries with file data)
1808 			 */
1809 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1810 
1811 			if (ioflag & IO_SYNC) {
1812 				/*
1813 				 * Synchronous I/O requested.
1814 				 */
1815 				hammer2_io_bwrite(&dio);
1816 			/*
1817 			} else if ((ioflag & IO_DIRECT) &&
1818 				   loff + n == pblksize) {
1819 				hammer2_io_bdwrite(&dio);
1820 			*/
1821 			} else if (ioflag & IO_ASYNC) {
1822 				hammer2_io_bawrite(&dio);
1823 			} else {
1824 				hammer2_io_bdwrite(&dio);
1825 			}
1826 			break;
1827 		default:
1828 			panic("hammer2_write_bp: bad chain type %d\n",
1829 			      chain->bref.type);
1830 			/* NOT REACHED */
1831 			error = 0;
1832 			break;
1833 		}
1834 		KKASSERT(error == 0);	/* XXX TODO */
1835 	}
1836 	*errorp = error;
1837 }
1838 
1839 static
1840 int
1841 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path,
1842 		struct vnode *devvp, struct ucred *cred)
1843 {
1844 	int error;
1845 
1846 	if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1847 		error = hammer2_recovery(hmp);
1848 	} else {
1849 		error = 0;
1850 	}
1851 	return error;
1852 }
1853 
1854 static
1855 int
1856 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1857 {
1858 	hammer2_pfs_t *pmp;
1859 	int flags;
1860 	int error = 0;
1861 
1862 	pmp = MPTOPMP(mp);
1863 
1864 	if (pmp == NULL)
1865 		return(0);
1866 
1867 	lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1868 
1869 	/*
1870 	 * If mount initialization proceeded far enough we must flush
1871 	 * its vnodes and sync the underlying mount points.  Three syncs
1872 	 * are required to fully flush the filesystem (freemap updates lag
1873 	 * by one flush, and one extra for safety).
1874 	 */
1875 	if (mntflags & MNT_FORCE)
1876 		flags = FORCECLOSE;
1877 	else
1878 		flags = 0;
1879 	if (pmp->iroot) {
1880 		error = vflush(mp, 0, flags);
1881 		if (error)
1882 			goto failed;
1883 		hammer2_vfs_sync(mp, MNT_WAIT);
1884 		hammer2_vfs_sync(mp, MNT_WAIT);
1885 		hammer2_vfs_sync(mp, MNT_WAIT);
1886 	}
1887 
1888 	if (pmp->wthread_td) {
1889 		hammer2_mtx_ex(&pmp->wthread_mtx);
1890 		pmp->wthread_destroy = 1;
1891 		wakeup(&pmp->wthread_bioq);
1892 		while (pmp->wthread_destroy != -1) {
1893 			mtxsleep(&pmp->wthread_destroy,
1894 				&pmp->wthread_mtx, 0,
1895 				"umount-sleep",	0);
1896 		}
1897 		hammer2_mtx_unlock(&pmp->wthread_mtx);
1898 		pmp->wthread_td = NULL;
1899 	}
1900 
1901 	/*
1902 	 * Cleanup our reference on ihidden.
1903 	 */
1904 	if (pmp->ihidden) {
1905 		hammer2_inode_drop(pmp->ihidden);
1906 		pmp->ihidden = NULL;
1907 	}
1908 	if (pmp->mp)
1909 		hammer2_unmount_helper(mp, pmp, NULL);
1910 
1911 	error = 0;
1912 failed:
1913 	lockmgr(&hammer2_mntlk, LK_RELEASE);
1914 
1915 	return (error);
1916 }
1917 
1918 /*
1919  * Mount helper, hook the system mount into our PFS.
1920  * The mount lock is held.
1921  *
1922  * We must bump the mount_count on related devices for any
1923  * mounted PFSs.
1924  */
1925 static
1926 void
1927 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
1928 {
1929 	hammer2_cluster_t *cluster;
1930 	hammer2_chain_t *rchain;
1931 	int i;
1932 
1933         mp->mnt_data = (qaddr_t)pmp;
1934 	pmp->mp = mp;
1935 
1936 	/*
1937 	 * After pmp->mp is set we have to adjust hmp->mount_count.
1938 	 */
1939 	cluster = &pmp->iroot->cluster;
1940 	for (i = 0; i < cluster->nchains; ++i) {
1941 		rchain = cluster->array[i].chain;
1942 		if (rchain == NULL)
1943 			continue;
1944 		++rchain->hmp->mount_count;
1945 		kprintf("hammer2_mount hmp=%p ++mount_count=%d\n",
1946 			rchain->hmp, rchain->hmp->mount_count);
1947 	}
1948 }
1949 
1950 /*
1951  * Mount helper, unhook the system mount from our PFS.
1952  * The mount lock is held.
1953  *
1954  * If hmp is supplied a mount responsible for being the first to open
1955  * the block device failed and the block device and all PFSs using the
1956  * block device must be cleaned up.
1957  *
1958  * If pmp is supplied multiple devices might be backing the PFS and each
1959  * must be disconnect.  This might not be the last PFS using some of the
1960  * underlying devices.  Also, we have to adjust our hmp->mount_count
1961  * accounting for the devices backing the pmp which is now undergoing an
1962  * unmount.
1963  */
1964 static
1965 void
1966 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
1967 {
1968 	hammer2_cluster_t *cluster;
1969 	hammer2_chain_t *rchain;
1970 	struct vnode *devvp;
1971 	int dumpcnt;
1972 	int ronly = 0;
1973 	int i;
1974 
1975 	/*
1976 	 * If no device supplied this is a high-level unmount and we have to
1977 	 * to disconnect the mount, adjust mount_count, and locate devices
1978 	 * that might now have no mounts.
1979 	 */
1980 	if (pmp) {
1981 		KKASSERT(hmp == NULL);
1982 		KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
1983 		pmp->mp = NULL;
1984 		mp->mnt_data = NULL;
1985 
1986 		/*
1987 		 * After pmp->mp is cleared we have to account for
1988 		 * mount_count.
1989 		 */
1990 		cluster = &pmp->iroot->cluster;
1991 		for (i = 0; i < cluster->nchains; ++i) {
1992 			rchain = cluster->array[i].chain;
1993 			if (rchain == NULL)
1994 				continue;
1995 			--rchain->hmp->mount_count;
1996 			kprintf("hammer2_unmount hmp=%p --mount_count=%d\n",
1997 				rchain->hmp, rchain->hmp->mount_count);
1998 			/* scrapping hmp now may invalidate the pmp */
1999 		}
2000 again:
2001 		TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
2002 			if (hmp->mount_count == 0) {
2003 				hammer2_unmount_helper(NULL, NULL, hmp);
2004 				goto again;
2005 			}
2006 		}
2007 		return;
2008 	}
2009 
2010 	/*
2011 	 * Try to terminate the block device.  We can't terminate it if
2012 	 * there are still PFSs referencing it.
2013 	 */
2014 	kprintf("hammer2_unmount hmp=%p mount_count=%d\n",
2015 		hmp, hmp->mount_count);
2016 	if (hmp->mount_count)
2017 		return;
2018 
2019 	hammer2_pfsfree_scan(hmp);
2020 	hammer2_dev_exlock(hmp);	/* XXX order */
2021 
2022 	/*
2023 	 * Cycle the volume data lock as a safety (probably not needed any
2024 	 * more).  To ensure everything is out we need to flush at least
2025 	 * three times.  (1) The running of the unlinkq can dirty the
2026 	 * filesystem, (2) A normal flush can dirty the freemap, and
2027 	 * (3) ensure that the freemap is fully synchronized.
2028 	 *
2029 	 * The next mount's recovery scan can clean everything up but we want
2030 	 * to leave the filesystem in a 100% clean state on a normal unmount.
2031 	 */
2032 #if 0
2033 	hammer2_voldata_lock(hmp);
2034 	hammer2_voldata_unlock(hmp);
2035 #endif
2036 	hammer2_iocom_uninit(hmp);
2037 
2038 	if ((hmp->vchain.flags | hmp->fchain.flags) &
2039 	    HAMMER2_CHAIN_FLUSH_MASK) {
2040 		kprintf("hammer2_unmount: chains left over "
2041 			"after final sync\n");
2042 		kprintf("    vchain %08x\n", hmp->vchain.flags);
2043 		kprintf("    fchain %08x\n", hmp->fchain.flags);
2044 
2045 		if (hammer2_debug & 0x0010)
2046 			Debugger("entered debugger");
2047 	}
2048 
2049 	KKASSERT(hmp->spmp == NULL);
2050 
2051 	/*
2052 	 * Finish up with the device vnode
2053 	 */
2054 	if ((devvp = hmp->devvp) != NULL) {
2055 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2056 		vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
2057 		hmp->devvp = NULL;
2058 		VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
2059 		vn_unlock(devvp);
2060 		vrele(devvp);
2061 		devvp = NULL;
2062 	}
2063 
2064 	/*
2065 	 * Clear vchain/fchain flags that might prevent final cleanup
2066 	 * of these chains.
2067 	 */
2068 	if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
2069 		atomic_clear_int(&hmp->vchain.flags,
2070 				 HAMMER2_CHAIN_MODIFIED);
2071 		hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
2072 		hammer2_chain_drop(&hmp->vchain);
2073 	}
2074 	if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
2075 		atomic_clear_int(&hmp->vchain.flags,
2076 				 HAMMER2_CHAIN_UPDATE);
2077 		hammer2_chain_drop(&hmp->vchain);
2078 	}
2079 
2080 	if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
2081 		atomic_clear_int(&hmp->fchain.flags,
2082 				 HAMMER2_CHAIN_MODIFIED);
2083 		hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
2084 		hammer2_chain_drop(&hmp->fchain);
2085 	}
2086 	if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
2087 		atomic_clear_int(&hmp->fchain.flags,
2088 				 HAMMER2_CHAIN_UPDATE);
2089 		hammer2_chain_drop(&hmp->fchain);
2090 	}
2091 
2092 	/*
2093 	 * Final drop of embedded freemap root chain to
2094 	 * clean up fchain.core (fchain structure is not
2095 	 * flagged ALLOCATED so it is cleaned out and then
2096 	 * left to rot).
2097 	 */
2098 	hammer2_chain_drop(&hmp->fchain);
2099 
2100 	/*
2101 	 * Final drop of embedded volume root chain to clean
2102 	 * up vchain.core (vchain structure is not flagged
2103 	 * ALLOCATED so it is cleaned out and then left to
2104 	 * rot).
2105 	 */
2106 	dumpcnt = 50;
2107 	hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
2108 	dumpcnt = 50;
2109 	hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
2110 	hammer2_dev_unlock(hmp);
2111 	hammer2_chain_drop(&hmp->vchain);
2112 
2113 	hammer2_io_cleanup(hmp, &hmp->iotree);
2114 	if (hmp->iofree_count) {
2115 		kprintf("io_cleanup: %d I/O's left hanging\n",
2116 			hmp->iofree_count);
2117 	}
2118 
2119 	TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
2120 	kmalloc_destroy(&hmp->mchain);
2121 	kfree(hmp, M_HAMMER2);
2122 }
2123 
2124 static
2125 int
2126 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
2127 	     ino_t ino, struct vnode **vpp)
2128 {
2129 	kprintf("hammer2_vget\n");
2130 	return (EOPNOTSUPP);
2131 }
2132 
2133 static
2134 int
2135 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
2136 {
2137 	hammer2_pfs_t *pmp;
2138 	hammer2_cluster_t *cparent;
2139 	int error;
2140 	struct vnode *vp;
2141 
2142 	pmp = MPTOPMP(mp);
2143 	if (pmp->iroot == NULL) {
2144 		*vpp = NULL;
2145 		error = EINVAL;
2146 	} else {
2147 		cparent = hammer2_inode_lock(pmp->iroot,
2148 						HAMMER2_RESOLVE_ALWAYS |
2149 					        HAMMER2_RESOLVE_SHARED);
2150 		vp = hammer2_igetv(pmp->iroot, cparent, &error);
2151 		hammer2_inode_unlock(pmp->iroot, cparent);
2152 		*vpp = vp;
2153 		if (vp == NULL)
2154 			kprintf("vnodefail\n");
2155 	}
2156 
2157 	return (error);
2158 }
2159 
2160 /*
2161  * Filesystem status
2162  *
2163  * XXX incorporate ipdata->inode_quota and data_quota
2164  */
2165 static
2166 int
2167 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2168 {
2169 	hammer2_pfs_t *pmp;
2170 	hammer2_dev_t *hmp;
2171 
2172 	pmp = MPTOPMP(mp);
2173 	KKASSERT(pmp->iroot->cluster.nchains >= 1);
2174 	hmp = pmp->iroot->cluster.focus->hmp;	/* XXX */
2175 
2176 	mp->mnt_stat.f_files = pmp->inode_count;
2177 	mp->mnt_stat.f_ffree = 0;
2178 	mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
2179 	mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
2180 	mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
2181 
2182 	*sbp = mp->mnt_stat;
2183 	return (0);
2184 }
2185 
2186 static
2187 int
2188 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2189 {
2190 	hammer2_pfs_t *pmp;
2191 	hammer2_dev_t *hmp;
2192 
2193 	pmp = MPTOPMP(mp);
2194 	KKASSERT(pmp->iroot->cluster.nchains >= 1);
2195 	hmp = pmp->iroot->cluster.focus->hmp;	/* XXX */
2196 
2197 	mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
2198 	mp->mnt_vstat.f_files = pmp->inode_count;
2199 	mp->mnt_vstat.f_ffree = 0;
2200 	mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
2201 	mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
2202 	mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
2203 
2204 	*sbp = mp->mnt_vstat;
2205 	return (0);
2206 }
2207 
2208 /*
2209  * Mount-time recovery (RW mounts)
2210  *
2211  * Updates to the free block table are allowed to lag flushes by one
2212  * transaction.  In case of a crash, then on a fresh mount we must do an
2213  * incremental scan of the last committed transaction id and make sure that
2214  * all related blocks have been marked allocated.
2215  *
2216  * The super-root topology and each PFS has its own transaction id domain,
2217  * so we must track PFS boundary transitions.
2218  */
2219 struct hammer2_recovery_elm {
2220 	TAILQ_ENTRY(hammer2_recovery_elm) entry;
2221 	hammer2_chain_t *chain;
2222 	hammer2_tid_t sync_tid;
2223 };
2224 
2225 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2226 
2227 struct hammer2_recovery_info {
2228 	struct hammer2_recovery_list list;
2229 	int	depth;
2230 };
2231 
2232 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2233 			hammer2_chain_t *parent,
2234 			struct hammer2_recovery_info *info,
2235 			hammer2_tid_t sync_tid);
2236 
2237 #define HAMMER2_RECOVERY_MAXDEPTH	10
2238 
2239 static
2240 int
2241 hammer2_recovery(hammer2_dev_t *hmp)
2242 {
2243 	hammer2_trans_t trans;
2244 	struct hammer2_recovery_info info;
2245 	struct hammer2_recovery_elm *elm;
2246 	hammer2_chain_t *parent;
2247 	hammer2_tid_t sync_tid;
2248 	hammer2_tid_t mirror_tid;
2249 	int error;
2250 	int cumulative_error = 0;
2251 
2252 	hammer2_trans_init(&trans, hmp->spmp, 0);
2253 
2254 	sync_tid = hmp->voldata.freemap_tid;
2255 	mirror_tid = hmp->voldata.mirror_tid;
2256 
2257 	kprintf("hammer2 mount \"%s\": ", hmp->devrepname);
2258 	if (sync_tid >= mirror_tid) {
2259 		kprintf(" no recovery needed\n");
2260 	} else {
2261 		kprintf(" freemap recovery %016jx-%016jx\n",
2262 			sync_tid + 1, mirror_tid);
2263 	}
2264 
2265 	TAILQ_INIT(&info.list);
2266 	info.depth = 0;
2267 	parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2268 	cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
2269 						 &info, sync_tid);
2270 	hammer2_chain_lookup_done(parent);
2271 
2272 	while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2273 		TAILQ_REMOVE(&info.list, elm, entry);
2274 		parent = elm->chain;
2275 		sync_tid = elm->sync_tid;
2276 		kfree(elm, M_HAMMER2);
2277 
2278 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2279 		error = hammer2_recovery_scan(&trans, hmp, parent,
2280 					      &info,
2281 					      hmp->voldata.freemap_tid);
2282 		hammer2_chain_unlock(parent);
2283 		hammer2_chain_drop(parent);	/* drop elm->chain ref */
2284 		if (error)
2285 			cumulative_error = error;
2286 	}
2287 	hammer2_trans_done(&trans);
2288 
2289 	return cumulative_error;
2290 }
2291 
2292 static
2293 int
2294 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2295 		      hammer2_chain_t *parent,
2296 		      struct hammer2_recovery_info *info,
2297 		      hammer2_tid_t sync_tid)
2298 {
2299 	const hammer2_inode_data_t *ripdata;
2300 	hammer2_chain_t *chain;
2301 	int cache_index;
2302 	int cumulative_error = 0;
2303 	int error;
2304 
2305 	/*
2306 	 * Adjust freemap to ensure that the block(s) are marked allocated.
2307 	 */
2308 	if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2309 		hammer2_freemap_adjust(trans, hmp, &parent->bref,
2310 				       HAMMER2_FREEMAP_DORECOVER);
2311 	}
2312 
2313 	/*
2314 	 * Check type for recursive scan
2315 	 */
2316 	switch(parent->bref.type) {
2317 	case HAMMER2_BREF_TYPE_VOLUME:
2318 		/* data already instantiated */
2319 		break;
2320 	case HAMMER2_BREF_TYPE_INODE:
2321 		/*
2322 		 * Must instantiate data for DIRECTDATA test and also
2323 		 * for recursion.
2324 		 */
2325 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2326 		ripdata = &hammer2_chain_rdata(parent)->ipdata;
2327 		if (ripdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2328 			/* not applicable to recovery scan */
2329 			hammer2_chain_unlock(parent);
2330 			return 0;
2331 		}
2332 		hammer2_chain_unlock(parent);
2333 		break;
2334 	case HAMMER2_BREF_TYPE_INDIRECT:
2335 		/*
2336 		 * Must instantiate data for recursion
2337 		 */
2338 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2339 		hammer2_chain_unlock(parent);
2340 		break;
2341 	case HAMMER2_BREF_TYPE_DATA:
2342 	case HAMMER2_BREF_TYPE_FREEMAP:
2343 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2344 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2345 		/* not applicable to recovery scan */
2346 		return 0;
2347 		break;
2348 	default:
2349 		return EDOM;
2350 	}
2351 
2352 	/*
2353 	 * Defer operation if depth limit reached or if we are crossing a
2354 	 * PFS boundary.
2355 	 */
2356 	if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2357 		struct hammer2_recovery_elm *elm;
2358 
2359 		elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2360 		elm->chain = parent;
2361 		elm->sync_tid = sync_tid;
2362 		hammer2_chain_ref(parent);
2363 		TAILQ_INSERT_TAIL(&info->list, elm, entry);
2364 		/* unlocked by caller */
2365 
2366 		return(0);
2367 	}
2368 
2369 
2370 	/*
2371 	 * Recursive scan of the last flushed transaction only.  We are
2372 	 * doing this without pmp assignments so don't leave the chains
2373 	 * hanging around after we are done with them.
2374 	 */
2375 	cache_index = 0;
2376 	chain = hammer2_chain_scan(parent, NULL, &cache_index,
2377 				   HAMMER2_LOOKUP_NODATA);
2378 	while (chain) {
2379 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2380 		if (chain->bref.mirror_tid > sync_tid) {
2381 			++info->depth;
2382 			error = hammer2_recovery_scan(trans, hmp, chain,
2383 						      info, sync_tid);
2384 			--info->depth;
2385 			if (error)
2386 				cumulative_error = error;
2387 		}
2388 		chain = hammer2_chain_scan(parent, chain, &cache_index,
2389 					   HAMMER2_LOOKUP_NODATA);
2390 	}
2391 
2392 	return cumulative_error;
2393 }
2394 
2395 /*
2396  * Sync the entire filesystem; this is called from the filesystem syncer
2397  * process periodically and whenever a user calls sync(1) on the hammer
2398  * mountpoint.
2399  *
2400  * Currently is actually called from the syncer! \o/
2401  *
2402  * This task will have to snapshot the state of the dirty inode chain.
2403  * From that, it will have to make sure all of the inodes on the dirty
2404  * chain have IO initiated. We make sure that io is initiated for the root
2405  * block.
2406  *
2407  * If waitfor is set, we wait for media to acknowledge the new rootblock.
2408  *
2409  * THINKS: side A vs side B, to have sync not stall all I/O?
2410  */
2411 int
2412 hammer2_vfs_sync(struct mount *mp, int waitfor)
2413 {
2414 	struct hammer2_sync_info info;
2415 	hammer2_inode_t *iroot;
2416 	hammer2_chain_t *chain;
2417 	hammer2_chain_t *parent;
2418 	hammer2_pfs_t *pmp;
2419 	hammer2_dev_t *hmp;
2420 	int flags;
2421 	int error;
2422 	int total_error;
2423 	int force_fchain;
2424 	int i;
2425 	int j;
2426 
2427 	pmp = MPTOPMP(mp);
2428 	iroot = pmp->iroot;
2429 	KKASSERT(iroot);
2430 	KKASSERT(iroot->pmp == pmp);
2431 
2432 	/*
2433 	 * We can't acquire locks on existing vnodes while in a transaction
2434 	 * without risking a deadlock.  This assumes that vfsync() can be
2435 	 * called without the vnode locked (which it can in DragonFly).
2436 	 * Otherwise we'd have to implement a multi-pass or flag the lock
2437 	 * failures and retry.
2438 	 *
2439 	 * The reclamation code interlocks with the sync list's token
2440 	 * (by removing the vnode from the scan list) before unlocking
2441 	 * the inode, giving us time to ref the inode.
2442 	 */
2443 	/*flags = VMSC_GETVP;*/
2444 	flags = 0;
2445 	if (waitfor & MNT_LAZY)
2446 		flags |= VMSC_ONEPASS;
2447 
2448 	/*
2449 	 * Start our flush transaction.  This does not return until all
2450 	 * concurrent transactions have completed and will prevent any
2451 	 * new transactions from running concurrently, except for the
2452 	 * buffer cache transactions.
2453 	 *
2454 	 * For efficiency do an async pass before making sure with a
2455 	 * synchronous pass on all related buffer cache buffers.  It
2456 	 * should theoretically not be possible for any new file buffers
2457 	 * to be instantiated during this sequence.
2458 	 */
2459 	hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
2460 					     HAMMER2_TRANS_PREFLUSH);
2461 	hammer2_run_unlinkq(&info.trans, pmp);
2462 
2463 	info.error = 0;
2464 	info.waitfor = MNT_NOWAIT;
2465 	vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2466 	info.waitfor = MNT_WAIT;
2467 	vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2468 
2469 	/*
2470 	 * Clear PREFLUSH.  This prevents (or asserts on) any new logical
2471 	 * buffer cache flushes which occur during the flush.  Device buffers
2472 	 * are not affected.
2473 	 */
2474 
2475 #if 0
2476 	if (info.error == 0 && (waitfor & MNT_WAIT)) {
2477 		info.waitfor = waitfor;
2478 		    vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2479 
2480 	}
2481 #endif
2482 	hammer2_bioq_sync(info.trans.pmp);
2483 	atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
2484 
2485 	total_error = 0;
2486 
2487 #if 0
2488 	/*
2489 	 * Flush all nodes making up the cluster
2490 	 *
2491 	 * We must also flush any deleted siblings because the super-root
2492 	 * flush won't do it for us.  They all must be staged or the
2493 	 * super-root flush will not be able to update its block table
2494 	 * properly.
2495 	 *
2496 	 * XXX currently done serially instead of concurrently
2497 	 */
2498 	for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2499 		chain = iroot->cluster.array[i].chain;
2500 		if (chain) {
2501 			hmp = chain->hmp;
2502 			hammer2_chain_ref(chain);    /* prevent destruction */
2503 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
2504 			hammer2_flush(&info.trans, chain);
2505 			hammer2_chain_unlock(chain);
2506 			hammer2_chain_drop(chain);
2507 		}
2508 	}
2509 #endif
2510 #if 0
2511 	hammer2_trans_done(&info.trans);
2512 #endif
2513 
2514 	/*
2515 	 * Flush all volume roots to synchronize PFS flushes with the
2516 	 * storage media.  Use a super-root transaction for each one.
2517 	 *
2518 	 * The flush code will detect super-root -> pfs-root chain
2519 	 * transitions using the last pfs-root flush.
2520 	 */
2521 	for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2522 		hammer2_chain_t *tmp;
2523 
2524 		chain = iroot->cluster.array[i].chain;
2525 		if (chain == NULL)
2526 			continue;
2527 
2528 		hmp = chain->hmp;
2529 
2530 		/*
2531 		 * We only have to flush each hmp once
2532 		 */
2533 		for (j = i - 1; j >= 0; --j) {
2534 			if ((tmp = iroot->cluster.array[j].chain) != NULL) {
2535 				if (tmp->hmp == hmp)
2536 					break;
2537 			}
2538 		}
2539 		if (j >= 0)
2540 			continue;
2541 #if 0
2542 		hammer2_trans_spmp(&info.trans, hmp->spmp);
2543 #endif
2544 
2545 		/*
2546 		 * Force an update of the XID from the PFS root to the
2547 		 * topology root.  We couldn't do this from the PFS
2548 		 * transaction because a SPMP transaction is needed.
2549 		 * This does not modify blocks, instead what it does is
2550 		 * allow the flush code to find the transition point and
2551 		 * then update on the way back up.
2552 		 */
2553 		parent = chain->parent;
2554 		KKASSERT(chain->pmp != parent->pmp);
2555 		hammer2_chain_setflush(&info.trans, parent);
2556 
2557 		/*
2558 		 * Media mounts have two 'roots', vchain for the topology
2559 		 * and fchain for the free block table.  Flush both.
2560 		 *
2561 		 * Note that the topology and free block table are handled
2562 		 * independently, so the free block table can wind up being
2563 		 * ahead of the topology.  We depend on the bulk free scan
2564 		 * code to deal with any loose ends.
2565 		 */
2566 		hammer2_chain_ref(&hmp->vchain);
2567 		hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2568 		hammer2_chain_ref(&hmp->fchain);
2569 		hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2570 		if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2571 			/*
2572 			 * This will also modify vchain as a side effect,
2573 			 * mark vchain as modified now.
2574 			 */
2575 			hammer2_voldata_modify(hmp);
2576 			chain = &hmp->fchain;
2577 			hammer2_flush(&info.trans, chain);
2578 			KKASSERT(chain == &hmp->fchain);
2579 		}
2580 		hammer2_chain_unlock(&hmp->fchain);
2581 		hammer2_chain_unlock(&hmp->vchain);
2582 		hammer2_chain_drop(&hmp->fchain);
2583 		/* vchain dropped down below */
2584 
2585 		hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2586 		if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2587 			chain = &hmp->vchain;
2588 			hammer2_flush(&info.trans, chain);
2589 			KKASSERT(chain == &hmp->vchain);
2590 			force_fchain = 1;
2591 		} else {
2592 			force_fchain = 0;
2593 		}
2594 		hammer2_chain_unlock(&hmp->vchain);
2595 		hammer2_chain_drop(&hmp->vchain);
2596 
2597 #if 0
2598 		hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2599 		if ((hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) ||
2600 		    force_fchain) {
2601 			/* this will also modify vchain as a side effect */
2602 			chain = &hmp->fchain;
2603 			hammer2_flush(&info.trans, chain);
2604 			KKASSERT(chain == &hmp->fchain);
2605 		}
2606 		hammer2_chain_unlock(&hmp->fchain);
2607 #endif
2608 
2609 		error = 0;
2610 
2611 		/*
2612 		 * We can't safely flush the volume header until we have
2613 		 * flushed any device buffers which have built up.
2614 		 *
2615 		 * XXX this isn't being incremental
2616 		 */
2617 		vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2618 		error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2619 		vn_unlock(hmp->devvp);
2620 
2621 		/*
2622 		 * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2623 		 * volume header needs synchronization via hmp->volsync.
2624 		 *
2625 		 * XXX synchronize the flag & data with only this flush XXX
2626 		 */
2627 		if (error == 0 &&
2628 		    (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2629 			struct buf *bp;
2630 
2631 			/*
2632 			 * Synchronize the disk before flushing the volume
2633 			 * header.
2634 			 */
2635 			bp = getpbuf(NULL);
2636 			bp->b_bio1.bio_offset = 0;
2637 			bp->b_bufsize = 0;
2638 			bp->b_bcount = 0;
2639 			bp->b_cmd = BUF_CMD_FLUSH;
2640 			bp->b_bio1.bio_done = biodone_sync;
2641 			bp->b_bio1.bio_flags |= BIO_SYNC;
2642 			vn_strategy(hmp->devvp, &bp->b_bio1);
2643 			biowait(&bp->b_bio1, "h2vol");
2644 			relpbuf(bp, NULL);
2645 
2646 			/*
2647 			 * Then we can safely flush the version of the
2648 			 * volume header synchronized by the flush code.
2649 			 */
2650 			i = hmp->volhdrno + 1;
2651 			if (i >= HAMMER2_NUM_VOLHDRS)
2652 				i = 0;
2653 			if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2654 			    hmp->volsync.volu_size) {
2655 				i = 0;
2656 			}
2657 			kprintf("sync volhdr %d %jd\n",
2658 				i, (intmax_t)hmp->volsync.volu_size);
2659 			bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2660 				    HAMMER2_PBUFSIZE, 0, 0);
2661 			atomic_clear_int(&hmp->vchain.flags,
2662 					 HAMMER2_CHAIN_VOLUMESYNC);
2663 			bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2664 			bawrite(bp);
2665 			hmp->volhdrno = i;
2666 		}
2667 		if (error)
2668 			total_error = error;
2669 
2670 #if 0
2671 		hammer2_trans_done(&info.trans);
2672 #endif
2673 	}
2674 	hammer2_trans_done(&info.trans);
2675 
2676 	return (total_error);
2677 }
2678 
2679 /*
2680  * Sync passes.
2681  */
2682 static int
2683 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2684 {
2685 	struct hammer2_sync_info *info = data;
2686 	hammer2_inode_t *ip;
2687 	int error;
2688 
2689 	/*
2690 	 *
2691 	 */
2692 	ip = VTOI(vp);
2693 	if (ip == NULL)
2694 		return(0);
2695 	if (vp->v_type == VNON || vp->v_type == VBAD) {
2696 		vclrisdirty(vp);
2697 		return(0);
2698 	}
2699 	if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2700 	    RB_EMPTY(&vp->v_rbdirty_tree)) {
2701 		vclrisdirty(vp);
2702 		return(0);
2703 	}
2704 
2705 	/*
2706 	 * VOP_FSYNC will start a new transaction so replicate some code
2707 	 * here to do it inline (see hammer2_vop_fsync()).
2708 	 *
2709 	 * WARNING: The vfsync interacts with the buffer cache and might
2710 	 *          block, we can't hold the inode lock at that time.
2711 	 *	    However, we MUST ref ip before blocking to ensure that
2712 	 *	    it isn't ripped out from under us (since we do not
2713 	 *	    hold a lock on the vnode).
2714 	 */
2715 	hammer2_inode_ref(ip);
2716 	atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2717 	if (vp)
2718 		vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
2719 
2720 	hammer2_inode_drop(ip);
2721 #if 1
2722 	error = 0;
2723 	if (error)
2724 		info->error = error;
2725 #endif
2726 	return(0);
2727 }
2728 
2729 static
2730 int
2731 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2732 {
2733 	return (0);
2734 }
2735 
2736 static
2737 int
2738 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2739 	       struct fid *fhp, struct vnode **vpp)
2740 {
2741 	return (0);
2742 }
2743 
2744 static
2745 int
2746 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2747 		 int *exflagsp, struct ucred **credanonp)
2748 {
2749 	return (0);
2750 }
2751 
2752 /*
2753  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2754  * header into the HMP
2755  *
2756  * XXX read four volhdrs and use the one with the highest TID whos CRC
2757  *     matches.
2758  *
2759  * XXX check iCRCs.
2760  *
2761  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2762  *     nonexistant locations.
2763  *
2764  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2765  */
2766 static
2767 int
2768 hammer2_install_volume_header(hammer2_dev_t *hmp)
2769 {
2770 	hammer2_volume_data_t *vd;
2771 	struct buf *bp;
2772 	hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2773 	int error_reported;
2774 	int error;
2775 	int valid;
2776 	int i;
2777 
2778 	error_reported = 0;
2779 	error = 0;
2780 	valid = 0;
2781 	bp = NULL;
2782 
2783 	/*
2784 	 * There are up to 4 copies of the volume header (syncs iterate
2785 	 * between them so there is no single master).  We don't trust the
2786 	 * volu_size field so we don't know precisely how large the filesystem
2787 	 * is, so depend on the OS to return an error if we go beyond the
2788 	 * block device's EOF.
2789 	 */
2790 	for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2791 		error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2792 			      HAMMER2_VOLUME_BYTES, &bp);
2793 		if (error) {
2794 			brelse(bp);
2795 			bp = NULL;
2796 			continue;
2797 		}
2798 
2799 		vd = (struct hammer2_volume_data *) bp->b_data;
2800 		if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2801 		    (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2802 			brelse(bp);
2803 			bp = NULL;
2804 			continue;
2805 		}
2806 
2807 		if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2808 			/* XXX: Reversed-endianness filesystem */
2809 			kprintf("hammer2: reverse-endian filesystem detected");
2810 			brelse(bp);
2811 			bp = NULL;
2812 			continue;
2813 		}
2814 
2815 		crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2816 		crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2817 				      HAMMER2_VOLUME_ICRC0_SIZE);
2818 		bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2819 		bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2820 				       HAMMER2_VOLUME_ICRC1_SIZE);
2821 		if ((crc0 != crc) || (bcrc0 != bcrc)) {
2822 			kprintf("hammer2 volume header crc "
2823 				"mismatch copy #%d %08x/%08x\n",
2824 				i, crc0, crc);
2825 			error_reported = 1;
2826 			brelse(bp);
2827 			bp = NULL;
2828 			continue;
2829 		}
2830 		if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2831 			valid = 1;
2832 			hmp->voldata = *vd;
2833 			hmp->volhdrno = i;
2834 		}
2835 		brelse(bp);
2836 		bp = NULL;
2837 	}
2838 	if (valid) {
2839 		hmp->volsync = hmp->voldata;
2840 		error = 0;
2841 		if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2842 			kprintf("hammer2: using volume header #%d\n",
2843 				hmp->volhdrno);
2844 		}
2845 	} else {
2846 		error = EINVAL;
2847 		kprintf("hammer2: no valid volume headers found!\n");
2848 	}
2849 	return (error);
2850 }
2851 
2852 /*
2853  * This handles hysteresis on regular file flushes.  Because the BIOs are
2854  * routed to a thread it is possible for an excessive number to build up
2855  * and cause long front-end stalls long before the runningbuffspace limit
2856  * is hit, so we implement hammer2_flush_pipe to control the
2857  * hysteresis.
2858  *
2859  * This is a particular problem when compression is used.
2860  */
2861 void
2862 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2863 {
2864 	atomic_add_int(&pmp->count_lwinprog, 1);
2865 }
2866 
2867 void
2868 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2869 {
2870 	int lwinprog;
2871 
2872 	lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2873 	if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2874 	    (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2875 		atomic_clear_int(&pmp->count_lwinprog,
2876 				 HAMMER2_LWINPROG_WAITING);
2877 		wakeup(&pmp->count_lwinprog);
2878 	}
2879 }
2880 
2881 void
2882 hammer2_lwinprog_wait(hammer2_pfs_t *pmp)
2883 {
2884 	int lwinprog;
2885 
2886 	for (;;) {
2887 		lwinprog = pmp->count_lwinprog;
2888 		cpu_ccfence();
2889 		if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2890 			break;
2891 		tsleep_interlock(&pmp->count_lwinprog, 0);
2892 		atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2893 		lwinprog = pmp->count_lwinprog;
2894 		if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2895 			break;
2896 		tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2897 	}
2898 }
2899 
2900 /*
2901  * Manage excessive memory resource use for chain and related
2902  * structures.
2903  */
2904 void
2905 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
2906 {
2907 	uint32_t waiting;
2908 	uint32_t count;
2909 	uint32_t limit;
2910 #if 0
2911 	static int zzticks;
2912 #endif
2913 
2914 	/*
2915 	 * Atomic check condition and wait.  Also do an early speedup of
2916 	 * the syncer to try to avoid hitting the wait.
2917 	 */
2918 	for (;;) {
2919 		waiting = pmp->inmem_dirty_chains;
2920 		cpu_ccfence();
2921 		count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2922 
2923 		limit = pmp->mp->mnt_nvnodelistsize / 10;
2924 		if (limit < hammer2_limit_dirty_chains)
2925 			limit = hammer2_limit_dirty_chains;
2926 		if (limit < 1000)
2927 			limit = 1000;
2928 
2929 #if 0
2930 		if ((int)(ticks - zzticks) > hz) {
2931 			zzticks = ticks;
2932 			kprintf("count %ld %ld\n", count, limit);
2933 		}
2934 #endif
2935 
2936 		/*
2937 		 * Block if there are too many dirty chains present, wait
2938 		 * for the flush to clean some out.
2939 		 */
2940 		if (count > limit) {
2941 			tsleep_interlock(&pmp->inmem_dirty_chains, 0);
2942 			if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
2943 					       waiting,
2944 				       waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
2945 				speedup_syncer(pmp->mp);
2946 				tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
2947 				       "chnmem", hz);
2948 			}
2949 			continue;	/* loop on success or fail */
2950 		}
2951 
2952 		/*
2953 		 * Try to start an early flush before we are forced to block.
2954 		 */
2955 		if (count > limit * 7 / 10)
2956 			speedup_syncer(pmp->mp);
2957 		break;
2958 	}
2959 }
2960 
2961 void
2962 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
2963 {
2964 	if (pmp) {
2965 		atomic_add_int(&pmp->inmem_dirty_chains, 1);
2966 	}
2967 }
2968 
2969 void
2970 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
2971 {
2972 	uint32_t waiting;
2973 
2974 	if (pmp == NULL)
2975 		return;
2976 
2977 	for (;;) {
2978 		waiting = pmp->inmem_dirty_chains;
2979 		cpu_ccfence();
2980 		if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
2981 				       waiting,
2982 				       (waiting - 1) &
2983 					~HAMMER2_DIRTYCHAIN_WAITING)) {
2984 			break;
2985 		}
2986 	}
2987 
2988 	if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
2989 		wakeup(&pmp->inmem_dirty_chains);
2990 }
2991 
2992 /*
2993  * Debugging
2994  */
2995 void
2996 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
2997 {
2998 	hammer2_chain_t *scan;
2999 	hammer2_chain_t *parent;
3000 
3001 	--*countp;
3002 	if (*countp == 0) {
3003 		kprintf("%*.*s...\n", tab, tab, "");
3004 		return;
3005 	}
3006 	if (*countp < 0)
3007 		return;
3008 	kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
3009 		tab, tab, "", pfx,
3010 		chain, chain->bref.type,
3011 		chain->bref.key, chain->bref.keybits,
3012 		chain->bref.mirror_tid);
3013 
3014 	kprintf("%*.*s      [%08x] (%s) refs=%d\n",
3015 		tab, tab, "",
3016 		chain->flags,
3017 		((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3018 		chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3019 		chain->refs);
3020 
3021 	kprintf("%*.*s      core [%08x]",
3022 		tab, tab, "",
3023 		chain->core.flags);
3024 
3025 	parent = chain->parent;
3026 	if (parent)
3027 		kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
3028 			tab, tab, "",
3029 			parent, parent->flags, parent->refs);
3030 	if (RB_EMPTY(&chain->core.rbtree)) {
3031 		kprintf("\n");
3032 	} else {
3033 		kprintf(" {\n");
3034 		RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree)
3035 			hammer2_dump_chain(scan, tab + 4, countp, 'a');
3036 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3037 			kprintf("%*.*s}(%s)\n", tab, tab, "",
3038 				chain->data->ipdata.filename);
3039 		else
3040 			kprintf("%*.*s}\n", tab, tab, "");
3041 	}
3042 }
3043