xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision a413fe45675e6d132bfaa7a0a089b5a6e670bb07)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54 
55 #include "hammer2.h"
56 
57 /*
58  * Returns TRUE if any chain in the cluster needs to be resized.
59  */
60 int
61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
62 {
63 	hammer2_chain_t *chain;
64 	int i;
65 
66 	for (i = 0; i < cluster->nchains; ++i) {
67 		chain = cluster->array[i].chain;
68 		if (chain && chain->bytes != bytes)
69 			return 1;
70 	}
71 	return 0;
72 }
73 
74 uint8_t
75 hammer2_cluster_type(hammer2_cluster_t *cluster)
76 {
77 	return(cluster->focus->bref.type);
78 }
79 
80 int
81 hammer2_cluster_modified(hammer2_cluster_t *cluster)
82 {
83 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
84 }
85 
86 /*
87  * Return a bref representative of the cluster.  Any data offset is removed
88  * (since it would only be applicable to a particular chain in the cluster).
89  *
90  * However, the radix portion of data_off is used for many purposes and will
91  * be retained.
92  */
93 void
94 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
95 {
96 	*bref = cluster->focus->bref;
97 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
98 }
99 
100 /*
101  * Return non-zero if the chain representing an inode has been flagged
102  * as having been unlinked.  Allows the vnode reclaim to avoid loading
103  * the inode data from disk e.g. when unmount or recycling old, clean
104  * vnodes.
105  */
106 int
107 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
108 {
109 	hammer2_chain_t *chain;
110 	int flags;
111 	int i;
112 
113 	flags = 0;
114 	for (i = 0; i < cluster->nchains; ++i) {
115 		chain = cluster->array[i].chain;
116 		if (chain)
117 			flags |= chain->flags;
118 	}
119 	return (flags & HAMMER2_CHAIN_UNLINKED);
120 }
121 
122 void
123 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
124 {
125 	hammer2_chain_t *chain;
126 	int i;
127 
128 	for (i = 0; i < cluster->nchains; ++i) {
129 		chain = cluster->array[i].chain;
130 		if (chain)
131 			atomic_set_int(&chain->flags, flags);
132 	}
133 }
134 
135 void
136 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
137 {
138 	hammer2_chain_t *chain;
139 	int i;
140 
141 	for (i = 0; i < cluster->nchains; ++i) {
142 		chain = cluster->array[i].chain;
143 		if (chain)
144 			atomic_clear_int(&chain->flags, flags);
145 	}
146 }
147 
148 void
149 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
150 {
151 	hammer2_chain_t *chain;
152 	int i;
153 
154 	for (i = 0; i < cluster->nchains; ++i) {
155 		chain = cluster->array[i].chain;
156 		if (chain)
157 			hammer2_chain_setflush(trans, chain);
158 	}
159 }
160 
161 void
162 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
163 				hammer2_cluster_t *cluster,
164 				int check_algo)
165 {
166 	hammer2_chain_t *chain;
167 	int i;
168 
169 	for (i = 0; i < cluster->nchains; ++i) {
170 		chain = cluster->array[i].chain;
171 		if (chain) {
172 			KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
173 			chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
174 			chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
175 		}
176 	}
177 }
178 
179 /*
180  * Create a cluster with one ref from the specified chain.  The chain
181  * is not further referenced.  The caller typically supplies a locked
182  * chain and transfers ownership to the cluster.
183  */
184 hammer2_cluster_t *
185 hammer2_cluster_from_chain(hammer2_chain_t *chain)
186 {
187 	hammer2_cluster_t *cluster;
188 
189 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
190 	cluster->array[0].chain = chain;
191 	cluster->nchains = 1;
192 	cluster->focus = chain;
193 	cluster->pmp = chain->pmp;
194 	cluster->refs = 1;
195 
196 	return cluster;
197 }
198 
199 /*
200  * Allocates a cluster and its underlying chain structures.  The underlying
201  * chains will be locked.  The cluster and underlying chains will have one
202  * ref.
203  */
204 hammer2_cluster_t *
205 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
206 		      hammer2_trans_t *trans, hammer2_blockref_t *bref)
207 {
208 	hammer2_cluster_t *cluster;
209 	hammer2_cluster_t *rcluster;
210 	hammer2_chain_t *chain;
211 	hammer2_chain_t *rchain;
212 #if 0
213 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
214 #endif
215 	int i;
216 
217 	KKASSERT(pmp != NULL);
218 
219 	/*
220 	 * Construct the appropriate system structure.
221 	 */
222 	switch(bref->type) {
223 	case HAMMER2_BREF_TYPE_INODE:
224 	case HAMMER2_BREF_TYPE_INDIRECT:
225 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
226 	case HAMMER2_BREF_TYPE_DATA:
227 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
228 		/*
229 		 * Chain's are really only associated with the hmp but we
230 		 * maintain a pmp association for per-mount memory tracking
231 		 * purposes.  The pmp can be NULL.
232 		 */
233 		break;
234 	case HAMMER2_BREF_TYPE_VOLUME:
235 	case HAMMER2_BREF_TYPE_FREEMAP:
236 		chain = NULL;
237 		panic("hammer2_cluster_alloc volume type illegal for op");
238 	default:
239 		chain = NULL;
240 		panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
241 		      bref->type);
242 	}
243 
244 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
245 	cluster->refs = 1;
246 
247 	rcluster = &pmp->iroot->cluster;
248 	for (i = 0; i < rcluster->nchains; ++i) {
249 		rchain = rcluster->array[i].chain;
250 		chain = hammer2_chain_alloc(rchain->hmp, pmp, trans, bref);
251 #if 0
252 		chain->hmp = rchain->hmp;
253 		chain->bref = *bref;
254 		chain->bytes = bytes;
255 		chain->refs = 1;
256 		chain->flags = HAMMER2_CHAIN_ALLOCATED;
257 #endif
258 
259 		/*
260 		 * NOTE: When loading a chain from backing store or creating a
261 		 *	 snapshot, trans will be NULL and the caller is
262 		 *	 responsible for setting these fields.
263 		 */
264 		cluster->array[i].chain = chain;
265 	}
266 	cluster->nchains = i;
267 	cluster->pmp = pmp;
268 	cluster->focus = cluster->array[0].chain;
269 
270 	return (cluster);
271 }
272 
273 /*
274  * Add a reference to a cluster.
275  *
276  * We must also ref the underlying chains in order to allow ref/unlock
277  * sequences to later re-lock.
278  */
279 void
280 hammer2_cluster_ref(hammer2_cluster_t *cluster)
281 {
282 	hammer2_chain_t *chain;
283 	int i;
284 
285 	atomic_add_int(&cluster->refs, 1);
286 	for (i = 0; i < cluster->nchains; ++i) {
287 		chain = cluster->array[i].chain;
288 		if (chain)
289 			hammer2_chain_ref(chain);
290 	}
291 }
292 
293 /*
294  * Drop the caller's reference to the cluster.  When the ref count drops to
295  * zero this function frees the cluster and drops all underlying chains.
296  *
297  * In-progress read I/Os are typically detached from the cluster once the
298  * first one returns (the remaining stay attached to the DIOs but are then
299  * ignored and drop naturally).
300  */
301 void
302 hammer2_cluster_drop(hammer2_cluster_t *cluster)
303 {
304 	hammer2_chain_t *chain;
305 	int i;
306 
307 	KKASSERT(cluster->refs > 0);
308 	for (i = 0; i < cluster->nchains; ++i) {
309 		chain = cluster->array[i].chain;
310 		if (chain) {
311 			hammer2_chain_drop(chain);
312 			if (cluster->refs == 1)
313 				cluster->array[i].chain = NULL;
314 		}
315 	}
316 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
317 		cluster->focus = NULL;
318 		kfree(cluster, M_HAMMER2);
319 		/* cluster = NULL; safety */
320 	}
321 }
322 
323 void
324 hammer2_cluster_wait(hammer2_cluster_t *cluster)
325 {
326 	tsleep(cluster->focus, 0, "h2clcw", 1);
327 }
328 
329 /*
330  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
331  * and then locks them.
332  */
333 int
334 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
335 {
336 	hammer2_chain_t *chain;
337 	hammer2_chain_t *tmp;
338 	int i;
339 	int error;
340 
341 	error = 0;
342 	atomic_add_int(&cluster->refs, 1);
343 	for (i = 0; i < cluster->nchains; ++i) {
344 		chain = cluster->array[i].chain;
345 		if (chain) {
346 			error = hammer2_chain_lock(chain, how);
347 			if (error) {
348 				while (--i >= 0) {
349 					tmp = cluster->array[i].chain;
350 					hammer2_chain_unlock(tmp);
351 				}
352 				atomic_add_int(&cluster->refs, -1);
353 				break;
354 			}
355 		}
356 	}
357 	return error;
358 }
359 
360 /*
361  * Replace the contents of dst with src, adding a reference to src's chains.
362  * dst is assumed to already have a ref and any chains present in dst are
363  * assumed to be locked and will be unlocked.
364  *
365  * If the chains in src are locked, only one of (src) or (dst) should be
366  * considered locked by the caller after return, not both.
367  */
368 void
369 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
370 {
371 	hammer2_chain_t *chain;
372 	hammer2_chain_t *tmp;
373 	int i;
374 
375 	KKASSERT(dst->refs == 1);
376 	dst->focus = NULL;
377 
378 	for (i = 0; i < src->nchains; ++i) {
379 		chain = src->array[i].chain;
380 		if (chain) {
381 			hammer2_chain_ref(chain);
382 			if (i < dst->nchains &&
383 			    (tmp = dst->array[i].chain) != NULL) {
384 				hammer2_chain_unlock(tmp);
385 			}
386 			dst->array[i].chain = chain;
387 			if (dst->focus == NULL)
388 				dst->focus = chain;
389 		}
390 	}
391 	while (i < dst->nchains) {
392 		chain = dst->array[i].chain;
393 		if (chain) {
394 			hammer2_chain_unlock(chain);
395 			dst->array[i].chain = NULL;
396 		}
397 		++i;
398 	}
399 	dst->nchains = src->nchains;
400 }
401 
402 /*
403  * Replace the contents of the locked destination with the contents of the
404  * locked source.  Destination must have one ref.
405  *
406  * Returns with the destination still with one ref and the copied chains
407  * with an additional lock (representing their state on the destination).
408  * The original chains associated with the destination are unlocked.
409  */
410 void
411 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
412 {
413 	hammer2_chain_t *chain;
414 	hammer2_chain_t *tmp;
415 	int i;
416 
417 	KKASSERT(dst->refs == 1);
418 
419 	dst->focus = NULL;
420 	for (i = 0; i < src->nchains; ++i) {
421 		chain = src->array[i].chain;
422 		if (chain) {
423 			hammer2_chain_lock(chain, 0);
424 			if (i < dst->nchains &&
425 			    (tmp = dst->array[i].chain) != NULL) {
426 				hammer2_chain_unlock(tmp);
427 			}
428 			dst->array[i].chain = chain;
429 			if (dst->focus == NULL)
430 				dst->focus = chain;
431 		}
432 	}
433 	while (i < dst->nchains) {
434 		chain = dst->array[i].chain;
435 		if (chain) {
436 			hammer2_chain_unlock(chain);
437 			dst->array[i].chain = NULL;
438 		}
439 		++i;
440 	}
441 	dst->nchains = src->nchains;
442 }
443 
444 /*
445  * Copy a cluster, returned a ref'd cluster.  All underlying chains
446  * are also ref'd, but not locked.
447  *
448  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
449  * to the new cluster and a reference is nominally added to them and to
450  * the cluster.  The cluster will have 1 ref.
451  *
452  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
453  * are copied but no additional references are made and the cluster will have
454  * 0 refs.  Callers must ref the cluster and the chains before dropping it
455  * (typically by locking it).
456  *
457  * If flags are passed as 0 the copy is setup as if it contained the chains
458  * but the chains will not be copied over, and the cluster will have 0 refs.
459  * Callers must ref the cluster before dropping it (typically by locking it).
460  */
461 hammer2_cluster_t *
462 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
463 {
464 	hammer2_pfsmount_t *pmp = ocluster->pmp;
465 	hammer2_cluster_t *ncluster;
466 	hammer2_chain_t *chain;
467 	int i;
468 
469 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
470 	ncluster->pmp = pmp;
471 	ncluster->nchains = ocluster->nchains;
472 	ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
473 	if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
474 		ncluster->focus = ocluster->focus;
475 		for (i = 0; i < ocluster->nchains; ++i) {
476 			chain = ocluster->array[i].chain;
477 			ncluster->array[i].chain = chain;
478 			if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
479 			    chain) {
480 				hammer2_chain_ref(chain);
481 			}
482 		}
483 	}
484 	return (ncluster);
485 }
486 
487 /*
488  * Unlock and deref a cluster.  The cluster is destroyed if this is the
489  * last ref.
490  */
491 void
492 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
493 {
494 	hammer2_chain_t *chain;
495 	int i;
496 
497 	KKASSERT(cluster->refs > 0);
498 	for (i = 0; i < cluster->nchains; ++i) {
499 		chain = cluster->array[i].chain;
500 		if (chain) {
501 			hammer2_chain_unlock(chain);
502 			if (cluster->refs == 1)
503 				cluster->array[i].chain = NULL;	/* safety */
504 		}
505 	}
506 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
507 		cluster->focus = NULL;
508 		kfree(cluster, M_HAMMER2);
509 		/* cluster = NULL; safety */
510 	}
511 }
512 
513 /*
514  * Resize the cluster's physical storage allocation in-place.  This may
515  * replace the cluster's chains.
516  */
517 void
518 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
519 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
520 		       int nradix, int flags)
521 {
522 	hammer2_chain_t *chain;
523 	int i;
524 
525 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
526 	KKASSERT(cparent->nchains == cluster->nchains);
527 
528 	cluster->focus = NULL;
529 	for (i = 0; i < cluster->nchains; ++i) {
530 		chain = cluster->array[i].chain;
531 		if (chain) {
532 			KKASSERT(cparent->array[i].chain);
533 			hammer2_chain_resize(trans, ip,
534 					     cparent->array[i].chain, chain,
535 					     nradix, flags);
536 			if (cluster->focus == NULL)
537 				cluster->focus = chain;
538 		}
539 	}
540 }
541 
542 /*
543  * Set an inode's cluster modified, marking the related chains RW and
544  * duplicating them if necessary.
545  *
546  * The passed-in chain is a localized copy of the chain previously acquired
547  * when the inode was locked (and possilby replaced in the mean time), and
548  * must also be updated.  In fact, we update it first and then synchronize
549  * the inode's cluster cache.
550  */
551 hammer2_inode_data_t *
552 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
553 			  hammer2_cluster_t *cluster, int flags)
554 {
555 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
556 	hammer2_cluster_modify(trans, cluster, flags);
557 
558 	hammer2_inode_repoint(ip, NULL, cluster);
559 	if (ip->vp)
560 		vsetisdirty(ip->vp);
561 	return (&hammer2_cluster_wdata(cluster)->ipdata);
562 }
563 
564 /*
565  * Adjust the cluster's chains to allow modification and adjust the
566  * focus.  Data will be accessible on return.
567  */
568 void
569 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
570 		       int flags)
571 {
572 	hammer2_chain_t *chain;
573 	int i;
574 
575 	cluster->focus = NULL;
576 	for (i = 0; i < cluster->nchains; ++i) {
577 		chain = cluster->array[i].chain;
578 		if (chain) {
579 			hammer2_chain_modify(trans, chain, flags);
580 			if (cluster->focus == NULL)
581 				cluster->focus = chain;
582 		}
583 	}
584 }
585 
586 /*
587  * Synchronize modifications from the focus to other chains in a cluster.
588  * Convenient because nominal API users can just modify the contents of the
589  * focus (at least for non-blockref data).
590  *
591  * Nominal front-end operations only edit non-block-table data in a single
592  * chain.  This code copies such modifications to the other chains in the
593  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
594  * by both the frontend and the backend and will explode in fireworks if
595  * blindly copied.
596  */
597 void
598 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
599 {
600 	hammer2_chain_t *focus;
601 	hammer2_chain_t *scan;
602 	const hammer2_inode_data_t *ripdata;
603 	hammer2_inode_data_t *wipdata;
604 	int i;
605 
606 	focus = cluster->focus;
607 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
608 
609 	for (i = 0; i < cluster->nchains; ++i) {
610 		scan = cluster->array[i].chain;
611 		if (scan == NULL || scan == focus)
612 			continue;
613 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
614 		KKASSERT(focus->bytes == scan->bytes &&
615 			 focus->bref.type == scan->bref.type);
616 		switch(focus->bref.type) {
617 		case HAMMER2_BREF_TYPE_INODE:
618 			ripdata = &focus->data->ipdata;
619 			wipdata = &scan->data->ipdata;
620 			if ((ripdata->op_flags &
621 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
622 				bcopy(ripdata, wipdata,
623 				      offsetof(hammer2_inode_data_t, u));
624 				break;
625 			}
626 			/* fall through */
627 		case HAMMER2_BREF_TYPE_DATA:
628 			bcopy(focus->data, scan->data, focus->bytes);
629 			break;
630 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
631 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
632 		case HAMMER2_BREF_TYPE_FREEMAP:
633 		case HAMMER2_BREF_TYPE_VOLUME:
634 			panic("hammer2_cluster_modsync: illegal node type");
635 			/* NOT REACHED */
636 			break;
637 		default:
638 			panic("hammer2_cluster_modsync: unknown node type");
639 			break;
640 		}
641 	}
642 }
643 
644 /*
645  * Lookup initialization/completion API
646  */
647 hammer2_cluster_t *
648 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
649 {
650 	hammer2_cluster_t *cluster;
651 	int i;
652 
653 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
654 	cluster->pmp = cparent->pmp;			/* can be NULL */
655 	/* cluster->focus = NULL; already null */
656 
657 	for (i = 0; i < cparent->nchains; ++i) {
658 		cluster->array[i].chain = cparent->array[i].chain;
659 		if (cluster->focus == NULL)
660 			cluster->focus = cluster->array[i].chain;
661 	}
662 	cluster->nchains = cparent->nchains;
663 
664 	/*
665 	 * Independently lock (this will also give cluster 1 ref)
666 	 */
667 	if (flags & HAMMER2_LOOKUP_SHARED) {
668 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
669 					      HAMMER2_RESOLVE_SHARED);
670 	} else {
671 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
672 	}
673 	return (cluster);
674 }
675 
676 void
677 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
678 {
679 	if (cparent)
680 		hammer2_cluster_unlock(cparent);
681 }
682 
683 /*
684  * Locate first match or overlap under parent, return a new cluster
685  */
686 hammer2_cluster_t *
687 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
688 		     hammer2_key_t key_beg, hammer2_key_t key_end,
689 		     int flags, int *ddflagp)
690 {
691 	hammer2_pfsmount_t *pmp;
692 	hammer2_cluster_t *cluster;
693 	hammer2_chain_t *chain;
694 	hammer2_key_t key_accum;
695 	hammer2_key_t key_next;
696 	hammer2_key_t bref_key;
697 	int bref_keybits;
698 	int null_count;
699 	int ddflag;
700 	int i;
701 	uint8_t bref_type;
702 	u_int bytes;
703 
704 	pmp = cparent->pmp;				/* can be NULL */
705 	key_accum = *key_nextp;
706 	null_count = 0;
707 	bref_type = 0;
708 	bref_key = 0;
709 	bref_keybits = 0;
710 	bytes = 0;
711 
712 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
713 	cluster->pmp = pmp;				/* can be NULL */
714 	cluster->refs = 1;
715 	/* cluster->focus = NULL; already null */
716 	cparent->focus = NULL;
717 	*ddflagp = 0;
718 
719 	for (i = 0; i < cparent->nchains; ++i) {
720 		key_next = *key_nextp;
721 		if (cparent->array[i].chain == NULL) {
722 			++null_count;
723 			continue;
724 		}
725 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
726 					     &key_next,
727 					     key_beg, key_end,
728 					     &cparent->array[i].cache_index,
729 					     flags, &ddflag);
730 		if (cparent->focus == NULL)
731 			cparent->focus = cparent->array[i].chain;
732 		cluster->array[i].chain = chain;
733 		if (chain == NULL) {
734 			++null_count;
735 		} else {
736 			if (cluster->focus == NULL) {
737 				bref_type = chain->bref.type;
738 				bref_key = chain->bref.key;
739 				bref_keybits = chain->bref.keybits;
740 				bytes = chain->bytes;
741 				*ddflagp = ddflag;
742 				cluster->focus = chain;
743 			}
744 			KKASSERT(bref_type == chain->bref.type);
745 			KKASSERT(bref_key == chain->bref.key);
746 			KKASSERT(bref_keybits == chain->bref.keybits);
747 			KKASSERT(bytes == chain->bytes);
748 			KKASSERT(*ddflagp == ddflag);
749 		}
750 		if (key_accum > key_next)
751 			key_accum = key_next;
752 	}
753 	*key_nextp = key_accum;
754 	cluster->nchains = i;
755 
756 	if (null_count == i) {
757 		hammer2_cluster_drop(cluster);
758 		cluster = NULL;
759 	}
760 
761 	return (cluster);
762 }
763 
764 /*
765  * Locate next match or overlap under parent, replace cluster
766  */
767 hammer2_cluster_t *
768 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
769 		     hammer2_key_t *key_nextp,
770 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
771 {
772 	hammer2_chain_t *chain;
773 	hammer2_key_t key_accum;
774 	hammer2_key_t key_next;
775 	int null_count;
776 	int i;
777 
778 	key_accum = *key_nextp;
779 	null_count = 0;
780 	cluster->focus = NULL;
781 	cparent->focus = NULL;
782 
783 	for (i = 0; i < cparent->nchains; ++i) {
784 		key_next = *key_nextp;
785 		chain = cluster->array[i].chain;
786 		if (chain == NULL) {
787 			if (cparent->focus == NULL)
788 				cparent->focus = cparent->array[i].chain;
789 			++null_count;
790 			continue;
791 		}
792 		if (cparent->array[i].chain == NULL) {
793 			if (flags & HAMMER2_LOOKUP_NOLOCK)
794 				hammer2_chain_drop(chain);
795 			else
796 				hammer2_chain_unlock(chain);
797 			++null_count;
798 			continue;
799 		}
800 		chain = hammer2_chain_next(&cparent->array[i].chain, chain,
801 					   &key_next, key_beg, key_end,
802 					   &cparent->array[i].cache_index,
803 					   flags);
804 		if (cparent->focus == NULL)
805 			cparent->focus = cparent->array[i].chain;
806 		cluster->array[i].chain = chain;
807 		if (chain == NULL) {
808 			++null_count;
809 		} else if (cluster->focus == NULL) {
810 			cluster->focus = chain;
811 		}
812 		if (key_accum > key_next)
813 			key_accum = key_next;
814 	}
815 
816 	if (null_count == i) {
817 		hammer2_cluster_drop(cluster);
818 		cluster = NULL;
819 	}
820 	return(cluster);
821 }
822 
823 #if 0
824 /*
825  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
826  *
827  * The raw scan function is similar to lookup/next but does not seek to a key.
828  * Blockrefs are iterated via first_chain = (parent, NULL) and
829  * next_chain = (parent, chain).
830  *
831  * The passed-in parent must be locked and its data resolved.  The returned
832  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
833  * under parent and then iterate with the passed-in chain (which this
834  * function will unlock).
835  */
836 hammer2_cluster_t *
837 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
838 		     int flags)
839 {
840 	hammer2_chain_t *chain;
841 	int null_count;
842 	int i;
843 
844 	null_count = 0;
845 
846 	for (i = 0; i < cparent->nchains; ++i) {
847 		chain = cluster->array[i].chain;
848 		if (chain == NULL) {
849 			++null_count;
850 			continue;
851 		}
852 		if (cparent->array[i].chain == NULL) {
853 			if (flags & HAMMER2_LOOKUP_NOLOCK)
854 				hammer2_chain_drop(chain);
855 			else
856 				hammer2_chain_unlock(chain);
857 			++null_count;
858 			continue;
859 		}
860 
861 		chain = hammer2_chain_scan(cparent->array[i].chain, chain,
862 					   &cparent->array[i].cache_index,
863 					   flags);
864 		cluster->array[i].chain = chain;
865 		if (chain == NULL)
866 			++null_count;
867 	}
868 
869 	if (null_count == i) {
870 		hammer2_cluster_drop(cluster);
871 		cluster = NULL;
872 	}
873 	return(cluster);
874 }
875 
876 #endif
877 
878 /*
879  * Create a new cluster using the specified key
880  */
881 int
882 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
883 		     hammer2_cluster_t **clusterp,
884 		     hammer2_key_t key, int keybits,
885 		     int type, size_t bytes, int flags)
886 {
887 	hammer2_cluster_t *cluster;
888 	hammer2_pfsmount_t *pmp;
889 	int error;
890 	int i;
891 
892 	pmp = trans->pmp;				/* can be NULL */
893 
894 	if ((cluster = *clusterp) == NULL) {
895 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
896 				  M_WAITOK | M_ZERO);
897 		cluster->pmp = pmp;			/* can be NULL */
898 		cluster->refs = 1;
899 	}
900 	cluster->focus = NULL;
901 	cparent->focus = NULL;
902 
903 	/*
904 	 * NOTE: cluster->array[] entries can initially be NULL.  If
905 	 *	 *clusterp is supplied, skip NULL entries, otherwise
906 	 *	 create new chains.
907 	 */
908 	for (i = 0; i < cparent->nchains; ++i) {
909 		if (*clusterp && cluster->array[i].chain == NULL) {
910 			if (cparent->focus == NULL)
911 				cparent->focus = cparent->array[i].chain;
912 			continue;
913 		}
914 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
915 					     &cluster->array[i].chain, pmp,
916 					     key, keybits,
917 					     type, bytes, flags);
918 		KKASSERT(error == 0);
919 		if (cparent->focus == NULL)
920 			cparent->focus = cparent->array[i].chain;
921 		if (cluster->focus == NULL)
922 			cluster->focus = cluster->array[i].chain;
923 	}
924 	cluster->nchains = i;
925 	*clusterp = cluster;
926 
927 	return error;
928 }
929 
930 /*
931  * Rename a cluster to a new parent.
932  *
933  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
934  *	    are used from a passed-in non-NULL bref pointer.  All other fields
935  *	    are extracted from the original chain for each chain in the
936  *	    iteration.
937  */
938 void
939 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
940 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
941 		       int flags)
942 {
943 	hammer2_chain_t *chain;
944 	hammer2_blockref_t xbref;
945 	int i;
946 
947 	cluster->focus = NULL;
948 	cparent->focus = NULL;
949 
950 	for (i = 0; i < cluster->nchains; ++i) {
951 		chain = cluster->array[i].chain;
952 		if (chain) {
953 			if (bref) {
954 				xbref = chain->bref;
955 				xbref.key = bref->key;
956 				xbref.keybits = bref->keybits;
957 				hammer2_chain_rename(trans, &xbref,
958 						     &cparent->array[i].chain,
959 						     chain, flags);
960 			} else {
961 				hammer2_chain_rename(trans, NULL,
962 						     &cparent->array[i].chain,
963 						     chain, flags);
964 			}
965 			cluster->array[i].chain = chain;
966 			if (cluster->focus == NULL)
967 				cluster->focus = chain;
968 			if (cparent->focus == NULL)
969 				cparent->focus = cparent->array[i].chain;
970 		} else {
971 			if (cparent->focus == NULL)
972 				cparent->focus = cparent->array[i].chain;
973 		}
974 	}
975 }
976 
977 /*
978  * Mark a cluster deleted
979  */
980 void
981 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
982 		       hammer2_cluster_t *cluster, int flags)
983 {
984 	hammer2_chain_t *chain;
985 	hammer2_chain_t *parent;
986 	int i;
987 
988 	if (cparent == NULL) {
989 		kprintf("cparent is NULL\n");
990 		return;
991 	}
992 
993 	for (i = 0; i < cluster->nchains; ++i) {
994 		parent = (i < cparent->nchains) ?
995 			 cparent->array[i].chain : NULL;
996 		chain = cluster->array[i].chain;
997 		if (chain == NULL)
998 			continue;
999 		if (chain->parent != parent) {
1000 			kprintf("hammer2_cluster_delete: parent "
1001 				"mismatch chain=%p parent=%p against=%p\n",
1002 				chain, chain->parent, parent);
1003 		} else {
1004 			hammer2_chain_delete(trans, parent, chain, flags);
1005 		}
1006 	}
1007 }
1008 
1009 /*
1010  * Create a snapshot of the specified {parent, ochain} with the specified
1011  * label.  The originating hammer2_inode must be exclusively locked for
1012  * safety.
1013  *
1014  * The ioctl code has already synced the filesystem.
1015  */
1016 int
1017 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1018 		       hammer2_ioc_pfs_t *pfs)
1019 {
1020 	hammer2_mount_t *hmp;
1021 	hammer2_cluster_t *ncluster;
1022 	const hammer2_inode_data_t *ripdata;
1023 	hammer2_inode_data_t *wipdata;
1024 	hammer2_chain_t *nchain;
1025 	hammer2_inode_t *nip;
1026 	size_t name_len;
1027 	hammer2_key_t lhc;
1028 	struct vattr vat;
1029 	uuid_t opfs_clid;
1030 	int error;
1031 	int i;
1032 
1033 	kprintf("snapshot %s\n", pfs->name);
1034 
1035 	name_len = strlen(pfs->name);
1036 	lhc = hammer2_dirhash(pfs->name, name_len);
1037 
1038 	/*
1039 	 * Get the clid
1040 	 */
1041 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1042 	opfs_clid = ripdata->pfs_clid;
1043 	hmp = ocluster->focus->hmp;
1044 
1045 	/*
1046 	 * Create the snapshot directory under the super-root
1047 	 *
1048 	 * Set PFS type, generate a unique filesystem id, and generate
1049 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1050 	 * which theoretically allows the snapshot to be used as part of
1051 	 * the same cluster (perhaps as a cache).
1052 	 *
1053 	 * Copy the (flushed) blockref array.  Theoretically we could use
1054 	 * chain_duplicate() but it becomes difficult to disentangle
1055 	 * the shared core so for now just brute-force it.
1056 	 */
1057 	VATTR_NULL(&vat);
1058 	vat.va_type = VDIR;
1059 	vat.va_mode = 0755;
1060 	ncluster = NULL;
1061 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1062 				   proc0.p_ucred, pfs->name, name_len,
1063 				   &ncluster, &error);
1064 
1065 	if (nip) {
1066 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1067 		wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1068 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1069 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1070 			wipdata->pfs_clid = opfs_clid;
1071 		else
1072 			kern_uuidgen(&wipdata->pfs_clid, 1);
1073 
1074 		for (i = 0; i < ncluster->nchains; ++i) {
1075 			nchain = ncluster->array[i].chain;
1076 			if (nchain)
1077 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1078 		}
1079 #if 0
1080 		/* XXX can't set this unless we do an explicit flush, which
1081 		   we also need a pmp assigned to do, else the flush code
1082 		   won't flush ncluster because it thinks it is crossing a
1083 		   flush boundary */
1084 		hammer2_cluster_set_chainflags(ncluster,
1085 					       HAMMER2_CHAIN_PFSBOUNDARY);
1086 #endif
1087 
1088 		/* XXX hack blockset copy */
1089 		/* XXX doesn't work with real cluster */
1090 		KKASSERT(ocluster->nchains == 1);
1091 		wipdata->u.blockset = ripdata->u.blockset;
1092 		hammer2_cluster_modsync(ncluster);
1093 		for (i = 0; i < ncluster->nchains; ++i) {
1094 			nchain = ncluster->array[i].chain;
1095 			if (nchain)
1096 				hammer2_flush(trans, nchain);
1097 		}
1098 		hammer2_inode_unlock_ex(nip, ncluster);
1099 	}
1100 	return (error);
1101 }
1102 
1103 /*
1104  * Return locked parent cluster given a locked child.  The child remains
1105  * locked on return.
1106  */
1107 hammer2_cluster_t *
1108 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1109 {
1110 	hammer2_cluster_t *cparent;
1111 	int i;
1112 
1113 	cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1114 	for (i = 0; i < cluster->nchains; ++i) {
1115 		hammer2_chain_t *chain;
1116 		hammer2_chain_t *rchain;
1117 
1118 		chain = cluster->array[i].chain;
1119 		if (chain == NULL)
1120 			continue;
1121 		hammer2_chain_ref(chain);
1122 		while ((rchain = chain->parent) != NULL) {
1123 			hammer2_chain_ref(rchain);
1124 			hammer2_chain_unlock(chain);
1125 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1126 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1127 			hammer2_chain_drop(rchain);
1128 			if (chain->parent == rchain)
1129 				break;
1130 			hammer2_chain_unlock(rchain);
1131 		}
1132 		hammer2_chain_drop(chain);
1133 		cparent->array[i].chain = rchain;
1134 	}
1135 	return cparent;
1136 }
1137 
1138 /************************************************************************
1139  *			        CLUSTER I/O 				*
1140  ************************************************************************
1141  *
1142  *
1143  * WARNING! blockref[] array data is not universal.  These functions should
1144  *	    only be used to access universal data.
1145  *
1146  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1147  *	    complete if necessary.  The I/O's should have already been
1148  *	    initiated by the cluster_lock/chain_lock operation.
1149  *
1150  *	    The cluster must already be in a modified state before wdata
1151  *	    is called.  The data will already be available for this case.
1152  */
1153 const hammer2_media_data_t *
1154 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1155 {
1156 	return(cluster->focus->data);
1157 }
1158 
1159 hammer2_media_data_t *
1160 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1161 {
1162 	KKASSERT(hammer2_cluster_modified(cluster));
1163 	return(cluster->focus->data);
1164 }
1165 
1166 /*
1167  * Load async into independent buffer - used to load logical buffers from
1168  * underlying device data.  The callback is made for the first validated
1169  * data found, or NULL if no valid data is available.
1170  *
1171  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1172  *	 in the inode with an exclusive lock held), the chain structure may be
1173  *	 shared.
1174  */
1175 void
1176 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1177 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1178 {
1179 	hammer2_chain_t *chain;
1180 	hammer2_iocb_t *iocb;
1181 	hammer2_mount_t *hmp;
1182 	hammer2_blockref_t *bref;
1183 	int i;
1184 
1185 	/*
1186 	 * Try to find a chain whos data is already resolved.  If none can
1187 	 * be found, start with the first chain.
1188 	 */
1189 	chain = NULL;
1190 	for (i = 0; i < cluster->nchains; ++i) {
1191 		chain = cluster->array[i].chain;
1192 		if (chain && chain->data)
1193 			break;
1194 	}
1195 	if (i == cluster->nchains) {
1196 		chain = cluster->array[0].chain;
1197 		i = 0;
1198 	}
1199 
1200 	iocb = &cluster->iocb;
1201 	iocb->callback = callback;
1202 	iocb->dio = NULL;		/* for already-validated case */
1203 	iocb->cluster = cluster;
1204 	iocb->chain = chain;
1205 	iocb->ptr = ptr;
1206 	iocb->lbase = (off_t)i;
1207 	iocb->flags = 0;
1208 	iocb->error = 0;
1209 
1210 	/*
1211 	 * Data already validated
1212 	 */
1213 	if (chain->data) {
1214 		callback(iocb);
1215 		return;
1216 	}
1217 
1218 	/*
1219 	 * We must resolve to a device buffer, either by issuing I/O or
1220 	 * by creating a zero-fill element.  We do not mark the buffer
1221 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1222 	 * API must still be used to do that).
1223 	 *
1224 	 * The device buffer is variable-sized in powers of 2 down
1225 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1226 	 * chunk always contains buffers of the same size. (XXX)
1227 	 *
1228 	 * The minimum physical IO size may be larger than the variable
1229 	 * block size.
1230 	 */
1231 	bref = &chain->bref;
1232 	hmp = chain->hmp;
1233 
1234 #if 0
1235 	/* handled by callback? <- TODO XXX even needed for loads? */
1236 	/*
1237 	 * The getblk() optimization for a 100% overwrite can only be used
1238 	 * if the physical block size matches the request.
1239 	 */
1240 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1241 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1242 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1243 		KKASSERT(error == 0);
1244 		iocb->dio = dio;
1245 		callback(iocb);
1246 		return;
1247 	}
1248 #endif
1249 
1250 	/*
1251 	 * Otherwise issue a read
1252 	 */
1253 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1254 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1255 }
1256 
1257 /************************************************************************
1258  *			    NODE FAILURES 				*
1259  ************************************************************************
1260  *
1261  * A node failure can occur for numerous reasons.
1262  *
1263  *	- A read I/O may fail
1264  *	- A write I/O may fail
1265  *	- An unexpected chain might be found (or be missing)
1266  *	- A node might disconnect temporarily and reconnect later
1267  *	  (for example, a USB stick could get pulled, or a node might
1268  *	  be programmatically disconnected).
1269  *	- A node might run out of space during a modifying operation.
1270  *
1271  * When a read failure or an unexpected chain state is found, the chain and
1272  * parent chain at the failure point for the nodes involved (the nodes
1273  * which we determine to be in error) are flagged as failed and removed
1274  * from the cluster.  The node itself is allowed to remain active.  The
1275  * highest common point (usually a parent chain) is queued to the
1276  * resynchronization thread for action.
1277  *
1278  * When a write I/O fails or a node runs out of space, we first adjust
1279  * as if a read failure occurs but we further disable flushes on the
1280  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1281  * but any new modifying transactions will automatically remove the node
1282  * from consideration in all related cluster structures and not generate
1283  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1284  * to the resynchronization thread for action.
1285  *
1286  * A temporary disconnect is handled as if a write failure occurred.
1287  *
1288  * Any of these failures might or might not stall related high level VNOPS,
1289  * depending on what has failed, what nodes remain, the type of cluster,
1290  * and the operating state of the cluster.
1291  *
1292  *			    FLUSH ON WRITE-DISABLED NODES
1293  *
1294  * A flush on a write-disabled node is not allowed to write anything because
1295  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1296  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1297  * Dirty meta-data related to the failed node is thrown away.
1298  *
1299  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1300  * retired... that is, if the filesystem still has enough nodes to complete
1301  * the operation.
1302  */
1303 
1304 /************************************************************************
1305  *			SYNCHRONIZATION THREAD				*
1306  ************************************************************************
1307  *
1308  * This thread is responsible for [re]synchronizing the cluster representing
1309  * a PFS.  Any out-of-sync or failed node starts this thread on a
1310  * node-by-node basis when the failure is detected.
1311  *
1312  * Clusters needing resynchronization are queued at the highest point
1313  * where the parent on the failed node is still valid, or a special
1314  * incremental scan from the ROOT is queued if no parent exists.  This
1315  * thread is also responsible for waiting for reconnections of the failed
1316  * node if the cause was due to a disconnect, and waiting for space to be
1317  * freed up if the cause was due to running out of space.
1318  *
1319  * If the cause is due to a node running out of space, this thread will also
1320  * remove older (unlocked) snapshots to make new space, recover space, and
1321  * then start resynchronization.
1322  *
1323  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1324  * and synchronizes using that snapshot against the target node.  This
1325  * ensures a consistent chain topology and also avoids interference between
1326  * the resynchronization thread and frontend operations.
1327  *
1328  * Since these are per-node threads it is possible to resynchronize several
1329  * nodes at once.
1330  */
1331