xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision 874e15d007943a40fad40d5e25620e2bf00235a1)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns TRUE if any chain in the cluster needs to be resized.
120  */
121 int
122 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
123 {
124 	hammer2_chain_t *chain;
125 	int i;
126 
127 	for (i = 0; i < cluster->nchains; ++i) {
128 		chain = cluster->array[i].chain;
129 		if (chain && chain->bytes != bytes)
130 			return 1;
131 	}
132 	return 0;
133 }
134 
135 uint8_t
136 hammer2_cluster_type(hammer2_cluster_t *cluster)
137 {
138 	return(cluster->focus->bref.type);
139 }
140 
141 int
142 hammer2_cluster_modified(hammer2_cluster_t *cluster)
143 {
144 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
145 }
146 
147 /*
148  * Return a bref representative of the cluster.  Any data offset is removed
149  * (since it would only be applicable to a particular chain in the cluster).
150  *
151  * However, the radix portion of data_off is used for many purposes and will
152  * be retained.
153  */
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
156 {
157 	*bref = cluster->focus->bref;
158 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
159 }
160 
161 /*
162  * Return non-zero if the chain representing an inode has been flagged
163  * as having been unlinked.  Allows the vnode reclaim to avoid loading
164  * the inode data from disk e.g. when unmount or recycling old, clean
165  * vnodes.
166  */
167 int
168 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
169 {
170 	hammer2_chain_t *chain;
171 	int flags;
172 	int i;
173 
174 	flags = 0;
175 	for (i = 0; i < cluster->nchains; ++i) {
176 		chain = cluster->array[i].chain;
177 		if (chain)
178 			flags |= chain->flags;
179 	}
180 	return (flags & HAMMER2_CHAIN_UNLINKED);
181 }
182 
183 void
184 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
185 {
186 	hammer2_chain_t *chain;
187 	int i;
188 
189 	for (i = 0; i < cluster->nchains; ++i) {
190 		chain = cluster->array[i].chain;
191 		if (chain)
192 			atomic_set_int(&chain->flags, flags);
193 	}
194 }
195 
196 void
197 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
198 {
199 	hammer2_chain_t *chain;
200 	int i;
201 
202 	for (i = 0; i < cluster->nchains; ++i) {
203 		chain = cluster->array[i].chain;
204 		if (chain)
205 			atomic_clear_int(&chain->flags, flags);
206 	}
207 }
208 
209 void
210 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
211 {
212 	hammer2_chain_t *chain;
213 	int i;
214 
215 	for (i = 0; i < cluster->nchains; ++i) {
216 		chain = cluster->array[i].chain;
217 		if (chain)
218 			hammer2_chain_setflush(trans, chain);
219 	}
220 }
221 
222 void
223 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
224 				hammer2_cluster_t *cluster,
225 				int check_algo)
226 {
227 	hammer2_chain_t *chain;
228 	int i;
229 
230 	for (i = 0; i < cluster->nchains; ++i) {
231 		chain = cluster->array[i].chain;
232 		if (chain) {
233 			KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
234 			chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
235 			chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
236 		}
237 	}
238 }
239 
240 /*
241  * Create a cluster with one ref from the specified chain.  The chain
242  * is not further referenced.  The caller typically supplies a locked
243  * chain and transfers ownership to the cluster.
244  *
245  * The returned cluster will be focused on the chain (strictly speaking,
246  * the focus should be NULL if the chain is not locked but we do not check
247  * for this condition).
248  *
249  * We fake the flags.
250  */
251 hammer2_cluster_t *
252 hammer2_cluster_from_chain(hammer2_chain_t *chain)
253 {
254 	hammer2_cluster_t *cluster;
255 
256 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
257 	cluster->array[0].chain = chain;
258 	cluster->nchains = 1;
259 	cluster->focus = chain;
260 	cluster->pmp = chain->pmp;
261 	cluster->refs = 1;
262 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
263 			 HAMMER2_CLUSTER_WRHARD |
264 			 HAMMER2_CLUSTER_RDHARD |
265 			 HAMMER2_CLUSTER_MSYNCED |
266 			 HAMMER2_CLUSTER_SSYNCED;
267 
268 	return cluster;
269 }
270 
271 /*
272  * Add a reference to a cluster.
273  *
274  * We must also ref the underlying chains in order to allow ref/unlock
275  * sequences to later re-lock.
276  */
277 void
278 hammer2_cluster_ref(hammer2_cluster_t *cluster)
279 {
280 	hammer2_chain_t *chain;
281 	int i;
282 
283 	atomic_add_int(&cluster->refs, 1);
284 	for (i = 0; i < cluster->nchains; ++i) {
285 		chain = cluster->array[i].chain;
286 		if (chain)
287 			hammer2_chain_ref(chain);
288 	}
289 }
290 
291 /*
292  * Drop the caller's reference to the cluster.  When the ref count drops to
293  * zero this function frees the cluster and drops all underlying chains.
294  *
295  * In-progress read I/Os are typically detached from the cluster once the
296  * first one returns (the remaining stay attached to the DIOs but are then
297  * ignored and drop naturally).
298  */
299 void
300 hammer2_cluster_drop(hammer2_cluster_t *cluster)
301 {
302 	hammer2_chain_t *chain;
303 	int i;
304 
305 	KKASSERT(cluster->refs > 0);
306 	for (i = 0; i < cluster->nchains; ++i) {
307 		chain = cluster->array[i].chain;
308 		if (chain) {
309 			hammer2_chain_drop(chain);
310 			if (cluster->refs == 1)
311 				cluster->array[i].chain = NULL;
312 		}
313 	}
314 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
315 		cluster->focus = NULL;		/* safety XXX chg to assert */
316 		kfree(cluster, M_HAMMER2);
317 		/* cluster is invalid */
318 	}
319 }
320 
321 void
322 hammer2_cluster_wait(hammer2_cluster_t *cluster)
323 {
324 	tsleep(cluster->focus, 0, "h2clcw", 1);
325 }
326 
327 /*
328  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
329  * and then locks them, modified by various RESOLVE flags.
330  *
331  * The act of locking a cluster sets its focus.
332  *
333  * The chains making up the cluster may be narrowed down based on quorum
334  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
335  * narrowed down to a single chain as long as the entire subtopology is known
336  * to be intact.  So, for example, we can narrow a read-only op to a single
337  * fast SLAVE but if we focus a CACHE chain we must still retain at least
338  * a SLAVE to ensure that the subtopology can be accessed.
339  *
340  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
341  * to be maintained once the topology is validated as-of the top level of
342  * the operation.
343  *
344  * If a failure occurs the operation must be aborted by higher-level code and
345  * retried. XXX
346  */
347 void
348 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
349 {
350 	hammer2_chain_t *chain;
351 	int i;
352 
353 	/* cannot be on inode-embedded cluster template, must be on copy */
354 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
355 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
356 		kprintf("hammer2_cluster_lock: cluster %p already locked!\n",
357 			cluster);
358 	} else {
359 		KKASSERT(cluster->focus == NULL);
360 	}
361 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
362 
363 	if ((how & HAMMER2_RESOLVE_NOREF) == 0)
364 		atomic_add_int(&cluster->refs, 1);
365 
366 	/*
367 	 * Lock chains and resolve state.
368 	 */
369 	for (i = 0; i < cluster->nchains; ++i) {
370 		chain = cluster->array[i].chain;
371 		if (chain == NULL)
372 			continue;
373 		hammer2_chain_lock(chain, how);
374 	}
375 
376 	hammer2_cluster_resolve(cluster);
377 }
378 
379 void
380 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
381 {
382 	hammer2_chain_t *chain;
383 	hammer2_pfs_t *pmp;
384 	hammer2_tid_t quorum_tid;
385 	int focus_pfs_type;
386 	uint32_t nflags;
387 	int ttlmasters;
388 	int ttlslaves;
389 	int nmasters;
390 	int nslaves;
391 	int nquorum;
392 	int i;
393 
394 	cluster->error = 0;
395 
396 	quorum_tid = 0;
397 	focus_pfs_type = 0;
398 	nflags = 0;
399 	ttlmasters = 0;
400 	ttlslaves = 0;
401 	nmasters = 0;
402 	nslaves = 0;
403 
404 	/*
405 	 * Calculate quorum
406 	 */
407 	pmp = cluster->pmp;
408 	KKASSERT(pmp != NULL || cluster->nchains == 0);
409 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
410 
411 	/*
412 	 * Pass 1
413 	 */
414 	for (i = 0; i < cluster->nchains; ++i) {
415 		chain = cluster->array[i].chain;
416 		if (chain == NULL)
417 			continue;
418 
419 		switch (cluster->pmp->pfs_types[i]) {
420 		case HAMMER2_PFSTYPE_MASTER:
421 			++ttlmasters;
422 			if (quorum_tid < chain->bref.mirror_tid ||
423 			    nmasters == 0) {
424 				nmasters = 1;
425 				quorum_tid = chain->bref.mirror_tid;
426 			} else if (quorum_tid == chain->bref.mirror_tid) {
427 				++nmasters;
428 			}
429 			break;
430 		case HAMMER2_PFSTYPE_SLAVE:
431 			++ttlslaves;
432 			break;
433 		case HAMMER2_PFSTYPE_SOFT_MASTER:
434 			nflags |= HAMMER2_CLUSTER_WRSOFT;
435 			nflags |= HAMMER2_CLUSTER_RDSOFT;
436 			break;
437 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
438 			nflags |= HAMMER2_CLUSTER_RDSOFT;
439 			break;
440 		case HAMMER2_PFSTYPE_SUPROOT:
441 			/*
442 			 * Degenerate cluster representing the super-root
443 			 * topology on a single device.
444 			 */
445 			nflags |= HAMMER2_CLUSTER_WRHARD;
446 			nflags |= HAMMER2_CLUSTER_RDHARD;
447 			cluster->focus = chain;
448 			cluster->error = chain->error;
449 			break;
450 		default:
451 			break;
452 		}
453 	}
454 
455 	/*
456 	 * Pass 2
457 	 */
458 	for (i = 0; i < cluster->nchains; ++i) {
459 		chain = cluster->array[i].chain;
460 		if (chain == NULL)
461 			continue;
462 
463 		switch (cluster->pmp->pfs_types[i]) {
464 		case HAMMER2_PFSTYPE_MASTER:
465 			/*
466 			 * We must have enough up-to-date masters to reach
467 			 * a quorum and the master mirror_tid must match
468 			 * the quorum's mirror_tid.
469 			 *
470 			 * Do not select an errored master.
471 			 */
472 			if (nmasters >= nquorum &&
473 			    chain->error == 0 &&
474 			    quorum_tid == chain->bref.mirror_tid) {
475 				nflags |= HAMMER2_CLUSTER_WRHARD;
476 				nflags |= HAMMER2_CLUSTER_RDHARD;
477 				if (cluster->focus == NULL ||
478 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
479 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
480 					cluster->focus = chain;
481 					cluster->error = chain->error;
482 				}
483 			}
484 			break;
485 		case HAMMER2_PFSTYPE_SLAVE:
486 			/*
487 			 * We must have enough up-to-date masters to reach
488 			 * a quorum and the slave mirror_tid must match the
489 			 * quorum's mirror_tid.
490 			 *
491 			 * Do not select an errored slave.
492 			 */
493 			if (nmasters >= nquorum &&
494 			    chain->error == 0 &&
495 			    quorum_tid == chain->bref.mirror_tid) {
496 				++nslaves;
497 				nflags |= HAMMER2_CLUSTER_RDHARD;
498 				if (cluster->focus == NULL) {
499 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
500 					cluster->focus = chain;
501 					cluster->error = chain->error;
502 				}
503 			}
504 			break;
505 		case HAMMER2_PFSTYPE_SOFT_MASTER:
506 			/*
507 			 * Directly mounted soft master always wins.  There
508 			 * should be only one.
509 			 */
510 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
511 			cluster->focus = chain;
512 			cluster->error = chain->error;
513 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
514 			break;
515 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
516 			/*
517 			 * Directly mounted soft slave always wins.  There
518 			 * should be only one.
519 			 */
520 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
521 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
522 				cluster->focus = chain;
523 				cluster->error = chain->error;
524 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
525 			}
526 			break;
527 		default:
528 			break;
529 		}
530 	}
531 
532 	/*
533 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
534 	 * all available nodes (even if 0 are available) are fully
535 	 * synchronized.  This is used by the synchronization thread to
536 	 * determine if there is work it could potentially accomplish.
537 	 */
538 	if (nslaves == ttlslaves)
539 		nflags |= HAMMER2_CLUSTER_SSYNCED;
540 	if (nmasters == ttlmasters)
541 		nflags |= HAMMER2_CLUSTER_MSYNCED;
542 
543 	/*
544 	 * Determine if the cluster was successfully locked for the
545 	 * requested operation and generate an error code.  The cluster
546 	 * will not be locked (or ref'd) if an error is returned.
547 	 *
548 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
549 	 * to determine if reading or writing is possible.  If writing, the
550 	 * cluster still requires a call to hammer2_cluster_modify() first.
551 	 */
552 	atomic_set_int(&cluster->flags, nflags);
553 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
554 }
555 
556 /*
557  * Copy a cluster, returned a ref'd cluster.  All underlying chains
558  * are also ref'd, but not locked.
559  *
560  * The cluster focus is not set because the cluster is not yet locked
561  * (and the originating cluster does not have to be locked either).
562  */
563 hammer2_cluster_t *
564 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
565 {
566 	hammer2_pfs_t *pmp = ocluster->pmp;
567 	hammer2_cluster_t *ncluster;
568 	hammer2_chain_t *chain;
569 	int i;
570 
571 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
572 	ncluster->pmp = pmp;
573 	ncluster->nchains = ocluster->nchains;
574 	ncluster->refs = 1;
575 	ncluster->flags = 0;	/* cluster not locked */
576 
577 	for (i = 0; i < ocluster->nchains; ++i) {
578 		chain = ocluster->array[i].chain;
579 		ncluster->array[i].chain = chain;
580 		if (chain)
581 			hammer2_chain_ref(chain);
582 	}
583 	return (ncluster);
584 }
585 
586 /*
587  * Unlock and deref a cluster.  The cluster is destroyed if this is the
588  * last ref.
589  */
590 void
591 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
592 {
593 	hammer2_chain_t *chain;
594 	int i;
595 
596 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
597 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
598 			cluster);
599 	}
600 	/* KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); */
601 	KKASSERT(cluster->refs > 0);
602 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
603 
604 	for (i = 0; i < cluster->nchains; ++i) {
605 		chain = cluster->array[i].chain;
606 		if (chain) {
607 			hammer2_chain_unlock(chain);
608 			if (cluster->refs == 1)
609 				cluster->array[i].chain = NULL;	/* safety */
610 		}
611 	}
612 	cluster->focus = NULL;
613 
614 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
615 		kfree(cluster, M_HAMMER2);
616 		/* cluster = NULL; safety */
617 	}
618 }
619 
620 /*
621  * Resize the cluster's physical storage allocation in-place.  This may
622  * replace the cluster's chains.
623  */
624 void
625 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
626 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
627 		       int nradix, int flags)
628 {
629 	hammer2_chain_t *chain;
630 	int i;
631 
632 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
633 	KKASSERT(cparent->nchains == cluster->nchains);
634 
635 	for (i = 0; i < cluster->nchains; ++i) {
636 		chain = cluster->array[i].chain;
637 		if (chain) {
638 			KKASSERT(cparent->array[i].chain);
639 			hammer2_chain_resize(trans, ip,
640 					     cparent->array[i].chain, chain,
641 					     nradix, flags);
642 		}
643 	}
644 }
645 
646 /*
647  * Set an inode's cluster modified, marking the related chains RW and
648  * duplicating them if necessary.
649  *
650  * The passed-in chain is a localized copy of the chain previously acquired
651  * when the inode was locked (and possilby replaced in the mean time), and
652  * must also be updated.  In fact, we update it first and then synchronize
653  * the inode's cluster cache.
654  */
655 hammer2_inode_data_t *
656 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
657 			  hammer2_cluster_t *cluster, int flags)
658 {
659 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
660 	hammer2_cluster_modify(trans, cluster, flags);
661 
662 	hammer2_inode_repoint(ip, NULL, cluster);
663 	if (ip->vp)
664 		vsetisdirty(ip->vp);
665 	return (&hammer2_cluster_wdata(cluster)->ipdata);
666 }
667 
668 /*
669  * Adjust the cluster's chains to allow modification and adjust the
670  * focus.  Data will be accessible on return.
671  *
672  * If our focused master errors on modify, re-resolve the cluster to
673  * try to select a different master.
674  */
675 void
676 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
677 		       int flags)
678 {
679 	hammer2_chain_t *chain;
680 	int resolve_again;
681 	int i;
682 
683 	resolve_again = 0;
684 	for (i = 0; i < cluster->nchains; ++i) {
685 		chain = cluster->array[i].chain;
686 		if (chain) {
687 			hammer2_chain_modify(trans, chain, flags);
688 			if (cluster->focus == chain &&
689 			    chain->error) {
690 				cluster->error = chain->error;
691 				resolve_again = 1;
692 			}
693 		}
694 	}
695 	if (resolve_again)
696 		hammer2_cluster_resolve(cluster);
697 }
698 
699 /*
700  * Synchronize modifications from the focus to other chains in a cluster.
701  * Convenient because nominal API users can just modify the contents of the
702  * focus (at least for non-blockref data).
703  *
704  * Nominal front-end operations only edit non-block-table data in a single
705  * chain.  This code copies such modifications to the other chains in the
706  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
707  * by both the frontend and the backend and will explode in fireworks if
708  * blindly copied.
709  */
710 void
711 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
712 {
713 	hammer2_chain_t *focus;
714 	hammer2_chain_t *scan;
715 	const hammer2_inode_data_t *ripdata;
716 	hammer2_inode_data_t *wipdata;
717 	int i;
718 
719 	focus = cluster->focus;
720 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
721 
722 	for (i = 0; i < cluster->nchains; ++i) {
723 		scan = cluster->array[i].chain;
724 		if (scan == NULL || scan == focus)
725 			continue;
726 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
727 		KKASSERT(focus->bytes == scan->bytes &&
728 			 focus->bref.type == scan->bref.type);
729 		switch(focus->bref.type) {
730 		case HAMMER2_BREF_TYPE_INODE:
731 			ripdata = &focus->data->ipdata;
732 			wipdata = &scan->data->ipdata;
733 			if ((ripdata->op_flags &
734 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
735 				bcopy(ripdata, wipdata,
736 				      offsetof(hammer2_inode_data_t, u));
737 				break;
738 			}
739 			/* fall through to full copy */
740 		case HAMMER2_BREF_TYPE_DATA:
741 			bcopy(focus->data, scan->data, focus->bytes);
742 			break;
743 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
744 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
745 		case HAMMER2_BREF_TYPE_FREEMAP:
746 		case HAMMER2_BREF_TYPE_VOLUME:
747 			panic("hammer2_cluster_modsync: illegal node type");
748 			/* NOT REACHED */
749 			break;
750 		default:
751 			panic("hammer2_cluster_modsync: unknown node type");
752 			break;
753 		}
754 	}
755 }
756 
757 /*
758  * Lookup initialization/completion API
759  */
760 hammer2_cluster_t *
761 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
762 {
763 	hammer2_cluster_t *cluster;
764 	int i;
765 
766 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
767 	cluster->pmp = cparent->pmp;			/* can be NULL */
768 	cluster->flags = 0;	/* cluster not locked (yet) */
769 	/* cluster->focus = NULL; already null */
770 
771 	for (i = 0; i < cparent->nchains; ++i)
772 		cluster->array[i].chain = cparent->array[i].chain;
773 	cluster->nchains = cparent->nchains;
774 
775 	/*
776 	 * Independently lock (this will also give cluster 1 ref)
777 	 */
778 	if (flags & HAMMER2_LOOKUP_SHARED) {
779 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
780 					      HAMMER2_RESOLVE_SHARED);
781 	} else {
782 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
783 	}
784 	return (cluster);
785 }
786 
787 void
788 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
789 {
790 	if (cparent)
791 		hammer2_cluster_unlock(cparent);
792 }
793 
794 /*
795  * Locate first match or overlap under parent, return a new cluster
796  */
797 hammer2_cluster_t *
798 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
799 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
800 {
801 	hammer2_pfs_t *pmp;
802 	hammer2_cluster_t *cluster;
803 	hammer2_chain_t *chain;
804 	hammer2_key_t key_accum;
805 	hammer2_key_t key_next;
806 	hammer2_key_t bref_key;
807 	int null_count;
808 	int bref_keybits;
809 	int i;
810 	uint8_t bref_type;
811 	u_int bytes;
812 
813 	pmp = cparent->pmp;				/* can be NULL */
814 	key_accum = *key_nextp;
815 	null_count = 0;
816 	bref_type = 0;
817 	bref_key = 0;
818 	bref_keybits = 0;
819 	bytes = 0;
820 
821 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
822 	cluster->pmp = pmp;				/* can be NULL */
823 	cluster->refs = 1;
824 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
825 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
826 
827 	for (i = 0; i < cparent->nchains; ++i) {
828 		key_next = *key_nextp;
829 		if (cparent->array[i].chain == NULL) {
830 			++null_count;
831 			continue;
832 		}
833 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
834 					     &key_next,
835 					     key_beg, key_end,
836 					     &cparent->array[i].cache_index,
837 					     flags);
838 		cluster->array[i].chain = chain;
839 		if (chain == NULL) {
840 			++null_count;
841 		} else {
842 			int ddflag = (chain->bref.type ==
843 				      HAMMER2_BREF_TYPE_INODE);
844 
845 			/*
846 			 * Set default focus.
847 			 */
848 			if (cluster->focus == NULL) {
849 				bref_type = chain->bref.type;
850 				bref_key = chain->bref.key;
851 				bref_keybits = chain->bref.keybits;
852 				bytes = chain->bytes;
853 				cluster->ddflag = ddflag;
854 				cluster->focus = chain;
855 			}
856 
857 			/*
858 			 * Override default focus to follow the parent.
859 			 */
860 			if (cparent->focus == cparent->array[i].chain)
861 				cluster->focus = chain;
862 
863 			KKASSERT(bref_type == chain->bref.type);
864 			KKASSERT(bref_key == chain->bref.key);
865 			KKASSERT(bref_keybits == chain->bref.keybits);
866 			KKASSERT(bytes == chain->bytes);
867 			KKASSERT(cluster->ddflag == ddflag);
868 		}
869 		if (key_accum > key_next)
870 			key_accum = key_next;
871 	}
872 	*key_nextp = key_accum;
873 	cluster->nchains = i;
874 	hammer2_cluster_resolve(cluster);
875 
876 	if (null_count == i) {
877 		hammer2_cluster_drop(cluster);
878 		cluster = NULL;
879 	}
880 
881 	return (cluster);
882 }
883 
884 /*
885  * Locate next match or overlap under parent, replace cluster
886  */
887 hammer2_cluster_t *
888 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
889 		     hammer2_key_t *key_nextp,
890 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
891 {
892 	hammer2_chain_t *chain;
893 	hammer2_key_t key_accum;
894 	hammer2_key_t key_next;
895 	hammer2_key_t bref_key;
896 	int null_count;
897 	int bref_keybits;
898 	int i;
899 	uint8_t bref_type;
900 	u_int bytes;
901 
902 	key_accum = *key_nextp;
903 	null_count = 0;
904 	cluster->focus = NULL;
905 	cparent->focus = NULL;
906 
907 	bref_type = 0;
908 	bref_key = 0;
909 	bref_keybits = 0;
910 	bytes = 0;
911 	cluster->ddflag = 0;
912 
913 	for (i = 0; i < cparent->nchains; ++i) {
914 		key_next = *key_nextp;
915 		chain = cluster->array[i].chain;
916 		if (chain == NULL) {
917 			++null_count;
918 			continue;
919 		}
920 		if (cparent->array[i].chain == NULL) {
921 			if (flags & HAMMER2_LOOKUP_NOLOCK)
922 				hammer2_chain_drop(chain);
923 			else
924 				hammer2_chain_unlock(chain);
925 			++null_count;
926 			continue;
927 		}
928 		chain = hammer2_chain_next(&cparent->array[i].chain, chain,
929 					   &key_next, key_beg, key_end,
930 					   &cparent->array[i].cache_index,
931 					   flags);
932 		cluster->array[i].chain = chain;
933 		if (chain == NULL) {
934 			++null_count;
935 		} else {
936 			int ddflag = (chain->bref.type ==
937 				      HAMMER2_BREF_TYPE_INODE);
938 			if (cluster->focus == NULL) {
939 				bref_type = chain->bref.type;
940 				bref_key = chain->bref.key;
941 				bref_keybits = chain->bref.keybits;
942 				bytes = chain->bytes;
943 				cluster->ddflag = ddflag;
944 				cluster->focus = chain;
945 			}
946 
947 			/*
948 			 * Override default focus to follow the parent.
949 			 */
950 			if (cparent->focus == cparent->array[i].chain)
951 				cluster->focus = chain;
952 
953 			KKASSERT(bref_type == chain->bref.type);
954 			KKASSERT(bref_key == chain->bref.key);
955 			KKASSERT(bref_keybits == chain->bref.keybits);
956 			KKASSERT(bytes == chain->bytes);
957 			KKASSERT(cluster->ddflag == ddflag);
958 		}
959 		if (key_accum > key_next)
960 			key_accum = key_next;
961 	}
962 	cluster->nchains = i;
963 	hammer2_cluster_resolve(cluster);
964 
965 	if (null_count == i) {
966 		hammer2_cluster_drop(cluster);
967 		cluster = NULL;
968 	}
969 	return(cluster);
970 }
971 
972 /*
973  * Create a new cluster using the specified key
974  */
975 int
976 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
977 		     hammer2_cluster_t **clusterp,
978 		     hammer2_key_t key, int keybits,
979 		     int type, size_t bytes, int flags)
980 {
981 	hammer2_cluster_t *cluster;
982 	hammer2_pfs_t *pmp;
983 	int error;
984 	int i;
985 
986 	pmp = trans->pmp;				/* can be NULL */
987 
988 	if ((cluster = *clusterp) == NULL) {
989 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
990 				  M_WAITOK | M_ZERO);
991 		cluster->pmp = pmp;			/* can be NULL */
992 		cluster->refs = 1;
993 		cluster->flags = HAMMER2_CLUSTER_LOCKED;
994 	}
995 	cluster->focus = NULL;
996 
997 	/*
998 	 * NOTE: cluster->array[] entries can initially be NULL.  If
999 	 *	 *clusterp is supplied, skip NULL entries, otherwise
1000 	 *	 create new chains.
1001 	 */
1002 	for (i = 0; i < cparent->nchains; ++i) {
1003 		if (*clusterp && cluster->array[i].chain == NULL) {
1004 			continue;
1005 		}
1006 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
1007 					     &cluster->array[i].chain, pmp,
1008 					     key, keybits,
1009 					     type, bytes, flags);
1010 		KKASSERT(error == 0);
1011 		if (cluster->focus == NULL)
1012 			cluster->focus = cluster->array[i].chain;
1013 		if (cparent->focus == cparent->array[i].chain)
1014 			cluster->focus = cluster->array[i].chain;
1015 	}
1016 	cluster->nchains = i;
1017 	*clusterp = cluster;
1018 	hammer2_cluster_resolve(cluster);
1019 
1020 	return error;
1021 }
1022 
1023 /*
1024  * Rename a cluster to a new parent.
1025  *
1026  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
1027  *	    are used from a passed-in non-NULL bref pointer.  All other fields
1028  *	    are extracted from the original chain for each chain in the
1029  *	    iteration.
1030  */
1031 void
1032 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1033 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1034 		       int flags)
1035 {
1036 	hammer2_chain_t *chain;
1037 	hammer2_blockref_t xbref;
1038 	int i;
1039 
1040 	cluster->focus = NULL;
1041 	cparent->focus = NULL;
1042 
1043 	for (i = 0; i < cluster->nchains; ++i) {
1044 		chain = cluster->array[i].chain;
1045 		if (chain) {
1046 			if (bref) {
1047 				xbref = chain->bref;
1048 				xbref.key = bref->key;
1049 				xbref.keybits = bref->keybits;
1050 				hammer2_chain_rename(trans, &xbref,
1051 						     &cparent->array[i].chain,
1052 						     chain, flags);
1053 			} else {
1054 				hammer2_chain_rename(trans, NULL,
1055 						     &cparent->array[i].chain,
1056 						     chain, flags);
1057 			}
1058 			KKASSERT(cluster->array[i].chain == chain); /*remove*/
1059 		}
1060 	}
1061 }
1062 
1063 /*
1064  * Mark a cluster deleted
1065  */
1066 void
1067 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1068 		       hammer2_cluster_t *cluster, int flags)
1069 {
1070 	hammer2_chain_t *chain;
1071 	hammer2_chain_t *parent;
1072 	int i;
1073 
1074 	if (cparent == NULL) {
1075 		kprintf("cparent is NULL\n");
1076 		return;
1077 	}
1078 
1079 	for (i = 0; i < cluster->nchains; ++i) {
1080 		parent = (i < cparent->nchains) ?
1081 			 cparent->array[i].chain : NULL;
1082 		chain = cluster->array[i].chain;
1083 		if (chain == NULL)
1084 			continue;
1085 		if (chain->parent != parent) {
1086 			kprintf("hammer2_cluster_delete: parent "
1087 				"mismatch chain=%p parent=%p against=%p\n",
1088 				chain, chain->parent, parent);
1089 		} else {
1090 			hammer2_chain_delete(trans, parent, chain, flags);
1091 		}
1092 	}
1093 }
1094 
1095 /*
1096  * Create a snapshot of the specified {parent, ochain} with the specified
1097  * label.  The originating hammer2_inode must be exclusively locked for
1098  * safety.
1099  *
1100  * The ioctl code has already synced the filesystem.
1101  */
1102 int
1103 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1104 		       hammer2_ioc_pfs_t *pfs)
1105 {
1106 	hammer2_dev_t *hmp;
1107 	hammer2_cluster_t *ncluster;
1108 	const hammer2_inode_data_t *ripdata;
1109 	hammer2_inode_data_t *wipdata;
1110 	hammer2_chain_t *nchain;
1111 	hammer2_inode_t *nip;
1112 	size_t name_len;
1113 	hammer2_key_t lhc;
1114 	struct vattr vat;
1115 #if 0
1116 	uuid_t opfs_clid;
1117 #endif
1118 	int error;
1119 	int i;
1120 
1121 	kprintf("snapshot %s\n", pfs->name);
1122 
1123 	name_len = strlen(pfs->name);
1124 	lhc = hammer2_dirhash(pfs->name, name_len);
1125 
1126 	/*
1127 	 * Get the clid
1128 	 */
1129 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1130 #if 0
1131 	opfs_clid = ripdata->pfs_clid;
1132 #endif
1133 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1134 
1135 	/*
1136 	 * Create the snapshot directory under the super-root
1137 	 *
1138 	 * Set PFS type, generate a unique filesystem id, and generate
1139 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1140 	 * which theoretically allows the snapshot to be used as part of
1141 	 * the same cluster (perhaps as a cache).
1142 	 *
1143 	 * Copy the (flushed) blockref array.  Theoretically we could use
1144 	 * chain_duplicate() but it becomes difficult to disentangle
1145 	 * the shared core so for now just brute-force it.
1146 	 */
1147 	VATTR_NULL(&vat);
1148 	vat.va_type = VDIR;
1149 	vat.va_mode = 0755;
1150 	ncluster = NULL;
1151 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1152 				   proc0.p_ucred, pfs->name, name_len,
1153 				   &ncluster,
1154 				   HAMMER2_INSERT_PFSROOT, &error);
1155 
1156 	if (nip) {
1157 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1158 		wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1159 		wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1160 		wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1161 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1162 
1163 		/*
1164 		 * Give the snapshot its own private cluster.  As a snapshot
1165 		 * no further synchronization with the original cluster will
1166 		 * be done.
1167 		 */
1168 #if 0
1169 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1170 			wipdata->pfs_clid = opfs_clid;
1171 		else
1172 			kern_uuidgen(&wipdata->pfs_clid, 1);
1173 #endif
1174 		kern_uuidgen(&wipdata->pfs_clid, 1);
1175 
1176 		for (i = 0; i < ncluster->nchains; ++i) {
1177 			nchain = ncluster->array[i].chain;
1178 			if (nchain)
1179 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1180 		}
1181 #if 0
1182 		/* XXX can't set this unless we do an explicit flush, which
1183 		   we also need a pmp assigned to do, else the flush code
1184 		   won't flush ncluster because it thinks it is crossing a
1185 		   flush boundary */
1186 		hammer2_cluster_set_chainflags(ncluster,
1187 					       HAMMER2_CHAIN_PFSBOUNDARY);
1188 #endif
1189 
1190 		/* XXX hack blockset copy */
1191 		/* XXX doesn't work with real cluster */
1192 		KKASSERT(ocluster->nchains == 1);
1193 		wipdata->u.blockset = ripdata->u.blockset;
1194 		hammer2_cluster_modsync(ncluster);
1195 		for (i = 0; i < ncluster->nchains; ++i) {
1196 			nchain = ncluster->array[i].chain;
1197 			if (nchain)
1198 				hammer2_flush(trans, nchain);
1199 		}
1200 		hammer2_inode_unlock_ex(nip, ncluster);
1201 	}
1202 	return (error);
1203 }
1204 
1205 /*
1206  * Return locked parent cluster given a locked child.  The child remains
1207  * locked on return.  The new parent's focus follows the child's focus
1208  * and the parent is always resolved.
1209  */
1210 hammer2_cluster_t *
1211 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1212 {
1213 	hammer2_cluster_t *cparent;
1214 	int i;
1215 
1216 	cparent = hammer2_cluster_copy(cluster);
1217 
1218 	for (i = 0; i < cparent->nchains; ++i) {
1219 		hammer2_chain_t *chain;
1220 		hammer2_chain_t *rchain;
1221 
1222 		/*
1223 		 * Calculate parent for each element.  Old chain has an extra
1224 		 * ref for cparent but the lock remains with cluster.
1225 		 */
1226 		chain = cparent->array[i].chain;
1227 		if (chain == NULL)
1228 			continue;
1229 		while ((rchain = chain->parent) != NULL) {
1230 			hammer2_chain_ref(rchain);
1231 			hammer2_chain_unlock(chain);
1232 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1233 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1234 			hammer2_chain_drop(rchain);
1235 			if (chain->parent == rchain)
1236 				break;
1237 			hammer2_chain_unlock(rchain);
1238 		}
1239 		if (cluster->focus == chain)
1240 			cparent->focus = rchain;
1241 		cparent->array[i].chain = rchain;
1242 		hammer2_chain_drop(chain);
1243 	}
1244 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1245 	hammer2_cluster_resolve(cparent);
1246 
1247 	return cparent;
1248 }
1249 
1250 /************************************************************************
1251  *			        CLUSTER I/O 				*
1252  ************************************************************************
1253  *
1254  *
1255  * WARNING! blockref[] array data is not universal.  These functions should
1256  *	    only be used to access universal data.
1257  *
1258  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1259  *	    complete if necessary.  The I/O's should have already been
1260  *	    initiated by the cluster_lock/chain_lock operation.
1261  *
1262  *	    The cluster must already be in a modified state before wdata
1263  *	    is called.  The data will already be available for this case.
1264  */
1265 const hammer2_media_data_t *
1266 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1267 {
1268 	return(cluster->focus->data);
1269 }
1270 
1271 hammer2_media_data_t *
1272 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1273 {
1274 	KKASSERT(hammer2_cluster_modified(cluster));
1275 	return(cluster->focus->data);
1276 }
1277 
1278 /*
1279  * Load cluster data asynchronously with callback.
1280  *
1281  * The callback is made for the first validated data found, or NULL
1282  * if no valid data is available.
1283  *
1284  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1285  *	 in the inode with an exclusive lock held), the chain structure may be
1286  *	 shared.
1287  */
1288 void
1289 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1290 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1291 {
1292 	hammer2_chain_t *chain;
1293 	hammer2_iocb_t *iocb;
1294 	hammer2_dev_t *hmp;
1295 	hammer2_blockref_t *bref;
1296 	int i;
1297 
1298 	/*
1299 	 * Try to find a chain whos data is already resolved.  If none can
1300 	 * be found, start with the first chain.
1301 	 */
1302 	chain = NULL;
1303 	for (i = 0; i < cluster->nchains; ++i) {
1304 		chain = cluster->array[i].chain;
1305 		if (chain && chain->data)
1306 			break;
1307 	}
1308 	if (i == cluster->nchains) {
1309 		chain = cluster->array[0].chain;
1310 		i = 0;
1311 	}
1312 
1313 	iocb = &cluster->iocb;
1314 	iocb->callback = callback;
1315 	iocb->dio = NULL;		/* for already-validated case */
1316 	iocb->cluster = cluster;
1317 	iocb->chain = chain;
1318 	iocb->ptr = ptr;
1319 	iocb->lbase = (off_t)i;
1320 	iocb->flags = 0;
1321 	iocb->error = 0;
1322 
1323 	/*
1324 	 * Data already validated
1325 	 */
1326 	if (chain->data) {
1327 		callback(iocb);
1328 		return;
1329 	}
1330 
1331 	/*
1332 	 * We must resolve to a device buffer, either by issuing I/O or
1333 	 * by creating a zero-fill element.  We do not mark the buffer
1334 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1335 	 * API must still be used to do that).
1336 	 *
1337 	 * The device buffer is variable-sized in powers of 2 down
1338 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1339 	 * chunk always contains buffers of the same size. (XXX)
1340 	 *
1341 	 * The minimum physical IO size may be larger than the variable
1342 	 * block size.
1343 	 *
1344 	 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1345 	 *	      matches hammer2_devblksize()?  Or does the freemap's
1346 	 *	      pre-zeroing handle the case for us?
1347 	 */
1348 	bref = &chain->bref;
1349 	hmp = chain->hmp;
1350 
1351 #if 0
1352 	/* handled by callback? <- TODO XXX even needed for loads? */
1353 	/*
1354 	 * The getblk() optimization for a 100% overwrite can only be used
1355 	 * if the physical block size matches the request.
1356 	 */
1357 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1358 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1359 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1360 		KKASSERT(error == 0);
1361 		iocb->dio = dio;
1362 		callback(iocb);
1363 		return;
1364 	}
1365 #endif
1366 
1367 	/*
1368 	 * Otherwise issue a read
1369 	 */
1370 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1371 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1372 }
1373 
1374 /************************************************************************
1375  *			    NODE FAILURES 				*
1376  ************************************************************************
1377  *
1378  * A node failure can occur for numerous reasons.
1379  *
1380  *	- A read I/O may fail
1381  *	- A write I/O may fail
1382  *	- An unexpected chain might be found (or be missing)
1383  *	- A node might disconnect temporarily and reconnect later
1384  *	  (for example, a USB stick could get pulled, or a node might
1385  *	  be programmatically disconnected).
1386  *	- A node might run out of space during a modifying operation.
1387  *
1388  * When a read failure or an unexpected chain state is found, the chain and
1389  * parent chain at the failure point for the nodes involved (the nodes
1390  * which we determine to be in error) are flagged as failed and removed
1391  * from the cluster.  The node itself is allowed to remain active.  The
1392  * highest common point (usually a parent chain) is queued to the
1393  * resynchronization thread for action.
1394  *
1395  * When a write I/O fails or a node runs out of space, we first adjust
1396  * as if a read failure occurs but we further disable flushes on the
1397  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1398  * but any new modifying transactions will automatically remove the node
1399  * from consideration in all related cluster structures and not generate
1400  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1401  * to the resynchronization thread for action.
1402  *
1403  * A temporary disconnect is handled as if a write failure occurred.
1404  *
1405  * Any of these failures might or might not stall related high level VNOPS,
1406  * depending on what has failed, what nodes remain, the type of cluster,
1407  * and the operating state of the cluster.
1408  *
1409  *			    FLUSH ON WRITE-DISABLED NODES
1410  *
1411  * A flush on a write-disabled node is not allowed to write anything because
1412  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1413  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1414  * Dirty meta-data related to the failed node is thrown away.
1415  *
1416  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1417  * retired... that is, if the filesystem still has enough nodes to complete
1418  * the operation.
1419  */
1420 
1421 /************************************************************************
1422  *			SYNCHRONIZATION THREAD				*
1423  ************************************************************************
1424  *
1425  * This thread is responsible for [re]synchronizing the cluster representing
1426  * a PFS.  Any out-of-sync or failed node starts this thread on a
1427  * node-by-node basis when the failure is detected.
1428  *
1429  * Clusters needing resynchronization are queued at the highest point
1430  * where the parent on the failed node is still valid, or a special
1431  * incremental scan from the ROOT is queued if no parent exists.  This
1432  * thread is also responsible for waiting for reconnections of the failed
1433  * node if the cause was due to a disconnect, and waiting for space to be
1434  * freed up if the cause was due to running out of space.
1435  *
1436  * If the cause is due to a node running out of space, this thread will also
1437  * remove older (unlocked) snapshots to make new space, recover space, and
1438  * then start resynchronization.
1439  *
1440  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1441  * and synchronizes using that snapshot against the target node.  This
1442  * ensures a consistent chain topology and also avoids interference between
1443  * the resynchronization thread and frontend operations.
1444  *
1445  * Since these are per-node threads it is possible to resynchronize several
1446  * nodes at once.
1447  */
1448