xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision d63676ccce44debffd22823892f437449f6acdad)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns the bref type of the cluster's foucs.
120  *
121  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
122  * The cluster must be locked.
123  */
124 uint8_t
125 hammer2_cluster_type(hammer2_cluster_t *cluster)
126 {
127 	if (cluster->error == 0) {
128 		KKASSERT(cluster->focus != NULL);
129 		return(cluster->focus->bref.type);
130 	}
131 	return 0;
132 }
133 
134 /*
135  * Returns non-zero if the cluster's focus is flagged as being modified.
136  *
137  * If the cluster is errored, returns 0.
138  */
139 static
140 int
141 hammer2_cluster_modified(hammer2_cluster_t *cluster)
142 {
143 	if (cluster->error == 0) {
144 		KKASSERT(cluster->focus != NULL);
145 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
146 	}
147 	return 0;
148 }
149 
150 /*
151  * Returns the bref of the cluster's focus, sans any data-offset information
152  * (since offset information is per-node and wouldn't be useful).
153  *
154  * Callers use this function to access modify_tid, mirror_tid, type,
155  * key, and keybits.
156  *
157  * If the cluster is errored, returns an empty bref.
158  * The cluster must be locked.
159  */
160 void
161 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
162 {
163 	if (cluster->error == 0) {
164 		KKASSERT(cluster->focus != NULL);
165 		*bref = cluster->focus->bref;
166 		bref->data_off = 0;
167 	} else {
168 		bzero(bref, sizeof(*bref));
169 	}
170 }
171 
172 /*
173  * Create a degenerate cluster with one ref from a single locked chain.
174  * The returned cluster will be focused on the chain and inherit its
175  * error state.
176  *
177  * The chain's lock and reference are transfered to the new cluster, so
178  * the caller should not try to unlock the chain separately.
179  *
180  * We fake the flags.
181  */
182 hammer2_cluster_t *
183 hammer2_cluster_from_chain(hammer2_chain_t *chain)
184 {
185 	hammer2_cluster_t *cluster;
186 
187 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
188 	cluster->array[0].chain = chain;
189 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
190 	cluster->nchains = 1;
191 	cluster->focus = chain;
192 	cluster->focus_index = 0;
193 	cluster->pmp = chain->pmp;
194 	cluster->refs = 1;
195 	cluster->error = chain->error;
196 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
197 			 HAMMER2_CLUSTER_WRHARD |
198 			 HAMMER2_CLUSTER_RDHARD |
199 			 HAMMER2_CLUSTER_MSYNCED |
200 			 HAMMER2_CLUSTER_SSYNCED;
201 
202 	return cluster;
203 }
204 
205 /*
206  * Add a reference to a cluster and its underlying chains.
207  *
208  * We must also ref the underlying chains in order to allow ref/unlock
209  * sequences to later re-lock.
210  */
211 void
212 hammer2_cluster_ref(hammer2_cluster_t *cluster)
213 {
214 	atomic_add_int(&cluster->refs, 1);
215 }
216 
217 /*
218  * Drop the caller's reference to the cluster.  When the ref count drops to
219  * zero this function frees the cluster and drops all underlying chains.
220  *
221  * In-progress read I/Os are typically detached from the cluster once the
222  * first one returns (the remaining stay attached to the DIOs but are then
223  * ignored and drop naturally).
224  */
225 void
226 hammer2_cluster_drop(hammer2_cluster_t *cluster)
227 {
228 	hammer2_chain_t *chain;
229 	int i;
230 
231 	KKASSERT(cluster->refs > 0);
232 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
233 		cluster->focus = NULL;		/* safety XXX chg to assert */
234 		cluster->focus_index = 0;
235 
236 		for (i = 0; i < cluster->nchains; ++i) {
237 			chain = cluster->array[i].chain;
238 			if (chain) {
239 				hammer2_chain_drop(chain);
240 				cluster->array[i].chain = NULL; /* safety */
241 			}
242 		}
243 		cluster->nchains = 0;				/* safety */
244 
245 		kfree(cluster, M_HAMMER2);
246 		/* cluster is invalid */
247 	}
248 }
249 
250 /*
251  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
252  *
253  * WARNING! This function expects the caller to handle resolution of the
254  *	    cluster.  We never re-resolve the cluster in this function,
255  *	    because it might be used to temporarily unlock/relock a cparent
256  *	    in an iteration or recursrion, and the cparents elements do not
257  *	    necessarily match.
258  */
259 void
260 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
261 {
262 	hammer2_chain_t *chain;
263 	int i;
264 
265 	/* cannot be on inode-embedded cluster template, must be on copy */
266 	KKASSERT(cluster->refs > 0);
267 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
268 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
269 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
270 			cluster);
271 	}
272 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
273 
274 	/*
275 	 * Lock chains and resolve state.
276 	 */
277 	for (i = 0; i < cluster->nchains; ++i) {
278 		if (i == idx)
279 			continue;
280 		chain = cluster->array[i].chain;
281 		if (chain == NULL)
282 			continue;
283 		hammer2_chain_lock(chain, how);
284 	}
285 }
286 
287 void
288 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
289 {
290 	hammer2_cluster_lock_except(cluster, -1, how);
291 }
292 
293 /*
294  * Calculate the clustering state for the cluster and set its focus.
295  * This routine must be called with care.  For example, it should not
296  * normally be called after relocking a non-leaf cluster because parent
297  * clusters help iterations and each element might be at a slightly different
298  * indirect node (each node's topology is independently indexed).
299  *
300  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
301  * operations.  Typically this is only set on a quorum of MASTERs or
302  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
303  * is present, this bit is *not* set on a quorum of MASTERs.  The
304  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
305  * that create/modify/delete elements use it.
306  *
307  * The chains making up the cluster may be narrowed down based on quorum
308  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
309  * narrowed down to a single chain as long as the entire subtopology is known
310  * to be intact.  So, for example, we can narrow a read-only op to a single
311  * fast SLAVE but if we focus a CACHE chain we must still retain at least
312  * a SLAVE to ensure that the subtopology can be accessed.
313  *
314  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
315  * to be maintained once the topology is validated as-of the top level of
316  * the operation.
317  *
318  * If a failure occurs the operation must be aborted by higher-level code and
319  * retried. XXX
320  */
321 void
322 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
323 {
324 	hammer2_chain_t *chain;
325 	hammer2_chain_t *focus;
326 	hammer2_pfs_t *pmp;
327 	hammer2_tid_t quorum_tid;
328 	hammer2_tid_t last_best_quorum_tid;
329 	int focus_pfs_type;
330 	uint32_t nflags;
331 	int ttlmasters;
332 	int ttlslaves;
333 	int nmasters;
334 	int nslaves;
335 	int nquorum;
336 	int smpresent;
337 	int i;
338 
339 	cluster->error = 0;
340 	cluster->focus = NULL;
341 
342 	focus_pfs_type = 0;
343 	nflags = 0;
344 	ttlmasters = 0;
345 	ttlslaves = 0;
346 	nmasters = 0;
347 	nslaves = 0;
348 
349 	/*
350 	 * Calculate quorum
351 	 */
352 	pmp = cluster->pmp;
353 	KKASSERT(pmp != NULL || cluster->nchains == 0);
354 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
355 	smpresent = 0;
356 
357 	/*
358 	 * Pass 1
359 	 *
360 	 * NOTE: A NULL chain is not necessarily an error, it could be
361 	 *	 e.g. a lookup failure or the end of an iteration.
362 	 *	 Process normally.
363 	 */
364 	for (i = 0; i < cluster->nchains; ++i) {
365 		chain = cluster->array[i].chain;
366 		if (chain && chain->error) {
367 			if (cluster->focus == NULL || cluster->focus == chain) {
368 				/* error will be overridden by valid focus */
369 				cluster->error = chain->error;
370 			}
371 
372 			/*
373 			 * Must count total masters and slaves whether the
374 			 * chain is errored or not.
375 			 */
376 			switch (cluster->pmp->pfs_types[i]) {
377 			case HAMMER2_PFSTYPE_MASTER:
378 				++ttlmasters;
379 				break;
380 			case HAMMER2_PFSTYPE_SLAVE:
381 				++ttlslaves;
382 				break;
383 			}
384 			continue;
385 		}
386 		switch (cluster->pmp->pfs_types[i]) {
387 		case HAMMER2_PFSTYPE_MASTER:
388 			++ttlmasters;
389 			break;
390 		case HAMMER2_PFSTYPE_SLAVE:
391 			++ttlslaves;
392 			break;
393 		case HAMMER2_PFSTYPE_SOFT_MASTER:
394 			nflags |= HAMMER2_CLUSTER_WRSOFT;
395 			nflags |= HAMMER2_CLUSTER_RDSOFT;
396 			smpresent = 1;
397 			break;
398 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
399 			nflags |= HAMMER2_CLUSTER_RDSOFT;
400 			break;
401 		case HAMMER2_PFSTYPE_SUPROOT:
402 			/*
403 			 * Degenerate cluster representing the super-root
404 			 * topology on a single device.  Fake stuff so
405 			 * cluster ops work as expected.
406 			 */
407 			nflags |= HAMMER2_CLUSTER_WRHARD;
408 			nflags |= HAMMER2_CLUSTER_RDHARD;
409 			cluster->focus_index = i;
410 			cluster->focus = chain;
411 			cluster->error = chain ? chain->error : 0;
412 			break;
413 		default:
414 			break;
415 		}
416 	}
417 
418 	/*
419 	 * Pass 2
420 	 *
421 	 * Resolve masters.  Calculate nmasters for the highest matching
422 	 * TID, if a quorum cannot be attained try the next lower matching
423 	 * TID until we exhaust TIDs.
424 	 *
425 	 * NOTE: A NULL chain is not necessarily an error, it could be
426 	 *	 e.g. a lookup failure or the end of an iteration.
427 	 *	 Process normally.
428 	 */
429 	last_best_quorum_tid = HAMMER2_TID_MAX;
430 	quorum_tid = 0;		/* fix gcc warning */
431 
432 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
433 		nmasters = 0;
434 		quorum_tid = 0;
435 
436 		for (i = 0; i < cluster->nchains; ++i) {
437 			if (cluster->pmp->pfs_types[i] !=
438 			    HAMMER2_PFSTYPE_MASTER) {
439 				continue;
440 			}
441 			chain = cluster->array[i].chain;
442 
443 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
444 				/*
445 				 * Invalid as in unsynchronized, cannot be
446 				 * used to calculate the quorum.
447 				 */
448 			} else if (chain == NULL && quorum_tid == 0) {
449 				/*
450 				 * NULL chain on master matches NULL chains
451 				 * on other masters.
452 				 */
453 				++nmasters;
454 			} else if (quorum_tid < last_best_quorum_tid &&
455 				   chain != NULL &&
456 				   (quorum_tid < chain->bref.modify_tid ||
457 				    nmasters == 0)) {
458 				/*
459 				 * Better TID located, reset nmasters count.
460 				 */
461 				nmasters = 1;
462 				quorum_tid = chain->bref.modify_tid;
463 			} else if (chain &&
464 				   quorum_tid == chain->bref.modify_tid) {
465 				/*
466 				 * TID matches current collection.
467 				 */
468 				++nmasters;
469 			}
470 		}
471 		if (nmasters >= nquorum)
472 			break;
473 		last_best_quorum_tid = quorum_tid;
474 	}
475 
476 	/*
477 	 * Pass 3
478 	 *
479 	 * NOTE: A NULL chain is not necessarily an error, it could be
480 	 *	 e.g. a lookup failure or the end of an iteration.
481 	 *	 Process normally.
482 	 */
483 	for (i = 0; i < cluster->nchains; ++i) {
484 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
485 		chain = cluster->array[i].chain;
486 		if (chain && chain->error) {
487 			if (cluster->focus == NULL || cluster->focus == chain) {
488 				/* error will be overridden by valid focus */
489 				cluster->error = chain->error;
490 			}
491 			continue;
492 		}
493 
494 		switch (cluster->pmp->pfs_types[i]) {
495 		case HAMMER2_PFSTYPE_MASTER:
496 			/*
497 			 * We must have enough up-to-date masters to reach
498 			 * a quorum and the master modify_tid must match
499 			 * the quorum's modify_tid.
500 			 *
501 			 * Do not select an errored or out-of-sync master.
502 			 */
503 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
504 				nflags |= HAMMER2_CLUSTER_UNHARD;
505 			} else if (nmasters >= nquorum &&
506 				   (chain == NULL || chain->error == 0) &&
507 				   ((chain == NULL && quorum_tid == 0) ||
508 				    (chain != NULL && quorum_tid ==
509 						  chain->bref.modify_tid))) {
510 				nflags |= HAMMER2_CLUSTER_WRHARD;
511 				nflags |= HAMMER2_CLUSTER_RDHARD;
512 				if (!smpresent) {
513 					cluster->array[i].flags |=
514 							HAMMER2_CITEM_FEMOD;
515 				}
516 				if (cluster->focus == NULL ||
517 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
518 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
519 					cluster->focus_index = i;
520 					cluster->focus = chain; /* NULL ok */
521 					cluster->error = chain ? chain->error :
522 								 0;
523 				}
524 			} else if (chain == NULL || chain->error == 0) {
525 				nflags |= HAMMER2_CLUSTER_UNHARD;
526 			}
527 			break;
528 		case HAMMER2_PFSTYPE_SLAVE:
529 			/*
530 			 * We must have enough up-to-date masters to reach
531 			 * a quorum and the slave modify_tid must match the
532 			 * quorum's modify_tid.
533 			 *
534 			 * Do not select an errored slave.
535 			 */
536 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
537 				nflags |= HAMMER2_CLUSTER_UNHARD;
538 			} else if (nmasters >= nquorum &&
539 				   (chain == NULL || chain->error == 0) &&
540 				   ((chain == NULL && quorum_tid == 0) ||
541 				    (chain && quorum_tid ==
542 					      chain->bref.modify_tid))) {
543 				++nslaves;
544 				nflags |= HAMMER2_CLUSTER_RDHARD;
545 #if 0
546 				/* XXX optimize for RESOLVE_RDONLY */
547 				if (cluster->focus == NULL) {
548 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
549 					cluster->focus_index = i;
550 					cluster->focus = chain; /* NULL ok */
551 					cluster->error = chain ? chain->error :
552 								 0;
553 				}
554 #endif
555 			} else if (chain == NULL || chain->error == 0) {
556 				nflags |= HAMMER2_CLUSTER_UNSOFT;
557 			}
558 			break;
559 		case HAMMER2_PFSTYPE_SOFT_MASTER:
560 			/*
561 			 * Directly mounted soft master always wins.  There
562 			 * should be only one.
563 			 */
564 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
565 			cluster->focus_index = i;
566 			cluster->focus = chain;
567 			cluster->error = chain ? chain->error : 0;
568 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
569 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
570 			break;
571 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
572 			/*
573 			 * Directly mounted soft slave always wins.  There
574 			 * should be only one.
575 			 */
576 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
577 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
578 				cluster->focus_index = i;
579 				cluster->focus = chain;
580 				cluster->error = chain ? chain->error : 0;
581 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
582 			}
583 			break;
584 		case HAMMER2_PFSTYPE_SUPROOT:
585 			/*
586 			 * spmp (degenerate case)
587 			 */
588 			KKASSERT(i == 0);
589 			cluster->focus_index = i;
590 			cluster->focus = chain;
591 			cluster->error = chain ? chain->error : 0;
592 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
593 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
594 			break;
595 		default:
596 			break;
597 		}
598 	}
599 
600 	/*
601 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
602 	 * is bad or if we are at the PFS root (the bref won't match at
603 	 * the PFS root, obviously).
604 	 */
605 	focus = cluster->focus;
606 	if (focus) {
607 		cluster->ddflag =
608 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
609 	} else {
610 		cluster->ddflag = 0;
611 		goto skip4;
612 	}
613 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
614 		goto skip4;
615 
616 	/*
617 	 * Pass 4
618 	 *
619 	 * Validate the elements that were not marked invalid.  They should
620 	 * match.
621 	 */
622 	for (i = 0; i < cluster->nchains; ++i) {
623 		int ddflag;
624 
625 		chain = cluster->array[i].chain;
626 
627 		if (chain == NULL)
628 			continue;
629 		if (chain == focus)
630 			continue;
631 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
632 			continue;
633 
634 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
635 		if (chain->bref.type != focus->bref.type ||
636 		    chain->bref.key != focus->bref.key ||
637 		    chain->bref.keybits != focus->bref.keybits ||
638 		    chain->bref.modify_tid != focus->bref.modify_tid ||
639 		    chain->bytes != focus->bytes ||
640 		    ddflag != cluster->ddflag) {
641 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
642 			if (hammer2_debug & 1)
643 			kprintf("cluster_resolve: matching modify_tid failed "
644 				"bref test: idx=%d type=%02x/%02x "
645 				"key=%016jx/%d-%016jx/%d "
646 				"mod=%016jx/%016jx bytes=%u/%u\n",
647 				i,
648 				chain->bref.type, focus->bref.type,
649 				chain->bref.key, chain->bref.keybits,
650 				focus->bref.key, focus->bref.keybits,
651 				chain->bref.modify_tid, focus->bref.modify_tid,
652 				chain->bytes, focus->bytes);
653 			if (hammer2_debug & 0x4000)
654 				panic("cluster_resolve");
655 			/* flag issue and force resync? */
656 		}
657 	}
658 skip4:
659 
660 	if (ttlslaves == 0)
661 		nflags |= HAMMER2_CLUSTER_NOSOFT;
662 	if (ttlmasters == 0)
663 		nflags |= HAMMER2_CLUSTER_NOHARD;
664 
665 	/*
666 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
667 	 * all available nodes (even if 0 are available) are fully
668 	 * synchronized.  This is used by the synchronization thread to
669 	 * determine if there is work it could potentially accomplish.
670 	 */
671 	if (nslaves == ttlslaves)
672 		nflags |= HAMMER2_CLUSTER_SSYNCED;
673 	if (nmasters == ttlmasters)
674 		nflags |= HAMMER2_CLUSTER_MSYNCED;
675 
676 	/*
677 	 * Determine if the cluster was successfully locked for the
678 	 * requested operation and generate an error code.  The cluster
679 	 * will not be locked (or ref'd) if an error is returned.
680 	 *
681 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
682 	 * to determine if reading or writing is possible.  If writing, the
683 	 * cluster still requires a call to hammer2_cluster_modify() first.
684 	 */
685 	atomic_set_int(&cluster->flags, nflags);
686 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
687 }
688 
689 /*
690  * This is used by the XOPS subsystem to calculate the state of
691  * the collection and tell hammer2_xop_collect() what to do with it.
692  * The collection can be in various states of desynchronization, the
693  * caller specifically wants to resolve the passed-in key.
694  *
695  * Return values:
696  *	0		- Quorum agreement, key is valid
697  *
698  *	ENOENT		- Quorum agreement, end of scan
699  *
700  *	ESRCH		- Quorum agreement, key is INVALID (caller should
701  *			  skip key).
702  *
703  *	EIO		- Quorum agreement but all elements had errors.
704  *
705  *	EDEADLK		- No quorum agreement possible for key, a repair
706  *			  may be needed.  Caller has to decide what to do,
707  *			  possibly iterating the key or generating an EIO.
708  *
709  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
710  *			  possible if caller waits for more responses.  Caller
711  *			  should not iterate key.
712  *
713  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
714  */
715 int
716 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
717 {
718 	hammer2_chain_t *chain;
719 	hammer2_chain_t *focus;
720 	hammer2_pfs_t *pmp;
721 	hammer2_tid_t quorum_tid;
722 	hammer2_tid_t last_best_quorum_tid;
723 	uint32_t nflags;
724 	int ttlmasters;
725 	int ttlslaves;
726 	int nmasters;
727 	int nmasters_keymatch;
728 	int nslaves;
729 	int nquorum;
730 	int umasters;	/* unknown masters (still in progress) */
731 	int smpresent;
732 	int i;
733 
734 	cluster->error = 0;
735 	cluster->focus = NULL;
736 
737 	nflags = 0;
738 	ttlmasters = 0;
739 	ttlslaves = 0;
740 	nmasters = 0;
741 	nmasters_keymatch = 0;
742 	umasters = 0;
743 	nslaves = 0;
744 
745 	/*
746 	 * Calculate quorum
747 	 */
748 	pmp = cluster->pmp;
749 	KKASSERT(pmp != NULL || cluster->nchains == 0);
750 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
751 	smpresent = 0;
752 
753 	/*
754 	 * Pass 1
755 	 *
756 	 * NOTE: A NULL chain is not necessarily an error, it could be
757 	 *	 e.g. a lookup failure or the end of an iteration.
758 	 *	 Process normally.
759 	 */
760 	for (i = 0; i < cluster->nchains; ++i) {
761 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
762 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
763 
764 		chain = cluster->array[i].chain;
765 		if (chain && chain->error) {
766 			if (cluster->focus == NULL || cluster->focus == chain) {
767 				/* error will be overridden by valid focus */
768 				cluster->error = chain->error;
769 			}
770 
771 			/*
772 			 * Must count total masters and slaves whether the
773 			 * chain is errored or not.
774 			 */
775 			switch (cluster->pmp->pfs_types[i]) {
776 			case HAMMER2_PFSTYPE_MASTER:
777 				++ttlmasters;
778 				break;
779 			case HAMMER2_PFSTYPE_SLAVE:
780 				++ttlslaves;
781 				break;
782 			}
783 			continue;
784 		}
785 		switch (cluster->pmp->pfs_types[i]) {
786 		case HAMMER2_PFSTYPE_MASTER:
787 			++ttlmasters;
788 			break;
789 		case HAMMER2_PFSTYPE_SLAVE:
790 			++ttlslaves;
791 			break;
792 		case HAMMER2_PFSTYPE_SOFT_MASTER:
793 			nflags |= HAMMER2_CLUSTER_WRSOFT;
794 			nflags |= HAMMER2_CLUSTER_RDSOFT;
795 			smpresent = 1;
796 			break;
797 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
798 			nflags |= HAMMER2_CLUSTER_RDSOFT;
799 			break;
800 		case HAMMER2_PFSTYPE_SUPROOT:
801 			/*
802 			 * Degenerate cluster representing the super-root
803 			 * topology on a single device.  Fake stuff so
804 			 * cluster ops work as expected.
805 			 */
806 			nflags |= HAMMER2_CLUSTER_WRHARD;
807 			nflags |= HAMMER2_CLUSTER_RDHARD;
808 			cluster->focus_index = i;
809 			cluster->focus = chain;
810 			cluster->error = chain ? chain->error : 0;
811 			break;
812 		default:
813 			break;
814 		}
815 	}
816 
817 	/*
818 	 * Pass 2
819 	 *
820 	 * Resolve nmasters		- master nodes fully match
821 	 *
822 	 * Resolve umasters		- master nodes operation still
823 	 *				  in progress
824 	 *
825 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
826 	 *				  key and may or may not match
827 	 *				  the quorum-agreed tid.
828 	 *
829 	 * The quorum-agreed TID is the highest matching TID.
830 	 */
831 	last_best_quorum_tid = HAMMER2_TID_MAX;
832 	quorum_tid = 0;		/* fix gcc warning */
833 
834 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
835 		nmasters = 0;
836 		quorum_tid = 0;
837 
838 		for (i = 0; i < cluster->nchains; ++i) {
839 			/* XXX SOFT smpresent handling */
840 			if (cluster->pmp->pfs_types[i] !=
841 			    HAMMER2_PFSTYPE_MASTER) {
842 				continue;
843 			}
844 
845 			chain = cluster->array[i].chain;
846 
847 			/*
848 			 * Skip elements still in progress.  umasters keeps
849 			 * track of masters that might still be in-progress.
850 			 */
851 			if (chain == NULL && (cluster->array[i].flags &
852 					      HAMMER2_CITEM_NULL) == 0) {
853 				++umasters;
854 				continue;
855 			}
856 
857 			/*
858 			 * Key match?
859 			 */
860 			if (flags & HAMMER2_CHECK_NULL) {
861 				if (chain == NULL) {
862 					++nmasters;
863 					++nmasters_keymatch;
864 				}
865 			} else if (chain &&
866 				   (key == (hammer2_key_t)-1 ||
867 				    chain->bref.key == key)) {
868 				++nmasters_keymatch;
869 				if (quorum_tid < last_best_quorum_tid &&
870 				    (quorum_tid < chain->bref.modify_tid ||
871 				     nmasters == 0)) {
872 					/*
873 					 * Better TID located, reset
874 					 * nmasters count.
875 					 */
876 					nmasters = 0;
877 					quorum_tid = chain->bref.modify_tid;
878 				}
879 				if (quorum_tid == chain->bref.modify_tid) {
880 					/*
881 					 * TID matches current collection.
882 					 */
883 					++nmasters;
884 					if (chain->error == 0) {
885 						cluster->focus = chain;
886 						cluster->focus_index = i;
887 					}
888 				}
889 			}
890 		}
891 		if (nmasters >= nquorum)
892 			break;
893 		last_best_quorum_tid = quorum_tid;
894 	}
895 
896 	/*
897 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
898 		nmasters, nquorum, nmasters_keymatch, umasters);
899 	*/
900 
901 	/*
902 	 * Early return if we do not have enough masters.
903 	 */
904 	if (nmasters < nquorum) {
905 		if (nmasters + umasters >= nquorum)
906 			return EINPROGRESS;
907 		if (nmasters_keymatch < nquorum)
908 			return ESRCH;
909 		return EDEADLK;
910 	}
911 
912 	/*
913 	 * Validated end of scan.
914 	 */
915 	if (flags & HAMMER2_CHECK_NULL)
916 		return ENOENT;
917 
918 	/*
919 	 * If we have a NULL focus at this point the agreeing quorum all
920 	 * had chain errors.
921 	 */
922 	if (cluster->focus == NULL)
923 		return EIO;
924 
925 	/*
926 	 * Pass 3
927 	 *
928 	 * We have quorum agreement, validate elements, not end of scan.
929 	 */
930 	for (i = 0; i < cluster->nchains; ++i) {
931 		chain = cluster->array[i].chain;
932 		if (chain == NULL ||
933 		    chain->bref.key != key ||
934 		    chain->bref.modify_tid != quorum_tid) {
935 			continue;
936 		}
937 
938 		switch (cluster->pmp->pfs_types[i]) {
939 		case HAMMER2_PFSTYPE_MASTER:
940 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
941 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
942 			nflags |= HAMMER2_CLUSTER_WRHARD;
943 			nflags |= HAMMER2_CLUSTER_RDHARD;
944 			break;
945 		case HAMMER2_PFSTYPE_SLAVE:
946 			/*
947 			 * We must have enough up-to-date masters to reach
948 			 * a quorum and the slave modify_tid must match the
949 			 * quorum's modify_tid.
950 			 *
951 			 * Do not select an errored slave.
952 			 */
953 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
954 			nflags |= HAMMER2_CLUSTER_RDHARD;
955 			++nslaves;
956 			break;
957 		case HAMMER2_PFSTYPE_SOFT_MASTER:
958 			/*
959 			 * Directly mounted soft master always wins.  There
960 			 * should be only one.
961 			 */
962 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
963 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
964 			break;
965 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
966 			/*
967 			 * Directly mounted soft slave always wins.  There
968 			 * should be only one.
969 			 *
970 			 * XXX
971 			 */
972 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
973 			break;
974 		case HAMMER2_PFSTYPE_SUPROOT:
975 			/*
976 			 * spmp (degenerate case)
977 			 */
978 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
979 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
980 			break;
981 		default:
982 			break;
983 		}
984 	}
985 
986 	/*
987 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
988 	 * is bad or if we are at the PFS root (the bref won't match at
989 	 * the PFS root, obviously).
990 	 */
991 	focus = cluster->focus;
992 	if (focus) {
993 		cluster->ddflag =
994 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
995 	} else {
996 		cluster->ddflag = 0;
997 		goto skip4;
998 	}
999 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1000 		goto skip4;
1001 
1002 	/*
1003 	 * Pass 4
1004 	 *
1005 	 * Validate the elements that were not marked invalid.  They should
1006 	 * match.
1007 	 */
1008 	for (i = 0; i < cluster->nchains; ++i) {
1009 		int ddflag;
1010 
1011 		chain = cluster->array[i].chain;
1012 
1013 		if (chain == NULL)
1014 			continue;
1015 		if (chain == focus)
1016 			continue;
1017 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1018 			continue;
1019 
1020 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1021 		if (chain->bref.type != focus->bref.type ||
1022 		    chain->bref.key != focus->bref.key ||
1023 		    chain->bref.keybits != focus->bref.keybits ||
1024 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1025 		    chain->bytes != focus->bytes ||
1026 		    ddflag != cluster->ddflag) {
1027 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1028 			if (hammer2_debug & 1)
1029 			kprintf("cluster_resolve: matching modify_tid failed "
1030 				"bref test: idx=%d type=%02x/%02x "
1031 				"key=%016jx/%d-%016jx/%d "
1032 				"mod=%016jx/%016jx bytes=%u/%u\n",
1033 				i,
1034 				chain->bref.type, focus->bref.type,
1035 				chain->bref.key, chain->bref.keybits,
1036 				focus->bref.key, focus->bref.keybits,
1037 				chain->bref.modify_tid, focus->bref.modify_tid,
1038 				chain->bytes, focus->bytes);
1039 			if (hammer2_debug & 0x4000)
1040 				panic("cluster_resolve");
1041 			/* flag issue and force resync? */
1042 		}
1043 	}
1044 skip4:
1045 
1046 	if (ttlslaves == 0)
1047 		nflags |= HAMMER2_CLUSTER_NOSOFT;
1048 	if (ttlmasters == 0)
1049 		nflags |= HAMMER2_CLUSTER_NOHARD;
1050 
1051 	/*
1052 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1053 	 * all available nodes (even if 0 are available) are fully
1054 	 * synchronized.  This is used by the synchronization thread to
1055 	 * determine if there is work it could potentially accomplish.
1056 	 */
1057 	if (nslaves == ttlslaves)
1058 		nflags |= HAMMER2_CLUSTER_SSYNCED;
1059 	if (nmasters == ttlmasters)
1060 		nflags |= HAMMER2_CLUSTER_MSYNCED;
1061 
1062 	/*
1063 	 * Determine if the cluster was successfully locked for the
1064 	 * requested operation and generate an error code.  The cluster
1065 	 * will not be locked (or ref'd) if an error is returned.
1066 	 *
1067 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
1068 	 * to determine if reading or writing is possible.  If writing, the
1069 	 * cluster still requires a call to hammer2_cluster_modify() first.
1070 	 */
1071 	atomic_set_int(&cluster->flags, nflags);
1072 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1073 
1074 	return 0;
1075 }
1076 
1077 /*
1078  * This is used by the sync thread to force non-NULL elements of a copy
1079  * of the pmp->iroot cluster to be good which is required to prime the
1080  * sync.
1081  */
1082 void
1083 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1084 {
1085 	int i;
1086 
1087 	for (i = 0; i < cluster->nchains; ++i) {
1088 		if (cluster->array[i].chain)
1089 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1090 	}
1091 }
1092 
1093 /*
1094  * Copy a cluster, returned a ref'd cluster.  All underlying chains
1095  * are also ref'd, but not locked.  Focus state is also copied.
1096  *
1097  * Original cluster does not have to be locked but usually is.
1098  * New cluster will not be flagged as locked.
1099  *
1100  * Callers using this function to initialize a new cluster from an inode
1101  * generally lock and resolve the resulting cluster.
1102  *
1103  * Callers which use this function to save/restore a cluster structure
1104  * generally retain the focus state and do not re-resolve it.  Caller should
1105  * not try to re-resolve internal (cparent) node state during an iteration
1106  * as the individual tracking elements of cparent in an iteration may not
1107  * match even though they are correct.
1108  */
1109 hammer2_cluster_t *
1110 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1111 {
1112 	hammer2_pfs_t *pmp = ocluster->pmp;
1113 	hammer2_cluster_t *ncluster;
1114 	hammer2_chain_t *chain;
1115 	int i;
1116 
1117 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1118 	ncluster->pmp = pmp;
1119 	ncluster->nchains = ocluster->nchains;
1120 	ncluster->refs = 1;
1121 
1122 	for (i = 0; i < ocluster->nchains; ++i) {
1123 		chain = ocluster->array[i].chain;
1124 		ncluster->array[i].chain = chain;
1125 		ncluster->array[i].flags = ocluster->array[i].flags;
1126 		if (chain)
1127 			hammer2_chain_ref(chain);
1128 	}
1129 	ncluster->focus_index = ocluster->focus_index;
1130 	ncluster->focus = ocluster->focus;
1131 	ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1132 					      HAMMER2_CLUSTER_INODE);
1133 
1134 	return (ncluster);
1135 }
1136 
1137 /*
1138  * Unlock a cluster.  Refcount and focus is maintained.
1139  */
1140 void
1141 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
1142 {
1143 	hammer2_chain_t *chain;
1144 	int i;
1145 
1146 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1147 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1148 			cluster);
1149 	}
1150 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1151 	KKASSERT(cluster->refs > 0);
1152 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1153 
1154 	for (i = 0; i < cluster->nchains; ++i) {
1155 		if (i == idx)
1156 			continue;
1157 		chain = cluster->array[i].chain;
1158 		if (chain)
1159 			hammer2_chain_unlock(chain);
1160 	}
1161 }
1162 
1163 void
1164 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1165 {
1166 	hammer2_cluster_unlock_except(cluster, -1);
1167 }
1168 
1169 /*
1170  * Set an inode's cluster modified, marking the related chains RW and
1171  * duplicating them if necessary.
1172  *
1173  * The passed-in chain is a localized copy of the chain previously acquired
1174  * when the inode was locked (and possilby replaced in the mean time), and
1175  * must also be updated.  In fact, we update it first and then synchronize
1176  * the inode's cluster cache.
1177  */
1178 hammer2_inode_data_t *
1179 hammer2_cluster_modify_ip(hammer2_inode_t *ip,
1180 			  hammer2_cluster_t *cluster, int flags)
1181 {
1182 	hammer2_inode_modify(ip);
1183 	hammer2_cluster_modify(cluster, flags);
1184 	hammer2_inode_repoint(ip, NULL, cluster);
1185 	return (&hammer2_cluster_wdata(cluster)->ipdata);
1186 }
1187 
1188 /*
1189  * Adjust the cluster's chains to allow modification and adjust the
1190  * focus.  Data will be accessible on return.
1191  *
1192  * If our focused master errors on modify, re-resolve the cluster to
1193  * try to select a different master.
1194  */
1195 void
1196 hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
1197 {
1198 	hammer2_chain_t *chain;
1199 	int resolve_again;
1200 	int i;
1201 
1202 	resolve_again = 0;
1203 	for (i = 0; i < cluster->nchains; ++i) {
1204 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1205 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1206 			continue;
1207 		}
1208 		chain = cluster->array[i].chain;
1209 		if (chain == NULL)
1210 			continue;
1211 		if (chain->error)
1212 			continue;
1213 		hammer2_chain_modify(chain, flags);
1214 		if (cluster->focus == chain && chain->error) {
1215 			cluster->error = chain->error;
1216 			resolve_again = 1;
1217 		}
1218 	}
1219 	if (resolve_again)
1220 		hammer2_cluster_resolve(cluster);
1221 }
1222 
1223 /*
1224  * Synchronize modifications from the focus to other chains in a cluster.
1225  * Convenient because nominal API users can just modify the contents of the
1226  * focus (at least for non-blockref data).
1227  *
1228  * Nominal front-end operations only edit non-block-table data in a single
1229  * chain.  This code copies such modifications to the other chains in the
1230  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
1231  * by both the frontend and the backend and will explode in fireworks if
1232  * blindly copied.
1233  */
1234 void
1235 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1236 {
1237 	hammer2_chain_t *focus;
1238 	hammer2_chain_t *scan;
1239 	const hammer2_inode_data_t *ripdata;
1240 	hammer2_inode_data_t *wipdata;
1241 	int i;
1242 
1243 	focus = cluster->focus;
1244 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1245 
1246 	for (i = 0; i < cluster->nchains; ++i) {
1247 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1248 			continue;
1249 		scan = cluster->array[i].chain;
1250 		if (scan == NULL || scan == focus)
1251 			continue;
1252 		if (scan->error)
1253 			continue;
1254 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1255 		KKASSERT(focus->bytes == scan->bytes &&
1256 			 focus->bref.type == scan->bref.type);
1257 		switch(focus->bref.type) {
1258 		case HAMMER2_BREF_TYPE_INODE:
1259 			ripdata = &focus->data->ipdata;
1260 			wipdata = &scan->data->ipdata;
1261 			if ((ripdata->meta.op_flags &
1262 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1263 				bcopy(ripdata, wipdata,
1264 				      offsetof(hammer2_inode_data_t, u));
1265 				break;
1266 			}
1267 			/* fall through to full copy */
1268 		case HAMMER2_BREF_TYPE_DATA:
1269 			bcopy(focus->data, scan->data, focus->bytes);
1270 			break;
1271 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1272 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1273 		case HAMMER2_BREF_TYPE_FREEMAP:
1274 		case HAMMER2_BREF_TYPE_VOLUME:
1275 			panic("hammer2_cluster_modsync: illegal node type");
1276 			/* NOT REACHED */
1277 			break;
1278 		default:
1279 			panic("hammer2_cluster_modsync: unknown node type");
1280 			break;
1281 		}
1282 	}
1283 }
1284 
1285 /*
1286  * Lookup initialization/completion API.  Returns a locked, fully resolved
1287  * cluster with one ref.
1288  */
1289 hammer2_cluster_t *
1290 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1291 {
1292 	hammer2_cluster_t *cluster;
1293 
1294 	cluster = hammer2_cluster_copy(cparent);
1295 	if (flags & HAMMER2_LOOKUP_SHARED) {
1296 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1297 					      HAMMER2_RESOLVE_SHARED);
1298 	} else {
1299 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1300 	}
1301 	hammer2_cluster_resolve(cluster);
1302 
1303 	return (cluster);
1304 }
1305 
1306 void
1307 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1308 {
1309 	if (cparent) {
1310 		hammer2_cluster_unlock(cparent);
1311 		hammer2_cluster_drop(cparent);
1312 	}
1313 }
1314 
1315 /*
1316  * Locate first match or overlap under parent, return a new, locked, resolved
1317  * cluster with one ref.
1318  *
1319  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1320  */
1321 hammer2_cluster_t *
1322 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1323 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1324 {
1325 	hammer2_pfs_t *pmp;
1326 	hammer2_cluster_t *cluster;
1327 	hammer2_chain_t *chain;
1328 	hammer2_key_t key_accum;
1329 	hammer2_key_t key_next;
1330 	int null_count;
1331 	int rflags;
1332 	int i;
1333 
1334 	KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1335 
1336 	pmp = cparent->pmp;				/* can be NULL */
1337 	key_accum = *key_nextp;
1338 	null_count = 0;
1339 	if (flags & HAMMER2_LOOKUP_SHARED)
1340 		rflags = HAMMER2_RESOLVE_SHARED;
1341 	else
1342 		rflags = 0;
1343 
1344 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1345 	cluster->pmp = pmp;				/* can be NULL */
1346 	cluster->refs = 1;
1347 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1348 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1349 
1350 	/*
1351 	 * Iterating earlier cluster elements with later elements still
1352 	 * locked is a problem, so we have to unlock the parent and then
1353 	 * re-lock as we go.
1354 	 */
1355 	hammer2_cluster_unlock(cparent);
1356 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1357 
1358 	/*
1359 	 * Pass-1, issue lookups.
1360 	 */
1361 	for (i = 0; i < cparent->nchains; ++i) {
1362 		cluster->array[i].flags = cparent->array[i].flags;
1363 		key_next = *key_nextp;
1364 
1365 		/*
1366 		 * Always relock the parent as we go.
1367 		 */
1368 		if (cparent->array[i].chain) {
1369 			hammer2_chain_lock(cparent->array[i].chain, rflags);
1370 		}
1371 
1372 		/*
1373 		 * Nothing to base the lookup, or parent was not synchronized.
1374 		 */
1375 		if (cparent->array[i].chain == NULL ||
1376 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1377 			++null_count;
1378 			continue;
1379 		}
1380 
1381 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
1382 					     &key_next,
1383 					     key_beg, key_end,
1384 					     &cparent->array[i].cache_index,
1385 					     flags);
1386 		cluster->array[i].chain = chain;
1387 		if (chain == NULL) {
1388 			++null_count;
1389 		}
1390 		if (key_accum > key_next)
1391 			key_accum = key_next;
1392 	}
1393 
1394 	/*
1395 	 * Cleanup
1396 	 */
1397 	cluster->nchains = i;
1398 	*key_nextp = key_accum;
1399 
1400 	/*
1401 	 * The cluster must be resolved, out of sync elements may be present.
1402 	 *
1403 	 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1404 	 */
1405 	if (null_count != i)
1406 		hammer2_cluster_resolve(cluster);
1407 	if (null_count == i ||
1408 	    (cluster->focus == NULL &&
1409 	     (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1410 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1411 			hammer2_cluster_unlock(cluster);
1412 		hammer2_cluster_drop(cluster);
1413 		cluster = NULL;
1414 	}
1415 
1416 	return (cluster);
1417 }
1418 
1419 /*
1420  * Locate next match or overlap under parent, replace the passed-in cluster.
1421  * The returned cluster is a new, locked, resolved cluster with one ref.
1422  *
1423  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1424  */
1425 hammer2_cluster_t *
1426 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1427 		     hammer2_key_t *key_nextp,
1428 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1429 {
1430 	hammer2_chain_t *ochain;
1431 	hammer2_chain_t *nchain;
1432 	hammer2_key_t key_accum;
1433 	hammer2_key_t key_next;
1434 	int parent_index;
1435 	int cluster_index;
1436 	int null_count;
1437 	int rflags;
1438 	int i;
1439 
1440 	KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1441 
1442 	key_accum = *key_nextp;
1443 	null_count = 0;
1444 	parent_index = cparent->focus_index;	/* save prior focus */
1445 	cluster_index = cluster->focus_index;
1446 	if (flags & HAMMER2_LOOKUP_SHARED)
1447 		rflags = HAMMER2_RESOLVE_SHARED;
1448 	else
1449 		rflags = 0;
1450 
1451 	cluster->focus = NULL;		/* XXX needed any more? */
1452 	/*cparent->focus = NULL;*/
1453 	cluster->focus_index = 0;	/* XXX needed any more? */
1454 	/*cparent->focus_index = 0;*/
1455 
1456 	cluster->ddflag = 0;
1457 
1458 	/*
1459 	 * The parent is always locked on entry, the iterator may be locked
1460 	 * depending on flags.
1461 	 *
1462 	 * We must temporarily unlock the passed-in clusters to avoid a
1463 	 * deadlock between elements of the cluster with other threads.
1464 	 * We will fixup the lock in the loop.
1465 	 *
1466 	 * Note that this will clear the focus.
1467 	 *
1468 	 * Reflag the clusters as locked, because we will relock them
1469 	 * as we go.
1470 	 */
1471 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1472 		hammer2_cluster_unlock(cluster);
1473 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1474 	}
1475 	hammer2_cluster_unlock(cparent);
1476 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1477 
1478 	for (i = 0; i < cparent->nchains; ++i) {
1479 		key_next = *key_nextp;
1480 		ochain = cluster->array[i].chain;
1481 
1482 		/*
1483 		 * Always relock the parent as we go.
1484 		 */
1485 		if (cparent->array[i].chain)
1486 			hammer2_chain_lock(cparent->array[i].chain, rflags);
1487 
1488 		/*
1489 		 * Nothing to iterate from.  These cases can occur under
1490 		 * normal operations.  For example, during synchronization
1491 		 * a slave might reach the end of its scan while records
1492 		 * are still left on the master(s).
1493 		 */
1494 		if (ochain == NULL) {
1495 			++null_count;
1496 			continue;
1497 		}
1498 		if (cparent->array[i].chain == NULL ||
1499 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1500 		    (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1501 			/* ochain has not yet been relocked */
1502 			hammer2_chain_drop(ochain);
1503 			cluster->array[i].chain = NULL;
1504 			++null_count;
1505 			continue;
1506 		}
1507 
1508 		/*
1509 		 * Relock the child if necessary.  Parent and child will then
1510 		 * be locked as expected by hammer2_chain_next() and flags.
1511 		 */
1512 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1513 			hammer2_chain_lock(ochain, rflags);
1514 		nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1515 					    &key_next, key_beg, key_end,
1516 					    &cparent->array[i].cache_index,
1517 					    flags);
1518 		/* ochain now invalid but can still be used for focus check */
1519 		if (parent_index == i) {
1520 			cparent->focus_index = i;
1521 			cparent->focus = cparent->array[i].chain;
1522 		}
1523 
1524 		cluster->array[i].chain = nchain;
1525 		if (nchain == NULL) {
1526 			++null_count;
1527 		}
1528 		if (key_accum > key_next)
1529 			key_accum = key_next;
1530 	}
1531 
1532 	/*
1533 	 * Cleanup
1534 	 */
1535 	cluster->nchains = i;
1536 	*key_nextp = key_accum;
1537 
1538 	/*
1539 	 * The cluster must be resolved, out of sync elements may be present.
1540 	 *
1541 	 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1542 	 */
1543 	if (null_count != i)
1544 		hammer2_cluster_resolve(cluster);
1545 	if (null_count == i ||
1546 	    (cluster->focus == NULL &&
1547 	     (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1548 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1549 			hammer2_cluster_unlock(cluster);
1550 		hammer2_cluster_drop(cluster);
1551 		cluster = NULL;
1552 	}
1553 	return(cluster);
1554 }
1555 
1556 /*
1557  * Advance just one chain in the cluster and recalculate the invalid bit.
1558  * The cluster index is allowed to be flagged invalid on input and is
1559  * recalculated on return.
1560  *
1561  * (used during synchronization to advance past a chain being deleted).
1562  *
1563  * The chain being advanced must not be the focus and the clusters in
1564  * question must have already passed normal cluster_lookup/cluster_next
1565  * checks.
1566  *
1567  * The cluster always remains intact on return, so void function.
1568  */
1569 void
1570 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1571 				  hammer2_cluster_t *cluster,
1572 				  hammer2_key_t *key_nextp,
1573 				  hammer2_key_t key_beg,
1574 				  hammer2_key_t key_end,
1575 				  int i, int flags)
1576 {
1577 	hammer2_chain_t *ochain;
1578 	hammer2_chain_t *nchain;
1579 	hammer2_chain_t *focus;
1580 	hammer2_key_t key_accum;
1581 	hammer2_key_t key_next;
1582 	int ddflag;
1583 
1584 	key_accum = *key_nextp;
1585 	key_next = *key_nextp;
1586 	ochain = cluster->array[i].chain;
1587 	if (ochain == NULL)
1588 		goto done;
1589 	KKASSERT(ochain != cluster->focus);
1590 
1591 	nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1592 				    &key_next, key_beg, key_end,
1593 				    &cparent->array[i].cache_index,
1594 				    flags);
1595 	/* ochain now invalid */
1596 	if (cparent->focus_index == i)
1597 		cparent->focus = cparent->array[i].chain;
1598 
1599 	/*
1600 	 * Install nchain.  Note that nchain can be NULL, and can also
1601 	 * be in an unlocked state depending on flags.
1602 	 */
1603 	cluster->array[i].chain = nchain;
1604 	cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1605 
1606 	if (key_accum > key_next)
1607 		key_accum = key_next;
1608 
1609 	focus = cluster->focus;
1610 	if (focus == NULL)
1611 		goto done;
1612 	if (nchain == NULL)
1613 		goto done;
1614 #if 0
1615 	if (nchain == focus)	/* ASSERTED NOT TRUE */
1616 		...
1617 #endif
1618 	ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1619 	if (nchain->bref.type != focus->bref.type ||
1620 	    nchain->bref.key != focus->bref.key ||
1621 	    nchain->bref.keybits != focus->bref.keybits ||
1622 	    nchain->bref.modify_tid != focus->bref.modify_tid ||
1623 	    nchain->bytes != focus->bytes ||
1624 	    ddflag != cluster->ddflag) {
1625 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1626 	}
1627 
1628 done:
1629 	*key_nextp = key_accum;
1630 #if 0
1631 	/*
1632 	 * For now don't re-resolve cluster->flags.
1633 	 */
1634 	hammer2_cluster_resolve(cluster);
1635 #endif
1636 }
1637 
1638 /*
1639  * Mark a cluster deleted
1640  */
1641 void
1642 hammer2_cluster_delete(hammer2_cluster_t *cparent,
1643 		       hammer2_cluster_t *cluster, int flags)
1644 {
1645 	hammer2_chain_t *chain;
1646 	hammer2_chain_t *parent;
1647 	int i;
1648 
1649 	if (cparent == NULL) {
1650 		kprintf("cparent is NULL\n");
1651 		return;
1652 	}
1653 
1654 	for (i = 0; i < cluster->nchains; ++i) {
1655 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1656 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1657 			continue;
1658 		}
1659 		parent = cparent->array[i].chain;
1660 		chain = cluster->array[i].chain;
1661 		if (chain == NULL)
1662 			continue;
1663 		if (chain->parent != parent) {
1664 			kprintf("hammer2_cluster_delete: parent "
1665 				"mismatch chain=%p parent=%p against=%p\n",
1666 				chain, chain->parent, parent);
1667 		} else {
1668 			hammer2_chain_delete(parent, chain, flags);
1669 		}
1670 	}
1671 }
1672 
1673 /*
1674  * Create a snapshot of the specified {parent, ochain} with the specified
1675  * label.  The originating hammer2_inode must be exclusively locked for
1676  * safety.
1677  *
1678  * The ioctl code has already synced the filesystem.
1679  */
1680 int
1681 hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
1682 		       hammer2_ioc_pfs_t *pmp)
1683 {
1684 	hammer2_dev_t *hmp;
1685 	const hammer2_inode_data_t *ripdata;
1686 	hammer2_inode_data_t *wipdata;
1687 	hammer2_chain_t *nchain;
1688 	hammer2_inode_t *nip;
1689 	size_t name_len;
1690 	hammer2_key_t lhc;
1691 	struct vattr vat;
1692 #if 0
1693 	uuid_t opfs_clid;
1694 #endif
1695 	int error;
1696 
1697 	kprintf("snapshot %s\n", pmp->name);
1698 
1699 	name_len = strlen(pmp->name);
1700 	lhc = hammer2_dirhash(pmp->name, name_len);
1701 
1702 	/*
1703 	 * Get the clid
1704 	 */
1705 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1706 #if 0
1707 	opfs_clid = ripdata->meta.pfs_clid;
1708 #endif
1709 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1710 
1711 	/*
1712 	 * Create the snapshot directory under the super-root
1713 	 *
1714 	 * Set PFS type, generate a unique filesystem id, and generate
1715 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1716 	 * which theoretically allows the snapshot to be used as part of
1717 	 * the same cluster (perhaps as a cache).
1718 	 *
1719 	 * Copy the (flushed) blockref array.  Theoretically we could use
1720 	 * chain_duplicate() but it becomes difficult to disentangle
1721 	 * the shared core so for now just brute-force it.
1722 	 */
1723 	VATTR_NULL(&vat);
1724 	vat.va_type = VDIR;
1725 	vat.va_mode = 0755;
1726 	nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
1727 				   pmp->name, name_len, 0,
1728 				   1, 0, 0,
1729 				   HAMMER2_INSERT_PFSROOT, &error);
1730 
1731 	if (nip) {
1732 		hammer2_inode_modify(nip);
1733 		nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
1734 		hammer2_chain_modify(nchain, 0);
1735 		wipdata = &nchain->data->ipdata;
1736 
1737 		nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1738 		nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1739 		nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1740 		kern_uuidgen(&nip->meta.pfs_fsid, 1);
1741 
1742 		/*
1743 		 * Give the snapshot its own private cluster id.  As a
1744 		 * snapshot no further synchronization with the original
1745 		 * cluster will be done.
1746 		 */
1747 #if 0
1748 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1749 			nip->meta.pfs_clid = opfs_clid;
1750 		else
1751 			kern_uuidgen(&nip->meta.pfs_clid, 1);
1752 #endif
1753 		kern_uuidgen(&nip->meta.pfs_clid, 1);
1754 		nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1755 
1756 		/* XXX hack blockset copy */
1757 		/* XXX doesn't work with real cluster */
1758 		KKASSERT(ocluster->nchains == 1);
1759 		wipdata->meta = nip->meta;
1760 		wipdata->u.blockset = ripdata->u.blockset;
1761 		hammer2_flush(nchain, 1);
1762 		hammer2_chain_unlock(nchain);
1763 		hammer2_chain_drop(nchain);
1764 		hammer2_inode_unlock(nip);
1765 	}
1766 	return (error);
1767 }
1768 
1769 /************************************************************************
1770  *			        CLUSTER I/O 				*
1771  ************************************************************************
1772  *
1773  *
1774  * WARNING! blockref[] array data is not universal.  These functions should
1775  *	    only be used to access universal data.
1776  *
1777  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1778  *	    complete if necessary.  The I/O's should have already been
1779  *	    initiated by the cluster_lock/chain_lock operation.
1780  *
1781  *	    The cluster must already be in a modified state before wdata
1782  *	    is called.  The data will already be available for this case.
1783  */
1784 const hammer2_media_data_t *
1785 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1786 {
1787 	KKASSERT(cluster->focus != NULL);
1788 	return(cluster->focus->data);
1789 }
1790 
1791 const hammer2_media_data_t *
1792 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
1793 {
1794 	KKASSERT(cluster->focus != NULL);
1795 	*bytesp = cluster->focus->bytes;
1796 	return(cluster->focus->data);
1797 }
1798 
1799 hammer2_media_data_t *
1800 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1801 {
1802 	KKASSERT(cluster->focus != NULL);
1803 	KKASSERT(hammer2_cluster_modified(cluster));
1804 	return(cluster->focus->data);
1805 }
1806