xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision 02d0b1ce2a4f5e91f8a38d977e6239e4483d4d48)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125 	hammer2_chain_t *chain;
126 	int i;
127 
128 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 	for (i = 0; i < cluster->nchains; ++i) {
130 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131 			continue;
132 		chain = cluster->array[i].chain;
133 		if (chain == NULL)
134 			continue;
135 		if (chain->error)
136 			continue;
137 		if (chain->bytes != bytes)
138 			return 1;
139 	}
140 	return 0;
141 }
142 
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 	if (cluster->error == 0)
154 		return(cluster->focus->bref.type);
155 	return 0;
156 }
157 
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 	if (cluster->error == 0)
168 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169 	return 0;
170 }
171 
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 	if (cluster->error == 0) {
187 		*bref = cluster->focus->bref;
188 		bref->data_off = 0;
189 	} else {
190 		bzero(bref, sizeof(*bref));
191 	}
192 }
193 
194 /*
195  * Return non-zero if the chain representing an inode has been flagged
196  * as having been unlinked.  Allows the vnode reclaim to avoid loading
197  * the inode data from disk e.g. when unmount or recycling old, clean
198  * vnodes.
199  *
200  * The cluster does not need to be locked.
201  * The focus cannot be used since the cluster might not be locked.
202  */
203 int
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
205 {
206 	hammer2_chain_t *chain;
207 	int flags;
208 	int i;
209 
210 	flags = 0;
211 	for (i = 0; i < cluster->nchains; ++i) {
212 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213 			continue;
214 		chain = cluster->array[i].chain;
215 		if (chain)
216 			flags |= chain->flags;
217 	}
218 	return (flags & HAMMER2_CHAIN_UNLINKED);
219 }
220 
221 /*
222  * Set a bitmask of flags in all chains related to a cluster.
223  * The cluster should probably be locked.
224  *
225  * XXX Only operate on FEMOD elements?
226  */
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
229 {
230 	hammer2_chain_t *chain;
231 	int i;
232 
233 	for (i = 0; i < cluster->nchains; ++i) {
234 		chain = cluster->array[i].chain;
235 		if (chain)
236 			atomic_set_int(&chain->flags, flags);
237 	}
238 }
239 
240 /*
241  * Set a bitmask of flags in all chains related to a cluster.
242  * The cluster should probably be locked.
243  *
244  * XXX Only operate on FEMOD elements?
245  */
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
248 {
249 	hammer2_chain_t *chain;
250 	int i;
251 
252 	for (i = 0; i < cluster->nchains; ++i) {
253 		chain = cluster->array[i].chain;
254 		if (chain)
255 			atomic_clear_int(&chain->flags, flags);
256 	}
257 }
258 
259 /*
260  * Flag the cluster for flushing recursively up to the root.  Despite the
261  * work it does, this is relatively benign.  It just makes sure that the
262  * flusher has top-down visibility to this cluster.
263  *
264  * Errored chains are not flagged for flushing.
265  *
266  * The cluster should probably be locked.
267  */
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
270 {
271 	hammer2_chain_t *chain;
272 	int i;
273 
274 	for (i = 0; i < cluster->nchains; ++i) {
275 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276 			continue;
277 		chain = cluster->array[i].chain;
278 		if (chain == NULL)
279 			continue;
280 		if (chain->error)
281 			continue;
282 		hammer2_chain_setflush(trans, chain);
283 	}
284 }
285 
286 /*
287  * Set the check mode for the cluster.
288  * Errored elements of the cluster are ignored.
289  *
290  * The cluster must be locked and modified.
291  */
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294 				hammer2_cluster_t *cluster,
295 				int check_algo)
296 {
297 	hammer2_chain_t *chain;
298 	int i;
299 
300 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301 	for (i = 0; i < cluster->nchains; ++i) {
302 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304 			continue;
305 		}
306 		chain = cluster->array[i].chain;
307 		if (chain == NULL)
308 			continue;
309 		if (chain->error)
310 			continue;
311 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312 		chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313 		chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
314 	}
315 }
316 
317 /*
318  * Create a degenerate cluster with one ref from a single locked chain.
319  * The returned cluster will be focused on the chain and inherit its
320  * error state.
321  *
322  * The chain's lock and reference are transfered to the new cluster, so
323  * the caller should not try to unlock the chain separately.
324  *
325  * We fake the flags.
326  */
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
329 {
330 	hammer2_cluster_t *cluster;
331 
332 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333 	cluster->array[0].chain = chain;
334 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335 	cluster->nchains = 1;
336 	cluster->focus = chain;
337 	cluster->focus_index = 0;
338 	cluster->pmp = chain->pmp;
339 	cluster->refs = 1;
340 	cluster->error = chain->error;
341 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
342 			 HAMMER2_CLUSTER_WRHARD |
343 			 HAMMER2_CLUSTER_RDHARD |
344 			 HAMMER2_CLUSTER_MSYNCED |
345 			 HAMMER2_CLUSTER_SSYNCED;
346 
347 	return cluster;
348 }
349 
350 /*
351  * Add a reference to a cluster and its underlying chains.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359 	atomic_add_int(&cluster->refs, 1);
360 }
361 
362 /*
363  * Drop the caller's reference to the cluster.  When the ref count drops to
364  * zero this function frees the cluster and drops all underlying chains.
365  *
366  * In-progress read I/Os are typically detached from the cluster once the
367  * first one returns (the remaining stay attached to the DIOs but are then
368  * ignored and drop naturally).
369  */
370 void
371 hammer2_cluster_drop(hammer2_cluster_t *cluster)
372 {
373 	hammer2_chain_t *chain;
374 	int i;
375 
376 	KKASSERT(cluster->refs > 0);
377 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
378 		cluster->focus = NULL;		/* safety XXX chg to assert */
379 		cluster->focus_index = 0;
380 
381 		for (i = 0; i < cluster->nchains; ++i) {
382 			chain = cluster->array[i].chain;
383 			if (chain) {
384 				hammer2_chain_drop(chain);
385 				cluster->array[i].chain = NULL; /* safety */
386 			}
387 		}
388 		cluster->nchains = 0;				/* safety */
389 
390 		kfree(cluster, M_HAMMER2);
391 		/* cluster is invalid */
392 	}
393 }
394 
395 void
396 hammer2_cluster_wait(hammer2_cluster_t *cluster)
397 {
398 	tsleep(cluster->focus, 0, "h2clcw", 1);
399 }
400 
401 /*
402  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
403  *
404  * WARNING! This function expects the caller to handle resolution of the
405  *	    cluster.  We never re-resolve the cluster in this function,
406  *	    because it might be used to temporarily unlock/relock a cparent
407  *	    in an iteration or recursrion, and the cparents elements do not
408  *	    necessarily match.
409  */
410 void
411 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
412 {
413 	hammer2_chain_t *chain;
414 	int i;
415 
416 	/* cannot be on inode-embedded cluster template, must be on copy */
417 	KKASSERT(cluster->refs > 0);
418 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
419 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
420 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
421 			cluster);
422 	}
423 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
424 
425 	/*
426 	 * Lock chains and resolve state.
427 	 */
428 	for (i = 0; i < cluster->nchains; ++i) {
429 		if (i == idx)
430 			continue;
431 		chain = cluster->array[i].chain;
432 		if (chain == NULL)
433 			continue;
434 		hammer2_chain_lock(chain, how);
435 	}
436 }
437 
438 void
439 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
440 {
441 	hammer2_cluster_lock_except(cluster, -1, how);
442 }
443 
444 /*
445  * Calculate the clustering state for the cluster and set its focus.
446  * This routine must be called with care.  For example, it should not
447  * normally be called after relocking a non-leaf cluster because parent
448  * clusters help iterations and each element might be at a slightly different
449  * indirect node (each node's topology is independently indexed).
450  *
451  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
452  * operations.  Typically this is only set on a quorum of MASTERs or
453  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
454  * is present, this bit is *not* set on a quorum of MASTERs.  The
455  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
456  * that create/modify/delete elements use it.
457  *
458  * The chains making up the cluster may be narrowed down based on quorum
459  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
460  * narrowed down to a single chain as long as the entire subtopology is known
461  * to be intact.  So, for example, we can narrow a read-only op to a single
462  * fast SLAVE but if we focus a CACHE chain we must still retain at least
463  * a SLAVE to ensure that the subtopology can be accessed.
464  *
465  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
466  * to be maintained once the topology is validated as-of the top level of
467  * the operation.
468  *
469  * If a failure occurs the operation must be aborted by higher-level code and
470  * retried. XXX
471  */
472 void
473 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
474 {
475 	hammer2_chain_t *chain;
476 	hammer2_chain_t *focus;
477 	hammer2_pfs_t *pmp;
478 	hammer2_tid_t quorum_tid;
479 	hammer2_tid_t last_best_quorum_tid;
480 	int focus_pfs_type;
481 	uint32_t nflags;
482 	int ttlmasters;
483 	int ttlslaves;
484 	int nmasters;
485 	int nslaves;
486 	int nquorum;
487 	int smpresent;
488 	int i;
489 
490 	cluster->error = 0;
491 	cluster->focus = NULL;
492 
493 	focus_pfs_type = 0;
494 	nflags = 0;
495 	ttlmasters = 0;
496 	ttlslaves = 0;
497 	nmasters = 0;
498 	nslaves = 0;
499 
500 	/*
501 	 * Calculate quorum
502 	 */
503 	pmp = cluster->pmp;
504 	KKASSERT(pmp != NULL || cluster->nchains == 0);
505 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
506 	smpresent = 0;
507 
508 	/*
509 	 * Pass 1
510 	 *
511 	 * NOTE: A NULL chain is not necessarily an error, it could be
512 	 *	 e.g. a lookup failure or the end of an iteration.
513 	 *	 Process normally.
514 	 */
515 	for (i = 0; i < cluster->nchains; ++i) {
516 		chain = cluster->array[i].chain;
517 		if (chain && chain->error) {
518 			if (cluster->focus == NULL || cluster->focus == chain) {
519 				/* error will be overridden by valid focus */
520 				cluster->error = chain->error;
521 			}
522 
523 			/*
524 			 * Must count total masters and slaves whether the
525 			 * chain is errored or not.
526 			 */
527 			switch (cluster->pmp->pfs_types[i]) {
528 			case HAMMER2_PFSTYPE_MASTER:
529 				++ttlmasters;
530 				break;
531 			case HAMMER2_PFSTYPE_SLAVE:
532 				++ttlslaves;
533 				break;
534 			}
535 			continue;
536 		}
537 		switch (cluster->pmp->pfs_types[i]) {
538 		case HAMMER2_PFSTYPE_MASTER:
539 			++ttlmasters;
540 			break;
541 		case HAMMER2_PFSTYPE_SLAVE:
542 			++ttlslaves;
543 			break;
544 		case HAMMER2_PFSTYPE_SOFT_MASTER:
545 			nflags |= HAMMER2_CLUSTER_WRSOFT;
546 			nflags |= HAMMER2_CLUSTER_RDSOFT;
547 			smpresent = 1;
548 			break;
549 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
550 			nflags |= HAMMER2_CLUSTER_RDSOFT;
551 			break;
552 		case HAMMER2_PFSTYPE_SUPROOT:
553 			/*
554 			 * Degenerate cluster representing the super-root
555 			 * topology on a single device.  Fake stuff so
556 			 * cluster ops work as expected.
557 			 */
558 			nflags |= HAMMER2_CLUSTER_WRHARD;
559 			nflags |= HAMMER2_CLUSTER_RDHARD;
560 			cluster->focus_index = i;
561 			cluster->focus = chain;
562 			cluster->error = chain ? chain->error : 0;
563 			break;
564 		default:
565 			break;
566 		}
567 	}
568 
569 	/*
570 	 * Pass 2
571 	 *
572 	 * Resolve masters.  Calculate nmasters for the highest matching
573 	 * TID, if a quorum cannot be attained try the next lower matching
574 	 * TID until we exhaust TIDs.
575 	 *
576 	 * NOTE: A NULL chain is not necessarily an error, it could be
577 	 *	 e.g. a lookup failure or the end of an iteration.
578 	 *	 Process normally.
579 	 */
580 	last_best_quorum_tid = HAMMER2_TID_MAX;
581 	quorum_tid = 0;		/* fix gcc warning */
582 
583 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
584 		nmasters = 0;
585 		quorum_tid = 0;
586 
587 		for (i = 0; i < cluster->nchains; ++i) {
588 			if (cluster->pmp->pfs_types[i] !=
589 			    HAMMER2_PFSTYPE_MASTER) {
590 				continue;
591 			}
592 			chain = cluster->array[i].chain;
593 
594 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
595 				/*
596 				 * Invalid as in unsynchronized, cannot be
597 				 * used to calculate the quorum.
598 				 */
599 			} else if (chain == NULL && quorum_tid == 0) {
600 				/*
601 				 * NULL chain on master matches NULL chains
602 				 * on other masters.
603 				 */
604 				++nmasters;
605 			} else if (quorum_tid < last_best_quorum_tid &&
606 				   chain != NULL &&
607 				   (quorum_tid < chain->bref.modify_tid ||
608 				    nmasters == 0)) {
609 				/*
610 				 * Better TID located, reset nmasters count.
611 				 */
612 				nmasters = 1;
613 				quorum_tid = chain->bref.modify_tid;
614 			} else if (chain &&
615 				   quorum_tid == chain->bref.modify_tid) {
616 				/*
617 				 * TID matches current collection.
618 				 */
619 				++nmasters;
620 			}
621 		}
622 		if (nmasters >= nquorum)
623 			break;
624 		last_best_quorum_tid = quorum_tid;
625 	}
626 
627 	/*
628 	 * Pass 3
629 	 *
630 	 * NOTE: A NULL chain is not necessarily an error, it could be
631 	 *	 e.g. a lookup failure or the end of an iteration.
632 	 *	 Process normally.
633 	 */
634 	for (i = 0; i < cluster->nchains; ++i) {
635 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
636 		chain = cluster->array[i].chain;
637 		if (chain && chain->error) {
638 			if (cluster->focus == NULL || cluster->focus == chain) {
639 				/* error will be overridden by valid focus */
640 				cluster->error = chain->error;
641 			}
642 			continue;
643 		}
644 
645 		switch (cluster->pmp->pfs_types[i]) {
646 		case HAMMER2_PFSTYPE_MASTER:
647 			/*
648 			 * We must have enough up-to-date masters to reach
649 			 * a quorum and the master modify_tid must match
650 			 * the quorum's modify_tid.
651 			 *
652 			 * Do not select an errored or out-of-sync master.
653 			 */
654 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
655 				nflags |= HAMMER2_CLUSTER_UNHARD;
656 			} else if (nmasters >= nquorum &&
657 				   (chain == NULL || chain->error == 0) &&
658 				   ((chain == NULL && quorum_tid == 0) ||
659 				    (chain != NULL && quorum_tid ==
660 						  chain->bref.modify_tid))) {
661 				nflags |= HAMMER2_CLUSTER_WRHARD;
662 				nflags |= HAMMER2_CLUSTER_RDHARD;
663 				if (!smpresent) {
664 					cluster->array[i].flags |=
665 							HAMMER2_CITEM_FEMOD;
666 				}
667 				if (cluster->focus == NULL ||
668 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
669 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
670 					cluster->focus_index = i;
671 					cluster->focus = chain; /* NULL ok */
672 					cluster->error = chain ? chain->error :
673 								 0;
674 				}
675 			} else if (chain == NULL || chain->error == 0) {
676 				nflags |= HAMMER2_CLUSTER_UNHARD;
677 			}
678 			break;
679 		case HAMMER2_PFSTYPE_SLAVE:
680 			/*
681 			 * We must have enough up-to-date masters to reach
682 			 * a quorum and the slave modify_tid must match the
683 			 * quorum's modify_tid.
684 			 *
685 			 * Do not select an errored slave.
686 			 */
687 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
688 				nflags |= HAMMER2_CLUSTER_UNHARD;
689 			} else if (nmasters >= nquorum &&
690 				   (chain == NULL || chain->error == 0) &&
691 				   ((chain == NULL && quorum_tid == 0) ||
692 				    (chain && quorum_tid ==
693 					      chain->bref.modify_tid))) {
694 				++nslaves;
695 				nflags |= HAMMER2_CLUSTER_RDHARD;
696 #if 0
697 				/* XXX optimize for RESOLVE_RDONLY */
698 				if (cluster->focus == NULL) {
699 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
700 					cluster->focus_index = i;
701 					cluster->focus = chain; /* NULL ok */
702 					cluster->error = chain ? chain->error :
703 								 0;
704 				}
705 #endif
706 			} else if (chain == NULL || chain->error == 0) {
707 				nflags |= HAMMER2_CLUSTER_UNSOFT;
708 			}
709 			break;
710 		case HAMMER2_PFSTYPE_SOFT_MASTER:
711 			/*
712 			 * Directly mounted soft master always wins.  There
713 			 * should be only one.
714 			 */
715 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
716 			cluster->focus_index = i;
717 			cluster->focus = chain;
718 			cluster->error = chain ? chain->error : 0;
719 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
720 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
721 			break;
722 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
723 			/*
724 			 * Directly mounted soft slave always wins.  There
725 			 * should be only one.
726 			 */
727 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
728 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
729 				cluster->focus_index = i;
730 				cluster->focus = chain;
731 				cluster->error = chain ? chain->error : 0;
732 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
733 			}
734 			break;
735 		case HAMMER2_PFSTYPE_SUPROOT:
736 			/*
737 			 * spmp (degenerate case)
738 			 */
739 			KKASSERT(i == 0);
740 			cluster->focus_index = i;
741 			cluster->focus = chain;
742 			cluster->error = chain ? chain->error : 0;
743 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
744 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
745 			break;
746 		default:
747 			break;
748 		}
749 	}
750 
751 	/*
752 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
753 	 * is bad or if we are at the PFS root (the bref won't match at
754 	 * the PFS root, obviously).
755 	 */
756 	focus = cluster->focus;
757 	if (focus) {
758 		cluster->ddflag =
759 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
760 	} else {
761 		cluster->ddflag = 0;
762 		goto skip4;
763 	}
764 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
765 		goto skip4;
766 
767 	/*
768 	 * Pass 4
769 	 *
770 	 * Validate the elements that were not marked invalid.  They should
771 	 * match.
772 	 */
773 	for (i = 0; i < cluster->nchains; ++i) {
774 		int ddflag;
775 
776 		chain = cluster->array[i].chain;
777 
778 		if (chain == NULL)
779 			continue;
780 		if (chain == focus)
781 			continue;
782 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
783 			continue;
784 
785 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
786 		if (chain->bref.type != focus->bref.type ||
787 		    chain->bref.key != focus->bref.key ||
788 		    chain->bref.keybits != focus->bref.keybits ||
789 		    chain->bref.modify_tid != focus->bref.modify_tid ||
790 		    chain->bytes != focus->bytes ||
791 		    ddflag != cluster->ddflag) {
792 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
793 			if (hammer2_debug & 1)
794 			kprintf("cluster_resolve: matching modify_tid failed "
795 				"bref test: idx=%d type=%02x/%02x "
796 				"key=%016jx/%d-%016jx/%d "
797 				"mod=%016jx/%016jx bytes=%u/%u\n",
798 				i,
799 				chain->bref.type, focus->bref.type,
800 				chain->bref.key, chain->bref.keybits,
801 				focus->bref.key, focus->bref.keybits,
802 				chain->bref.modify_tid, focus->bref.modify_tid,
803 				chain->bytes, focus->bytes);
804 			if (hammer2_debug & 0x4000)
805 				panic("cluster_resolve");
806 			/* flag issue and force resync? */
807 		}
808 	}
809 skip4:
810 
811 	if (ttlslaves == 0)
812 		nflags |= HAMMER2_CLUSTER_NOSOFT;
813 	if (ttlmasters == 0)
814 		nflags |= HAMMER2_CLUSTER_NOHARD;
815 
816 	/*
817 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
818 	 * all available nodes (even if 0 are available) are fully
819 	 * synchronized.  This is used by the synchronization thread to
820 	 * determine if there is work it could potentially accomplish.
821 	 */
822 	if (nslaves == ttlslaves)
823 		nflags |= HAMMER2_CLUSTER_SSYNCED;
824 	if (nmasters == ttlmasters)
825 		nflags |= HAMMER2_CLUSTER_MSYNCED;
826 
827 	/*
828 	 * Determine if the cluster was successfully locked for the
829 	 * requested operation and generate an error code.  The cluster
830 	 * will not be locked (or ref'd) if an error is returned.
831 	 *
832 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
833 	 * to determine if reading or writing is possible.  If writing, the
834 	 * cluster still requires a call to hammer2_cluster_modify() first.
835 	 */
836 	atomic_set_int(&cluster->flags, nflags);
837 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
838 }
839 
840 /*
841  * This is used by the sync thread to force non-NULL elements of a copy
842  * of the pmp->iroot cluster to be good which is required to prime the
843  * sync.
844  */
845 void
846 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
847 {
848 	int i;
849 
850 	for (i = 0; i < cluster->nchains; ++i) {
851 		if (cluster->array[i].chain)
852 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
853 	}
854 }
855 
856 /*
857  * Copy a cluster, returned a ref'd cluster.  All underlying chains
858  * are also ref'd, but not locked.  Focus state is also copied.
859  *
860  * Original cluster does not have to be locked but usually is.
861  * New cluster will not be flagged as locked.
862  *
863  * Callers using this function to initialize a new cluster from an inode
864  * generally lock and resolve the resulting cluster.
865  *
866  * Callers which use this function to save/restore a cluster structure
867  * generally retain the focus state and do not re-resolve it.  Caller should
868  * not try to re-resolve internal (cparent) node state during an iteration
869  * as the individual tracking elements of cparent in an iteration may not
870  * match even though they are correct.
871  */
872 hammer2_cluster_t *
873 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
874 {
875 	hammer2_pfs_t *pmp = ocluster->pmp;
876 	hammer2_cluster_t *ncluster;
877 	hammer2_chain_t *chain;
878 	int i;
879 
880 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
881 	ncluster->pmp = pmp;
882 	ncluster->nchains = ocluster->nchains;
883 	ncluster->refs = 1;
884 
885 	for (i = 0; i < ocluster->nchains; ++i) {
886 		chain = ocluster->array[i].chain;
887 		ncluster->array[i].chain = chain;
888 		ncluster->array[i].flags = ocluster->array[i].flags;
889 		if (chain)
890 			hammer2_chain_ref(chain);
891 	}
892 	ncluster->focus_index = ocluster->focus_index;
893 	ncluster->focus = ocluster->focus;
894 	ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
895 					      HAMMER2_CLUSTER_INODE);
896 
897 	return (ncluster);
898 }
899 
900 /*
901  * Unlock a cluster.  Refcount and focus is maintained.
902  */
903 void
904 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
905 {
906 	hammer2_chain_t *chain;
907 	int i;
908 
909 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
910 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
911 			cluster);
912 	}
913 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
914 	KKASSERT(cluster->refs > 0);
915 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
916 
917 	for (i = 0; i < cluster->nchains; ++i) {
918 		if (i == idx)
919 			continue;
920 		chain = cluster->array[i].chain;
921 		if (chain)
922 			hammer2_chain_unlock(chain);
923 	}
924 }
925 
926 void
927 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
928 {
929 	hammer2_cluster_unlock_except(cluster, -1);
930 }
931 
932 /*
933  * Resize the cluster's physical storage allocation in-place.  This may
934  * replace the cluster's chains.
935  */
936 void
937 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
938 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
939 		       int nradix, int flags)
940 {
941 	hammer2_chain_t *chain;
942 	int i;
943 
944 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
945 	KKASSERT(cparent->nchains == cluster->nchains);
946 
947 	for (i = 0; i < cluster->nchains; ++i) {
948 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
949 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
950 			continue;
951 		}
952 		chain = cluster->array[i].chain;
953 		if (chain) {
954 			KKASSERT(cparent->array[i].chain);
955 			hammer2_chain_resize(trans, ip,
956 					     cparent->array[i].chain, chain,
957 					     nradix, flags);
958 		}
959 	}
960 }
961 
962 /*
963  * Set an inode's cluster modified, marking the related chains RW and
964  * duplicating them if necessary.
965  *
966  * The passed-in chain is a localized copy of the chain previously acquired
967  * when the inode was locked (and possilby replaced in the mean time), and
968  * must also be updated.  In fact, we update it first and then synchronize
969  * the inode's cluster cache.
970  */
971 hammer2_inode_data_t *
972 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
973 			  hammer2_cluster_t *cluster, int flags)
974 {
975 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
976 	hammer2_cluster_modify(trans, cluster, flags);
977 
978 	hammer2_inode_repoint(ip, NULL, cluster);
979 	if (ip->vp)
980 		vsetisdirty(ip->vp);
981 	return (&hammer2_cluster_wdata(cluster)->ipdata);
982 }
983 
984 /*
985  * Adjust the cluster's chains to allow modification and adjust the
986  * focus.  Data will be accessible on return.
987  *
988  * If our focused master errors on modify, re-resolve the cluster to
989  * try to select a different master.
990  */
991 void
992 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
993 		       int flags)
994 {
995 	hammer2_chain_t *chain;
996 	int resolve_again;
997 	int i;
998 
999 	resolve_again = 0;
1000 	for (i = 0; i < cluster->nchains; ++i) {
1001 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1002 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1003 			continue;
1004 		}
1005 		chain = cluster->array[i].chain;
1006 		if (chain == NULL)
1007 			continue;
1008 		if (chain->error)
1009 			continue;
1010 		hammer2_chain_modify(trans, chain, flags);
1011 		if (cluster->focus == chain && chain->error) {
1012 			cluster->error = chain->error;
1013 			resolve_again = 1;
1014 		}
1015 	}
1016 	if (resolve_again)
1017 		hammer2_cluster_resolve(cluster);
1018 }
1019 
1020 /*
1021  * Synchronize modifications from the focus to other chains in a cluster.
1022  * Convenient because nominal API users can just modify the contents of the
1023  * focus (at least for non-blockref data).
1024  *
1025  * Nominal front-end operations only edit non-block-table data in a single
1026  * chain.  This code copies such modifications to the other chains in the
1027  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
1028  * by both the frontend and the backend and will explode in fireworks if
1029  * blindly copied.
1030  */
1031 void
1032 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1033 {
1034 	hammer2_chain_t *focus;
1035 	hammer2_chain_t *scan;
1036 	const hammer2_inode_data_t *ripdata;
1037 	hammer2_inode_data_t *wipdata;
1038 	int i;
1039 
1040 	focus = cluster->focus;
1041 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1042 
1043 	for (i = 0; i < cluster->nchains; ++i) {
1044 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1045 			continue;
1046 		scan = cluster->array[i].chain;
1047 		if (scan == NULL || scan == focus)
1048 			continue;
1049 		if (scan->error)
1050 			continue;
1051 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1052 		KKASSERT(focus->bytes == scan->bytes &&
1053 			 focus->bref.type == scan->bref.type);
1054 		switch(focus->bref.type) {
1055 		case HAMMER2_BREF_TYPE_INODE:
1056 			ripdata = &focus->data->ipdata;
1057 			wipdata = &scan->data->ipdata;
1058 			if ((ripdata->op_flags &
1059 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1060 				bcopy(ripdata, wipdata,
1061 				      offsetof(hammer2_inode_data_t, u));
1062 				break;
1063 			}
1064 			/* fall through to full copy */
1065 		case HAMMER2_BREF_TYPE_DATA:
1066 			bcopy(focus->data, scan->data, focus->bytes);
1067 			break;
1068 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1069 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1070 		case HAMMER2_BREF_TYPE_FREEMAP:
1071 		case HAMMER2_BREF_TYPE_VOLUME:
1072 			panic("hammer2_cluster_modsync: illegal node type");
1073 			/* NOT REACHED */
1074 			break;
1075 		default:
1076 			panic("hammer2_cluster_modsync: unknown node type");
1077 			break;
1078 		}
1079 	}
1080 }
1081 
1082 /*
1083  * Lookup initialization/completion API.  Returns a locked, fully resolved
1084  * cluster with one ref.
1085  */
1086 hammer2_cluster_t *
1087 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1088 {
1089 	hammer2_cluster_t *cluster;
1090 
1091 	cluster = hammer2_cluster_copy(cparent);
1092 	if (flags & HAMMER2_LOOKUP_SHARED) {
1093 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1094 					      HAMMER2_RESOLVE_SHARED);
1095 	} else {
1096 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1097 	}
1098 	hammer2_cluster_resolve(cluster);
1099 
1100 	return (cluster);
1101 }
1102 
1103 void
1104 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1105 {
1106 	if (cparent) {
1107 		hammer2_cluster_unlock(cparent);
1108 		hammer2_cluster_drop(cparent);
1109 	}
1110 }
1111 
1112 /*
1113  * Locate first match or overlap under parent, return a new, locked, resolved
1114  * cluster with one ref.
1115  *
1116  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1117  */
1118 hammer2_cluster_t *
1119 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1120 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1121 {
1122 	hammer2_pfs_t *pmp;
1123 	hammer2_cluster_t *cluster;
1124 	hammer2_chain_t *chain;
1125 	hammer2_key_t key_accum;
1126 	hammer2_key_t key_next;
1127 	int null_count;
1128 	int rflags;
1129 	int i;
1130 
1131 	KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1132 
1133 	pmp = cparent->pmp;				/* can be NULL */
1134 	key_accum = *key_nextp;
1135 	null_count = 0;
1136 	if (flags & HAMMER2_LOOKUP_SHARED)
1137 		rflags = HAMMER2_RESOLVE_SHARED;
1138 	else
1139 		rflags = 0;
1140 
1141 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1142 	cluster->pmp = pmp;				/* can be NULL */
1143 	cluster->refs = 1;
1144 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1145 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1146 
1147 	/*
1148 	 * Iterating earlier cluster elements with later elements still
1149 	 * locked is a problem, so we have to unlock the parent and then
1150 	 * re-lock as we go.
1151 	 */
1152 	hammer2_cluster_unlock(cparent);
1153 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1154 
1155 	/*
1156 	 * Pass-1, issue lookups.
1157 	 */
1158 	for (i = 0; i < cparent->nchains; ++i) {
1159 		cluster->array[i].flags = cparent->array[i].flags;
1160 		key_next = *key_nextp;
1161 
1162 		/*
1163 		 * Always relock the parent as we go.
1164 		 */
1165 		if (cparent->array[i].chain) {
1166 			hammer2_chain_lock(cparent->array[i].chain, rflags);
1167 		}
1168 
1169 		/*
1170 		 * Nothing to base the lookup, or parent was not synchronized.
1171 		 */
1172 		if (cparent->array[i].chain == NULL ||
1173 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1174 			++null_count;
1175 			continue;
1176 		}
1177 
1178 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
1179 					     &key_next,
1180 					     key_beg, key_end,
1181 					     &cparent->array[i].cache_index,
1182 					     flags);
1183 		cluster->array[i].chain = chain;
1184 		if (chain == NULL) {
1185 			++null_count;
1186 		}
1187 		if (key_accum > key_next)
1188 			key_accum = key_next;
1189 	}
1190 
1191 	/*
1192 	 * Cleanup
1193 	 */
1194 	cluster->nchains = i;
1195 	*key_nextp = key_accum;
1196 
1197 	/*
1198 	 * The cluster must be resolved, out of sync elements may be present.
1199 	 *
1200 	 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1201 	 */
1202 	if (null_count != i)
1203 		hammer2_cluster_resolve(cluster);
1204 	if (null_count == i ||
1205 	    (cluster->focus == NULL &&
1206 	     (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1207 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1208 			hammer2_cluster_unlock(cluster);
1209 		hammer2_cluster_drop(cluster);
1210 		cluster = NULL;
1211 	}
1212 
1213 	return (cluster);
1214 }
1215 
1216 /*
1217  * Locate next match or overlap under parent, replace the passed-in cluster.
1218  * The returned cluster is a new, locked, resolved cluster with one ref.
1219  *
1220  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1221  */
1222 hammer2_cluster_t *
1223 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1224 		     hammer2_key_t *key_nextp,
1225 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1226 {
1227 	hammer2_chain_t *ochain;
1228 	hammer2_chain_t *nchain;
1229 	hammer2_key_t key_accum;
1230 	hammer2_key_t key_next;
1231 	int parent_index;
1232 	int cluster_index;
1233 	int null_count;
1234 	int rflags;
1235 	int i;
1236 
1237 	KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1238 
1239 	key_accum = *key_nextp;
1240 	null_count = 0;
1241 	parent_index = cparent->focus_index;	/* save prior focus */
1242 	cluster_index = cluster->focus_index;
1243 	if (flags & HAMMER2_LOOKUP_SHARED)
1244 		rflags = HAMMER2_RESOLVE_SHARED;
1245 	else
1246 		rflags = 0;
1247 
1248 	cluster->focus = NULL;		/* XXX needed any more? */
1249 	/*cparent->focus = NULL;*/
1250 	cluster->focus_index = 0;	/* XXX needed any more? */
1251 	/*cparent->focus_index = 0;*/
1252 
1253 	cluster->ddflag = 0;
1254 
1255 	/*
1256 	 * The parent is always locked on entry, the iterator may be locked
1257 	 * depending on flags.
1258 	 *
1259 	 * We must temporarily unlock the passed-in clusters to avoid a
1260 	 * deadlock between elements of the cluster with other threads.
1261 	 * We will fixup the lock in the loop.
1262 	 *
1263 	 * Note that this will clear the focus.
1264 	 *
1265 	 * Reflag the clusters as locked, because we will relock them
1266 	 * as we go.
1267 	 */
1268 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1269 		hammer2_cluster_unlock(cluster);
1270 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1271 	}
1272 	hammer2_cluster_unlock(cparent);
1273 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1274 
1275 	for (i = 0; i < cparent->nchains; ++i) {
1276 		key_next = *key_nextp;
1277 		ochain = cluster->array[i].chain;
1278 
1279 		/*
1280 		 * Always relock the parent as we go.
1281 		 */
1282 		if (cparent->array[i].chain)
1283 			hammer2_chain_lock(cparent->array[i].chain, rflags);
1284 
1285 		/*
1286 		 * Nothing to iterate from.  These cases can occur under
1287 		 * normal operations.  For example, during synchronization
1288 		 * a slave might reach the end of its scan while records
1289 		 * are still left on the master(s).
1290 		 */
1291 		if (ochain == NULL) {
1292 			++null_count;
1293 			continue;
1294 		}
1295 		if (cparent->array[i].chain == NULL ||
1296 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1297 		    (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1298 			/* ochain has not yet been relocked */
1299 			hammer2_chain_drop(ochain);
1300 			cluster->array[i].chain = NULL;
1301 			++null_count;
1302 			continue;
1303 		}
1304 
1305 		/*
1306 		 * Relock the child if necessary.  Parent and child will then
1307 		 * be locked as expected by hammer2_chain_next() and flags.
1308 		 */
1309 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1310 			hammer2_chain_lock(ochain, rflags);
1311 		nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1312 					    &key_next, key_beg, key_end,
1313 					    &cparent->array[i].cache_index,
1314 					    flags);
1315 		/* ochain now invalid but can still be used for focus check */
1316 		if (parent_index == i) {
1317 			cparent->focus_index = i;
1318 			cparent->focus = cparent->array[i].chain;
1319 		}
1320 
1321 		cluster->array[i].chain = nchain;
1322 		if (nchain == NULL) {
1323 			++null_count;
1324 		}
1325 		if (key_accum > key_next)
1326 			key_accum = key_next;
1327 	}
1328 
1329 	/*
1330 	 * Cleanup
1331 	 */
1332 	cluster->nchains = i;
1333 	*key_nextp = key_accum;
1334 
1335 	/*
1336 	 * The cluster must be resolved, out of sync elements may be present.
1337 	 *
1338 	 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1339 	 */
1340 	if (null_count != i)
1341 		hammer2_cluster_resolve(cluster);
1342 	if (null_count == i ||
1343 	    (cluster->focus == NULL &&
1344 	     (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1345 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1346 			hammer2_cluster_unlock(cluster);
1347 		hammer2_cluster_drop(cluster);
1348 		cluster = NULL;
1349 	}
1350 	return(cluster);
1351 }
1352 
1353 /*
1354  * Advance just one chain in the cluster and recalculate the invalid bit.
1355  * The cluster index is allowed to be flagged invalid on input and is
1356  * recalculated on return.
1357  *
1358  * (used during synchronization to advance past a chain being deleted).
1359  *
1360  * The chain being advanced must not be the focus and the clusters in
1361  * question must have already passed normal cluster_lookup/cluster_next
1362  * checks.
1363  *
1364  * The cluster always remains intact on return, so void function.
1365  */
1366 void
1367 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1368 				  hammer2_cluster_t *cluster,
1369 				  hammer2_key_t *key_nextp,
1370 				  hammer2_key_t key_beg,
1371 				  hammer2_key_t key_end,
1372 				  int i, int flags)
1373 {
1374 	hammer2_chain_t *ochain;
1375 	hammer2_chain_t *nchain;
1376 	hammer2_chain_t *focus;
1377 	hammer2_key_t key_accum;
1378 	hammer2_key_t key_next;
1379 	int ddflag;
1380 
1381 	key_accum = *key_nextp;
1382 	key_next = *key_nextp;
1383 	ochain = cluster->array[i].chain;
1384 	if (ochain == NULL)
1385 		goto done;
1386 	KKASSERT(ochain != cluster->focus);
1387 
1388 	nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1389 				    &key_next, key_beg, key_end,
1390 				    &cparent->array[i].cache_index,
1391 				    flags);
1392 	/* ochain now invalid */
1393 	if (cparent->focus_index == i)
1394 		cparent->focus = cparent->array[i].chain;
1395 
1396 	/*
1397 	 * Install nchain.  Note that nchain can be NULL, and can also
1398 	 * be in an unlocked state depending on flags.
1399 	 */
1400 	cluster->array[i].chain = nchain;
1401 	cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1402 
1403 	if (key_accum > key_next)
1404 		key_accum = key_next;
1405 
1406 	focus = cluster->focus;
1407 	if (focus == NULL)
1408 		goto done;
1409 	if (nchain == NULL)
1410 		goto done;
1411 #if 0
1412 	if (nchain == focus)	/* ASSERTED NOT TRUE */
1413 		...
1414 #endif
1415 	ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1416 	if (nchain->bref.type != focus->bref.type ||
1417 	    nchain->bref.key != focus->bref.key ||
1418 	    nchain->bref.keybits != focus->bref.keybits ||
1419 	    nchain->bref.modify_tid != focus->bref.modify_tid ||
1420 	    nchain->bytes != focus->bytes ||
1421 	    ddflag != cluster->ddflag) {
1422 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1423 	}
1424 
1425 done:
1426 	*key_nextp = key_accum;
1427 #if 0
1428 	/*
1429 	 * For now don't re-resolve cluster->flags.
1430 	 */
1431 	hammer2_cluster_resolve(cluster);
1432 #endif
1433 }
1434 
1435 /*
1436  * Create a new cluster using the specified key
1437  */
1438 int
1439 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1440 		     hammer2_cluster_t **clusterp,
1441 		     hammer2_key_t key, int keybits,
1442 		     int type, size_t bytes, int flags)
1443 {
1444 	hammer2_cluster_t *cluster;
1445 	hammer2_pfs_t *pmp;
1446 	int error;
1447 	int i;
1448 
1449 	pmp = trans->pmp;				/* can be NULL */
1450 
1451 	if ((cluster = *clusterp) == NULL) {
1452 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1453 				  M_WAITOK | M_ZERO);
1454 		cluster->pmp = pmp;			/* can be NULL */
1455 		cluster->refs = 1;
1456 		cluster->flags = HAMMER2_CLUSTER_LOCKED;
1457 	}
1458 	cluster->focus_index = 0;
1459 	cluster->focus = NULL;
1460 
1461 	/*
1462 	 * NOTE: cluster->array[] entries can initially be NULL.  If
1463 	 *	 *clusterp is supplied, skip NULL entries, otherwise
1464 	 *	 create new chains.
1465 	 */
1466 	for (i = 0; i < cparent->nchains; ++i) {
1467 		if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1468 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1469 			continue;
1470 		}
1471 		if (*clusterp) {
1472 			if ((cluster->array[i].flags &
1473 			     HAMMER2_CITEM_FEMOD) == 0) {
1474 				cluster->array[i].flags |=
1475 						HAMMER2_CITEM_INVALID;
1476 				continue;
1477 			}
1478 			if (cluster->array[i].chain == NULL)
1479 				continue;
1480 		}
1481 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
1482 					     &cluster->array[i].chain, pmp,
1483 					     key, keybits,
1484 					     type, bytes, flags);
1485 		if (cparent->focus_index == i)
1486 			cparent->focus = cparent->array[i].chain;
1487 		KKASSERT(error == 0);
1488 		if (cluster->focus == NULL) {
1489 			cluster->focus_index = i;
1490 			cluster->focus = cluster->array[i].chain;
1491 		}
1492 		if (cparent->focus == cparent->array[i].chain) {
1493 			cluster->focus_index = i;
1494 			cluster->focus = cluster->array[i].chain;
1495 		}
1496 	}
1497 	cluster->nchains = i;
1498 	*clusterp = cluster;
1499 	hammer2_cluster_resolve(cluster);
1500 
1501 	return error;
1502 }
1503 
1504 /*
1505  * Rename a cluster to a new parent.
1506  *
1507  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1508  *	    So the data_off field is not relevant.  Only the key and
1509  *	    keybits are used.
1510  */
1511 void
1512 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1513 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1514 		       int flags)
1515 {
1516 	hammer2_chain_t *chain;
1517 	hammer2_blockref_t xbref;
1518 	int i;
1519 
1520 #if 0
1521 	cluster->focus = NULL;
1522 	cparent->focus = NULL;
1523 	cluster->focus_index = 0;
1524 	cparent->focus_index = 0;
1525 #endif
1526 
1527 	for (i = 0; i < cluster->nchains; ++i) {
1528 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1529 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1530 			continue;
1531 		}
1532 		chain = cluster->array[i].chain;
1533 		if (chain) {
1534 			if (bref) {
1535 				xbref = chain->bref;
1536 				xbref.key = bref->key;
1537 				xbref.keybits = bref->keybits;
1538 				hammer2_chain_rename(trans, &xbref,
1539 						     &cparent->array[i].chain,
1540 						     chain, flags);
1541 			} else {
1542 				hammer2_chain_rename(trans, NULL,
1543 						     &cparent->array[i].chain,
1544 						     chain, flags);
1545 			}
1546 			if (cparent->focus_index == i)
1547 				cparent->focus = cparent->array[i].chain;
1548 			KKASSERT(cluster->array[i].chain == chain); /*remove*/
1549 		}
1550 	}
1551 }
1552 
1553 /*
1554  * Mark a cluster deleted
1555  */
1556 void
1557 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1558 		       hammer2_cluster_t *cluster, int flags)
1559 {
1560 	hammer2_chain_t *chain;
1561 	hammer2_chain_t *parent;
1562 	int i;
1563 
1564 	if (cparent == NULL) {
1565 		kprintf("cparent is NULL\n");
1566 		return;
1567 	}
1568 
1569 	for (i = 0; i < cluster->nchains; ++i) {
1570 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1571 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1572 			continue;
1573 		}
1574 		parent = cparent->array[i].chain;
1575 		chain = cluster->array[i].chain;
1576 		if (chain == NULL)
1577 			continue;
1578 		if (chain->parent != parent) {
1579 			kprintf("hammer2_cluster_delete: parent "
1580 				"mismatch chain=%p parent=%p against=%p\n",
1581 				chain, chain->parent, parent);
1582 		} else {
1583 			hammer2_chain_delete(trans, parent, chain, flags);
1584 		}
1585 	}
1586 }
1587 
1588 /*
1589  * Create a snapshot of the specified {parent, ochain} with the specified
1590  * label.  The originating hammer2_inode must be exclusively locked for
1591  * safety.
1592  *
1593  * The ioctl code has already synced the filesystem.
1594  */
1595 int
1596 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1597 		       hammer2_ioc_pfs_t *pfs)
1598 {
1599 	hammer2_dev_t *hmp;
1600 	hammer2_cluster_t *ncluster;
1601 	const hammer2_inode_data_t *ripdata;
1602 	hammer2_inode_data_t *wipdata;
1603 	hammer2_chain_t *nchain;
1604 	hammer2_inode_t *nip;
1605 	size_t name_len;
1606 	hammer2_key_t lhc;
1607 	struct vattr vat;
1608 #if 0
1609 	uuid_t opfs_clid;
1610 #endif
1611 	int error;
1612 	int i;
1613 
1614 	kprintf("snapshot %s\n", pfs->name);
1615 
1616 	name_len = strlen(pfs->name);
1617 	lhc = hammer2_dirhash(pfs->name, name_len);
1618 
1619 	/*
1620 	 * Get the clid
1621 	 */
1622 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1623 #if 0
1624 	opfs_clid = ripdata->pfs_clid;
1625 #endif
1626 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1627 
1628 	/*
1629 	 * Create the snapshot directory under the super-root
1630 	 *
1631 	 * Set PFS type, generate a unique filesystem id, and generate
1632 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1633 	 * which theoretically allows the snapshot to be used as part of
1634 	 * the same cluster (perhaps as a cache).
1635 	 *
1636 	 * Copy the (flushed) blockref array.  Theoretically we could use
1637 	 * chain_duplicate() but it becomes difficult to disentangle
1638 	 * the shared core so for now just brute-force it.
1639 	 */
1640 	VATTR_NULL(&vat);
1641 	vat.va_type = VDIR;
1642 	vat.va_mode = 0755;
1643 	ncluster = NULL;
1644 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1645 				   proc0.p_ucred, pfs->name, name_len,
1646 				   &ncluster,
1647 				   HAMMER2_INSERT_PFSROOT, &error);
1648 
1649 	if (nip) {
1650 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1651 		wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1652 		wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1653 		wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1654 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1655 
1656 		/*
1657 		 * Give the snapshot its own private cluster.  As a snapshot
1658 		 * no further synchronization with the original cluster will
1659 		 * be done.
1660 		 */
1661 #if 0
1662 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1663 			wipdata->pfs_clid = opfs_clid;
1664 		else
1665 			kern_uuidgen(&wipdata->pfs_clid, 1);
1666 #endif
1667 		kern_uuidgen(&wipdata->pfs_clid, 1);
1668 
1669 		for (i = 0; i < ncluster->nchains; ++i) {
1670 			if ((ncluster->array[i].flags &
1671 			     HAMMER2_CITEM_FEMOD) == 0) {
1672 				ncluster->array[i].flags |=
1673 					HAMMER2_CITEM_INVALID;
1674 				continue;
1675 			}
1676 			nchain = ncluster->array[i].chain;
1677 			if (nchain)
1678 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1679 		}
1680 #if 0
1681 		/* XXX can't set this unless we do an explicit flush, which
1682 		   we also need a pmp assigned to do, else the flush code
1683 		   won't flush ncluster because it thinks it is crossing a
1684 		   flush boundary */
1685 		hammer2_cluster_set_chainflags(ncluster,
1686 					       HAMMER2_CHAIN_PFSBOUNDARY);
1687 #endif
1688 
1689 		/* XXX hack blockset copy */
1690 		/* XXX doesn't work with real cluster */
1691 		KKASSERT(ocluster->nchains == 1);
1692 		wipdata->u.blockset = ripdata->u.blockset;
1693 		hammer2_cluster_modsync(ncluster);
1694 		for (i = 0; i < ncluster->nchains; ++i) {
1695 			nchain = ncluster->array[i].chain;
1696 			if (nchain)
1697 				hammer2_flush(trans, nchain, 1);
1698 		}
1699 		hammer2_inode_unlock(nip, ncluster);
1700 	}
1701 	return (error);
1702 }
1703 
1704 /*
1705  * Return locked parent cluster given a locked child.  The child remains
1706  * locked on return.  The new parent's focus follows the child's focus
1707  * and the parent is always resolved.
1708  *
1709  * We must temporarily unlock the passed-in cluster to avoid a deadlock
1710  * between elements of the cluster.
1711  *
1712  * We must not try to hammer2_cluster_resolve() cparent.  The individual
1713  * parent chains for the nodes are the correct parents for the cluster but
1714  * do not necessarily match, so resolve would likely implode.
1715  */
1716 hammer2_cluster_t *
1717 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1718 {
1719 	hammer2_cluster_t *cparent;
1720 	int i;
1721 
1722 	cparent = hammer2_cluster_copy(cluster);
1723 	hammer2_cluster_unlock(cluster);
1724 
1725 	for (i = 0; i < cparent->nchains; ++i) {
1726 		hammer2_chain_t *chain;
1727 		hammer2_chain_t *rchain;
1728 
1729 		/*
1730 		 * Calculate parent for each element.  Old chain has an extra
1731 		 * ref for cparent but the lock remains with cluster.
1732 		 */
1733 		chain = cparent->array[i].chain;
1734 		if (chain == NULL)
1735 			continue;
1736 		while ((rchain = chain->parent) != NULL) {
1737 			hammer2_chain_ref(rchain);
1738 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1739 			if (chain->parent == rchain)
1740 				break;
1741 			hammer2_chain_unlock(rchain);
1742 			hammer2_chain_drop(rchain);
1743 		}
1744 		cparent->array[i].chain = rchain;
1745 		hammer2_chain_drop(chain);
1746 	}
1747 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1748 	/* hammer2_cluster_resolve(cparent); */
1749 	hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1750 
1751 	return cparent;
1752 }
1753 
1754 /************************************************************************
1755  *			        CLUSTER I/O 				*
1756  ************************************************************************
1757  *
1758  *
1759  * WARNING! blockref[] array data is not universal.  These functions should
1760  *	    only be used to access universal data.
1761  *
1762  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1763  *	    complete if necessary.  The I/O's should have already been
1764  *	    initiated by the cluster_lock/chain_lock operation.
1765  *
1766  *	    The cluster must already be in a modified state before wdata
1767  *	    is called.  The data will already be available for this case.
1768  */
1769 const hammer2_media_data_t *
1770 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1771 {
1772 	return(cluster->focus->data);
1773 }
1774 
1775 hammer2_media_data_t *
1776 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1777 {
1778 	KKASSERT(hammer2_cluster_modified(cluster));
1779 	return(cluster->focus->data);
1780 }
1781 
1782 /*
1783  * Load cluster data asynchronously with callback.
1784  *
1785  * The callback is made for the first validated data found, or NULL
1786  * if no valid data is available.
1787  *
1788  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1789  *	 in the inode with an exclusive lock held), the chain structure may be
1790  *	 shared.
1791  */
1792 void
1793 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1794 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1795 {
1796 	hammer2_chain_t *chain;
1797 	hammer2_iocb_t *iocb;
1798 	hammer2_dev_t *hmp;
1799 	hammer2_blockref_t *bref;
1800 	int i;
1801 
1802 	i = cluster->focus_index;
1803 	chain = cluster->focus;
1804 
1805 	iocb = &cluster->iocb;
1806 	iocb->callback = callback;
1807 	iocb->dio = NULL;		/* for already-validated case */
1808 	iocb->cluster = cluster;
1809 	iocb->chain = chain;
1810 	iocb->ptr = ptr;
1811 	iocb->lbase = (off_t)i;
1812 	iocb->flags = 0;
1813 	iocb->error = 0;
1814 
1815 	/*
1816 	 * Data already validated
1817 	 */
1818 	if (chain->data) {
1819 		callback(iocb);
1820 		return;
1821 	}
1822 
1823 	/*
1824 	 * We must resolve to a device buffer, either by issuing I/O or
1825 	 * by creating a zero-fill element.  We do not mark the buffer
1826 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1827 	 * API must still be used to do that).
1828 	 *
1829 	 * The device buffer is variable-sized in powers of 2 down
1830 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1831 	 * chunk always contains buffers of the same size. (XXX)
1832 	 *
1833 	 * The minimum physical IO size may be larger than the variable
1834 	 * block size.
1835 	 *
1836 	 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1837 	 *	      matches hammer2_devblksize()?  Or does the freemap's
1838 	 *	      pre-zeroing handle the case for us?
1839 	 */
1840 	bref = &chain->bref;
1841 	hmp = chain->hmp;
1842 
1843 #if 0
1844 	/* handled by callback? <- TODO XXX even needed for loads? */
1845 	/*
1846 	 * The getblk() optimization for a 100% overwrite can only be used
1847 	 * if the physical block size matches the request.
1848 	 */
1849 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1850 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1851 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1852 		KKASSERT(error == 0);
1853 		iocb->dio = dio;
1854 		callback(iocb);
1855 		return;
1856 	}
1857 #endif
1858 
1859 	/*
1860 	 * Otherwise issue a read
1861 	 */
1862 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1863 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1864 }
1865