xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision 25ca8c794df06033ca37cb3fdf9b2992a7b5a9e6)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125 	hammer2_chain_t *chain;
126 	int i;
127 
128 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 	for (i = 0; i < cluster->nchains; ++i) {
130 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131 			continue;
132 		chain = cluster->array[i].chain;
133 		if (chain == NULL)
134 			continue;
135 		if (chain->error)
136 			continue;
137 		if (chain->bytes != bytes)
138 			return 1;
139 	}
140 	return 0;
141 }
142 
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 	if (cluster->error == 0)
154 		return(cluster->focus->bref.type);
155 	return 0;
156 }
157 
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 	if (cluster->error == 0)
168 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169 	return 0;
170 }
171 
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 	if (cluster->error == 0) {
187 		*bref = cluster->focus->bref;
188 		bref->data_off = 0;
189 	} else {
190 		bzero(bref, sizeof(*bref));
191 	}
192 }
193 
194 /*
195  * Flag the cluster for flushing recursively up to the root.  Despite the
196  * work it does, this is relatively benign.  It just makes sure that the
197  * flusher has top-down visibility to this cluster.
198  *
199  * Errored chains are not flagged for flushing.
200  *
201  * The cluster should probably be locked.
202  */
203 void
204 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
205 {
206 	hammer2_chain_t *chain;
207 	int i;
208 
209 	for (i = 0; i < cluster->nchains; ++i) {
210 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
211 			continue;
212 		chain = cluster->array[i].chain;
213 		if (chain == NULL)
214 			continue;
215 		if (chain->error)
216 			continue;
217 		hammer2_chain_setflush(trans, chain);
218 	}
219 }
220 
221 /*
222  * Set the check mode for the cluster.
223  * Errored elements of the cluster are ignored.
224  *
225  * The cluster must be locked and modified.
226  */
227 void
228 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
229 				hammer2_cluster_t *cluster,
230 				int check_algo)
231 {
232 	hammer2_chain_t *chain;
233 	int i;
234 
235 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
236 	for (i = 0; i < cluster->nchains; ++i) {
237 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
238 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
239 			continue;
240 		}
241 		chain = cluster->array[i].chain;
242 		if (chain == NULL)
243 			continue;
244 		if (chain->error)
245 			continue;
246 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
247 		chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
248 		chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
249 	}
250 }
251 
252 /*
253  * Create a degenerate cluster with one ref from a single locked chain.
254  * The returned cluster will be focused on the chain and inherit its
255  * error state.
256  *
257  * The chain's lock and reference are transfered to the new cluster, so
258  * the caller should not try to unlock the chain separately.
259  *
260  * We fake the flags.
261  */
262 hammer2_cluster_t *
263 hammer2_cluster_from_chain(hammer2_chain_t *chain)
264 {
265 	hammer2_cluster_t *cluster;
266 
267 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
268 	cluster->array[0].chain = chain;
269 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
270 	cluster->nchains = 1;
271 	cluster->focus = chain;
272 	cluster->focus_index = 0;
273 	cluster->pmp = chain->pmp;
274 	cluster->refs = 1;
275 	cluster->error = chain->error;
276 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
277 			 HAMMER2_CLUSTER_WRHARD |
278 			 HAMMER2_CLUSTER_RDHARD |
279 			 HAMMER2_CLUSTER_MSYNCED |
280 			 HAMMER2_CLUSTER_SSYNCED;
281 
282 	return cluster;
283 }
284 
285 /*
286  * Add a reference to a cluster and its underlying chains.
287  *
288  * We must also ref the underlying chains in order to allow ref/unlock
289  * sequences to later re-lock.
290  */
291 void
292 hammer2_cluster_ref(hammer2_cluster_t *cluster)
293 {
294 	atomic_add_int(&cluster->refs, 1);
295 }
296 
297 /*
298  * Drop the caller's reference to the cluster.  When the ref count drops to
299  * zero this function frees the cluster and drops all underlying chains.
300  *
301  * In-progress read I/Os are typically detached from the cluster once the
302  * first one returns (the remaining stay attached to the DIOs but are then
303  * ignored and drop naturally).
304  */
305 void
306 hammer2_cluster_drop(hammer2_cluster_t *cluster)
307 {
308 	hammer2_chain_t *chain;
309 	int i;
310 
311 	KKASSERT(cluster->refs > 0);
312 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
313 		cluster->focus = NULL;		/* safety XXX chg to assert */
314 		cluster->focus_index = 0;
315 
316 		for (i = 0; i < cluster->nchains; ++i) {
317 			chain = cluster->array[i].chain;
318 			if (chain) {
319 				hammer2_chain_drop(chain);
320 				cluster->array[i].chain = NULL; /* safety */
321 			}
322 		}
323 		cluster->nchains = 0;				/* safety */
324 
325 		kfree(cluster, M_HAMMER2);
326 		/* cluster is invalid */
327 	}
328 }
329 
330 void
331 hammer2_cluster_wait(hammer2_cluster_t *cluster)
332 {
333 	tsleep(cluster->focus, 0, "h2clcw", 1);
334 }
335 
336 /*
337  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
338  *
339  * WARNING! This function expects the caller to handle resolution of the
340  *	    cluster.  We never re-resolve the cluster in this function,
341  *	    because it might be used to temporarily unlock/relock a cparent
342  *	    in an iteration or recursrion, and the cparents elements do not
343  *	    necessarily match.
344  */
345 void
346 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
347 {
348 	hammer2_chain_t *chain;
349 	int i;
350 
351 	/* cannot be on inode-embedded cluster template, must be on copy */
352 	KKASSERT(cluster->refs > 0);
353 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
354 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
355 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
356 			cluster);
357 	}
358 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
359 
360 	/*
361 	 * Lock chains and resolve state.
362 	 */
363 	for (i = 0; i < cluster->nchains; ++i) {
364 		if (i == idx)
365 			continue;
366 		chain = cluster->array[i].chain;
367 		if (chain == NULL)
368 			continue;
369 		hammer2_chain_lock(chain, how);
370 	}
371 }
372 
373 void
374 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
375 {
376 	hammer2_cluster_lock_except(cluster, -1, how);
377 }
378 
379 /*
380  * Calculate the clustering state for the cluster and set its focus.
381  * This routine must be called with care.  For example, it should not
382  * normally be called after relocking a non-leaf cluster because parent
383  * clusters help iterations and each element might be at a slightly different
384  * indirect node (each node's topology is independently indexed).
385  *
386  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
387  * operations.  Typically this is only set on a quorum of MASTERs or
388  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
389  * is present, this bit is *not* set on a quorum of MASTERs.  The
390  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
391  * that create/modify/delete elements use it.
392  *
393  * The chains making up the cluster may be narrowed down based on quorum
394  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
395  * narrowed down to a single chain as long as the entire subtopology is known
396  * to be intact.  So, for example, we can narrow a read-only op to a single
397  * fast SLAVE but if we focus a CACHE chain we must still retain at least
398  * a SLAVE to ensure that the subtopology can be accessed.
399  *
400  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
401  * to be maintained once the topology is validated as-of the top level of
402  * the operation.
403  *
404  * If a failure occurs the operation must be aborted by higher-level code and
405  * retried. XXX
406  */
407 void
408 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
409 {
410 	hammer2_chain_t *chain;
411 	hammer2_chain_t *focus;
412 	hammer2_pfs_t *pmp;
413 	hammer2_tid_t quorum_tid;
414 	hammer2_tid_t last_best_quorum_tid;
415 	int focus_pfs_type;
416 	uint32_t nflags;
417 	int ttlmasters;
418 	int ttlslaves;
419 	int nmasters;
420 	int nslaves;
421 	int nquorum;
422 	int smpresent;
423 	int i;
424 
425 	cluster->error = 0;
426 	cluster->focus = NULL;
427 
428 	focus_pfs_type = 0;
429 	nflags = 0;
430 	ttlmasters = 0;
431 	ttlslaves = 0;
432 	nmasters = 0;
433 	nslaves = 0;
434 
435 	/*
436 	 * Calculate quorum
437 	 */
438 	pmp = cluster->pmp;
439 	KKASSERT(pmp != NULL || cluster->nchains == 0);
440 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
441 	smpresent = 0;
442 
443 	/*
444 	 * Pass 1
445 	 *
446 	 * NOTE: A NULL chain is not necessarily an error, it could be
447 	 *	 e.g. a lookup failure or the end of an iteration.
448 	 *	 Process normally.
449 	 */
450 	for (i = 0; i < cluster->nchains; ++i) {
451 		chain = cluster->array[i].chain;
452 		if (chain && chain->error) {
453 			if (cluster->focus == NULL || cluster->focus == chain) {
454 				/* error will be overridden by valid focus */
455 				cluster->error = chain->error;
456 			}
457 
458 			/*
459 			 * Must count total masters and slaves whether the
460 			 * chain is errored or not.
461 			 */
462 			switch (cluster->pmp->pfs_types[i]) {
463 			case HAMMER2_PFSTYPE_MASTER:
464 				++ttlmasters;
465 				break;
466 			case HAMMER2_PFSTYPE_SLAVE:
467 				++ttlslaves;
468 				break;
469 			}
470 			continue;
471 		}
472 		switch (cluster->pmp->pfs_types[i]) {
473 		case HAMMER2_PFSTYPE_MASTER:
474 			++ttlmasters;
475 			break;
476 		case HAMMER2_PFSTYPE_SLAVE:
477 			++ttlslaves;
478 			break;
479 		case HAMMER2_PFSTYPE_SOFT_MASTER:
480 			nflags |= HAMMER2_CLUSTER_WRSOFT;
481 			nflags |= HAMMER2_CLUSTER_RDSOFT;
482 			smpresent = 1;
483 			break;
484 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
485 			nflags |= HAMMER2_CLUSTER_RDSOFT;
486 			break;
487 		case HAMMER2_PFSTYPE_SUPROOT:
488 			/*
489 			 * Degenerate cluster representing the super-root
490 			 * topology on a single device.  Fake stuff so
491 			 * cluster ops work as expected.
492 			 */
493 			nflags |= HAMMER2_CLUSTER_WRHARD;
494 			nflags |= HAMMER2_CLUSTER_RDHARD;
495 			cluster->focus_index = i;
496 			cluster->focus = chain;
497 			cluster->error = chain ? chain->error : 0;
498 			break;
499 		default:
500 			break;
501 		}
502 	}
503 
504 	/*
505 	 * Pass 2
506 	 *
507 	 * Resolve masters.  Calculate nmasters for the highest matching
508 	 * TID, if a quorum cannot be attained try the next lower matching
509 	 * TID until we exhaust TIDs.
510 	 *
511 	 * NOTE: A NULL chain is not necessarily an error, it could be
512 	 *	 e.g. a lookup failure or the end of an iteration.
513 	 *	 Process normally.
514 	 */
515 	last_best_quorum_tid = HAMMER2_TID_MAX;
516 	quorum_tid = 0;		/* fix gcc warning */
517 
518 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
519 		nmasters = 0;
520 		quorum_tid = 0;
521 
522 		for (i = 0; i < cluster->nchains; ++i) {
523 			if (cluster->pmp->pfs_types[i] !=
524 			    HAMMER2_PFSTYPE_MASTER) {
525 				continue;
526 			}
527 			chain = cluster->array[i].chain;
528 
529 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
530 				/*
531 				 * Invalid as in unsynchronized, cannot be
532 				 * used to calculate the quorum.
533 				 */
534 			} else if (chain == NULL && quorum_tid == 0) {
535 				/*
536 				 * NULL chain on master matches NULL chains
537 				 * on other masters.
538 				 */
539 				++nmasters;
540 			} else if (quorum_tid < last_best_quorum_tid &&
541 				   chain != NULL &&
542 				   (quorum_tid < chain->bref.modify_tid ||
543 				    nmasters == 0)) {
544 				/*
545 				 * Better TID located, reset nmasters count.
546 				 */
547 				nmasters = 1;
548 				quorum_tid = chain->bref.modify_tid;
549 			} else if (chain &&
550 				   quorum_tid == chain->bref.modify_tid) {
551 				/*
552 				 * TID matches current collection.
553 				 */
554 				++nmasters;
555 			}
556 		}
557 		if (nmasters >= nquorum)
558 			break;
559 		last_best_quorum_tid = quorum_tid;
560 	}
561 
562 	/*
563 	 * Pass 3
564 	 *
565 	 * NOTE: A NULL chain is not necessarily an error, it could be
566 	 *	 e.g. a lookup failure or the end of an iteration.
567 	 *	 Process normally.
568 	 */
569 	for (i = 0; i < cluster->nchains; ++i) {
570 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
571 		chain = cluster->array[i].chain;
572 		if (chain && chain->error) {
573 			if (cluster->focus == NULL || cluster->focus == chain) {
574 				/* error will be overridden by valid focus */
575 				cluster->error = chain->error;
576 			}
577 			continue;
578 		}
579 
580 		switch (cluster->pmp->pfs_types[i]) {
581 		case HAMMER2_PFSTYPE_MASTER:
582 			/*
583 			 * We must have enough up-to-date masters to reach
584 			 * a quorum and the master modify_tid must match
585 			 * the quorum's modify_tid.
586 			 *
587 			 * Do not select an errored or out-of-sync master.
588 			 */
589 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
590 				nflags |= HAMMER2_CLUSTER_UNHARD;
591 			} else if (nmasters >= nquorum &&
592 				   (chain == NULL || chain->error == 0) &&
593 				   ((chain == NULL && quorum_tid == 0) ||
594 				    (chain != NULL && quorum_tid ==
595 						  chain->bref.modify_tid))) {
596 				nflags |= HAMMER2_CLUSTER_WRHARD;
597 				nflags |= HAMMER2_CLUSTER_RDHARD;
598 				if (!smpresent) {
599 					cluster->array[i].flags |=
600 							HAMMER2_CITEM_FEMOD;
601 				}
602 				if (cluster->focus == NULL ||
603 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
604 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
605 					cluster->focus_index = i;
606 					cluster->focus = chain; /* NULL ok */
607 					cluster->error = chain ? chain->error :
608 								 0;
609 				}
610 			} else if (chain == NULL || chain->error == 0) {
611 				nflags |= HAMMER2_CLUSTER_UNHARD;
612 			}
613 			break;
614 		case HAMMER2_PFSTYPE_SLAVE:
615 			/*
616 			 * We must have enough up-to-date masters to reach
617 			 * a quorum and the slave modify_tid must match the
618 			 * quorum's modify_tid.
619 			 *
620 			 * Do not select an errored slave.
621 			 */
622 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
623 				nflags |= HAMMER2_CLUSTER_UNHARD;
624 			} else if (nmasters >= nquorum &&
625 				   (chain == NULL || chain->error == 0) &&
626 				   ((chain == NULL && quorum_tid == 0) ||
627 				    (chain && quorum_tid ==
628 					      chain->bref.modify_tid))) {
629 				++nslaves;
630 				nflags |= HAMMER2_CLUSTER_RDHARD;
631 #if 0
632 				/* XXX optimize for RESOLVE_RDONLY */
633 				if (cluster->focus == NULL) {
634 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
635 					cluster->focus_index = i;
636 					cluster->focus = chain; /* NULL ok */
637 					cluster->error = chain ? chain->error :
638 								 0;
639 				}
640 #endif
641 			} else if (chain == NULL || chain->error == 0) {
642 				nflags |= HAMMER2_CLUSTER_UNSOFT;
643 			}
644 			break;
645 		case HAMMER2_PFSTYPE_SOFT_MASTER:
646 			/*
647 			 * Directly mounted soft master always wins.  There
648 			 * should be only one.
649 			 */
650 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
651 			cluster->focus_index = i;
652 			cluster->focus = chain;
653 			cluster->error = chain ? chain->error : 0;
654 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
655 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
656 			break;
657 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
658 			/*
659 			 * Directly mounted soft slave always wins.  There
660 			 * should be only one.
661 			 */
662 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
663 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
664 				cluster->focus_index = i;
665 				cluster->focus = chain;
666 				cluster->error = chain ? chain->error : 0;
667 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
668 			}
669 			break;
670 		case HAMMER2_PFSTYPE_SUPROOT:
671 			/*
672 			 * spmp (degenerate case)
673 			 */
674 			KKASSERT(i == 0);
675 			cluster->focus_index = i;
676 			cluster->focus = chain;
677 			cluster->error = chain ? chain->error : 0;
678 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
679 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
680 			break;
681 		default:
682 			break;
683 		}
684 	}
685 
686 	/*
687 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
688 	 * is bad or if we are at the PFS root (the bref won't match at
689 	 * the PFS root, obviously).
690 	 */
691 	focus = cluster->focus;
692 	if (focus) {
693 		cluster->ddflag =
694 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
695 	} else {
696 		cluster->ddflag = 0;
697 		goto skip4;
698 	}
699 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
700 		goto skip4;
701 
702 	/*
703 	 * Pass 4
704 	 *
705 	 * Validate the elements that were not marked invalid.  They should
706 	 * match.
707 	 */
708 	for (i = 0; i < cluster->nchains; ++i) {
709 		int ddflag;
710 
711 		chain = cluster->array[i].chain;
712 
713 		if (chain == NULL)
714 			continue;
715 		if (chain == focus)
716 			continue;
717 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
718 			continue;
719 
720 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
721 		if (chain->bref.type != focus->bref.type ||
722 		    chain->bref.key != focus->bref.key ||
723 		    chain->bref.keybits != focus->bref.keybits ||
724 		    chain->bref.modify_tid != focus->bref.modify_tid ||
725 		    chain->bytes != focus->bytes ||
726 		    ddflag != cluster->ddflag) {
727 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
728 			if (hammer2_debug & 1)
729 			kprintf("cluster_resolve: matching modify_tid failed "
730 				"bref test: idx=%d type=%02x/%02x "
731 				"key=%016jx/%d-%016jx/%d "
732 				"mod=%016jx/%016jx bytes=%u/%u\n",
733 				i,
734 				chain->bref.type, focus->bref.type,
735 				chain->bref.key, chain->bref.keybits,
736 				focus->bref.key, focus->bref.keybits,
737 				chain->bref.modify_tid, focus->bref.modify_tid,
738 				chain->bytes, focus->bytes);
739 			if (hammer2_debug & 0x4000)
740 				panic("cluster_resolve");
741 			/* flag issue and force resync? */
742 		}
743 	}
744 skip4:
745 
746 	if (ttlslaves == 0)
747 		nflags |= HAMMER2_CLUSTER_NOSOFT;
748 	if (ttlmasters == 0)
749 		nflags |= HAMMER2_CLUSTER_NOHARD;
750 
751 	/*
752 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
753 	 * all available nodes (even if 0 are available) are fully
754 	 * synchronized.  This is used by the synchronization thread to
755 	 * determine if there is work it could potentially accomplish.
756 	 */
757 	if (nslaves == ttlslaves)
758 		nflags |= HAMMER2_CLUSTER_SSYNCED;
759 	if (nmasters == ttlmasters)
760 		nflags |= HAMMER2_CLUSTER_MSYNCED;
761 
762 	/*
763 	 * Determine if the cluster was successfully locked for the
764 	 * requested operation and generate an error code.  The cluster
765 	 * will not be locked (or ref'd) if an error is returned.
766 	 *
767 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
768 	 * to determine if reading or writing is possible.  If writing, the
769 	 * cluster still requires a call to hammer2_cluster_modify() first.
770 	 */
771 	atomic_set_int(&cluster->flags, nflags);
772 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
773 }
774 
775 /*
776  * This is used by the sync thread to force non-NULL elements of a copy
777  * of the pmp->iroot cluster to be good which is required to prime the
778  * sync.
779  */
780 void
781 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
782 {
783 	int i;
784 
785 	for (i = 0; i < cluster->nchains; ++i) {
786 		if (cluster->array[i].chain)
787 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
788 	}
789 }
790 
791 /*
792  * Copy a cluster, returned a ref'd cluster.  All underlying chains
793  * are also ref'd, but not locked.  Focus state is also copied.
794  *
795  * Original cluster does not have to be locked but usually is.
796  * New cluster will not be flagged as locked.
797  *
798  * Callers using this function to initialize a new cluster from an inode
799  * generally lock and resolve the resulting cluster.
800  *
801  * Callers which use this function to save/restore a cluster structure
802  * generally retain the focus state and do not re-resolve it.  Caller should
803  * not try to re-resolve internal (cparent) node state during an iteration
804  * as the individual tracking elements of cparent in an iteration may not
805  * match even though they are correct.
806  */
807 hammer2_cluster_t *
808 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
809 {
810 	hammer2_pfs_t *pmp = ocluster->pmp;
811 	hammer2_cluster_t *ncluster;
812 	hammer2_chain_t *chain;
813 	int i;
814 
815 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
816 	ncluster->pmp = pmp;
817 	ncluster->nchains = ocluster->nchains;
818 	ncluster->refs = 1;
819 
820 	for (i = 0; i < ocluster->nchains; ++i) {
821 		chain = ocluster->array[i].chain;
822 		ncluster->array[i].chain = chain;
823 		ncluster->array[i].flags = ocluster->array[i].flags;
824 		if (chain)
825 			hammer2_chain_ref(chain);
826 	}
827 	ncluster->focus_index = ocluster->focus_index;
828 	ncluster->focus = ocluster->focus;
829 	ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
830 					      HAMMER2_CLUSTER_INODE);
831 
832 	return (ncluster);
833 }
834 
835 /*
836  * Unlock a cluster.  Refcount and focus is maintained.
837  */
838 void
839 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
840 {
841 	hammer2_chain_t *chain;
842 	int i;
843 
844 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
845 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
846 			cluster);
847 	}
848 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
849 	KKASSERT(cluster->refs > 0);
850 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
851 
852 	for (i = 0; i < cluster->nchains; ++i) {
853 		if (i == idx)
854 			continue;
855 		chain = cluster->array[i].chain;
856 		if (chain)
857 			hammer2_chain_unlock(chain);
858 	}
859 }
860 
861 void
862 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
863 {
864 	hammer2_cluster_unlock_except(cluster, -1);
865 }
866 
867 /*
868  * Resize the cluster's physical storage allocation in-place.  This may
869  * replace the cluster's chains.
870  */
871 void
872 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
873 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
874 		       int nradix, int flags)
875 {
876 	hammer2_chain_t *chain;
877 	int i;
878 
879 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
880 	KKASSERT(cparent->nchains == cluster->nchains);
881 
882 	for (i = 0; i < cluster->nchains; ++i) {
883 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
884 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
885 			continue;
886 		}
887 		chain = cluster->array[i].chain;
888 		if (chain) {
889 			KKASSERT(cparent->array[i].chain);
890 			hammer2_chain_resize(trans, ip,
891 					     cparent->array[i].chain, chain,
892 					     nradix, flags);
893 		}
894 	}
895 }
896 
897 /*
898  * Set an inode's cluster modified, marking the related chains RW and
899  * duplicating them if necessary.
900  *
901  * The passed-in chain is a localized copy of the chain previously acquired
902  * when the inode was locked (and possilby replaced in the mean time), and
903  * must also be updated.  In fact, we update it first and then synchronize
904  * the inode's cluster cache.
905  */
906 hammer2_inode_data_t *
907 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
908 			  hammer2_cluster_t *cluster, int flags)
909 {
910 	hammer2_inode_modify(trans, ip);
911 	hammer2_cluster_modify(trans, cluster, flags);
912 	hammer2_inode_repoint(ip, NULL, cluster);
913 	return (&hammer2_cluster_wdata(cluster)->ipdata);
914 }
915 
916 /*
917  * Adjust the cluster's chains to allow modification and adjust the
918  * focus.  Data will be accessible on return.
919  *
920  * If our focused master errors on modify, re-resolve the cluster to
921  * try to select a different master.
922  */
923 void
924 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
925 		       int flags)
926 {
927 	hammer2_chain_t *chain;
928 	int resolve_again;
929 	int i;
930 
931 	resolve_again = 0;
932 	for (i = 0; i < cluster->nchains; ++i) {
933 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
934 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
935 			continue;
936 		}
937 		chain = cluster->array[i].chain;
938 		if (chain == NULL)
939 			continue;
940 		if (chain->error)
941 			continue;
942 		hammer2_chain_modify(trans, chain, flags);
943 		if (cluster->focus == chain && chain->error) {
944 			cluster->error = chain->error;
945 			resolve_again = 1;
946 		}
947 	}
948 	if (resolve_again)
949 		hammer2_cluster_resolve(cluster);
950 }
951 
952 /*
953  * Synchronize modifications from the focus to other chains in a cluster.
954  * Convenient because nominal API users can just modify the contents of the
955  * focus (at least for non-blockref data).
956  *
957  * Nominal front-end operations only edit non-block-table data in a single
958  * chain.  This code copies such modifications to the other chains in the
959  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
960  * by both the frontend and the backend and will explode in fireworks if
961  * blindly copied.
962  */
963 void
964 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
965 {
966 	hammer2_chain_t *focus;
967 	hammer2_chain_t *scan;
968 	const hammer2_inode_data_t *ripdata;
969 	hammer2_inode_data_t *wipdata;
970 	int i;
971 
972 	focus = cluster->focus;
973 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
974 
975 	for (i = 0; i < cluster->nchains; ++i) {
976 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
977 			continue;
978 		scan = cluster->array[i].chain;
979 		if (scan == NULL || scan == focus)
980 			continue;
981 		if (scan->error)
982 			continue;
983 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
984 		KKASSERT(focus->bytes == scan->bytes &&
985 			 focus->bref.type == scan->bref.type);
986 		switch(focus->bref.type) {
987 		case HAMMER2_BREF_TYPE_INODE:
988 			ripdata = &focus->data->ipdata;
989 			wipdata = &scan->data->ipdata;
990 			if ((ripdata->meta.op_flags &
991 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
992 				bcopy(ripdata, wipdata,
993 				      offsetof(hammer2_inode_data_t, u));
994 				break;
995 			}
996 			/* fall through to full copy */
997 		case HAMMER2_BREF_TYPE_DATA:
998 			bcopy(focus->data, scan->data, focus->bytes);
999 			break;
1000 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1001 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1002 		case HAMMER2_BREF_TYPE_FREEMAP:
1003 		case HAMMER2_BREF_TYPE_VOLUME:
1004 			panic("hammer2_cluster_modsync: illegal node type");
1005 			/* NOT REACHED */
1006 			break;
1007 		default:
1008 			panic("hammer2_cluster_modsync: unknown node type");
1009 			break;
1010 		}
1011 	}
1012 }
1013 
1014 /*
1015  * Lookup initialization/completion API.  Returns a locked, fully resolved
1016  * cluster with one ref.
1017  */
1018 hammer2_cluster_t *
1019 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1020 {
1021 	hammer2_cluster_t *cluster;
1022 
1023 	cluster = hammer2_cluster_copy(cparent);
1024 	if (flags & HAMMER2_LOOKUP_SHARED) {
1025 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1026 					      HAMMER2_RESOLVE_SHARED);
1027 	} else {
1028 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1029 	}
1030 	hammer2_cluster_resolve(cluster);
1031 
1032 	return (cluster);
1033 }
1034 
1035 void
1036 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1037 {
1038 	if (cparent) {
1039 		hammer2_cluster_unlock(cparent);
1040 		hammer2_cluster_drop(cparent);
1041 	}
1042 }
1043 
1044 /*
1045  * Locate first match or overlap under parent, return a new, locked, resolved
1046  * cluster with one ref.
1047  *
1048  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1049  */
1050 hammer2_cluster_t *
1051 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1052 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1053 {
1054 	hammer2_pfs_t *pmp;
1055 	hammer2_cluster_t *cluster;
1056 	hammer2_chain_t *chain;
1057 	hammer2_key_t key_accum;
1058 	hammer2_key_t key_next;
1059 	int null_count;
1060 	int rflags;
1061 	int i;
1062 
1063 	KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1064 
1065 	pmp = cparent->pmp;				/* can be NULL */
1066 	key_accum = *key_nextp;
1067 	null_count = 0;
1068 	if (flags & HAMMER2_LOOKUP_SHARED)
1069 		rflags = HAMMER2_RESOLVE_SHARED;
1070 	else
1071 		rflags = 0;
1072 
1073 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1074 	cluster->pmp = pmp;				/* can be NULL */
1075 	cluster->refs = 1;
1076 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1077 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1078 
1079 	/*
1080 	 * Iterating earlier cluster elements with later elements still
1081 	 * locked is a problem, so we have to unlock the parent and then
1082 	 * re-lock as we go.
1083 	 */
1084 	hammer2_cluster_unlock(cparent);
1085 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1086 
1087 	/*
1088 	 * Pass-1, issue lookups.
1089 	 */
1090 	for (i = 0; i < cparent->nchains; ++i) {
1091 		cluster->array[i].flags = cparent->array[i].flags;
1092 		key_next = *key_nextp;
1093 
1094 		/*
1095 		 * Always relock the parent as we go.
1096 		 */
1097 		if (cparent->array[i].chain) {
1098 			hammer2_chain_lock(cparent->array[i].chain, rflags);
1099 		}
1100 
1101 		/*
1102 		 * Nothing to base the lookup, or parent was not synchronized.
1103 		 */
1104 		if (cparent->array[i].chain == NULL ||
1105 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1106 			++null_count;
1107 			continue;
1108 		}
1109 
1110 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
1111 					     &key_next,
1112 					     key_beg, key_end,
1113 					     &cparent->array[i].cache_index,
1114 					     flags);
1115 		cluster->array[i].chain = chain;
1116 		if (chain == NULL) {
1117 			++null_count;
1118 		}
1119 		if (key_accum > key_next)
1120 			key_accum = key_next;
1121 	}
1122 
1123 	/*
1124 	 * Cleanup
1125 	 */
1126 	cluster->nchains = i;
1127 	*key_nextp = key_accum;
1128 
1129 	/*
1130 	 * The cluster must be resolved, out of sync elements may be present.
1131 	 *
1132 	 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1133 	 */
1134 	if (null_count != i)
1135 		hammer2_cluster_resolve(cluster);
1136 	if (null_count == i ||
1137 	    (cluster->focus == NULL &&
1138 	     (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1139 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1140 			hammer2_cluster_unlock(cluster);
1141 		hammer2_cluster_drop(cluster);
1142 		cluster = NULL;
1143 	}
1144 
1145 	return (cluster);
1146 }
1147 
1148 /*
1149  * Locate next match or overlap under parent, replace the passed-in cluster.
1150  * The returned cluster is a new, locked, resolved cluster with one ref.
1151  *
1152  * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1153  */
1154 hammer2_cluster_t *
1155 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1156 		     hammer2_key_t *key_nextp,
1157 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1158 {
1159 	hammer2_chain_t *ochain;
1160 	hammer2_chain_t *nchain;
1161 	hammer2_key_t key_accum;
1162 	hammer2_key_t key_next;
1163 	int parent_index;
1164 	int cluster_index;
1165 	int null_count;
1166 	int rflags;
1167 	int i;
1168 
1169 	KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1170 
1171 	key_accum = *key_nextp;
1172 	null_count = 0;
1173 	parent_index = cparent->focus_index;	/* save prior focus */
1174 	cluster_index = cluster->focus_index;
1175 	if (flags & HAMMER2_LOOKUP_SHARED)
1176 		rflags = HAMMER2_RESOLVE_SHARED;
1177 	else
1178 		rflags = 0;
1179 
1180 	cluster->focus = NULL;		/* XXX needed any more? */
1181 	/*cparent->focus = NULL;*/
1182 	cluster->focus_index = 0;	/* XXX needed any more? */
1183 	/*cparent->focus_index = 0;*/
1184 
1185 	cluster->ddflag = 0;
1186 
1187 	/*
1188 	 * The parent is always locked on entry, the iterator may be locked
1189 	 * depending on flags.
1190 	 *
1191 	 * We must temporarily unlock the passed-in clusters to avoid a
1192 	 * deadlock between elements of the cluster with other threads.
1193 	 * We will fixup the lock in the loop.
1194 	 *
1195 	 * Note that this will clear the focus.
1196 	 *
1197 	 * Reflag the clusters as locked, because we will relock them
1198 	 * as we go.
1199 	 */
1200 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1201 		hammer2_cluster_unlock(cluster);
1202 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1203 	}
1204 	hammer2_cluster_unlock(cparent);
1205 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1206 
1207 	for (i = 0; i < cparent->nchains; ++i) {
1208 		key_next = *key_nextp;
1209 		ochain = cluster->array[i].chain;
1210 
1211 		/*
1212 		 * Always relock the parent as we go.
1213 		 */
1214 		if (cparent->array[i].chain)
1215 			hammer2_chain_lock(cparent->array[i].chain, rflags);
1216 
1217 		/*
1218 		 * Nothing to iterate from.  These cases can occur under
1219 		 * normal operations.  For example, during synchronization
1220 		 * a slave might reach the end of its scan while records
1221 		 * are still left on the master(s).
1222 		 */
1223 		if (ochain == NULL) {
1224 			++null_count;
1225 			continue;
1226 		}
1227 		if (cparent->array[i].chain == NULL ||
1228 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1229 		    (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1230 			/* ochain has not yet been relocked */
1231 			hammer2_chain_drop(ochain);
1232 			cluster->array[i].chain = NULL;
1233 			++null_count;
1234 			continue;
1235 		}
1236 
1237 		/*
1238 		 * Relock the child if necessary.  Parent and child will then
1239 		 * be locked as expected by hammer2_chain_next() and flags.
1240 		 */
1241 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1242 			hammer2_chain_lock(ochain, rflags);
1243 		nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1244 					    &key_next, key_beg, key_end,
1245 					    &cparent->array[i].cache_index,
1246 					    flags);
1247 		/* ochain now invalid but can still be used for focus check */
1248 		if (parent_index == i) {
1249 			cparent->focus_index = i;
1250 			cparent->focus = cparent->array[i].chain;
1251 		}
1252 
1253 		cluster->array[i].chain = nchain;
1254 		if (nchain == NULL) {
1255 			++null_count;
1256 		}
1257 		if (key_accum > key_next)
1258 			key_accum = key_next;
1259 	}
1260 
1261 	/*
1262 	 * Cleanup
1263 	 */
1264 	cluster->nchains = i;
1265 	*key_nextp = key_accum;
1266 
1267 	/*
1268 	 * The cluster must be resolved, out of sync elements may be present.
1269 	 *
1270 	 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1271 	 */
1272 	if (null_count != i)
1273 		hammer2_cluster_resolve(cluster);
1274 	if (null_count == i ||
1275 	    (cluster->focus == NULL &&
1276 	     (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1277 		if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1278 			hammer2_cluster_unlock(cluster);
1279 		hammer2_cluster_drop(cluster);
1280 		cluster = NULL;
1281 	}
1282 	return(cluster);
1283 }
1284 
1285 /*
1286  * Advance just one chain in the cluster and recalculate the invalid bit.
1287  * The cluster index is allowed to be flagged invalid on input and is
1288  * recalculated on return.
1289  *
1290  * (used during synchronization to advance past a chain being deleted).
1291  *
1292  * The chain being advanced must not be the focus and the clusters in
1293  * question must have already passed normal cluster_lookup/cluster_next
1294  * checks.
1295  *
1296  * The cluster always remains intact on return, so void function.
1297  */
1298 void
1299 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1300 				  hammer2_cluster_t *cluster,
1301 				  hammer2_key_t *key_nextp,
1302 				  hammer2_key_t key_beg,
1303 				  hammer2_key_t key_end,
1304 				  int i, int flags)
1305 {
1306 	hammer2_chain_t *ochain;
1307 	hammer2_chain_t *nchain;
1308 	hammer2_chain_t *focus;
1309 	hammer2_key_t key_accum;
1310 	hammer2_key_t key_next;
1311 	int ddflag;
1312 
1313 	key_accum = *key_nextp;
1314 	key_next = *key_nextp;
1315 	ochain = cluster->array[i].chain;
1316 	if (ochain == NULL)
1317 		goto done;
1318 	KKASSERT(ochain != cluster->focus);
1319 
1320 	nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1321 				    &key_next, key_beg, key_end,
1322 				    &cparent->array[i].cache_index,
1323 				    flags);
1324 	/* ochain now invalid */
1325 	if (cparent->focus_index == i)
1326 		cparent->focus = cparent->array[i].chain;
1327 
1328 	/*
1329 	 * Install nchain.  Note that nchain can be NULL, and can also
1330 	 * be in an unlocked state depending on flags.
1331 	 */
1332 	cluster->array[i].chain = nchain;
1333 	cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1334 
1335 	if (key_accum > key_next)
1336 		key_accum = key_next;
1337 
1338 	focus = cluster->focus;
1339 	if (focus == NULL)
1340 		goto done;
1341 	if (nchain == NULL)
1342 		goto done;
1343 #if 0
1344 	if (nchain == focus)	/* ASSERTED NOT TRUE */
1345 		...
1346 #endif
1347 	ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1348 	if (nchain->bref.type != focus->bref.type ||
1349 	    nchain->bref.key != focus->bref.key ||
1350 	    nchain->bref.keybits != focus->bref.keybits ||
1351 	    nchain->bref.modify_tid != focus->bref.modify_tid ||
1352 	    nchain->bytes != focus->bytes ||
1353 	    ddflag != cluster->ddflag) {
1354 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1355 	}
1356 
1357 done:
1358 	*key_nextp = key_accum;
1359 #if 0
1360 	/*
1361 	 * For now don't re-resolve cluster->flags.
1362 	 */
1363 	hammer2_cluster_resolve(cluster);
1364 #endif
1365 }
1366 
1367 /*
1368  * Create a new cluster using the specified key
1369  */
1370 int
1371 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1372 		     hammer2_cluster_t **clusterp,
1373 		     hammer2_key_t key, int keybits,
1374 		     int type, size_t bytes, int flags)
1375 {
1376 	hammer2_cluster_t *cluster;
1377 	hammer2_pfs_t *pmp;
1378 	int error;
1379 	int i;
1380 
1381 	pmp = trans->pmp;				/* can be NULL */
1382 
1383 	if ((cluster = *clusterp) == NULL) {
1384 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1385 				  M_WAITOK | M_ZERO);
1386 		cluster->pmp = pmp;			/* can be NULL */
1387 		cluster->refs = 1;
1388 		cluster->flags = HAMMER2_CLUSTER_LOCKED;
1389 	}
1390 	cluster->focus_index = 0;
1391 	cluster->focus = NULL;
1392 
1393 	/*
1394 	 * NOTE: cluster->array[] entries can initially be NULL.  If
1395 	 *	 *clusterp is supplied, skip NULL entries, otherwise
1396 	 *	 create new chains.
1397 	 */
1398 	for (i = 0; i < cparent->nchains; ++i) {
1399 		if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1400 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1401 			continue;
1402 		}
1403 		if (*clusterp) {
1404 			if ((cluster->array[i].flags &
1405 			     HAMMER2_CITEM_FEMOD) == 0) {
1406 				cluster->array[i].flags |=
1407 						HAMMER2_CITEM_INVALID;
1408 				continue;
1409 			}
1410 			if (cluster->array[i].chain == NULL)
1411 				continue;
1412 		}
1413 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
1414 					     &cluster->array[i].chain, pmp,
1415 					     key, keybits,
1416 					     type, bytes, flags);
1417 		if (cparent->focus_index == i)
1418 			cparent->focus = cparent->array[i].chain;
1419 		KKASSERT(error == 0);
1420 		if (cluster->focus == NULL) {
1421 			cluster->focus_index = i;
1422 			cluster->focus = cluster->array[i].chain;
1423 		}
1424 		if (cparent->focus == cparent->array[i].chain) {
1425 			cluster->focus_index = i;
1426 			cluster->focus = cluster->array[i].chain;
1427 		}
1428 	}
1429 	cluster->nchains = i;
1430 	*clusterp = cluster;
1431 	hammer2_cluster_resolve(cluster);
1432 
1433 	return error;
1434 }
1435 
1436 /*
1437  * Rename a cluster to a new parent.
1438  *
1439  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1440  *	    So the data_off field is not relevant.  Only the key and
1441  *	    keybits are used.
1442  */
1443 void
1444 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1445 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1446 		       int flags)
1447 {
1448 	hammer2_chain_t *chain;
1449 	hammer2_blockref_t xbref;
1450 	int i;
1451 
1452 #if 0
1453 	cluster->focus = NULL;
1454 	cparent->focus = NULL;
1455 	cluster->focus_index = 0;
1456 	cparent->focus_index = 0;
1457 #endif
1458 
1459 	for (i = 0; i < cluster->nchains; ++i) {
1460 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1461 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1462 			continue;
1463 		}
1464 		chain = cluster->array[i].chain;
1465 		if (chain) {
1466 			if (bref) {
1467 				xbref = chain->bref;
1468 				xbref.key = bref->key;
1469 				xbref.keybits = bref->keybits;
1470 				hammer2_chain_rename(trans, &xbref,
1471 						     &cparent->array[i].chain,
1472 						     chain, flags);
1473 			} else {
1474 				hammer2_chain_rename(trans, NULL,
1475 						     &cparent->array[i].chain,
1476 						     chain, flags);
1477 			}
1478 			if (cparent->focus_index == i)
1479 				cparent->focus = cparent->array[i].chain;
1480 			KKASSERT(cluster->array[i].chain == chain); /*remove*/
1481 		}
1482 	}
1483 }
1484 
1485 /*
1486  * Mark a cluster deleted
1487  */
1488 void
1489 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1490 		       hammer2_cluster_t *cluster, int flags)
1491 {
1492 	hammer2_chain_t *chain;
1493 	hammer2_chain_t *parent;
1494 	int i;
1495 
1496 	if (cparent == NULL) {
1497 		kprintf("cparent is NULL\n");
1498 		return;
1499 	}
1500 
1501 	for (i = 0; i < cluster->nchains; ++i) {
1502 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1503 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1504 			continue;
1505 		}
1506 		parent = cparent->array[i].chain;
1507 		chain = cluster->array[i].chain;
1508 		if (chain == NULL)
1509 			continue;
1510 		if (chain->parent != parent) {
1511 			kprintf("hammer2_cluster_delete: parent "
1512 				"mismatch chain=%p parent=%p against=%p\n",
1513 				chain, chain->parent, parent);
1514 		} else {
1515 			hammer2_chain_delete(trans, parent, chain, flags);
1516 		}
1517 	}
1518 }
1519 
1520 /*
1521  * Create a snapshot of the specified {parent, ochain} with the specified
1522  * label.  The originating hammer2_inode must be exclusively locked for
1523  * safety.
1524  *
1525  * The ioctl code has already synced the filesystem.
1526  */
1527 int
1528 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1529 		       hammer2_ioc_pfs_t *pfs)
1530 {
1531 	hammer2_dev_t *hmp;
1532 	hammer2_cluster_t *ncluster;
1533 	const hammer2_inode_data_t *ripdata;
1534 	hammer2_inode_data_t *wipdata;
1535 	hammer2_chain_t *nchain;
1536 	hammer2_inode_t *nip;
1537 	size_t name_len;
1538 	hammer2_key_t lhc;
1539 	struct vattr vat;
1540 #if 0
1541 	uuid_t opfs_clid;
1542 #endif
1543 	int error;
1544 	int i;
1545 
1546 	kprintf("snapshot %s\n", pfs->name);
1547 
1548 	name_len = strlen(pfs->name);
1549 	lhc = hammer2_dirhash(pfs->name, name_len);
1550 
1551 	/*
1552 	 * Get the clid
1553 	 */
1554 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1555 #if 0
1556 	opfs_clid = ripdata->meta.pfs_clid;
1557 #endif
1558 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1559 
1560 	/*
1561 	 * Create the snapshot directory under the super-root
1562 	 *
1563 	 * Set PFS type, generate a unique filesystem id, and generate
1564 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1565 	 * which theoretically allows the snapshot to be used as part of
1566 	 * the same cluster (perhaps as a cache).
1567 	 *
1568 	 * Copy the (flushed) blockref array.  Theoretically we could use
1569 	 * chain_duplicate() but it becomes difficult to disentangle
1570 	 * the shared core so for now just brute-force it.
1571 	 */
1572 	VATTR_NULL(&vat);
1573 	vat.va_type = VDIR;
1574 	vat.va_mode = 0755;
1575 	ncluster = NULL;
1576 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1577 				   proc0.p_ucred, pfs->name, name_len,
1578 				   &ncluster,
1579 				   HAMMER2_INSERT_PFSROOT, &error);
1580 
1581 	if (nip) {
1582 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1583 		wipdata->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
1584 		wipdata->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1585 		wipdata->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
1586 		kern_uuidgen(&wipdata->meta.pfs_fsid, 1);
1587 
1588 		/*
1589 		 * Give the snapshot its own private cluster.  As a snapshot
1590 		 * no further synchronization with the original cluster will
1591 		 * be done.
1592 		 */
1593 #if 0
1594 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1595 			wipdata->meta.pfs_clid = opfs_clid;
1596 		else
1597 			kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1598 #endif
1599 		kern_uuidgen(&wipdata->meta.pfs_clid, 1);
1600 
1601 		for (i = 0; i < ncluster->nchains; ++i) {
1602 			if ((ncluster->array[i].flags &
1603 			     HAMMER2_CITEM_FEMOD) == 0) {
1604 				ncluster->array[i].flags |=
1605 					HAMMER2_CITEM_INVALID;
1606 				continue;
1607 			}
1608 			nchain = ncluster->array[i].chain;
1609 			if (nchain)
1610 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1611 		}
1612 
1613 		/* XXX hack blockset copy */
1614 		/* XXX doesn't work with real cluster */
1615 		KKASSERT(ocluster->nchains == 1);
1616 		wipdata->u.blockset = ripdata->u.blockset;
1617 		hammer2_cluster_modsync(ncluster);
1618 		for (i = 0; i < ncluster->nchains; ++i) {
1619 			nchain = ncluster->array[i].chain;
1620 			if (nchain)
1621 				hammer2_flush(trans, nchain, 1);
1622 		}
1623 		hammer2_inode_unlock(nip, ncluster);
1624 	}
1625 	return (error);
1626 }
1627 
1628 /*
1629  * Return locked parent cluster given a locked child.  The child remains
1630  * locked on return.  The new parent's focus follows the child's focus
1631  * and the parent is always resolved.
1632  *
1633  * We must temporarily unlock the passed-in cluster to avoid a deadlock
1634  * between elements of the cluster.
1635  *
1636  * We must not try to hammer2_cluster_resolve() cparent.  The individual
1637  * parent chains for the nodes are the correct parents for the cluster but
1638  * do not necessarily match, so resolve would likely implode.
1639  */
1640 hammer2_cluster_t *
1641 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1642 {
1643 	hammer2_cluster_t *cparent;
1644 	int i;
1645 
1646 	cparent = hammer2_cluster_copy(cluster);
1647 	hammer2_cluster_unlock(cluster);
1648 
1649 	for (i = 0; i < cparent->nchains; ++i) {
1650 		hammer2_chain_t *chain;
1651 		hammer2_chain_t *rchain;
1652 
1653 		/*
1654 		 * Calculate parent for each element.  Old chain has an extra
1655 		 * ref for cparent but the lock remains with cluster.
1656 		 */
1657 		chain = cparent->array[i].chain;
1658 		if (chain == NULL)
1659 			continue;
1660 		while ((rchain = chain->parent) != NULL) {
1661 			hammer2_chain_ref(rchain);
1662 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1663 			if (chain->parent == rchain)
1664 				break;
1665 			hammer2_chain_unlock(rchain);
1666 			hammer2_chain_drop(rchain);
1667 		}
1668 		cparent->array[i].chain = rchain;
1669 		hammer2_chain_drop(chain);
1670 	}
1671 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1672 	/* hammer2_cluster_resolve(cparent); */
1673 	hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1674 
1675 	return cparent;
1676 }
1677 
1678 /************************************************************************
1679  *			        CLUSTER I/O 				*
1680  ************************************************************************
1681  *
1682  *
1683  * WARNING! blockref[] array data is not universal.  These functions should
1684  *	    only be used to access universal data.
1685  *
1686  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1687  *	    complete if necessary.  The I/O's should have already been
1688  *	    initiated by the cluster_lock/chain_lock operation.
1689  *
1690  *	    The cluster must already be in a modified state before wdata
1691  *	    is called.  The data will already be available for this case.
1692  */
1693 const hammer2_media_data_t *
1694 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1695 {
1696 	return(cluster->focus->data);
1697 }
1698 
1699 hammer2_media_data_t *
1700 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1701 {
1702 	KKASSERT(hammer2_cluster_modified(cluster));
1703 	return(cluster->focus->data);
1704 }
1705 
1706 /*
1707  * Load cluster data asynchronously with callback.
1708  *
1709  * The callback is made for the first validated data found, or NULL
1710  * if no valid data is available.
1711  *
1712  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1713  *	 in the inode with an exclusive lock held), the chain structure may be
1714  *	 shared.
1715  */
1716 void
1717 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1718 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1719 {
1720 	hammer2_chain_t *chain;
1721 	hammer2_iocb_t *iocb;
1722 	hammer2_dev_t *hmp;
1723 	hammer2_blockref_t *bref;
1724 	int i;
1725 
1726 	i = cluster->focus_index;
1727 	chain = cluster->focus;
1728 
1729 	iocb = &cluster->iocb;
1730 	iocb->callback = callback;
1731 	iocb->dio = NULL;		/* for already-validated case */
1732 	iocb->cluster = cluster;
1733 	iocb->chain = chain;
1734 	iocb->ptr = ptr;
1735 	iocb->lbase = (off_t)i;
1736 	iocb->flags = 0;
1737 	iocb->error = 0;
1738 
1739 	/*
1740 	 * Data already validated
1741 	 */
1742 	if (chain->data) {
1743 		callback(iocb);
1744 		return;
1745 	}
1746 
1747 	/*
1748 	 * We must resolve to a device buffer, either by issuing I/O or
1749 	 * by creating a zero-fill element.  We do not mark the buffer
1750 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1751 	 * API must still be used to do that).
1752 	 *
1753 	 * The device buffer is variable-sized in powers of 2 down
1754 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1755 	 * chunk always contains buffers of the same size. (XXX)
1756 	 *
1757 	 * The minimum physical IO size may be larger than the variable
1758 	 * block size.
1759 	 *
1760 	 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1761 	 *	      matches hammer2_devblksize()?  Or does the freemap's
1762 	 *	      pre-zeroing handle the case for us?
1763 	 */
1764 	bref = &chain->bref;
1765 	hmp = chain->hmp;
1766 
1767 #if 0
1768 	/* handled by callback? <- TODO XXX even needed for loads? */
1769 	/*
1770 	 * The getblk() optimization for a 100% overwrite can only be used
1771 	 * if the physical block size matches the request.
1772 	 */
1773 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1774 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1775 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1776 		KKASSERT(error == 0);
1777 		iocb->dio = dio;
1778 		callback(iocb);
1779 		return;
1780 	}
1781 #endif
1782 
1783 	/*
1784 	 * Otherwise issue a read
1785 	 */
1786 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1787 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1788 }
1789