xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision adb2b6d689f2a8205f4f9f4fb7139f1010bbe51e)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125 	hammer2_chain_t *chain;
126 	int i;
127 
128 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 	for (i = 0; i < cluster->nchains; ++i) {
130 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131 			continue;
132 		chain = cluster->array[i].chain;
133 		if (chain == NULL)
134 			continue;
135 		if (chain->error)
136 			continue;
137 		if (chain->bytes != bytes)
138 			return 1;
139 	}
140 	return 0;
141 }
142 
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 	if (cluster->error == 0)
154 		return(cluster->focus->bref.type);
155 	return 0;
156 }
157 
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 	if (cluster->error == 0)
168 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169 	return 0;
170 }
171 
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 	if (cluster->error == 0) {
187 		*bref = cluster->focus->bref;
188 		bref->data_off = 0;
189 	} else {
190 		bzero(bref, sizeof(*bref));
191 	}
192 }
193 
194 /*
195  * Return non-zero if the chain representing an inode has been flagged
196  * as having been unlinked.  Allows the vnode reclaim to avoid loading
197  * the inode data from disk e.g. when unmount or recycling old, clean
198  * vnodes.
199  *
200  * The cluster does not need to be locked.
201  * The focus cannot be used since the cluster might not be locked.
202  */
203 int
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
205 {
206 	hammer2_chain_t *chain;
207 	int flags;
208 	int i;
209 
210 	flags = 0;
211 	for (i = 0; i < cluster->nchains; ++i) {
212 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213 			continue;
214 		chain = cluster->array[i].chain;
215 		if (chain)
216 			flags |= chain->flags;
217 	}
218 	return (flags & HAMMER2_CHAIN_UNLINKED);
219 }
220 
221 /*
222  * Set a bitmask of flags in all chains related to a cluster.
223  * The cluster should probably be locked.
224  *
225  * XXX Only operate on FEMOD elements?
226  */
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
229 {
230 	hammer2_chain_t *chain;
231 	int i;
232 
233 	for (i = 0; i < cluster->nchains; ++i) {
234 		chain = cluster->array[i].chain;
235 		if (chain)
236 			atomic_set_int(&chain->flags, flags);
237 	}
238 }
239 
240 /*
241  * Set a bitmask of flags in all chains related to a cluster.
242  * The cluster should probably be locked.
243  *
244  * XXX Only operate on FEMOD elements?
245  */
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
248 {
249 	hammer2_chain_t *chain;
250 	int i;
251 
252 	for (i = 0; i < cluster->nchains; ++i) {
253 		chain = cluster->array[i].chain;
254 		if (chain)
255 			atomic_clear_int(&chain->flags, flags);
256 	}
257 }
258 
259 /*
260  * Flag the cluster for flushing recursively up to the root.  Despite the
261  * work it does, this is relatively benign.  It just makes sure that the
262  * flusher has top-down visibility to this cluster.
263  *
264  * Errored chains are not flagged for flushing.
265  *
266  * The cluster should probably be locked.
267  */
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
270 {
271 	hammer2_chain_t *chain;
272 	int i;
273 
274 	for (i = 0; i < cluster->nchains; ++i) {
275 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276 			continue;
277 		chain = cluster->array[i].chain;
278 		if (chain == NULL)
279 			continue;
280 		if (chain->error)
281 			continue;
282 		hammer2_chain_setflush(trans, chain);
283 	}
284 }
285 
286 /*
287  * Set the check mode for the cluster.
288  * Errored elements of the cluster are ignored.
289  *
290  * The cluster must be locked and modified.
291  */
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294 				hammer2_cluster_t *cluster,
295 				int check_algo)
296 {
297 	hammer2_chain_t *chain;
298 	int i;
299 
300 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301 	for (i = 0; i < cluster->nchains; ++i) {
302 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304 			continue;
305 		}
306 		chain = cluster->array[i].chain;
307 		if (chain == NULL)
308 			continue;
309 		if (chain->error)
310 			continue;
311 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312 		chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313 		chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
314 	}
315 }
316 
317 /*
318  * Create a degenerate cluster with one ref from a single locked chain.
319  * The returned cluster will be focused on the chain and inherit its
320  * error state.
321  *
322  * The chain's lock and reference are transfered to the new cluster, so
323  * the caller should not try to unlock the chain separately.
324  *
325  * We fake the flags.
326  */
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
329 {
330 	hammer2_cluster_t *cluster;
331 
332 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333 	cluster->array[0].chain = chain;
334 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335 	cluster->nchains = 1;
336 	cluster->focus = chain;
337 	cluster->focus_index = 0;
338 	cluster->pmp = chain->pmp;
339 	cluster->refs = 1;
340 	cluster->error = chain->error;
341 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
342 			 HAMMER2_CLUSTER_WRHARD |
343 			 HAMMER2_CLUSTER_RDHARD |
344 			 HAMMER2_CLUSTER_MSYNCED |
345 			 HAMMER2_CLUSTER_SSYNCED;
346 
347 	return cluster;
348 }
349 
350 /*
351  * Add a reference to a cluster and its underlying chains.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359 	atomic_add_int(&cluster->refs, 1);
360 #if 0
361 	hammer2_chain_t *chain;
362 	int i;
363 
364 	for (i = 0; i < cluster->nchains; ++i) {
365 		chain = cluster->array[i].chain;
366 		if (chain)
367 			hammer2_chain_ref(chain);
368 	}
369 #endif
370 }
371 
372 /*
373  * Drop the caller's reference to the cluster.  When the ref count drops to
374  * zero this function frees the cluster and drops all underlying chains.
375  *
376  * In-progress read I/Os are typically detached from the cluster once the
377  * first one returns (the remaining stay attached to the DIOs but are then
378  * ignored and drop naturally).
379  */
380 void
381 hammer2_cluster_drop(hammer2_cluster_t *cluster)
382 {
383 	hammer2_chain_t *chain;
384 	int i;
385 
386 	KKASSERT(cluster->refs > 0);
387 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
388 		cluster->focus = NULL;		/* safety XXX chg to assert */
389 		cluster->focus_index = 0;
390 
391 		for (i = 0; i < cluster->nchains; ++i) {
392 			chain = cluster->array[i].chain;
393 			if (chain) {
394 				hammer2_chain_drop(chain);
395 				cluster->array[i].chain = NULL; /* safety */
396 			}
397 		}
398 		cluster->nchains = 0;				/* safety */
399 
400 		kfree(cluster, M_HAMMER2);
401 		/* cluster is invalid */
402 	}
403 }
404 
405 void
406 hammer2_cluster_wait(hammer2_cluster_t *cluster)
407 {
408 	tsleep(cluster->focus, 0, "h2clcw", 1);
409 }
410 
411 /*
412  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
413  * and then locks them, modified by various RESOLVE flags.
414  *
415  * The act of locking a cluster sets its focus.  Note that cluster elements
416  * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus.  Locking a
417  * cluster does not adjust this flag since exact matches only matter for leafs
418  * (parents can depend on minor differences in topology).
419  *
420  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
421  * operations.  Typically this is only set on a quorum of MASTERs or
422  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
423  * is present, this bit is *not* set on a quorum of MASTERs.  The
424  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
425  * that create/modify/delete elements use it.
426  *
427  * The chains making up the cluster may be narrowed down based on quorum
428  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
429  * narrowed down to a single chain as long as the entire subtopology is known
430  * to be intact.  So, for example, we can narrow a read-only op to a single
431  * fast SLAVE but if we focus a CACHE chain we must still retain at least
432  * a SLAVE to ensure that the subtopology can be accessed.
433  *
434  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
435  * to be maintained once the topology is validated as-of the top level of
436  * the operation.
437  *
438  * If a failure occurs the operation must be aborted by higher-level code and
439  * retried. XXX
440  */
441 void
442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
443 {
444 	hammer2_chain_t *chain;
445 	int i;
446 
447 	/* cannot be on inode-embedded cluster template, must be on copy */
448 	KKASSERT(cluster->refs > 0);
449 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
450 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
451 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
452 			cluster);
453 	} else {
454 		KKASSERT(cluster->focus == NULL);
455 	}
456 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
457 
458 	/*
459 	 * Lock chains and resolve state.
460 	 */
461 	for (i = 0; i < cluster->nchains; ++i) {
462 		chain = cluster->array[i].chain;
463 		if (chain == NULL)
464 			continue;
465 		hammer2_chain_lock(chain, how);
466 	}
467 
468 	hammer2_cluster_resolve(cluster);
469 }
470 
471 void
472 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
473 {
474 	hammer2_chain_t *chain;
475 	hammer2_pfs_t *pmp;
476 	hammer2_tid_t quorum_tid;
477 	hammer2_tid_t last_best_quorum_tid;
478 	int focus_pfs_type;
479 	uint32_t nflags;
480 	int ttlmasters;
481 	int ttlslaves;
482 	int nmasters;
483 	int nslaves;
484 	int nquorum;
485 	int smpresent;
486 	int i;
487 
488 	cluster->error = 0;
489 	cluster->focus = NULL;
490 
491 	focus_pfs_type = 0;
492 	nflags = 0;
493 	ttlmasters = 0;
494 	ttlslaves = 0;
495 	nmasters = 0;
496 	nslaves = 0;
497 
498 	/*
499 	 * Calculate quorum
500 	 */
501 	pmp = cluster->pmp;
502 	KKASSERT(pmp != NULL || cluster->nchains == 0);
503 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
504 	smpresent = 0;
505 
506 	/*
507 	 * Pass 1
508 	 *
509 	 * NOTE: A NULL chain is not necessarily an error, it could be
510 	 *	 e.g. a lookup failure or the end of an iteration.
511 	 *	 Process normally.
512 	 */
513 	for (i = 0; i < cluster->nchains; ++i) {
514 		chain = cluster->array[i].chain;
515 		if (chain && chain->error) {
516 			if (cluster->focus == NULL || cluster->focus == chain) {
517 				/* error will be overridden by valid focus */
518 				cluster->error = chain->error;
519 			}
520 
521 			/*
522 			 * Must count total masters and slaves whether the
523 			 * chain is errored or not.
524 			 */
525 			switch (cluster->pmp->pfs_types[i]) {
526 			case HAMMER2_PFSTYPE_MASTER:
527 				++ttlmasters;
528 				break;
529 			case HAMMER2_PFSTYPE_SLAVE:
530 				++ttlslaves;
531 				break;
532 			}
533 			continue;
534 		}
535 		switch (cluster->pmp->pfs_types[i]) {
536 		case HAMMER2_PFSTYPE_MASTER:
537 			++ttlmasters;
538 			break;
539 		case HAMMER2_PFSTYPE_SLAVE:
540 			++ttlslaves;
541 			break;
542 		case HAMMER2_PFSTYPE_SOFT_MASTER:
543 			nflags |= HAMMER2_CLUSTER_WRSOFT;
544 			nflags |= HAMMER2_CLUSTER_RDSOFT;
545 			smpresent = 1;
546 			break;
547 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
548 			nflags |= HAMMER2_CLUSTER_RDSOFT;
549 			break;
550 		case HAMMER2_PFSTYPE_SUPROOT:
551 			/*
552 			 * Degenerate cluster representing the super-root
553 			 * topology on a single device.  Fake stuff so
554 			 * cluster ops work as expected.
555 			 */
556 			nflags |= HAMMER2_CLUSTER_WRHARD;
557 			nflags |= HAMMER2_CLUSTER_RDHARD;
558 			cluster->focus_index = i;
559 			cluster->focus = chain;
560 			cluster->error = chain ? chain->error : 0;
561 			break;
562 		default:
563 			break;
564 		}
565 	}
566 
567 	/*
568 	 * Pass 2
569 	 *
570 	 * Resolve masters.  Calculate nmasters for the highest matching
571 	 * TID, if a quorum cannot be attained try the next lower matching
572 	 * TID until we exhaust TIDs.
573 	 *
574 	 * NOTE: A NULL chain is not necessarily an error, it could be
575 	 *	 e.g. a lookup failure or the end of an iteration.
576 	 *	 Process normally.
577 	 */
578 	last_best_quorum_tid = HAMMER2_TID_MAX;
579 	quorum_tid = 0;		/* fix gcc warning */
580 
581 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
582 		nmasters = 0;
583 		quorum_tid = 0;
584 
585 		for (i = 0; i < cluster->nchains; ++i) {
586 			if (cluster->pmp->pfs_types[i] !=
587 			    HAMMER2_PFSTYPE_MASTER) {
588 				continue;
589 			}
590 			chain = cluster->array[i].chain;
591 
592 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
593 				/*
594 				 * Invalid as in unsynchronized, cannot be
595 				 * used to calculate the quorum.
596 				 */
597 			} else if (chain == NULL && quorum_tid == 0) {
598 				/*
599 				 * NULL chain on master matches NULL chains
600 				 * on other masters.
601 				 */
602 				++nmasters;
603 			} else if (quorum_tid < last_best_quorum_tid &&
604 				   chain != NULL &&
605 				   (quorum_tid < chain->bref.modify_tid ||
606 				    nmasters == 0)) {
607 				/*
608 				 * Better TID located, reset nmasters count.
609 				 */
610 				nmasters = 1;
611 				quorum_tid = chain->bref.modify_tid;
612 			} else if (chain &&
613 				   quorum_tid == chain->bref.modify_tid) {
614 				/*
615 				 * TID matches current collection.
616 				 */
617 				++nmasters;
618 			}
619 		}
620 		if (nmasters >= nquorum)
621 			break;
622 		last_best_quorum_tid = quorum_tid;
623 	}
624 
625 	/*
626 	 * Pass 3
627 	 *
628 	 * NOTE: A NULL chain is not necessarily an error, it could be
629 	 *	 e.g. a lookup failure or the end of an iteration.
630 	 *	 Process normally.
631 	 */
632 	for (i = 0; i < cluster->nchains; ++i) {
633 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
634 		chain = cluster->array[i].chain;
635 		if (chain && chain->error) {
636 			if (cluster->focus == NULL || cluster->focus == chain) {
637 				/* error will be overridden by valid focus */
638 				cluster->error = chain->error;
639 			}
640 			continue;
641 		}
642 
643 		switch (cluster->pmp->pfs_types[i]) {
644 		case HAMMER2_PFSTYPE_MASTER:
645 			/*
646 			 * We must have enough up-to-date masters to reach
647 			 * a quorum and the master modify_tid must match
648 			 * the quorum's modify_tid.
649 			 *
650 			 * Do not select an errored or out-of-sync master.
651 			 */
652 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
653 				nflags |= HAMMER2_CLUSTER_UNHARD;
654 			} else if (nmasters >= nquorum &&
655 				   (chain == NULL || chain->error == 0) &&
656 				   ((chain == NULL && quorum_tid == 0) ||
657 				    (chain != NULL && quorum_tid ==
658 						  chain->bref.modify_tid))) {
659 				nflags |= HAMMER2_CLUSTER_WRHARD;
660 				nflags |= HAMMER2_CLUSTER_RDHARD;
661 				if (!smpresent) {
662 					cluster->array[i].flags |=
663 							HAMMER2_CITEM_FEMOD;
664 				}
665 				if (cluster->focus == NULL ||
666 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
667 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
668 					cluster->focus_index = i;
669 					cluster->focus = chain; /* NULL ok */
670 					cluster->error = chain ? chain->error :
671 								 0;
672 				}
673 			} else if (chain == NULL || chain->error == 0) {
674 				nflags |= HAMMER2_CLUSTER_UNHARD;
675 			}
676 			break;
677 		case HAMMER2_PFSTYPE_SLAVE:
678 			/*
679 			 * We must have enough up-to-date masters to reach
680 			 * a quorum and the slave modify_tid must match the
681 			 * quorum's modify_tid.
682 			 *
683 			 * Do not select an errored slave.
684 			 */
685 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
686 				nflags |= HAMMER2_CLUSTER_UNHARD;
687 			} else if (nmasters >= nquorum &&
688 				   (chain == NULL || chain->error == 0) &&
689 				   ((chain == NULL && quorum_tid == 0) ||
690 				    (chain && quorum_tid ==
691 					      chain->bref.modify_tid))) {
692 				++nslaves;
693 				nflags |= HAMMER2_CLUSTER_RDHARD;
694 #if 0
695 				/* XXX optimize for RESOLVE_RDONLY */
696 				if (cluster->focus == NULL) {
697 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
698 					cluster->focus_index = i;
699 					cluster->focus = chain; /* NULL ok */
700 					cluster->error = chain ? chain->error :
701 								 0;
702 				}
703 #endif
704 			} else if (chain == NULL || chain->error == 0) {
705 				nflags |= HAMMER2_CLUSTER_UNSOFT;
706 			}
707 			break;
708 		case HAMMER2_PFSTYPE_SOFT_MASTER:
709 			/*
710 			 * Directly mounted soft master always wins.  There
711 			 * should be only one.
712 			 */
713 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
714 			cluster->focus_index = i;
715 			cluster->focus = chain;
716 			cluster->error = chain ? chain->error : 0;
717 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
718 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
719 			break;
720 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
721 			/*
722 			 * Directly mounted soft slave always wins.  There
723 			 * should be only one.
724 			 */
725 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
726 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
727 				cluster->focus_index = i;
728 				cluster->focus = chain;
729 				cluster->error = chain ? chain->error : 0;
730 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
731 			}
732 			break;
733 		case HAMMER2_PFSTYPE_SUPROOT:
734 			/*
735 			 * spmp (degenerate case)
736 			 */
737 			KKASSERT(i == 0);
738 			cluster->focus_index = i;
739 			cluster->focus = chain;
740 			cluster->error = chain ? chain->error : 0;
741 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
742 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
743 			break;
744 		default:
745 			break;
746 		}
747 	}
748 
749 	if (ttlslaves == 0)
750 		nflags |= HAMMER2_CLUSTER_NOSOFT;
751 	if (ttlmasters == 0)
752 		nflags |= HAMMER2_CLUSTER_NOHARD;
753 
754 	/*
755 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
756 	 * all available nodes (even if 0 are available) are fully
757 	 * synchronized.  This is used by the synchronization thread to
758 	 * determine if there is work it could potentially accomplish.
759 	 */
760 	if (nslaves == ttlslaves)
761 		nflags |= HAMMER2_CLUSTER_SSYNCED;
762 	if (nmasters == ttlmasters)
763 		nflags |= HAMMER2_CLUSTER_MSYNCED;
764 
765 	/*
766 	 * Determine if the cluster was successfully locked for the
767 	 * requested operation and generate an error code.  The cluster
768 	 * will not be locked (or ref'd) if an error is returned.
769 	 *
770 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
771 	 * to determine if reading or writing is possible.  If writing, the
772 	 * cluster still requires a call to hammer2_cluster_modify() first.
773 	 */
774 	atomic_set_int(&cluster->flags, nflags);
775 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
776 }
777 
778 /*
779  * Copy a cluster, returned a ref'd cluster.  All underlying chains
780  * are also ref'd, but not locked.
781  *
782  * The cluster focus is not set because the cluster is not yet locked
783  * (and the originating cluster does not have to be locked either).
784  */
785 hammer2_cluster_t *
786 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
787 {
788 	hammer2_pfs_t *pmp = ocluster->pmp;
789 	hammer2_cluster_t *ncluster;
790 	hammer2_chain_t *chain;
791 	int i;
792 
793 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
794 	ncluster->pmp = pmp;
795 	ncluster->nchains = ocluster->nchains;
796 	ncluster->refs = 1;
797 	ncluster->flags = 0;	/* cluster not locked */
798 
799 	for (i = 0; i < ocluster->nchains; ++i) {
800 		chain = ocluster->array[i].chain;
801 		ncluster->array[i].chain = chain;
802 		if (chain)
803 			hammer2_chain_ref(chain);
804 	}
805 	return (ncluster);
806 }
807 
808 /*
809  * Unlock and deref a cluster.  The cluster is destroyed if this is the
810  * last ref.
811  */
812 void
813 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
814 {
815 	hammer2_chain_t *chain;
816 	int i;
817 
818 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
819 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
820 			cluster);
821 	}
822 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
823 	KKASSERT(cluster->refs > 0);
824 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
825 
826 	for (i = 0; i < cluster->nchains; ++i) {
827 		chain = cluster->array[i].chain;
828 		if (chain)
829 			hammer2_chain_unlock(chain);
830 	}
831 	cluster->focus_index = 0;
832 	cluster->focus = NULL;
833 }
834 
835 /*
836  * Resize the cluster's physical storage allocation in-place.  This may
837  * replace the cluster's chains.
838  */
839 void
840 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
841 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
842 		       int nradix, int flags)
843 {
844 	hammer2_chain_t *chain;
845 	int i;
846 
847 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
848 	KKASSERT(cparent->nchains == cluster->nchains);
849 
850 	for (i = 0; i < cluster->nchains; ++i) {
851 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
852 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
853 			continue;
854 		}
855 		chain = cluster->array[i].chain;
856 		if (chain) {
857 			KKASSERT(cparent->array[i].chain);
858 			hammer2_chain_resize(trans, ip,
859 					     cparent->array[i].chain, chain,
860 					     nradix, flags);
861 		}
862 	}
863 }
864 
865 /*
866  * Set an inode's cluster modified, marking the related chains RW and
867  * duplicating them if necessary.
868  *
869  * The passed-in chain is a localized copy of the chain previously acquired
870  * when the inode was locked (and possilby replaced in the mean time), and
871  * must also be updated.  In fact, we update it first and then synchronize
872  * the inode's cluster cache.
873  */
874 hammer2_inode_data_t *
875 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
876 			  hammer2_cluster_t *cluster, int flags)
877 {
878 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
879 	hammer2_cluster_modify(trans, cluster, flags);
880 
881 	hammer2_inode_repoint(ip, NULL, cluster);
882 	if (ip->vp)
883 		vsetisdirty(ip->vp);
884 	return (&hammer2_cluster_wdata(cluster)->ipdata);
885 }
886 
887 /*
888  * Adjust the cluster's chains to allow modification and adjust the
889  * focus.  Data will be accessible on return.
890  *
891  * If our focused master errors on modify, re-resolve the cluster to
892  * try to select a different master.
893  */
894 void
895 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
896 		       int flags)
897 {
898 	hammer2_chain_t *chain;
899 	int resolve_again;
900 	int i;
901 
902 	resolve_again = 0;
903 	for (i = 0; i < cluster->nchains; ++i) {
904 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
905 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
906 			continue;
907 		}
908 		chain = cluster->array[i].chain;
909 		if (chain == NULL)
910 			continue;
911 		if (chain->error)
912 			continue;
913 		hammer2_chain_modify(trans, chain, flags);
914 		if (cluster->focus == chain && chain->error) {
915 			cluster->error = chain->error;
916 			resolve_again = 1;
917 		}
918 	}
919 	if (resolve_again)
920 		hammer2_cluster_resolve(cluster);
921 }
922 
923 /*
924  * Synchronize modifications from the focus to other chains in a cluster.
925  * Convenient because nominal API users can just modify the contents of the
926  * focus (at least for non-blockref data).
927  *
928  * Nominal front-end operations only edit non-block-table data in a single
929  * chain.  This code copies such modifications to the other chains in the
930  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
931  * by both the frontend and the backend and will explode in fireworks if
932  * blindly copied.
933  */
934 void
935 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
936 {
937 	hammer2_chain_t *focus;
938 	hammer2_chain_t *scan;
939 	const hammer2_inode_data_t *ripdata;
940 	hammer2_inode_data_t *wipdata;
941 	int i;
942 
943 	focus = cluster->focus;
944 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
945 
946 	for (i = 0; i < cluster->nchains; ++i) {
947 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
948 			continue;
949 		scan = cluster->array[i].chain;
950 		if (scan == NULL || scan == focus)
951 			continue;
952 		if (scan->error)
953 			continue;
954 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
955 		KKASSERT(focus->bytes == scan->bytes &&
956 			 focus->bref.type == scan->bref.type);
957 		switch(focus->bref.type) {
958 		case HAMMER2_BREF_TYPE_INODE:
959 			ripdata = &focus->data->ipdata;
960 			wipdata = &scan->data->ipdata;
961 			if ((ripdata->op_flags &
962 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
963 				bcopy(ripdata, wipdata,
964 				      offsetof(hammer2_inode_data_t, u));
965 				break;
966 			}
967 			/* fall through to full copy */
968 		case HAMMER2_BREF_TYPE_DATA:
969 			bcopy(focus->data, scan->data, focus->bytes);
970 			break;
971 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
972 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
973 		case HAMMER2_BREF_TYPE_FREEMAP:
974 		case HAMMER2_BREF_TYPE_VOLUME:
975 			panic("hammer2_cluster_modsync: illegal node type");
976 			/* NOT REACHED */
977 			break;
978 		default:
979 			panic("hammer2_cluster_modsync: unknown node type");
980 			break;
981 		}
982 	}
983 }
984 
985 /*
986  * Lookup initialization/completion API.  Returns a locked cluster with 1 ref.
987  */
988 hammer2_cluster_t *
989 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
990 {
991 	hammer2_cluster_t *cluster;
992 
993 	cluster = hammer2_cluster_copy(cparent);
994 	if (flags & HAMMER2_LOOKUP_SHARED) {
995 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
996 					      HAMMER2_RESOLVE_SHARED);
997 	} else {
998 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
999 	}
1000 	return (cluster);
1001 }
1002 
1003 void
1004 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1005 {
1006 	if (cparent) {
1007 		hammer2_cluster_unlock(cparent);
1008 		hammer2_cluster_drop(cparent);
1009 	}
1010 }
1011 
1012 /*
1013  * Locate first match or overlap under parent, return a new cluster
1014  */
1015 hammer2_cluster_t *
1016 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1017 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1018 {
1019 	hammer2_pfs_t *pmp;
1020 	hammer2_cluster_t *cluster;
1021 	hammer2_chain_t *chain;
1022 	hammer2_chain_t *focus;
1023 	hammer2_key_t key_accum;
1024 	hammer2_key_t key_next;
1025 	int null_count;
1026 	int i;
1027 
1028 	pmp = cparent->pmp;				/* can be NULL */
1029 	key_accum = *key_nextp;
1030 	null_count = 0;
1031 
1032 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1033 	cluster->pmp = pmp;				/* can be NULL */
1034 	cluster->refs = 1;
1035 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1036 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1037 
1038 	/*
1039 	 * Pass-1, issue lookup and find focus.
1040 	 */
1041 	for (i = 0; i < cparent->nchains; ++i) {
1042 		cluster->array[i].flags = cparent->array[i].flags;
1043 		key_next = *key_nextp;
1044 
1045 		/*
1046 		 * Nothing to base the lookup, or parent was not synchronized.
1047 		 */
1048 		if (cparent->array[i].chain == NULL ||
1049 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1050 			++null_count;
1051 			continue;
1052 		}
1053 
1054 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
1055 					     &key_next,
1056 					     key_beg, key_end,
1057 					     &cparent->array[i].cache_index,
1058 					     flags);
1059 		if (cparent->focus_index == i)
1060 			cparent->focus = cparent->array[i].chain;
1061 		cluster->array[i].chain = chain;
1062 		if (chain == NULL) {
1063 			++null_count;
1064 		} else if (chain->error) {
1065 			/*
1066 			 * Leave errored chain in cluster, but it cannot be
1067 			 * the cluster's focus.  It is still possible for an
1068 			 * error'd chain to be synchronized (since we have
1069 			 * the bref), synchronization state will be handled
1070 			 * in pass-2.
1071 			 */
1072 			;
1073 		} else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
1074 			/*
1075 			 * Leave unsynchronized chain in cluster, but it cannot
1076 			 * be the cluster's focus.
1077 			 */
1078 			;
1079 		} else {
1080 			int ddflag = (chain->bref.type ==
1081 				      HAMMER2_BREF_TYPE_INODE);
1082 
1083 			if (cluster->focus == NULL) {
1084 				cluster->focus_index = i;
1085 				cluster->focus = chain;
1086 				cluster->ddflag = ddflag;
1087 			}
1088 			if (cparent->focus == cparent->array[i].chain) {
1089 				cluster->focus_index = i;
1090 				cluster->focus = chain;
1091 				cluster->ddflag = ddflag;
1092 			}
1093 		}
1094 		if (key_accum > key_next)
1095 			key_accum = key_next;
1096 	}
1097 
1098 	/*
1099 	 * Pass-2 invalidate mismatches
1100 	 */
1101 	focus = cluster->focus;
1102 	if (focus == NULL)
1103 		goto done;
1104 
1105 	for (i = 0; i < cparent->nchains; ++i) {
1106 		int ddflag;
1107 
1108 		chain = cluster->array[i].chain;
1109 
1110 		if (chain == NULL)
1111 			continue;
1112 		if (chain == focus)
1113 			continue;
1114 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1115 			continue;
1116 
1117 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1118 		if (chain->bref.type != focus->bref.type ||
1119 		    chain->bref.key != focus->bref.key ||
1120 		    chain->bref.keybits != focus->bref.keybits ||
1121 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1122 		    chain->bytes != focus->bytes ||
1123 		    ddflag != cluster->ddflag) {
1124 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1125 		}
1126 	}
1127 
1128 	/*
1129 	 * Resolve cluster flags.  A lookup or locking failure could wind
1130 	 * up changing the cluster.
1131 	 */
1132 done:
1133 	*key_nextp = key_accum;
1134 	cluster->nchains = i;
1135 	hammer2_cluster_resolve(cluster);
1136 
1137 	if (null_count == i) {
1138 		hammer2_cluster_drop(cluster);
1139 		cluster = NULL;
1140 	}
1141 
1142 	return (cluster);
1143 }
1144 
1145 /*
1146  * Locate next match or overlap under parent, replace cluster
1147  */
1148 hammer2_cluster_t *
1149 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1150 		     hammer2_key_t *key_nextp,
1151 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1152 {
1153 	hammer2_chain_t *ochain;
1154 	hammer2_chain_t *nchain;
1155 	hammer2_chain_t *focus;
1156 	hammer2_key_t key_accum;
1157 	hammer2_key_t key_next;
1158 	int null_count;
1159 	int i;
1160 
1161 	key_accum = *key_nextp;
1162 	null_count = 0;
1163 	cluster->focus = NULL;
1164 	cparent->focus = NULL;
1165 	cluster->focus_index = 0;
1166 	cparent->focus_index = 0;
1167 
1168 	cluster->ddflag = 0;
1169 
1170 	for (i = 0; i < cparent->nchains; ++i) {
1171 		key_next = *key_nextp;
1172 		ochain = cluster->array[i].chain;
1173 
1174 		/*
1175 		 * Nothing to iterate from.  These cases can occur under
1176 		 * normal operations.  For example, during synchronization
1177 		 * a slave might reach the end of its scan while records
1178 		 * are still left on the master(s).
1179 		 */
1180 		if (ochain == NULL) {
1181 			++null_count;
1182 			continue;
1183 		}
1184 		if (cparent->array[i].chain == NULL ||
1185 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1186 		    (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1187 			if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1188 				hammer2_chain_unlock(ochain);
1189 			hammer2_chain_drop(ochain);
1190 			cluster->array[i].chain = NULL;
1191 			++null_count;
1192 			continue;
1193 		}
1194 
1195 		nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1196 					    &key_next, key_beg, key_end,
1197 					    &cparent->array[i].cache_index,
1198 					    flags);
1199 		if (cparent->focus_index == i)
1200 			cparent->focus = cparent->array[i].chain;
1201 		/* ochain now invalid but can still be used for focus check */
1202 
1203 		cluster->array[i].chain = nchain;
1204 		if (nchain == NULL) {
1205 			++null_count;
1206 		} else if (nchain->error) {
1207 			/*
1208 			 * Leave errored chain in cluster, but it cannot be
1209 			 * the cluster's focus.
1210 			 */
1211 			;
1212 		} else {
1213 			int ddflag = (nchain->bref.type ==
1214 				      HAMMER2_BREF_TYPE_INODE);
1215 
1216 			/*
1217 			 * Possible new focus.
1218 			 */
1219 			if (cluster->focus == NULL) {
1220 				cluster->ddflag = ddflag;
1221 				cluster->focus_index = i;
1222 				cluster->focus = nchain;
1223 			}
1224 
1225 			/*
1226 			 * Fixup pre-existing focus.
1227 			 */
1228 			if (cluster->focus == ochain) {
1229 				cluster->focus_index = i;
1230 				cluster->focus = nchain;
1231 			}
1232 		}
1233 		if (key_accum > key_next)
1234 			key_accum = key_next;
1235 	}
1236 
1237 	/*
1238 	 * Pass-2 invalidate mismatches
1239 	 */
1240 	focus = cluster->focus;
1241 	if (focus == NULL)
1242 		goto done;
1243 
1244 	for (i = 0; i < cparent->nchains; ++i) {
1245 		int ddflag;
1246 
1247 		nchain = cluster->array[i].chain;
1248 
1249 		if (nchain == NULL)
1250 			continue;
1251 		if (nchain == focus)
1252 			continue;
1253 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1254 			continue;
1255 
1256 		ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1257 		if (nchain->bref.type != focus->bref.type ||
1258 		    nchain->bref.key != focus->bref.key ||
1259 		    nchain->bref.keybits != focus->bref.keybits ||
1260 		    nchain->bref.modify_tid != focus->bref.modify_tid ||
1261 		    nchain->bytes != focus->bytes ||
1262 		    ddflag != cluster->ddflag) {
1263 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1264 		}
1265 	}
1266 
1267 done:
1268 	*key_nextp = key_accum;
1269 	cluster->nchains = i;
1270 	hammer2_cluster_resolve(cluster);
1271 
1272 	if (null_count == i) {
1273 		hammer2_cluster_drop(cluster);
1274 		cluster = NULL;
1275 	}
1276 	return(cluster);
1277 }
1278 
1279 /*
1280  * Advance just one chain in the cluster and recalculate the invalid bit.
1281  * The cluster index is allowed to be flagged invalid on input and is
1282  * recalculated on return.
1283  *
1284  * (used during synchronization to advance past a chain being deleted).
1285  *
1286  * The chain being advanced must not be the focus and the clusters in
1287  * question must have already passed normal cluster_lookup/cluster_next
1288  * checks.
1289  *
1290  * The cluster always remains intact on return, so void function.
1291  */
1292 void
1293 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1294 				  hammer2_cluster_t *cluster,
1295 				  hammer2_key_t *key_nextp,
1296 				  hammer2_key_t key_beg,
1297 				  hammer2_key_t key_end,
1298 				  int i, int flags)
1299 {
1300 	hammer2_chain_t *ochain;
1301 	hammer2_chain_t *nchain;
1302 	hammer2_chain_t *focus;
1303 	hammer2_key_t key_accum;
1304 	hammer2_key_t key_next;
1305 	int ddflag;
1306 
1307 	key_accum = *key_nextp;
1308 	key_next = *key_nextp;
1309 	ochain = cluster->array[i].chain;
1310 	if (ochain == NULL)
1311 		goto done;
1312 	KKASSERT(ochain != cluster->focus);
1313 
1314 	nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1315 				    &key_next, key_beg, key_end,
1316 				    &cparent->array[i].cache_index,
1317 				    flags);
1318 	if (cparent->focus_index == i)
1319 		cparent->focus = cparent->array[i].chain;
1320 	/* ochain now invalid */
1321 
1322 	/*
1323 	 * Install nchain.  Note that nchain can be NULL, and can also
1324 	 * be in an unlocked state depending on flags.
1325 	 */
1326 	cluster->array[i].chain = nchain;
1327 	cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1328 
1329 	if (key_accum > key_next)
1330 		key_accum = key_next;
1331 
1332 	focus = cluster->focus;
1333 	if (focus == NULL)
1334 		goto done;
1335 	if (nchain == NULL)
1336 		goto done;
1337 #if 0
1338 	if (nchain == focus)	/* ASSERTED NOT TRUE */
1339 		...
1340 #endif
1341 	ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1342 	if (nchain->bref.type != focus->bref.type ||
1343 	    nchain->bref.key != focus->bref.key ||
1344 	    nchain->bref.keybits != focus->bref.keybits ||
1345 	    nchain->bref.modify_tid != focus->bref.modify_tid ||
1346 	    nchain->bytes != focus->bytes ||
1347 	    ddflag != cluster->ddflag) {
1348 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1349 	}
1350 
1351 done:
1352 	*key_nextp = key_accum;
1353 #if 0
1354 	/*
1355 	 * For now don't re-resolve cluster->flags.
1356 	 */
1357 	hammer2_cluster_resolve(cluster);
1358 #endif
1359 }
1360 
1361 /*
1362  * Create a new cluster using the specified key
1363  */
1364 int
1365 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1366 		     hammer2_cluster_t **clusterp,
1367 		     hammer2_key_t key, int keybits,
1368 		     int type, size_t bytes, int flags)
1369 {
1370 	hammer2_cluster_t *cluster;
1371 	hammer2_pfs_t *pmp;
1372 	int error;
1373 	int i;
1374 
1375 	pmp = trans->pmp;				/* can be NULL */
1376 
1377 	if ((cluster = *clusterp) == NULL) {
1378 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1379 				  M_WAITOK | M_ZERO);
1380 		cluster->pmp = pmp;			/* can be NULL */
1381 		cluster->refs = 1;
1382 		cluster->flags = HAMMER2_CLUSTER_LOCKED;
1383 	}
1384 	cluster->focus_index = 0;
1385 	cluster->focus = NULL;
1386 
1387 	/*
1388 	 * NOTE: cluster->array[] entries can initially be NULL.  If
1389 	 *	 *clusterp is supplied, skip NULL entries, otherwise
1390 	 *	 create new chains.
1391 	 */
1392 	for (i = 0; i < cparent->nchains; ++i) {
1393 		if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1394 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1395 			continue;
1396 		}
1397 		if (*clusterp) {
1398 			if ((cluster->array[i].flags &
1399 			     HAMMER2_CITEM_FEMOD) == 0) {
1400 				cluster->array[i].flags |=
1401 						HAMMER2_CITEM_INVALID;
1402 				continue;
1403 			}
1404 			if (cluster->array[i].chain == NULL)
1405 				continue;
1406 		}
1407 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
1408 					     &cluster->array[i].chain, pmp,
1409 					     key, keybits,
1410 					     type, bytes, flags);
1411 		if (cparent->focus_index == i)
1412 			cparent->focus = cparent->array[i].chain;
1413 		KKASSERT(error == 0);
1414 		if (cluster->focus == NULL) {
1415 			cluster->focus_index = i;
1416 			cluster->focus = cluster->array[i].chain;
1417 		}
1418 		if (cparent->focus == cparent->array[i].chain) {
1419 			cluster->focus_index = i;
1420 			cluster->focus = cluster->array[i].chain;
1421 		}
1422 	}
1423 	cluster->nchains = i;
1424 	*clusterp = cluster;
1425 	hammer2_cluster_resolve(cluster);
1426 
1427 	return error;
1428 }
1429 
1430 /*
1431  * Rename a cluster to a new parent.
1432  *
1433  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1434  *	    So the data_off field is not relevant.  Only the key and
1435  *	    keybits are used.
1436  */
1437 void
1438 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1439 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1440 		       int flags)
1441 {
1442 	hammer2_chain_t *chain;
1443 	hammer2_blockref_t xbref;
1444 	int i;
1445 
1446 #if 0
1447 	cluster->focus = NULL;
1448 	cparent->focus = NULL;
1449 	cluster->focus_index = 0;
1450 	cparent->focus_index = 0;
1451 #endif
1452 
1453 	for (i = 0; i < cluster->nchains; ++i) {
1454 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1455 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1456 			continue;
1457 		}
1458 		chain = cluster->array[i].chain;
1459 		if (chain) {
1460 			if (bref) {
1461 				xbref = chain->bref;
1462 				xbref.key = bref->key;
1463 				xbref.keybits = bref->keybits;
1464 				hammer2_chain_rename(trans, &xbref,
1465 						     &cparent->array[i].chain,
1466 						     chain, flags);
1467 			} else {
1468 				hammer2_chain_rename(trans, NULL,
1469 						     &cparent->array[i].chain,
1470 						     chain, flags);
1471 			}
1472 			if (cparent->focus_index == i)
1473 				cparent->focus = cparent->array[i].chain;
1474 			KKASSERT(cluster->array[i].chain == chain); /*remove*/
1475 		}
1476 	}
1477 }
1478 
1479 /*
1480  * Mark a cluster deleted
1481  */
1482 void
1483 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1484 		       hammer2_cluster_t *cluster, int flags)
1485 {
1486 	hammer2_chain_t *chain;
1487 	hammer2_chain_t *parent;
1488 	int i;
1489 
1490 	if (cparent == NULL) {
1491 		kprintf("cparent is NULL\n");
1492 		return;
1493 	}
1494 
1495 	for (i = 0; i < cluster->nchains; ++i) {
1496 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1497 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1498 			continue;
1499 		}
1500 		parent = cparent->array[i].chain;
1501 		chain = cluster->array[i].chain;
1502 		if (chain == NULL)
1503 			continue;
1504 		if (chain->parent != parent) {
1505 			kprintf("hammer2_cluster_delete: parent "
1506 				"mismatch chain=%p parent=%p against=%p\n",
1507 				chain, chain->parent, parent);
1508 		} else {
1509 			hammer2_chain_delete(trans, parent, chain, flags);
1510 		}
1511 	}
1512 }
1513 
1514 /*
1515  * Create a snapshot of the specified {parent, ochain} with the specified
1516  * label.  The originating hammer2_inode must be exclusively locked for
1517  * safety.
1518  *
1519  * The ioctl code has already synced the filesystem.
1520  */
1521 int
1522 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1523 		       hammer2_ioc_pfs_t *pfs)
1524 {
1525 	hammer2_dev_t *hmp;
1526 	hammer2_cluster_t *ncluster;
1527 	const hammer2_inode_data_t *ripdata;
1528 	hammer2_inode_data_t *wipdata;
1529 	hammer2_chain_t *nchain;
1530 	hammer2_inode_t *nip;
1531 	size_t name_len;
1532 	hammer2_key_t lhc;
1533 	struct vattr vat;
1534 #if 0
1535 	uuid_t opfs_clid;
1536 #endif
1537 	int error;
1538 	int i;
1539 
1540 	kprintf("snapshot %s\n", pfs->name);
1541 
1542 	name_len = strlen(pfs->name);
1543 	lhc = hammer2_dirhash(pfs->name, name_len);
1544 
1545 	/*
1546 	 * Get the clid
1547 	 */
1548 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1549 #if 0
1550 	opfs_clid = ripdata->pfs_clid;
1551 #endif
1552 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1553 
1554 	/*
1555 	 * Create the snapshot directory under the super-root
1556 	 *
1557 	 * Set PFS type, generate a unique filesystem id, and generate
1558 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1559 	 * which theoretically allows the snapshot to be used as part of
1560 	 * the same cluster (perhaps as a cache).
1561 	 *
1562 	 * Copy the (flushed) blockref array.  Theoretically we could use
1563 	 * chain_duplicate() but it becomes difficult to disentangle
1564 	 * the shared core so for now just brute-force it.
1565 	 */
1566 	VATTR_NULL(&vat);
1567 	vat.va_type = VDIR;
1568 	vat.va_mode = 0755;
1569 	ncluster = NULL;
1570 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1571 				   proc0.p_ucred, pfs->name, name_len,
1572 				   &ncluster,
1573 				   HAMMER2_INSERT_PFSROOT, &error);
1574 
1575 	if (nip) {
1576 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1577 		wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1578 		wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1579 		wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1580 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1581 
1582 		/*
1583 		 * Give the snapshot its own private cluster.  As a snapshot
1584 		 * no further synchronization with the original cluster will
1585 		 * be done.
1586 		 */
1587 #if 0
1588 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1589 			wipdata->pfs_clid = opfs_clid;
1590 		else
1591 			kern_uuidgen(&wipdata->pfs_clid, 1);
1592 #endif
1593 		kern_uuidgen(&wipdata->pfs_clid, 1);
1594 
1595 		for (i = 0; i < ncluster->nchains; ++i) {
1596 			if ((ncluster->array[i].flags &
1597 			     HAMMER2_CITEM_FEMOD) == 0) {
1598 				ncluster->array[i].flags |=
1599 					HAMMER2_CITEM_INVALID;
1600 				continue;
1601 			}
1602 			nchain = ncluster->array[i].chain;
1603 			if (nchain)
1604 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1605 		}
1606 #if 0
1607 		/* XXX can't set this unless we do an explicit flush, which
1608 		   we also need a pmp assigned to do, else the flush code
1609 		   won't flush ncluster because it thinks it is crossing a
1610 		   flush boundary */
1611 		hammer2_cluster_set_chainflags(ncluster,
1612 					       HAMMER2_CHAIN_PFSBOUNDARY);
1613 #endif
1614 
1615 		/* XXX hack blockset copy */
1616 		/* XXX doesn't work with real cluster */
1617 		KKASSERT(ocluster->nchains == 1);
1618 		wipdata->u.blockset = ripdata->u.blockset;
1619 		hammer2_cluster_modsync(ncluster);
1620 		for (i = 0; i < ncluster->nchains; ++i) {
1621 			nchain = ncluster->array[i].chain;
1622 			if (nchain)
1623 				hammer2_flush(trans, nchain);
1624 		}
1625 		hammer2_inode_unlock(nip, ncluster);
1626 	}
1627 	return (error);
1628 }
1629 
1630 /*
1631  * Return locked parent cluster given a locked child.  The child remains
1632  * locked on return.  The new parent's focus follows the child's focus
1633  * and the parent is always resolved.
1634  */
1635 hammer2_cluster_t *
1636 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1637 {
1638 	hammer2_cluster_t *cparent;
1639 	int i;
1640 
1641 	cparent = hammer2_cluster_copy(cluster);
1642 
1643 	for (i = 0; i < cparent->nchains; ++i) {
1644 		hammer2_chain_t *chain;
1645 		hammer2_chain_t *rchain;
1646 
1647 		/*
1648 		 * Calculate parent for each element.  Old chain has an extra
1649 		 * ref for cparent but the lock remains with cluster.
1650 		 */
1651 		chain = cparent->array[i].chain;
1652 		if (chain == NULL)
1653 			continue;
1654 		while ((rchain = chain->parent) != NULL) {
1655 			hammer2_chain_ref(rchain);
1656 			hammer2_chain_unlock(chain);
1657 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1658 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1659 			if (chain->parent == rchain)
1660 				break;
1661 			hammer2_chain_unlock(rchain);
1662 			hammer2_chain_drop(rchain);
1663 		}
1664 		if (cluster->focus == chain) {
1665 			cparent->focus_index = i;
1666 			cparent->focus = rchain;
1667 		}
1668 		cparent->array[i].chain = rchain;
1669 		hammer2_chain_drop(chain);
1670 	}
1671 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1672 	hammer2_cluster_resolve(cparent);
1673 
1674 	return cparent;
1675 }
1676 
1677 /************************************************************************
1678  *			        CLUSTER I/O 				*
1679  ************************************************************************
1680  *
1681  *
1682  * WARNING! blockref[] array data is not universal.  These functions should
1683  *	    only be used to access universal data.
1684  *
1685  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1686  *	    complete if necessary.  The I/O's should have already been
1687  *	    initiated by the cluster_lock/chain_lock operation.
1688  *
1689  *	    The cluster must already be in a modified state before wdata
1690  *	    is called.  The data will already be available for this case.
1691  */
1692 const hammer2_media_data_t *
1693 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1694 {
1695 	return(cluster->focus->data);
1696 }
1697 
1698 hammer2_media_data_t *
1699 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1700 {
1701 	KKASSERT(hammer2_cluster_modified(cluster));
1702 	return(cluster->focus->data);
1703 }
1704 
1705 /*
1706  * Load cluster data asynchronously with callback.
1707  *
1708  * The callback is made for the first validated data found, or NULL
1709  * if no valid data is available.
1710  *
1711  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1712  *	 in the inode with an exclusive lock held), the chain structure may be
1713  *	 shared.
1714  */
1715 void
1716 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1717 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1718 {
1719 	hammer2_chain_t *chain;
1720 	hammer2_iocb_t *iocb;
1721 	hammer2_dev_t *hmp;
1722 	hammer2_blockref_t *bref;
1723 	int i;
1724 
1725 	/*
1726 	 * Try to find a chain whos data is already resolved.  If none can
1727 	 * be found, start with the first chain.
1728 	 */
1729 	chain = NULL;
1730 	for (i = 0; i < cluster->nchains; ++i) {
1731 		chain = cluster->array[i].chain;
1732 		if (chain && chain->data)
1733 			break;
1734 	}
1735 	if (i == cluster->nchains) {
1736 		chain = cluster->array[0].chain;
1737 		i = 0;
1738 	}
1739 
1740 	iocb = &cluster->iocb;
1741 	iocb->callback = callback;
1742 	iocb->dio = NULL;		/* for already-validated case */
1743 	iocb->cluster = cluster;
1744 	iocb->chain = chain;
1745 	iocb->ptr = ptr;
1746 	iocb->lbase = (off_t)i;
1747 	iocb->flags = 0;
1748 	iocb->error = 0;
1749 
1750 	/*
1751 	 * Data already validated
1752 	 */
1753 	if (chain->data) {
1754 		callback(iocb);
1755 		return;
1756 	}
1757 
1758 	/*
1759 	 * We must resolve to a device buffer, either by issuing I/O or
1760 	 * by creating a zero-fill element.  We do not mark the buffer
1761 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1762 	 * API must still be used to do that).
1763 	 *
1764 	 * The device buffer is variable-sized in powers of 2 down
1765 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1766 	 * chunk always contains buffers of the same size. (XXX)
1767 	 *
1768 	 * The minimum physical IO size may be larger than the variable
1769 	 * block size.
1770 	 *
1771 	 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1772 	 *	      matches hammer2_devblksize()?  Or does the freemap's
1773 	 *	      pre-zeroing handle the case for us?
1774 	 */
1775 	bref = &chain->bref;
1776 	hmp = chain->hmp;
1777 
1778 #if 0
1779 	/* handled by callback? <- TODO XXX even needed for loads? */
1780 	/*
1781 	 * The getblk() optimization for a 100% overwrite can only be used
1782 	 * if the physical block size matches the request.
1783 	 */
1784 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1785 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1786 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1787 		KKASSERT(error == 0);
1788 		iocb->dio = dio;
1789 		callback(iocb);
1790 		return;
1791 	}
1792 #endif
1793 
1794 	/*
1795 	 * Otherwise issue a read
1796 	 */
1797 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1798 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1799 }
1800