xref: /dflybsd-src/usr.sbin/makefs/hammer2/hammer2_cluster.c (revision 8a656edff0678e34ac00175186cb6850eadc9441)
1*2d60b848STomohiro Kusumi /*
2*2d60b848STomohiro Kusumi  * SPDX-License-Identifier: BSD-3-Clause
3*2d60b848STomohiro Kusumi  *
4*2d60b848STomohiro Kusumi  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5*2d60b848STomohiro Kusumi  * Copyright (c) 2011-2022 The DragonFly Project.  All rights reserved.
6*2d60b848STomohiro Kusumi  *
7*2d60b848STomohiro Kusumi  * This code is derived from software contributed to The DragonFly Project
8*2d60b848STomohiro Kusumi  * by Matthew Dillon <dillon@dragonflybsd.org>
9*2d60b848STomohiro Kusumi  *
10*2d60b848STomohiro Kusumi  * Redistribution and use in source and binary forms, with or without
11*2d60b848STomohiro Kusumi  * modification, are permitted provided that the following conditions
12*2d60b848STomohiro Kusumi  * are met:
13*2d60b848STomohiro Kusumi  *
14*2d60b848STomohiro Kusumi  * 1. Redistributions of source code must retain the above copyright
15*2d60b848STomohiro Kusumi  *    notice, this list of conditions and the following disclaimer.
16*2d60b848STomohiro Kusumi  * 2. Redistributions in binary form must reproduce the above copyright
17*2d60b848STomohiro Kusumi  *    notice, this list of conditions and the following disclaimer in
18*2d60b848STomohiro Kusumi  *    the documentation and/or other materials provided with the
19*2d60b848STomohiro Kusumi  *    distribution.
20*2d60b848STomohiro Kusumi  * 3. Neither the name of The DragonFly Project nor the names of its
21*2d60b848STomohiro Kusumi  *    contributors may be used to endorse or promote products derived
22*2d60b848STomohiro Kusumi  *    from this software without specific, prior written permission.
23*2d60b848STomohiro Kusumi  *
24*2d60b848STomohiro Kusumi  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25*2d60b848STomohiro Kusumi  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26*2d60b848STomohiro Kusumi  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27*2d60b848STomohiro Kusumi  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28*2d60b848STomohiro Kusumi  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29*2d60b848STomohiro Kusumi  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30*2d60b848STomohiro Kusumi  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31*2d60b848STomohiro Kusumi  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32*2d60b848STomohiro Kusumi  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33*2d60b848STomohiro Kusumi  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34*2d60b848STomohiro Kusumi  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35*2d60b848STomohiro Kusumi  * SUCH DAMAGE.
36*2d60b848STomohiro Kusumi  */
37*2d60b848STomohiro Kusumi /*
38*2d60b848STomohiro Kusumi  * The cluster module collects multiple chains representing the same
39*2d60b848STomohiro Kusumi  * information from different nodes into a single entity.  It allows direct
40*2d60b848STomohiro Kusumi  * access to media data as long as it is not blockref array data (which
41*2d60b848STomohiro Kusumi  * will obviously have to be different at each node).
42*2d60b848STomohiro Kusumi  *
43*2d60b848STomohiro Kusumi  * This module also handles I/O dispatch, status rollup, and various
44*2d60b848STomohiro Kusumi  * mastership arrangements including quorum operations.  It effectively
45*2d60b848STomohiro Kusumi  * presents one topology to the vnops layer.
46*2d60b848STomohiro Kusumi  *
47*2d60b848STomohiro Kusumi  * Many of the API calls mimic chain API calls but operate on clusters
48*2d60b848STomohiro Kusumi  * instead of chains.  Please see hammer2_chain.c for more complete code
49*2d60b848STomohiro Kusumi  * documentation of the API functions.
50*2d60b848STomohiro Kusumi  *
51*2d60b848STomohiro Kusumi  * WARNING! This module is *extremely* complex.  It must issue asynchronous
52*2d60b848STomohiro Kusumi  *	    locks and I/O, do quorum and/or master-slave processing, and
53*2d60b848STomohiro Kusumi  *	    it must operate properly even if some nodes are broken (which
54*2d60b848STomohiro Kusumi  *	    can also mean indefinite locks).
55*2d60b848STomohiro Kusumi  *
56*2d60b848STomohiro Kusumi  *				CLUSTER OPERATIONS
57*2d60b848STomohiro Kusumi  *
58*2d60b848STomohiro Kusumi  * Cluster operations can be broken down into three pieces:
59*2d60b848STomohiro Kusumi  *
60*2d60b848STomohiro Kusumi  * (1) Chain locking and data retrieval.
61*2d60b848STomohiro Kusumi  *
62*2d60b848STomohiro Kusumi  *	- Most complex functions, quorum management on transaction ids.
63*2d60b848STomohiro Kusumi  *
64*2d60b848STomohiro Kusumi  *	- Locking and data accesses must be internally asynchronous.
65*2d60b848STomohiro Kusumi  *
66*2d60b848STomohiro Kusumi  *	- Validate and manage cache coherency primitives (cache state
67*2d60b848STomohiro Kusumi  *	  is stored in chain topologies but must be validated by these
68*2d60b848STomohiro Kusumi  *	  functions).
69*2d60b848STomohiro Kusumi  *
70*2d60b848STomohiro Kusumi  * (2) Lookups and Scans
71*2d60b848STomohiro Kusumi  *		hammer2_cluster_lookup()
72*2d60b848STomohiro Kusumi  *		hammer2_cluster_next()
73*2d60b848STomohiro Kusumi  *
74*2d60b848STomohiro Kusumi  *	- Depend on locking & data retrieval functions, but still complex.
75*2d60b848STomohiro Kusumi  *
76*2d60b848STomohiro Kusumi  *	- Must do quorum management on transaction ids.
77*2d60b848STomohiro Kusumi  *
78*2d60b848STomohiro Kusumi  *	- Lookup and Iteration ops Must be internally asynchronous.
79*2d60b848STomohiro Kusumi  *
80*2d60b848STomohiro Kusumi  * (3) Modifying Operations
81*2d60b848STomohiro Kusumi  *		hammer2_cluster_create()
82*2d60b848STomohiro Kusumi  *
83*2d60b848STomohiro Kusumi  *	- Can usually punt on failures, operation continues unless quorum
84*2d60b848STomohiro Kusumi  *	  is lost.  If quorum is lost, must wait for resynchronization
85*2d60b848STomohiro Kusumi  *	  (depending on the management mode).
86*2d60b848STomohiro Kusumi  *
87*2d60b848STomohiro Kusumi  *	- Must disconnect node on failures (also not flush), remount, and
88*2d60b848STomohiro Kusumi  *	  resynchronize.
89*2d60b848STomohiro Kusumi  *
90*2d60b848STomohiro Kusumi  *	- Network links (via kdmsg) are relatively easy to issue as the
91*2d60b848STomohiro Kusumi  *	  complex underworkings of hammer2_chain.c don't have to messed
92*2d60b848STomohiro Kusumi  *	  with (the protocol is at a higher level than block-level).
93*2d60b848STomohiro Kusumi  *
94*2d60b848STomohiro Kusumi  *	- Multiple local disk nodes (i.e. block devices) are another matter.
95*2d60b848STomohiro Kusumi  *	  Chain operations have to be dispatched to per-node threads (xN)
96*2d60b848STomohiro Kusumi  *	  because we can't asynchronize potentially very complex chain
97*2d60b848STomohiro Kusumi  *	  operations in hammer2_chain.c (it would be a huge mess).
98*2d60b848STomohiro Kusumi  *
99*2d60b848STomohiro Kusumi  *	  (these threads are also used to terminate incoming kdmsg ops from
100*2d60b848STomohiro Kusumi  *	  other machines).
101*2d60b848STomohiro Kusumi  *
102*2d60b848STomohiro Kusumi  *	- Single-node filesystems do not use threads and will simply call
103*2d60b848STomohiro Kusumi  *	  hammer2_chain.c functions directly.  This short-cut is handled
104*2d60b848STomohiro Kusumi  *	  at the base of each cluster function.
105*2d60b848STomohiro Kusumi  */
106*2d60b848STomohiro Kusumi /*
107*2d60b848STomohiro Kusumi #include <sys/cdefs.h>
108*2d60b848STomohiro Kusumi #include <sys/param.h>
109*2d60b848STomohiro Kusumi #include <sys/systm.h>
110*2d60b848STomohiro Kusumi #include <sys/types.h>
111*2d60b848STomohiro Kusumi */
112*2d60b848STomohiro Kusumi 
113*2d60b848STomohiro Kusumi #include "hammer2.h"
114*2d60b848STomohiro Kusumi 
115*2d60b848STomohiro Kusumi /*
116*2d60b848STomohiro Kusumi  * Returns the bref type of the cluster's foucs.
117*2d60b848STomohiro Kusumi  *
118*2d60b848STomohiro Kusumi  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
119*2d60b848STomohiro Kusumi  * The cluster must be locked.
120*2d60b848STomohiro Kusumi  */
121*2d60b848STomohiro Kusumi uint8_t
hammer2_cluster_type(hammer2_cluster_t * cluster)122*2d60b848STomohiro Kusumi hammer2_cluster_type(hammer2_cluster_t *cluster)
123*2d60b848STomohiro Kusumi {
124*2d60b848STomohiro Kusumi 	if (cluster->error == 0) {
125*2d60b848STomohiro Kusumi 		KKASSERT(cluster->focus != NULL);
126*2d60b848STomohiro Kusumi 		return(cluster->focus->bref.type);
127*2d60b848STomohiro Kusumi 	}
128*2d60b848STomohiro Kusumi 	return 0;
129*2d60b848STomohiro Kusumi }
130*2d60b848STomohiro Kusumi 
131*2d60b848STomohiro Kusumi /*
132*2d60b848STomohiro Kusumi  * Returns the bref of the cluster's focus, sans any data-offset information
133*2d60b848STomohiro Kusumi  * (since offset information is per-node and wouldn't be useful).
134*2d60b848STomohiro Kusumi  *
135*2d60b848STomohiro Kusumi  * Callers use this function to access modify_tid, mirror_tid, type,
136*2d60b848STomohiro Kusumi  * key, and keybits.
137*2d60b848STomohiro Kusumi  *
138*2d60b848STomohiro Kusumi  * If the cluster is errored, returns an empty bref.
139*2d60b848STomohiro Kusumi  * The cluster must be locked.
140*2d60b848STomohiro Kusumi  */
141*2d60b848STomohiro Kusumi void
hammer2_cluster_bref(hammer2_cluster_t * cluster,hammer2_blockref_t * bref)142*2d60b848STomohiro Kusumi hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
143*2d60b848STomohiro Kusumi {
144*2d60b848STomohiro Kusumi 	if (cluster->error == 0) {
145*2d60b848STomohiro Kusumi 		KKASSERT(cluster->focus != NULL);
146*2d60b848STomohiro Kusumi 		*bref = cluster->focus->bref;
147*2d60b848STomohiro Kusumi 		bref->data_off = 0;
148*2d60b848STomohiro Kusumi 	} else {
149*2d60b848STomohiro Kusumi 		bzero(bref, sizeof(*bref));
150*2d60b848STomohiro Kusumi 	}
151*2d60b848STomohiro Kusumi }
152*2d60b848STomohiro Kusumi 
153*2d60b848STomohiro Kusumi /*
154*2d60b848STomohiro Kusumi  * Create a degenerate cluster with one ref from a single locked chain.
155*2d60b848STomohiro Kusumi  * The returned cluster will be focused on the chain and inherit its
156*2d60b848STomohiro Kusumi  * error state.
157*2d60b848STomohiro Kusumi  *
158*2d60b848STomohiro Kusumi  * The chain's lock and reference are transfered to the new cluster, so
159*2d60b848STomohiro Kusumi  * the caller should not try to unlock the chain separately.
160*2d60b848STomohiro Kusumi  *
161*2d60b848STomohiro Kusumi  * We fake the flags.
162*2d60b848STomohiro Kusumi  */
163*2d60b848STomohiro Kusumi void
hammer2_dummy_xop_from_chain(hammer2_xop_head_t * xop,hammer2_chain_t * chain)164*2d60b848STomohiro Kusumi hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
165*2d60b848STomohiro Kusumi {
166*2d60b848STomohiro Kusumi 	hammer2_cluster_t *cluster;
167*2d60b848STomohiro Kusumi 
168*2d60b848STomohiro Kusumi 	bzero(xop, sizeof(*xop));
169*2d60b848STomohiro Kusumi 
170*2d60b848STomohiro Kusumi 	cluster = &xop->cluster;
171*2d60b848STomohiro Kusumi 	cluster->array[0].chain = chain;
172*2d60b848STomohiro Kusumi 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
173*2d60b848STomohiro Kusumi 	cluster->nchains = 1;
174*2d60b848STomohiro Kusumi 	cluster->focus = chain;
175*2d60b848STomohiro Kusumi 	cluster->focus_index = 0;
176*2d60b848STomohiro Kusumi 	cluster->pmp = chain->pmp;
177*2d60b848STomohiro Kusumi 	cluster->refs = 1;
178*2d60b848STomohiro Kusumi 	cluster->error = chain->error;
179*2d60b848STomohiro Kusumi 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
180*2d60b848STomohiro Kusumi 			 HAMMER2_CLUSTER_WRHARD |
181*2d60b848STomohiro Kusumi 			 HAMMER2_CLUSTER_RDHARD |
182*2d60b848STomohiro Kusumi 			 HAMMER2_CLUSTER_MSYNCED |
183*2d60b848STomohiro Kusumi 			 HAMMER2_CLUSTER_SSYNCED;
184*2d60b848STomohiro Kusumi }
185*2d60b848STomohiro Kusumi 
186*2d60b848STomohiro Kusumi /*
187*2d60b848STomohiro Kusumi  * Add a reference to a cluster and its underlying chains.
188*2d60b848STomohiro Kusumi  *
189*2d60b848STomohiro Kusumi  * We must also ref the underlying chains in order to allow ref/unlock
190*2d60b848STomohiro Kusumi  * sequences to later re-lock.
191*2d60b848STomohiro Kusumi  */
192*2d60b848STomohiro Kusumi void
hammer2_cluster_ref(hammer2_cluster_t * cluster)193*2d60b848STomohiro Kusumi hammer2_cluster_ref(hammer2_cluster_t *cluster)
194*2d60b848STomohiro Kusumi {
195*2d60b848STomohiro Kusumi 	atomic_add_int(&cluster->refs, 1);
196*2d60b848STomohiro Kusumi }
197*2d60b848STomohiro Kusumi 
198*2d60b848STomohiro Kusumi /*
199*2d60b848STomohiro Kusumi  * Drop the caller's reference to the cluster.  When the ref count drops to
200*2d60b848STomohiro Kusumi  * zero this function frees the cluster and drops all underlying chains.
201*2d60b848STomohiro Kusumi  *
202*2d60b848STomohiro Kusumi  * In-progress read I/Os are typically detached from the cluster once the
203*2d60b848STomohiro Kusumi  * first one returns (the remaining stay attached to the DIOs but are then
204*2d60b848STomohiro Kusumi  * ignored and drop naturally).
205*2d60b848STomohiro Kusumi  */
206*2d60b848STomohiro Kusumi void
hammer2_cluster_drop(hammer2_cluster_t * cluster)207*2d60b848STomohiro Kusumi hammer2_cluster_drop(hammer2_cluster_t *cluster)
208*2d60b848STomohiro Kusumi {
209*2d60b848STomohiro Kusumi 	hammer2_chain_t *chain;
210*2d60b848STomohiro Kusumi 	int i;
211*2d60b848STomohiro Kusumi 
212*2d60b848STomohiro Kusumi 	KKASSERT(cluster->refs > 0);
213*2d60b848STomohiro Kusumi 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
214*2d60b848STomohiro Kusumi 		cluster->focus = NULL;		/* safety XXX chg to assert */
215*2d60b848STomohiro Kusumi 		cluster->focus_index = 0;
216*2d60b848STomohiro Kusumi 
217*2d60b848STomohiro Kusumi 		for (i = 0; i < cluster->nchains; ++i) {
218*2d60b848STomohiro Kusumi 			chain = cluster->array[i].chain;
219*2d60b848STomohiro Kusumi 			if (chain) {
220*2d60b848STomohiro Kusumi 				hammer2_chain_drop(chain);
221*2d60b848STomohiro Kusumi 				cluster->array[i].chain = NULL; /* safety */
222*2d60b848STomohiro Kusumi 			}
223*2d60b848STomohiro Kusumi 		}
224*2d60b848STomohiro Kusumi 		cluster->nchains = 0;				/* safety */
225*2d60b848STomohiro Kusumi 
226*2d60b848STomohiro Kusumi 		kfree(cluster, M_HAMMER2);
227*2d60b848STomohiro Kusumi 		/* cluster is invalid */
228*2d60b848STomohiro Kusumi 	}
229*2d60b848STomohiro Kusumi }
230*2d60b848STomohiro Kusumi 
231*2d60b848STomohiro Kusumi /*
232*2d60b848STomohiro Kusumi  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
233*2d60b848STomohiro Kusumi  *
234*2d60b848STomohiro Kusumi  * WARNING! This function expects the caller to handle resolution of the
235*2d60b848STomohiro Kusumi  *	    cluster.  We never re-resolve the cluster in this function,
236*2d60b848STomohiro Kusumi  *	    because it might be used to temporarily unlock/relock a cparent
237*2d60b848STomohiro Kusumi  *	    in an iteration or recursrion, and the cparents elements do not
238*2d60b848STomohiro Kusumi  *	    necessarily match.
239*2d60b848STomohiro Kusumi  */
240*2d60b848STomohiro Kusumi void
hammer2_cluster_lock(hammer2_cluster_t * cluster,int how)241*2d60b848STomohiro Kusumi hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
242*2d60b848STomohiro Kusumi {
243*2d60b848STomohiro Kusumi 	hammer2_chain_t *chain;
244*2d60b848STomohiro Kusumi 	int i;
245*2d60b848STomohiro Kusumi 
246*2d60b848STomohiro Kusumi 	/* cannot be on inode-embedded cluster template, must be on copy */
247*2d60b848STomohiro Kusumi 	KKASSERT(cluster->refs > 0);
248*2d60b848STomohiro Kusumi 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
249*2d60b848STomohiro Kusumi 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
250*2d60b848STomohiro Kusumi 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
251*2d60b848STomohiro Kusumi 			cluster);
252*2d60b848STomohiro Kusumi 	}
253*2d60b848STomohiro Kusumi 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
254*2d60b848STomohiro Kusumi 
255*2d60b848STomohiro Kusumi 	/*
256*2d60b848STomohiro Kusumi 	 * Lock chains and resolve state.
257*2d60b848STomohiro Kusumi 	 */
258*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
259*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
260*2d60b848STomohiro Kusumi 		if (chain == NULL)
261*2d60b848STomohiro Kusumi 			continue;
262*2d60b848STomohiro Kusumi 		hammer2_chain_lock(chain, how);
263*2d60b848STomohiro Kusumi 	}
264*2d60b848STomohiro Kusumi }
265*2d60b848STomohiro Kusumi 
266*2d60b848STomohiro Kusumi void
hammer2_cluster_unhold(hammer2_cluster_t * cluster)267*2d60b848STomohiro Kusumi hammer2_cluster_unhold(hammer2_cluster_t *cluster)
268*2d60b848STomohiro Kusumi {
269*2d60b848STomohiro Kusumi 	hammer2_chain_t *chain;
270*2d60b848STomohiro Kusumi 	int i;
271*2d60b848STomohiro Kusumi 
272*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
273*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
274*2d60b848STomohiro Kusumi 		if (chain == NULL)
275*2d60b848STomohiro Kusumi 			continue;
276*2d60b848STomohiro Kusumi 		hammer2_chain_unhold(chain);
277*2d60b848STomohiro Kusumi 	}
278*2d60b848STomohiro Kusumi }
279*2d60b848STomohiro Kusumi 
280*2d60b848STomohiro Kusumi void
hammer2_cluster_rehold(hammer2_cluster_t * cluster)281*2d60b848STomohiro Kusumi hammer2_cluster_rehold(hammer2_cluster_t *cluster)
282*2d60b848STomohiro Kusumi {
283*2d60b848STomohiro Kusumi 	hammer2_chain_t *chain;
284*2d60b848STomohiro Kusumi 	int i;
285*2d60b848STomohiro Kusumi 
286*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
287*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
288*2d60b848STomohiro Kusumi 		if (chain == NULL)
289*2d60b848STomohiro Kusumi 			continue;
290*2d60b848STomohiro Kusumi 		hammer2_chain_rehold(chain);
291*2d60b848STomohiro Kusumi 	}
292*2d60b848STomohiro Kusumi }
293*2d60b848STomohiro Kusumi 
294*2d60b848STomohiro Kusumi /*
295*2d60b848STomohiro Kusumi  * This is used by the XOPS subsystem to calculate the state of
296*2d60b848STomohiro Kusumi  * the collection and tell hammer2_xop_collect() what to do with it.
297*2d60b848STomohiro Kusumi  * The collection can be in various states of desynchronization, the
298*2d60b848STomohiro Kusumi  * caller specifically wants to resolve the passed-in key.
299*2d60b848STomohiro Kusumi  *
300*2d60b848STomohiro Kusumi  * Return values (HAMMER2_ERROR_*):
301*2d60b848STomohiro Kusumi  *
302*2d60b848STomohiro Kusumi  *	0		- Quorum agreement, key is valid
303*2d60b848STomohiro Kusumi  *
304*2d60b848STomohiro Kusumi  *	ENOENT		- Quorum agreement, end of scan
305*2d60b848STomohiro Kusumi  *
306*2d60b848STomohiro Kusumi  *	ESRCH		- Quorum agreement, key is INVALID (caller should
307*2d60b848STomohiro Kusumi  *			  skip key).
308*2d60b848STomohiro Kusumi  *
309*2d60b848STomohiro Kusumi  *	EIO		- Quorum agreement but all elements had errors.
310*2d60b848STomohiro Kusumi  *
311*2d60b848STomohiro Kusumi  *	EDEADLK		- No quorum agreement possible for key, a repair
312*2d60b848STomohiro Kusumi  *			  may be needed.  Caller has to decide what to do,
313*2d60b848STomohiro Kusumi  *			  possibly iterating the key or generating an EIO.
314*2d60b848STomohiro Kusumi  *
315*2d60b848STomohiro Kusumi  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
316*2d60b848STomohiro Kusumi  *			  possible if caller waits for more responses.  Caller
317*2d60b848STomohiro Kusumi  *			  should not iterate key.
318*2d60b848STomohiro Kusumi  *
319*2d60b848STomohiro Kusumi  *	CHECK		- CRC check error
320*2d60b848STomohiro Kusumi  *
321*2d60b848STomohiro Kusumi  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
322*2d60b848STomohiro Kusumi  *
323*2d60b848STomohiro Kusumi  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
324*2d60b848STomohiro Kusumi  */
325*2d60b848STomohiro Kusumi int
hammer2_cluster_check(hammer2_cluster_t * cluster,hammer2_key_t key,int flags)326*2d60b848STomohiro Kusumi hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
327*2d60b848STomohiro Kusumi {
328*2d60b848STomohiro Kusumi 	hammer2_chain_t *chain;
329*2d60b848STomohiro Kusumi 	hammer2_chain_t *focus;
330*2d60b848STomohiro Kusumi 	hammer2_pfs_t *pmp;
331*2d60b848STomohiro Kusumi 	hammer2_tid_t quorum_tid;
332*2d60b848STomohiro Kusumi 	hammer2_tid_t last_best_quorum_tid;
333*2d60b848STomohiro Kusumi 	uint32_t nflags;
334*2d60b848STomohiro Kusumi 	int ttlmasters;
335*2d60b848STomohiro Kusumi 	int ttlslaves;
336*2d60b848STomohiro Kusumi 	int nmasters;
337*2d60b848STomohiro Kusumi 	int nmasters_keymatch;
338*2d60b848STomohiro Kusumi 	int nslaves;
339*2d60b848STomohiro Kusumi 	int nquorum;
340*2d60b848STomohiro Kusumi 	int umasters;	/* unknown masters (still in progress) */
341*2d60b848STomohiro Kusumi 	int error;
342*2d60b848STomohiro Kusumi 	int i;
343*2d60b848STomohiro Kusumi 
344*2d60b848STomohiro Kusumi 	cluster->error = 0;
345*2d60b848STomohiro Kusumi 	cluster->focus = NULL;
346*2d60b848STomohiro Kusumi 
347*2d60b848STomohiro Kusumi 	pmp = cluster->pmp;
348*2d60b848STomohiro Kusumi 	KKASSERT(pmp != NULL || cluster->nchains == 0);
349*2d60b848STomohiro Kusumi 
350*2d60b848STomohiro Kusumi 	/*
351*2d60b848STomohiro Kusumi 	 * Calculate quorum
352*2d60b848STomohiro Kusumi 	 */
353*2d60b848STomohiro Kusumi 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
354*2d60b848STomohiro Kusumi 	nflags = 0;
355*2d60b848STomohiro Kusumi 	ttlmasters = 0;
356*2d60b848STomohiro Kusumi 	ttlslaves = 0;
357*2d60b848STomohiro Kusumi 
358*2d60b848STomohiro Kusumi 	/*
359*2d60b848STomohiro Kusumi 	 * Pass 1
360*2d60b848STomohiro Kusumi 	 *
361*2d60b848STomohiro Kusumi 	 * NOTE: A NULL chain is not necessarily an error, it could be
362*2d60b848STomohiro Kusumi 	 *	 e.g. a lookup failure or the end of an iteration.
363*2d60b848STomohiro Kusumi 	 *	 Process normally.
364*2d60b848STomohiro Kusumi 	 */
365*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
366*2d60b848STomohiro Kusumi 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
367*2d60b848STomohiro Kusumi 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
368*2d60b848STomohiro Kusumi 
369*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
370*2d60b848STomohiro Kusumi 		error = cluster->array[i].error;
371*2d60b848STomohiro Kusumi 		if (chain && error) {
372*2d60b848STomohiro Kusumi 			if (cluster->focus == NULL || cluster->focus == chain) {
373*2d60b848STomohiro Kusumi 				/* error will be overridden by valid focus */
374*2d60b848STomohiro Kusumi 				/* XXX */
375*2d60b848STomohiro Kusumi 			}
376*2d60b848STomohiro Kusumi 
377*2d60b848STomohiro Kusumi 			/*
378*2d60b848STomohiro Kusumi 			 * Must count total masters and slaves whether the
379*2d60b848STomohiro Kusumi 			 * chain is errored or not.
380*2d60b848STomohiro Kusumi 			 */
381*2d60b848STomohiro Kusumi 			switch (cluster->pmp->pfs_types[i]) {
382*2d60b848STomohiro Kusumi 			case HAMMER2_PFSTYPE_SUPROOT:
383*2d60b848STomohiro Kusumi 			case HAMMER2_PFSTYPE_MASTER:
384*2d60b848STomohiro Kusumi 				++ttlmasters;
385*2d60b848STomohiro Kusumi 				break;
386*2d60b848STomohiro Kusumi 			case HAMMER2_PFSTYPE_SLAVE:
387*2d60b848STomohiro Kusumi 				++ttlslaves;
388*2d60b848STomohiro Kusumi 				break;
389*2d60b848STomohiro Kusumi 			}
390*2d60b848STomohiro Kusumi 			continue;
391*2d60b848STomohiro Kusumi 		}
392*2d60b848STomohiro Kusumi 		switch (cluster->pmp->pfs_types[i]) {
393*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_MASTER:
394*2d60b848STomohiro Kusumi 			++ttlmasters;
395*2d60b848STomohiro Kusumi 			break;
396*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SLAVE:
397*2d60b848STomohiro Kusumi 			++ttlslaves;
398*2d60b848STomohiro Kusumi 			break;
399*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SOFT_MASTER:
400*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_WRSOFT;
401*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_RDSOFT;
402*2d60b848STomohiro Kusumi 			break;
403*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
404*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_RDSOFT;
405*2d60b848STomohiro Kusumi 			break;
406*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SUPROOT:
407*2d60b848STomohiro Kusumi 			/*
408*2d60b848STomohiro Kusumi 			 * Degenerate cluster representing the super-root
409*2d60b848STomohiro Kusumi 			 * topology on a single device.  Fake stuff so
410*2d60b848STomohiro Kusumi 			 * cluster ops work as expected.
411*2d60b848STomohiro Kusumi 			 */
412*2d60b848STomohiro Kusumi 			++ttlmasters;
413*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_WRHARD;
414*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_RDHARD;
415*2d60b848STomohiro Kusumi 			cluster->focus_index = i;
416*2d60b848STomohiro Kusumi 			cluster->focus = chain;
417*2d60b848STomohiro Kusumi 			cluster->error = error;
418*2d60b848STomohiro Kusumi 			break;
419*2d60b848STomohiro Kusumi 		default:
420*2d60b848STomohiro Kusumi 			break;
421*2d60b848STomohiro Kusumi 		}
422*2d60b848STomohiro Kusumi 	}
423*2d60b848STomohiro Kusumi 
424*2d60b848STomohiro Kusumi 	/*
425*2d60b848STomohiro Kusumi 	 * Pass 2
426*2d60b848STomohiro Kusumi 	 *
427*2d60b848STomohiro Kusumi 	 * Resolve nmasters		- master nodes fully match
428*2d60b848STomohiro Kusumi 	 *
429*2d60b848STomohiro Kusumi 	 * Resolve umasters		- master nodes operation still
430*2d60b848STomohiro Kusumi 	 *				  in progress
431*2d60b848STomohiro Kusumi 	 *
432*2d60b848STomohiro Kusumi 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
433*2d60b848STomohiro Kusumi 	 *				  key and may or may not match
434*2d60b848STomohiro Kusumi 	 *				  the quorum-agreed tid.
435*2d60b848STomohiro Kusumi 	 *
436*2d60b848STomohiro Kusumi 	 * The quorum-agreed TID is the highest matching TID.
437*2d60b848STomohiro Kusumi 	 */
438*2d60b848STomohiro Kusumi 	last_best_quorum_tid = HAMMER2_TID_MAX;
439*2d60b848STomohiro Kusumi 	umasters = 0;
440*2d60b848STomohiro Kusumi 	nmasters = 0;
441*2d60b848STomohiro Kusumi 	nmasters_keymatch = 0;
442*2d60b848STomohiro Kusumi 	quorum_tid = 0;		/* fix gcc warning */
443*2d60b848STomohiro Kusumi 
444*2d60b848STomohiro Kusumi 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
445*2d60b848STomohiro Kusumi 		umasters = 0;
446*2d60b848STomohiro Kusumi 		nmasters = 0;
447*2d60b848STomohiro Kusumi 		nmasters_keymatch = 0;
448*2d60b848STomohiro Kusumi 		quorum_tid = 0;
449*2d60b848STomohiro Kusumi 
450*2d60b848STomohiro Kusumi 		for (i = 0; i < cluster->nchains; ++i) {
451*2d60b848STomohiro Kusumi 			/* XXX SOFT smpresent handling */
452*2d60b848STomohiro Kusumi 			switch(cluster->pmp->pfs_types[i]) {
453*2d60b848STomohiro Kusumi 			case HAMMER2_PFSTYPE_MASTER:
454*2d60b848STomohiro Kusumi 			case HAMMER2_PFSTYPE_SUPROOT:
455*2d60b848STomohiro Kusumi 				break;
456*2d60b848STomohiro Kusumi 			default:
457*2d60b848STomohiro Kusumi 				continue;
458*2d60b848STomohiro Kusumi 			}
459*2d60b848STomohiro Kusumi 
460*2d60b848STomohiro Kusumi 			chain = cluster->array[i].chain;
461*2d60b848STomohiro Kusumi 			error = cluster->array[i].error;
462*2d60b848STomohiro Kusumi 
463*2d60b848STomohiro Kusumi 			/*
464*2d60b848STomohiro Kusumi 			 * Skip elements still in progress.  umasters keeps
465*2d60b848STomohiro Kusumi 			 * track of masters that might still be in-progress.
466*2d60b848STomohiro Kusumi 			 */
467*2d60b848STomohiro Kusumi 			if (chain == NULL && (cluster->array[i].flags &
468*2d60b848STomohiro Kusumi 					      HAMMER2_CITEM_NULL) == 0) {
469*2d60b848STomohiro Kusumi 				++umasters;
470*2d60b848STomohiro Kusumi 				continue;
471*2d60b848STomohiro Kusumi 			}
472*2d60b848STomohiro Kusumi 
473*2d60b848STomohiro Kusumi 			/*
474*2d60b848STomohiro Kusumi 			 * Key match?
475*2d60b848STomohiro Kusumi 			 */
476*2d60b848STomohiro Kusumi 			if (flags & HAMMER2_CHECK_NULL) {
477*2d60b848STomohiro Kusumi 				if (chain == NULL) {
478*2d60b848STomohiro Kusumi 					++nmasters;
479*2d60b848STomohiro Kusumi 					++nmasters_keymatch;
480*2d60b848STomohiro Kusumi 					if (cluster->error == 0)
481*2d60b848STomohiro Kusumi 						cluster->error = error;
482*2d60b848STomohiro Kusumi 				}
483*2d60b848STomohiro Kusumi 			} else if (chain &&
484*2d60b848STomohiro Kusumi 				   (key == (hammer2_key_t)-1 ||
485*2d60b848STomohiro Kusumi 				    chain->bref.key == key)) {
486*2d60b848STomohiro Kusumi 				++nmasters_keymatch;
487*2d60b848STomohiro Kusumi 
488*2d60b848STomohiro Kusumi 				if (chain->bref.modify_tid <
489*2d60b848STomohiro Kusumi 				     last_best_quorum_tid &&
490*2d60b848STomohiro Kusumi 				    quorum_tid < chain->bref.modify_tid) {
491*2d60b848STomohiro Kusumi 					/*
492*2d60b848STomohiro Kusumi 					 * Select new TID as master if better
493*2d60b848STomohiro Kusumi 					 * than any found so far in this loop,
494*2d60b848STomohiro Kusumi 					 * as long as it does not reach the
495*2d60b848STomohiro Kusumi 					 * best tid found in the previous loop.
496*2d60b848STomohiro Kusumi 					 */
497*2d60b848STomohiro Kusumi 					nmasters = 0;
498*2d60b848STomohiro Kusumi 					quorum_tid = chain->bref.modify_tid;
499*2d60b848STomohiro Kusumi 				}
500*2d60b848STomohiro Kusumi 				if (quorum_tid == chain->bref.modify_tid) {
501*2d60b848STomohiro Kusumi 					/*
502*2d60b848STomohiro Kusumi 					 * TID matches current collection.
503*2d60b848STomohiro Kusumi 					 *
504*2d60b848STomohiro Kusumi 					 * (error handled in next pass)
505*2d60b848STomohiro Kusumi 					 */
506*2d60b848STomohiro Kusumi 					++nmasters;
507*2d60b848STomohiro Kusumi 					if (chain->error == 0) {
508*2d60b848STomohiro Kusumi 						cluster->focus = chain;
509*2d60b848STomohiro Kusumi 						cluster->focus_index = i;
510*2d60b848STomohiro Kusumi 					}
511*2d60b848STomohiro Kusumi 				}
512*2d60b848STomohiro Kusumi 			}
513*2d60b848STomohiro Kusumi 		}
514*2d60b848STomohiro Kusumi 		if (nmasters >= nquorum)
515*2d60b848STomohiro Kusumi 			break;
516*2d60b848STomohiro Kusumi 		last_best_quorum_tid = quorum_tid;
517*2d60b848STomohiro Kusumi 	}
518*2d60b848STomohiro Kusumi 
519*2d60b848STomohiro Kusumi 	/*
520*2d60b848STomohiro Kusumi 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
521*2d60b848STomohiro Kusumi 		nmasters, nquorum, nmasters_keymatch, umasters);
522*2d60b848STomohiro Kusumi 	*/
523*2d60b848STomohiro Kusumi 
524*2d60b848STomohiro Kusumi 	/*
525*2d60b848STomohiro Kusumi 	 * Early return if we do not have enough masters.
526*2d60b848STomohiro Kusumi 	 */
527*2d60b848STomohiro Kusumi 	if (nmasters < nquorum) {
528*2d60b848STomohiro Kusumi 		if (nmasters + umasters >= nquorum)
529*2d60b848STomohiro Kusumi 			return HAMMER2_ERROR_EINPROGRESS;
530*2d60b848STomohiro Kusumi 		if (nmasters_keymatch < nquorum)
531*2d60b848STomohiro Kusumi 			return HAMMER2_ERROR_ESRCH;
532*2d60b848STomohiro Kusumi 		return HAMMER2_ERROR_EDEADLK;
533*2d60b848STomohiro Kusumi 	}
534*2d60b848STomohiro Kusumi 
535*2d60b848STomohiro Kusumi 	/*
536*2d60b848STomohiro Kusumi 	 * Validated end of scan.
537*2d60b848STomohiro Kusumi 	 */
538*2d60b848STomohiro Kusumi 	if (flags & HAMMER2_CHECK_NULL) {
539*2d60b848STomohiro Kusumi 		if (cluster->error == 0)
540*2d60b848STomohiro Kusumi 			cluster->error = HAMMER2_ERROR_ENOENT;
541*2d60b848STomohiro Kusumi 		return cluster->error;
542*2d60b848STomohiro Kusumi 	}
543*2d60b848STomohiro Kusumi 
544*2d60b848STomohiro Kusumi 	/*
545*2d60b848STomohiro Kusumi 	 * If we have a NULL focus at this point the agreeing quorum all
546*2d60b848STomohiro Kusumi 	 * had chain errors.
547*2d60b848STomohiro Kusumi 	 */
548*2d60b848STomohiro Kusumi 	if (cluster->focus == NULL)
549*2d60b848STomohiro Kusumi 		return HAMMER2_ERROR_EIO;
550*2d60b848STomohiro Kusumi 
551*2d60b848STomohiro Kusumi 	/*
552*2d60b848STomohiro Kusumi 	 * Pass 3
553*2d60b848STomohiro Kusumi 	 *
554*2d60b848STomohiro Kusumi 	 * We have quorum agreement, validate elements, not end of scan.
555*2d60b848STomohiro Kusumi 	 */
556*2d60b848STomohiro Kusumi 	nslaves = 0;
557*2d60b848STomohiro Kusumi 	cluster->error = 0;
558*2d60b848STomohiro Kusumi 
559*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
560*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
561*2d60b848STomohiro Kusumi 		error = cluster->array[i].error;
562*2d60b848STomohiro Kusumi 		if (chain == NULL ||
563*2d60b848STomohiro Kusumi 		    chain->bref.key != key ||
564*2d60b848STomohiro Kusumi 		    chain->bref.modify_tid != quorum_tid) {
565*2d60b848STomohiro Kusumi 			continue;
566*2d60b848STomohiro Kusumi 		}
567*2d60b848STomohiro Kusumi 
568*2d60b848STomohiro Kusumi 		/*
569*2d60b848STomohiro Kusumi 		 * Quorum Match
570*2d60b848STomohiro Kusumi 		 *
571*2d60b848STomohiro Kusumi 		 * XXX for now, cumulative error.
572*2d60b848STomohiro Kusumi 		 */
573*2d60b848STomohiro Kusumi 		if (cluster->error == 0)
574*2d60b848STomohiro Kusumi 			cluster->error = error;
575*2d60b848STomohiro Kusumi 
576*2d60b848STomohiro Kusumi 		switch (cluster->pmp->pfs_types[i]) {
577*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_MASTER:
578*2d60b848STomohiro Kusumi 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
579*2d60b848STomohiro Kusumi 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
580*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_WRHARD;
581*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_RDHARD;
582*2d60b848STomohiro Kusumi 			break;
583*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SLAVE:
584*2d60b848STomohiro Kusumi 			/*
585*2d60b848STomohiro Kusumi 			 * We must have enough up-to-date masters to reach
586*2d60b848STomohiro Kusumi 			 * a quorum and the slave modify_tid must match the
587*2d60b848STomohiro Kusumi 			 * quorum's modify_tid.
588*2d60b848STomohiro Kusumi 			 *
589*2d60b848STomohiro Kusumi 			 * Do not select an errored slave.
590*2d60b848STomohiro Kusumi 			 */
591*2d60b848STomohiro Kusumi 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
592*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_RDHARD;
593*2d60b848STomohiro Kusumi 			++nslaves;
594*2d60b848STomohiro Kusumi 			break;
595*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SOFT_MASTER:
596*2d60b848STomohiro Kusumi 			/*
597*2d60b848STomohiro Kusumi 			 * Directly mounted soft master always wins.  There
598*2d60b848STomohiro Kusumi 			 * should be only one.
599*2d60b848STomohiro Kusumi 			 */
600*2d60b848STomohiro Kusumi 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
601*2d60b848STomohiro Kusumi 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
602*2d60b848STomohiro Kusumi 			break;
603*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
604*2d60b848STomohiro Kusumi 			/*
605*2d60b848STomohiro Kusumi 			 * Directly mounted soft slave always wins.  There
606*2d60b848STomohiro Kusumi 			 * should be only one.
607*2d60b848STomohiro Kusumi 			 *
608*2d60b848STomohiro Kusumi 			 * XXX
609*2d60b848STomohiro Kusumi 			 */
610*2d60b848STomohiro Kusumi 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
611*2d60b848STomohiro Kusumi 			break;
612*2d60b848STomohiro Kusumi 		case HAMMER2_PFSTYPE_SUPROOT:
613*2d60b848STomohiro Kusumi 			/*
614*2d60b848STomohiro Kusumi 			 * spmp (degenerate case)
615*2d60b848STomohiro Kusumi 			 */
616*2d60b848STomohiro Kusumi 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
617*2d60b848STomohiro Kusumi 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
618*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_WRHARD;
619*2d60b848STomohiro Kusumi 			nflags |= HAMMER2_CLUSTER_RDHARD;
620*2d60b848STomohiro Kusumi 			break;
621*2d60b848STomohiro Kusumi 		default:
622*2d60b848STomohiro Kusumi 			break;
623*2d60b848STomohiro Kusumi 		}
624*2d60b848STomohiro Kusumi 	}
625*2d60b848STomohiro Kusumi 
626*2d60b848STomohiro Kusumi 	/*
627*2d60b848STomohiro Kusumi 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
628*2d60b848STomohiro Kusumi 	 * is bad or if we are at the PFS root (the bref won't match at
629*2d60b848STomohiro Kusumi 	 * the PFS root, obviously).
630*2d60b848STomohiro Kusumi 	 *
631*2d60b848STomohiro Kusumi 	 * focus is probably not locked and it isn't safe to test its
632*2d60b848STomohiro Kusumi 	 * content (e.g. focus->data, focus->dio, other content).  We
633*2d60b848STomohiro Kusumi 	 * do not synchronize the dio to the cpu here.  In fact, in numerous
634*2d60b848STomohiro Kusumi 	 * situations the frontend doesn't even need to access its dio/data,
635*2d60b848STomohiro Kusumi 	 * so synchronizing it here would be wasteful.
636*2d60b848STomohiro Kusumi 	 */
637*2d60b848STomohiro Kusumi 	focus = cluster->focus;
638*2d60b848STomohiro Kusumi 	if (focus) {
639*2d60b848STomohiro Kusumi 		cluster->ddflag =
640*2d60b848STomohiro Kusumi 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
641*2d60b848STomohiro Kusumi 	} else {
642*2d60b848STomohiro Kusumi 		cluster->ddflag = 0;
643*2d60b848STomohiro Kusumi 		goto skip4;
644*2d60b848STomohiro Kusumi 	}
645*2d60b848STomohiro Kusumi 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
646*2d60b848STomohiro Kusumi 		goto skip4;
647*2d60b848STomohiro Kusumi 
648*2d60b848STomohiro Kusumi 	/*
649*2d60b848STomohiro Kusumi 	 * Pass 4
650*2d60b848STomohiro Kusumi 	 *
651*2d60b848STomohiro Kusumi 	 * Validate the elements that were not marked invalid.  They should
652*2d60b848STomohiro Kusumi 	 * match.
653*2d60b848STomohiro Kusumi 	 */
654*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
655*2d60b848STomohiro Kusumi 		int ddflag;
656*2d60b848STomohiro Kusumi 
657*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
658*2d60b848STomohiro Kusumi 
659*2d60b848STomohiro Kusumi 		if (chain == NULL)
660*2d60b848STomohiro Kusumi 			continue;
661*2d60b848STomohiro Kusumi 		if (chain == focus)
662*2d60b848STomohiro Kusumi 			continue;
663*2d60b848STomohiro Kusumi 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
664*2d60b848STomohiro Kusumi 			continue;
665*2d60b848STomohiro Kusumi 
666*2d60b848STomohiro Kusumi 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
667*2d60b848STomohiro Kusumi 		if (chain->bref.type != focus->bref.type ||
668*2d60b848STomohiro Kusumi 		    chain->bref.key != focus->bref.key ||
669*2d60b848STomohiro Kusumi 		    chain->bref.keybits != focus->bref.keybits ||
670*2d60b848STomohiro Kusumi 		    chain->bref.modify_tid != focus->bref.modify_tid ||
671*2d60b848STomohiro Kusumi 		    chain->bytes != focus->bytes ||
672*2d60b848STomohiro Kusumi 		    ddflag != cluster->ddflag) {
673*2d60b848STomohiro Kusumi 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
674*2d60b848STomohiro Kusumi 			if (hammer2_debug & 1)
675*2d60b848STomohiro Kusumi 			kprintf("cluster_check: matching modify_tid failed "
676*2d60b848STomohiro Kusumi 				"bref test: idx=%d type=%02x/%02x "
677*2d60b848STomohiro Kusumi 				"key=%016jx/%d-%016jx/%d "
678*2d60b848STomohiro Kusumi 				"mod=%016jx/%016jx bytes=%u/%u\n",
679*2d60b848STomohiro Kusumi 				i,
680*2d60b848STomohiro Kusumi 				chain->bref.type, focus->bref.type,
681*2d60b848STomohiro Kusumi 				chain->bref.key, chain->bref.keybits,
682*2d60b848STomohiro Kusumi 				focus->bref.key, focus->bref.keybits,
683*2d60b848STomohiro Kusumi 				chain->bref.modify_tid, focus->bref.modify_tid,
684*2d60b848STomohiro Kusumi 				chain->bytes, focus->bytes);
685*2d60b848STomohiro Kusumi 			if (hammer2_debug & 0x4000)
686*2d60b848STomohiro Kusumi 				panic("cluster_check");
687*2d60b848STomohiro Kusumi 			/* flag issue and force resync? */
688*2d60b848STomohiro Kusumi 		}
689*2d60b848STomohiro Kusumi 	}
690*2d60b848STomohiro Kusumi skip4:
691*2d60b848STomohiro Kusumi 
692*2d60b848STomohiro Kusumi 	if (ttlslaves == 0)
693*2d60b848STomohiro Kusumi 		nflags |= HAMMER2_CLUSTER_NOSOFT;
694*2d60b848STomohiro Kusumi 	if (ttlmasters == 0)
695*2d60b848STomohiro Kusumi 		nflags |= HAMMER2_CLUSTER_NOHARD;
696*2d60b848STomohiro Kusumi 
697*2d60b848STomohiro Kusumi 	/*
698*2d60b848STomohiro Kusumi 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
699*2d60b848STomohiro Kusumi 	 * all available nodes (even if 0 are available) are fully
700*2d60b848STomohiro Kusumi 	 * synchronized.  This is used by the synchronization thread to
701*2d60b848STomohiro Kusumi 	 * determine if there is work it could potentially accomplish.
702*2d60b848STomohiro Kusumi 	 */
703*2d60b848STomohiro Kusumi 	if (nslaves == ttlslaves)
704*2d60b848STomohiro Kusumi 		nflags |= HAMMER2_CLUSTER_SSYNCED;
705*2d60b848STomohiro Kusumi 	if (nmasters == ttlmasters)
706*2d60b848STomohiro Kusumi 		nflags |= HAMMER2_CLUSTER_MSYNCED;
707*2d60b848STomohiro Kusumi 
708*2d60b848STomohiro Kusumi 	/*
709*2d60b848STomohiro Kusumi 	 * Determine if the cluster was successfully locked for the
710*2d60b848STomohiro Kusumi 	 * requested operation and generate an error code.  The cluster
711*2d60b848STomohiro Kusumi 	 * will not be locked (or ref'd) if an error is returned.
712*2d60b848STomohiro Kusumi 	 */
713*2d60b848STomohiro Kusumi 	atomic_set_int(&cluster->flags, nflags);
714*2d60b848STomohiro Kusumi 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
715*2d60b848STomohiro Kusumi 
716*2d60b848STomohiro Kusumi 	return cluster->error;
717*2d60b848STomohiro Kusumi }
718*2d60b848STomohiro Kusumi 
719*2d60b848STomohiro Kusumi /*
720*2d60b848STomohiro Kusumi  * Unlock a cluster.  Refcount and focus is maintained.
721*2d60b848STomohiro Kusumi  */
722*2d60b848STomohiro Kusumi void
hammer2_cluster_unlock(hammer2_cluster_t * cluster)723*2d60b848STomohiro Kusumi hammer2_cluster_unlock(hammer2_cluster_t *cluster)
724*2d60b848STomohiro Kusumi {
725*2d60b848STomohiro Kusumi 	hammer2_chain_t *chain;
726*2d60b848STomohiro Kusumi 	int i;
727*2d60b848STomohiro Kusumi 
728*2d60b848STomohiro Kusumi 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
729*2d60b848STomohiro Kusumi 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
730*2d60b848STomohiro Kusumi 			cluster);
731*2d60b848STomohiro Kusumi 	}
732*2d60b848STomohiro Kusumi 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
733*2d60b848STomohiro Kusumi 	KKASSERT(cluster->refs > 0);
734*2d60b848STomohiro Kusumi 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
735*2d60b848STomohiro Kusumi 
736*2d60b848STomohiro Kusumi 	for (i = 0; i < cluster->nchains; ++i) {
737*2d60b848STomohiro Kusumi 		chain = cluster->array[i].chain;
738*2d60b848STomohiro Kusumi 		if (chain)
739*2d60b848STomohiro Kusumi 			hammer2_chain_unlock(chain);
740*2d60b848STomohiro Kusumi 	}
741*2d60b848STomohiro Kusumi }
742