1*2d60b848STomohiro Kusumi /*
2*2d60b848STomohiro Kusumi * SPDX-License-Identifier: BSD-3-Clause
3*2d60b848STomohiro Kusumi *
4*2d60b848STomohiro Kusumi * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5*2d60b848STomohiro Kusumi * Copyright (c) 2011-2022 The DragonFly Project. All rights reserved.
6*2d60b848STomohiro Kusumi *
7*2d60b848STomohiro Kusumi * This code is derived from software contributed to The DragonFly Project
8*2d60b848STomohiro Kusumi * by Matthew Dillon <dillon@dragonflybsd.org>
9*2d60b848STomohiro Kusumi *
10*2d60b848STomohiro Kusumi * Redistribution and use in source and binary forms, with or without
11*2d60b848STomohiro Kusumi * modification, are permitted provided that the following conditions
12*2d60b848STomohiro Kusumi * are met:
13*2d60b848STomohiro Kusumi *
14*2d60b848STomohiro Kusumi * 1. Redistributions of source code must retain the above copyright
15*2d60b848STomohiro Kusumi * notice, this list of conditions and the following disclaimer.
16*2d60b848STomohiro Kusumi * 2. Redistributions in binary form must reproduce the above copyright
17*2d60b848STomohiro Kusumi * notice, this list of conditions and the following disclaimer in
18*2d60b848STomohiro Kusumi * the documentation and/or other materials provided with the
19*2d60b848STomohiro Kusumi * distribution.
20*2d60b848STomohiro Kusumi * 3. Neither the name of The DragonFly Project nor the names of its
21*2d60b848STomohiro Kusumi * contributors may be used to endorse or promote products derived
22*2d60b848STomohiro Kusumi * from this software without specific, prior written permission.
23*2d60b848STomohiro Kusumi *
24*2d60b848STomohiro Kusumi * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25*2d60b848STomohiro Kusumi * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26*2d60b848STomohiro Kusumi * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27*2d60b848STomohiro Kusumi * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28*2d60b848STomohiro Kusumi * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29*2d60b848STomohiro Kusumi * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30*2d60b848STomohiro Kusumi * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31*2d60b848STomohiro Kusumi * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32*2d60b848STomohiro Kusumi * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33*2d60b848STomohiro Kusumi * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34*2d60b848STomohiro Kusumi * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35*2d60b848STomohiro Kusumi * SUCH DAMAGE.
36*2d60b848STomohiro Kusumi */
37*2d60b848STomohiro Kusumi /*
38*2d60b848STomohiro Kusumi * The cluster module collects multiple chains representing the same
39*2d60b848STomohiro Kusumi * information from different nodes into a single entity. It allows direct
40*2d60b848STomohiro Kusumi * access to media data as long as it is not blockref array data (which
41*2d60b848STomohiro Kusumi * will obviously have to be different at each node).
42*2d60b848STomohiro Kusumi *
43*2d60b848STomohiro Kusumi * This module also handles I/O dispatch, status rollup, and various
44*2d60b848STomohiro Kusumi * mastership arrangements including quorum operations. It effectively
45*2d60b848STomohiro Kusumi * presents one topology to the vnops layer.
46*2d60b848STomohiro Kusumi *
47*2d60b848STomohiro Kusumi * Many of the API calls mimic chain API calls but operate on clusters
48*2d60b848STomohiro Kusumi * instead of chains. Please see hammer2_chain.c for more complete code
49*2d60b848STomohiro Kusumi * documentation of the API functions.
50*2d60b848STomohiro Kusumi *
51*2d60b848STomohiro Kusumi * WARNING! This module is *extremely* complex. It must issue asynchronous
52*2d60b848STomohiro Kusumi * locks and I/O, do quorum and/or master-slave processing, and
53*2d60b848STomohiro Kusumi * it must operate properly even if some nodes are broken (which
54*2d60b848STomohiro Kusumi * can also mean indefinite locks).
55*2d60b848STomohiro Kusumi *
56*2d60b848STomohiro Kusumi * CLUSTER OPERATIONS
57*2d60b848STomohiro Kusumi *
58*2d60b848STomohiro Kusumi * Cluster operations can be broken down into three pieces:
59*2d60b848STomohiro Kusumi *
60*2d60b848STomohiro Kusumi * (1) Chain locking and data retrieval.
61*2d60b848STomohiro Kusumi *
62*2d60b848STomohiro Kusumi * - Most complex functions, quorum management on transaction ids.
63*2d60b848STomohiro Kusumi *
64*2d60b848STomohiro Kusumi * - Locking and data accesses must be internally asynchronous.
65*2d60b848STomohiro Kusumi *
66*2d60b848STomohiro Kusumi * - Validate and manage cache coherency primitives (cache state
67*2d60b848STomohiro Kusumi * is stored in chain topologies but must be validated by these
68*2d60b848STomohiro Kusumi * functions).
69*2d60b848STomohiro Kusumi *
70*2d60b848STomohiro Kusumi * (2) Lookups and Scans
71*2d60b848STomohiro Kusumi * hammer2_cluster_lookup()
72*2d60b848STomohiro Kusumi * hammer2_cluster_next()
73*2d60b848STomohiro Kusumi *
74*2d60b848STomohiro Kusumi * - Depend on locking & data retrieval functions, but still complex.
75*2d60b848STomohiro Kusumi *
76*2d60b848STomohiro Kusumi * - Must do quorum management on transaction ids.
77*2d60b848STomohiro Kusumi *
78*2d60b848STomohiro Kusumi * - Lookup and Iteration ops Must be internally asynchronous.
79*2d60b848STomohiro Kusumi *
80*2d60b848STomohiro Kusumi * (3) Modifying Operations
81*2d60b848STomohiro Kusumi * hammer2_cluster_create()
82*2d60b848STomohiro Kusumi *
83*2d60b848STomohiro Kusumi * - Can usually punt on failures, operation continues unless quorum
84*2d60b848STomohiro Kusumi * is lost. If quorum is lost, must wait for resynchronization
85*2d60b848STomohiro Kusumi * (depending on the management mode).
86*2d60b848STomohiro Kusumi *
87*2d60b848STomohiro Kusumi * - Must disconnect node on failures (also not flush), remount, and
88*2d60b848STomohiro Kusumi * resynchronize.
89*2d60b848STomohiro Kusumi *
90*2d60b848STomohiro Kusumi * - Network links (via kdmsg) are relatively easy to issue as the
91*2d60b848STomohiro Kusumi * complex underworkings of hammer2_chain.c don't have to messed
92*2d60b848STomohiro Kusumi * with (the protocol is at a higher level than block-level).
93*2d60b848STomohiro Kusumi *
94*2d60b848STomohiro Kusumi * - Multiple local disk nodes (i.e. block devices) are another matter.
95*2d60b848STomohiro Kusumi * Chain operations have to be dispatched to per-node threads (xN)
96*2d60b848STomohiro Kusumi * because we can't asynchronize potentially very complex chain
97*2d60b848STomohiro Kusumi * operations in hammer2_chain.c (it would be a huge mess).
98*2d60b848STomohiro Kusumi *
99*2d60b848STomohiro Kusumi * (these threads are also used to terminate incoming kdmsg ops from
100*2d60b848STomohiro Kusumi * other machines).
101*2d60b848STomohiro Kusumi *
102*2d60b848STomohiro Kusumi * - Single-node filesystems do not use threads and will simply call
103*2d60b848STomohiro Kusumi * hammer2_chain.c functions directly. This short-cut is handled
104*2d60b848STomohiro Kusumi * at the base of each cluster function.
105*2d60b848STomohiro Kusumi */
106*2d60b848STomohiro Kusumi /*
107*2d60b848STomohiro Kusumi #include <sys/cdefs.h>
108*2d60b848STomohiro Kusumi #include <sys/param.h>
109*2d60b848STomohiro Kusumi #include <sys/systm.h>
110*2d60b848STomohiro Kusumi #include <sys/types.h>
111*2d60b848STomohiro Kusumi */
112*2d60b848STomohiro Kusumi
113*2d60b848STomohiro Kusumi #include "hammer2.h"
114*2d60b848STomohiro Kusumi
115*2d60b848STomohiro Kusumi /*
116*2d60b848STomohiro Kusumi * Returns the bref type of the cluster's foucs.
117*2d60b848STomohiro Kusumi *
118*2d60b848STomohiro Kusumi * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
119*2d60b848STomohiro Kusumi * The cluster must be locked.
120*2d60b848STomohiro Kusumi */
121*2d60b848STomohiro Kusumi uint8_t
hammer2_cluster_type(hammer2_cluster_t * cluster)122*2d60b848STomohiro Kusumi hammer2_cluster_type(hammer2_cluster_t *cluster)
123*2d60b848STomohiro Kusumi {
124*2d60b848STomohiro Kusumi if (cluster->error == 0) {
125*2d60b848STomohiro Kusumi KKASSERT(cluster->focus != NULL);
126*2d60b848STomohiro Kusumi return(cluster->focus->bref.type);
127*2d60b848STomohiro Kusumi }
128*2d60b848STomohiro Kusumi return 0;
129*2d60b848STomohiro Kusumi }
130*2d60b848STomohiro Kusumi
131*2d60b848STomohiro Kusumi /*
132*2d60b848STomohiro Kusumi * Returns the bref of the cluster's focus, sans any data-offset information
133*2d60b848STomohiro Kusumi * (since offset information is per-node and wouldn't be useful).
134*2d60b848STomohiro Kusumi *
135*2d60b848STomohiro Kusumi * Callers use this function to access modify_tid, mirror_tid, type,
136*2d60b848STomohiro Kusumi * key, and keybits.
137*2d60b848STomohiro Kusumi *
138*2d60b848STomohiro Kusumi * If the cluster is errored, returns an empty bref.
139*2d60b848STomohiro Kusumi * The cluster must be locked.
140*2d60b848STomohiro Kusumi */
141*2d60b848STomohiro Kusumi void
hammer2_cluster_bref(hammer2_cluster_t * cluster,hammer2_blockref_t * bref)142*2d60b848STomohiro Kusumi hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
143*2d60b848STomohiro Kusumi {
144*2d60b848STomohiro Kusumi if (cluster->error == 0) {
145*2d60b848STomohiro Kusumi KKASSERT(cluster->focus != NULL);
146*2d60b848STomohiro Kusumi *bref = cluster->focus->bref;
147*2d60b848STomohiro Kusumi bref->data_off = 0;
148*2d60b848STomohiro Kusumi } else {
149*2d60b848STomohiro Kusumi bzero(bref, sizeof(*bref));
150*2d60b848STomohiro Kusumi }
151*2d60b848STomohiro Kusumi }
152*2d60b848STomohiro Kusumi
153*2d60b848STomohiro Kusumi /*
154*2d60b848STomohiro Kusumi * Create a degenerate cluster with one ref from a single locked chain.
155*2d60b848STomohiro Kusumi * The returned cluster will be focused on the chain and inherit its
156*2d60b848STomohiro Kusumi * error state.
157*2d60b848STomohiro Kusumi *
158*2d60b848STomohiro Kusumi * The chain's lock and reference are transfered to the new cluster, so
159*2d60b848STomohiro Kusumi * the caller should not try to unlock the chain separately.
160*2d60b848STomohiro Kusumi *
161*2d60b848STomohiro Kusumi * We fake the flags.
162*2d60b848STomohiro Kusumi */
163*2d60b848STomohiro Kusumi void
hammer2_dummy_xop_from_chain(hammer2_xop_head_t * xop,hammer2_chain_t * chain)164*2d60b848STomohiro Kusumi hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
165*2d60b848STomohiro Kusumi {
166*2d60b848STomohiro Kusumi hammer2_cluster_t *cluster;
167*2d60b848STomohiro Kusumi
168*2d60b848STomohiro Kusumi bzero(xop, sizeof(*xop));
169*2d60b848STomohiro Kusumi
170*2d60b848STomohiro Kusumi cluster = &xop->cluster;
171*2d60b848STomohiro Kusumi cluster->array[0].chain = chain;
172*2d60b848STomohiro Kusumi cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
173*2d60b848STomohiro Kusumi cluster->nchains = 1;
174*2d60b848STomohiro Kusumi cluster->focus = chain;
175*2d60b848STomohiro Kusumi cluster->focus_index = 0;
176*2d60b848STomohiro Kusumi cluster->pmp = chain->pmp;
177*2d60b848STomohiro Kusumi cluster->refs = 1;
178*2d60b848STomohiro Kusumi cluster->error = chain->error;
179*2d60b848STomohiro Kusumi cluster->flags = HAMMER2_CLUSTER_LOCKED |
180*2d60b848STomohiro Kusumi HAMMER2_CLUSTER_WRHARD |
181*2d60b848STomohiro Kusumi HAMMER2_CLUSTER_RDHARD |
182*2d60b848STomohiro Kusumi HAMMER2_CLUSTER_MSYNCED |
183*2d60b848STomohiro Kusumi HAMMER2_CLUSTER_SSYNCED;
184*2d60b848STomohiro Kusumi }
185*2d60b848STomohiro Kusumi
186*2d60b848STomohiro Kusumi /*
187*2d60b848STomohiro Kusumi * Add a reference to a cluster and its underlying chains.
188*2d60b848STomohiro Kusumi *
189*2d60b848STomohiro Kusumi * We must also ref the underlying chains in order to allow ref/unlock
190*2d60b848STomohiro Kusumi * sequences to later re-lock.
191*2d60b848STomohiro Kusumi */
192*2d60b848STomohiro Kusumi void
hammer2_cluster_ref(hammer2_cluster_t * cluster)193*2d60b848STomohiro Kusumi hammer2_cluster_ref(hammer2_cluster_t *cluster)
194*2d60b848STomohiro Kusumi {
195*2d60b848STomohiro Kusumi atomic_add_int(&cluster->refs, 1);
196*2d60b848STomohiro Kusumi }
197*2d60b848STomohiro Kusumi
198*2d60b848STomohiro Kusumi /*
199*2d60b848STomohiro Kusumi * Drop the caller's reference to the cluster. When the ref count drops to
200*2d60b848STomohiro Kusumi * zero this function frees the cluster and drops all underlying chains.
201*2d60b848STomohiro Kusumi *
202*2d60b848STomohiro Kusumi * In-progress read I/Os are typically detached from the cluster once the
203*2d60b848STomohiro Kusumi * first one returns (the remaining stay attached to the DIOs but are then
204*2d60b848STomohiro Kusumi * ignored and drop naturally).
205*2d60b848STomohiro Kusumi */
206*2d60b848STomohiro Kusumi void
hammer2_cluster_drop(hammer2_cluster_t * cluster)207*2d60b848STomohiro Kusumi hammer2_cluster_drop(hammer2_cluster_t *cluster)
208*2d60b848STomohiro Kusumi {
209*2d60b848STomohiro Kusumi hammer2_chain_t *chain;
210*2d60b848STomohiro Kusumi int i;
211*2d60b848STomohiro Kusumi
212*2d60b848STomohiro Kusumi KKASSERT(cluster->refs > 0);
213*2d60b848STomohiro Kusumi if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
214*2d60b848STomohiro Kusumi cluster->focus = NULL; /* safety XXX chg to assert */
215*2d60b848STomohiro Kusumi cluster->focus_index = 0;
216*2d60b848STomohiro Kusumi
217*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
218*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
219*2d60b848STomohiro Kusumi if (chain) {
220*2d60b848STomohiro Kusumi hammer2_chain_drop(chain);
221*2d60b848STomohiro Kusumi cluster->array[i].chain = NULL; /* safety */
222*2d60b848STomohiro Kusumi }
223*2d60b848STomohiro Kusumi }
224*2d60b848STomohiro Kusumi cluster->nchains = 0; /* safety */
225*2d60b848STomohiro Kusumi
226*2d60b848STomohiro Kusumi kfree(cluster, M_HAMMER2);
227*2d60b848STomohiro Kusumi /* cluster is invalid */
228*2d60b848STomohiro Kusumi }
229*2d60b848STomohiro Kusumi }
230*2d60b848STomohiro Kusumi
231*2d60b848STomohiro Kusumi /*
232*2d60b848STomohiro Kusumi * Lock a cluster. Cluster must already be referenced. Focus is maintained.
233*2d60b848STomohiro Kusumi *
234*2d60b848STomohiro Kusumi * WARNING! This function expects the caller to handle resolution of the
235*2d60b848STomohiro Kusumi * cluster. We never re-resolve the cluster in this function,
236*2d60b848STomohiro Kusumi * because it might be used to temporarily unlock/relock a cparent
237*2d60b848STomohiro Kusumi * in an iteration or recursrion, and the cparents elements do not
238*2d60b848STomohiro Kusumi * necessarily match.
239*2d60b848STomohiro Kusumi */
240*2d60b848STomohiro Kusumi void
hammer2_cluster_lock(hammer2_cluster_t * cluster,int how)241*2d60b848STomohiro Kusumi hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
242*2d60b848STomohiro Kusumi {
243*2d60b848STomohiro Kusumi hammer2_chain_t *chain;
244*2d60b848STomohiro Kusumi int i;
245*2d60b848STomohiro Kusumi
246*2d60b848STomohiro Kusumi /* cannot be on inode-embedded cluster template, must be on copy */
247*2d60b848STomohiro Kusumi KKASSERT(cluster->refs > 0);
248*2d60b848STomohiro Kusumi KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
249*2d60b848STomohiro Kusumi if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
250*2d60b848STomohiro Kusumi panic("hammer2_cluster_lock: cluster %p already locked!\n",
251*2d60b848STomohiro Kusumi cluster);
252*2d60b848STomohiro Kusumi }
253*2d60b848STomohiro Kusumi atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
254*2d60b848STomohiro Kusumi
255*2d60b848STomohiro Kusumi /*
256*2d60b848STomohiro Kusumi * Lock chains and resolve state.
257*2d60b848STomohiro Kusumi */
258*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
259*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
260*2d60b848STomohiro Kusumi if (chain == NULL)
261*2d60b848STomohiro Kusumi continue;
262*2d60b848STomohiro Kusumi hammer2_chain_lock(chain, how);
263*2d60b848STomohiro Kusumi }
264*2d60b848STomohiro Kusumi }
265*2d60b848STomohiro Kusumi
266*2d60b848STomohiro Kusumi void
hammer2_cluster_unhold(hammer2_cluster_t * cluster)267*2d60b848STomohiro Kusumi hammer2_cluster_unhold(hammer2_cluster_t *cluster)
268*2d60b848STomohiro Kusumi {
269*2d60b848STomohiro Kusumi hammer2_chain_t *chain;
270*2d60b848STomohiro Kusumi int i;
271*2d60b848STomohiro Kusumi
272*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
273*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
274*2d60b848STomohiro Kusumi if (chain == NULL)
275*2d60b848STomohiro Kusumi continue;
276*2d60b848STomohiro Kusumi hammer2_chain_unhold(chain);
277*2d60b848STomohiro Kusumi }
278*2d60b848STomohiro Kusumi }
279*2d60b848STomohiro Kusumi
280*2d60b848STomohiro Kusumi void
hammer2_cluster_rehold(hammer2_cluster_t * cluster)281*2d60b848STomohiro Kusumi hammer2_cluster_rehold(hammer2_cluster_t *cluster)
282*2d60b848STomohiro Kusumi {
283*2d60b848STomohiro Kusumi hammer2_chain_t *chain;
284*2d60b848STomohiro Kusumi int i;
285*2d60b848STomohiro Kusumi
286*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
287*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
288*2d60b848STomohiro Kusumi if (chain == NULL)
289*2d60b848STomohiro Kusumi continue;
290*2d60b848STomohiro Kusumi hammer2_chain_rehold(chain);
291*2d60b848STomohiro Kusumi }
292*2d60b848STomohiro Kusumi }
293*2d60b848STomohiro Kusumi
294*2d60b848STomohiro Kusumi /*
295*2d60b848STomohiro Kusumi * This is used by the XOPS subsystem to calculate the state of
296*2d60b848STomohiro Kusumi * the collection and tell hammer2_xop_collect() what to do with it.
297*2d60b848STomohiro Kusumi * The collection can be in various states of desynchronization, the
298*2d60b848STomohiro Kusumi * caller specifically wants to resolve the passed-in key.
299*2d60b848STomohiro Kusumi *
300*2d60b848STomohiro Kusumi * Return values (HAMMER2_ERROR_*):
301*2d60b848STomohiro Kusumi *
302*2d60b848STomohiro Kusumi * 0 - Quorum agreement, key is valid
303*2d60b848STomohiro Kusumi *
304*2d60b848STomohiro Kusumi * ENOENT - Quorum agreement, end of scan
305*2d60b848STomohiro Kusumi *
306*2d60b848STomohiro Kusumi * ESRCH - Quorum agreement, key is INVALID (caller should
307*2d60b848STomohiro Kusumi * skip key).
308*2d60b848STomohiro Kusumi *
309*2d60b848STomohiro Kusumi * EIO - Quorum agreement but all elements had errors.
310*2d60b848STomohiro Kusumi *
311*2d60b848STomohiro Kusumi * EDEADLK - No quorum agreement possible for key, a repair
312*2d60b848STomohiro Kusumi * may be needed. Caller has to decide what to do,
313*2d60b848STomohiro Kusumi * possibly iterating the key or generating an EIO.
314*2d60b848STomohiro Kusumi *
315*2d60b848STomohiro Kusumi * EINPROGRESS - No quorum agreement yet, but agreement is still
316*2d60b848STomohiro Kusumi * possible if caller waits for more responses. Caller
317*2d60b848STomohiro Kusumi * should not iterate key.
318*2d60b848STomohiro Kusumi *
319*2d60b848STomohiro Kusumi * CHECK - CRC check error
320*2d60b848STomohiro Kusumi *
321*2d60b848STomohiro Kusumi * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
322*2d60b848STomohiro Kusumi *
323*2d60b848STomohiro Kusumi * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
324*2d60b848STomohiro Kusumi */
325*2d60b848STomohiro Kusumi int
hammer2_cluster_check(hammer2_cluster_t * cluster,hammer2_key_t key,int flags)326*2d60b848STomohiro Kusumi hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
327*2d60b848STomohiro Kusumi {
328*2d60b848STomohiro Kusumi hammer2_chain_t *chain;
329*2d60b848STomohiro Kusumi hammer2_chain_t *focus;
330*2d60b848STomohiro Kusumi hammer2_pfs_t *pmp;
331*2d60b848STomohiro Kusumi hammer2_tid_t quorum_tid;
332*2d60b848STomohiro Kusumi hammer2_tid_t last_best_quorum_tid;
333*2d60b848STomohiro Kusumi uint32_t nflags;
334*2d60b848STomohiro Kusumi int ttlmasters;
335*2d60b848STomohiro Kusumi int ttlslaves;
336*2d60b848STomohiro Kusumi int nmasters;
337*2d60b848STomohiro Kusumi int nmasters_keymatch;
338*2d60b848STomohiro Kusumi int nslaves;
339*2d60b848STomohiro Kusumi int nquorum;
340*2d60b848STomohiro Kusumi int umasters; /* unknown masters (still in progress) */
341*2d60b848STomohiro Kusumi int error;
342*2d60b848STomohiro Kusumi int i;
343*2d60b848STomohiro Kusumi
344*2d60b848STomohiro Kusumi cluster->error = 0;
345*2d60b848STomohiro Kusumi cluster->focus = NULL;
346*2d60b848STomohiro Kusumi
347*2d60b848STomohiro Kusumi pmp = cluster->pmp;
348*2d60b848STomohiro Kusumi KKASSERT(pmp != NULL || cluster->nchains == 0);
349*2d60b848STomohiro Kusumi
350*2d60b848STomohiro Kusumi /*
351*2d60b848STomohiro Kusumi * Calculate quorum
352*2d60b848STomohiro Kusumi */
353*2d60b848STomohiro Kusumi nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
354*2d60b848STomohiro Kusumi nflags = 0;
355*2d60b848STomohiro Kusumi ttlmasters = 0;
356*2d60b848STomohiro Kusumi ttlslaves = 0;
357*2d60b848STomohiro Kusumi
358*2d60b848STomohiro Kusumi /*
359*2d60b848STomohiro Kusumi * Pass 1
360*2d60b848STomohiro Kusumi *
361*2d60b848STomohiro Kusumi * NOTE: A NULL chain is not necessarily an error, it could be
362*2d60b848STomohiro Kusumi * e.g. a lookup failure or the end of an iteration.
363*2d60b848STomohiro Kusumi * Process normally.
364*2d60b848STomohiro Kusumi */
365*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
366*2d60b848STomohiro Kusumi cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
367*2d60b848STomohiro Kusumi cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
368*2d60b848STomohiro Kusumi
369*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
370*2d60b848STomohiro Kusumi error = cluster->array[i].error;
371*2d60b848STomohiro Kusumi if (chain && error) {
372*2d60b848STomohiro Kusumi if (cluster->focus == NULL || cluster->focus == chain) {
373*2d60b848STomohiro Kusumi /* error will be overridden by valid focus */
374*2d60b848STomohiro Kusumi /* XXX */
375*2d60b848STomohiro Kusumi }
376*2d60b848STomohiro Kusumi
377*2d60b848STomohiro Kusumi /*
378*2d60b848STomohiro Kusumi * Must count total masters and slaves whether the
379*2d60b848STomohiro Kusumi * chain is errored or not.
380*2d60b848STomohiro Kusumi */
381*2d60b848STomohiro Kusumi switch (cluster->pmp->pfs_types[i]) {
382*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SUPROOT:
383*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_MASTER:
384*2d60b848STomohiro Kusumi ++ttlmasters;
385*2d60b848STomohiro Kusumi break;
386*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SLAVE:
387*2d60b848STomohiro Kusumi ++ttlslaves;
388*2d60b848STomohiro Kusumi break;
389*2d60b848STomohiro Kusumi }
390*2d60b848STomohiro Kusumi continue;
391*2d60b848STomohiro Kusumi }
392*2d60b848STomohiro Kusumi switch (cluster->pmp->pfs_types[i]) {
393*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_MASTER:
394*2d60b848STomohiro Kusumi ++ttlmasters;
395*2d60b848STomohiro Kusumi break;
396*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SLAVE:
397*2d60b848STomohiro Kusumi ++ttlslaves;
398*2d60b848STomohiro Kusumi break;
399*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SOFT_MASTER:
400*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_WRSOFT;
401*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_RDSOFT;
402*2d60b848STomohiro Kusumi break;
403*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SOFT_SLAVE:
404*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_RDSOFT;
405*2d60b848STomohiro Kusumi break;
406*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SUPROOT:
407*2d60b848STomohiro Kusumi /*
408*2d60b848STomohiro Kusumi * Degenerate cluster representing the super-root
409*2d60b848STomohiro Kusumi * topology on a single device. Fake stuff so
410*2d60b848STomohiro Kusumi * cluster ops work as expected.
411*2d60b848STomohiro Kusumi */
412*2d60b848STomohiro Kusumi ++ttlmasters;
413*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_WRHARD;
414*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_RDHARD;
415*2d60b848STomohiro Kusumi cluster->focus_index = i;
416*2d60b848STomohiro Kusumi cluster->focus = chain;
417*2d60b848STomohiro Kusumi cluster->error = error;
418*2d60b848STomohiro Kusumi break;
419*2d60b848STomohiro Kusumi default:
420*2d60b848STomohiro Kusumi break;
421*2d60b848STomohiro Kusumi }
422*2d60b848STomohiro Kusumi }
423*2d60b848STomohiro Kusumi
424*2d60b848STomohiro Kusumi /*
425*2d60b848STomohiro Kusumi * Pass 2
426*2d60b848STomohiro Kusumi *
427*2d60b848STomohiro Kusumi * Resolve nmasters - master nodes fully match
428*2d60b848STomohiro Kusumi *
429*2d60b848STomohiro Kusumi * Resolve umasters - master nodes operation still
430*2d60b848STomohiro Kusumi * in progress
431*2d60b848STomohiro Kusumi *
432*2d60b848STomohiro Kusumi * Resolve nmasters_keymatch - master nodes match the passed-in
433*2d60b848STomohiro Kusumi * key and may or may not match
434*2d60b848STomohiro Kusumi * the quorum-agreed tid.
435*2d60b848STomohiro Kusumi *
436*2d60b848STomohiro Kusumi * The quorum-agreed TID is the highest matching TID.
437*2d60b848STomohiro Kusumi */
438*2d60b848STomohiro Kusumi last_best_quorum_tid = HAMMER2_TID_MAX;
439*2d60b848STomohiro Kusumi umasters = 0;
440*2d60b848STomohiro Kusumi nmasters = 0;
441*2d60b848STomohiro Kusumi nmasters_keymatch = 0;
442*2d60b848STomohiro Kusumi quorum_tid = 0; /* fix gcc warning */
443*2d60b848STomohiro Kusumi
444*2d60b848STomohiro Kusumi while (nmasters < nquorum && last_best_quorum_tid != 0) {
445*2d60b848STomohiro Kusumi umasters = 0;
446*2d60b848STomohiro Kusumi nmasters = 0;
447*2d60b848STomohiro Kusumi nmasters_keymatch = 0;
448*2d60b848STomohiro Kusumi quorum_tid = 0;
449*2d60b848STomohiro Kusumi
450*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
451*2d60b848STomohiro Kusumi /* XXX SOFT smpresent handling */
452*2d60b848STomohiro Kusumi switch(cluster->pmp->pfs_types[i]) {
453*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_MASTER:
454*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SUPROOT:
455*2d60b848STomohiro Kusumi break;
456*2d60b848STomohiro Kusumi default:
457*2d60b848STomohiro Kusumi continue;
458*2d60b848STomohiro Kusumi }
459*2d60b848STomohiro Kusumi
460*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
461*2d60b848STomohiro Kusumi error = cluster->array[i].error;
462*2d60b848STomohiro Kusumi
463*2d60b848STomohiro Kusumi /*
464*2d60b848STomohiro Kusumi * Skip elements still in progress. umasters keeps
465*2d60b848STomohiro Kusumi * track of masters that might still be in-progress.
466*2d60b848STomohiro Kusumi */
467*2d60b848STomohiro Kusumi if (chain == NULL && (cluster->array[i].flags &
468*2d60b848STomohiro Kusumi HAMMER2_CITEM_NULL) == 0) {
469*2d60b848STomohiro Kusumi ++umasters;
470*2d60b848STomohiro Kusumi continue;
471*2d60b848STomohiro Kusumi }
472*2d60b848STomohiro Kusumi
473*2d60b848STomohiro Kusumi /*
474*2d60b848STomohiro Kusumi * Key match?
475*2d60b848STomohiro Kusumi */
476*2d60b848STomohiro Kusumi if (flags & HAMMER2_CHECK_NULL) {
477*2d60b848STomohiro Kusumi if (chain == NULL) {
478*2d60b848STomohiro Kusumi ++nmasters;
479*2d60b848STomohiro Kusumi ++nmasters_keymatch;
480*2d60b848STomohiro Kusumi if (cluster->error == 0)
481*2d60b848STomohiro Kusumi cluster->error = error;
482*2d60b848STomohiro Kusumi }
483*2d60b848STomohiro Kusumi } else if (chain &&
484*2d60b848STomohiro Kusumi (key == (hammer2_key_t)-1 ||
485*2d60b848STomohiro Kusumi chain->bref.key == key)) {
486*2d60b848STomohiro Kusumi ++nmasters_keymatch;
487*2d60b848STomohiro Kusumi
488*2d60b848STomohiro Kusumi if (chain->bref.modify_tid <
489*2d60b848STomohiro Kusumi last_best_quorum_tid &&
490*2d60b848STomohiro Kusumi quorum_tid < chain->bref.modify_tid) {
491*2d60b848STomohiro Kusumi /*
492*2d60b848STomohiro Kusumi * Select new TID as master if better
493*2d60b848STomohiro Kusumi * than any found so far in this loop,
494*2d60b848STomohiro Kusumi * as long as it does not reach the
495*2d60b848STomohiro Kusumi * best tid found in the previous loop.
496*2d60b848STomohiro Kusumi */
497*2d60b848STomohiro Kusumi nmasters = 0;
498*2d60b848STomohiro Kusumi quorum_tid = chain->bref.modify_tid;
499*2d60b848STomohiro Kusumi }
500*2d60b848STomohiro Kusumi if (quorum_tid == chain->bref.modify_tid) {
501*2d60b848STomohiro Kusumi /*
502*2d60b848STomohiro Kusumi * TID matches current collection.
503*2d60b848STomohiro Kusumi *
504*2d60b848STomohiro Kusumi * (error handled in next pass)
505*2d60b848STomohiro Kusumi */
506*2d60b848STomohiro Kusumi ++nmasters;
507*2d60b848STomohiro Kusumi if (chain->error == 0) {
508*2d60b848STomohiro Kusumi cluster->focus = chain;
509*2d60b848STomohiro Kusumi cluster->focus_index = i;
510*2d60b848STomohiro Kusumi }
511*2d60b848STomohiro Kusumi }
512*2d60b848STomohiro Kusumi }
513*2d60b848STomohiro Kusumi }
514*2d60b848STomohiro Kusumi if (nmasters >= nquorum)
515*2d60b848STomohiro Kusumi break;
516*2d60b848STomohiro Kusumi last_best_quorum_tid = quorum_tid;
517*2d60b848STomohiro Kusumi }
518*2d60b848STomohiro Kusumi
519*2d60b848STomohiro Kusumi /*
520*2d60b848STomohiro Kusumi kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
521*2d60b848STomohiro Kusumi nmasters, nquorum, nmasters_keymatch, umasters);
522*2d60b848STomohiro Kusumi */
523*2d60b848STomohiro Kusumi
524*2d60b848STomohiro Kusumi /*
525*2d60b848STomohiro Kusumi * Early return if we do not have enough masters.
526*2d60b848STomohiro Kusumi */
527*2d60b848STomohiro Kusumi if (nmasters < nquorum) {
528*2d60b848STomohiro Kusumi if (nmasters + umasters >= nquorum)
529*2d60b848STomohiro Kusumi return HAMMER2_ERROR_EINPROGRESS;
530*2d60b848STomohiro Kusumi if (nmasters_keymatch < nquorum)
531*2d60b848STomohiro Kusumi return HAMMER2_ERROR_ESRCH;
532*2d60b848STomohiro Kusumi return HAMMER2_ERROR_EDEADLK;
533*2d60b848STomohiro Kusumi }
534*2d60b848STomohiro Kusumi
535*2d60b848STomohiro Kusumi /*
536*2d60b848STomohiro Kusumi * Validated end of scan.
537*2d60b848STomohiro Kusumi */
538*2d60b848STomohiro Kusumi if (flags & HAMMER2_CHECK_NULL) {
539*2d60b848STomohiro Kusumi if (cluster->error == 0)
540*2d60b848STomohiro Kusumi cluster->error = HAMMER2_ERROR_ENOENT;
541*2d60b848STomohiro Kusumi return cluster->error;
542*2d60b848STomohiro Kusumi }
543*2d60b848STomohiro Kusumi
544*2d60b848STomohiro Kusumi /*
545*2d60b848STomohiro Kusumi * If we have a NULL focus at this point the agreeing quorum all
546*2d60b848STomohiro Kusumi * had chain errors.
547*2d60b848STomohiro Kusumi */
548*2d60b848STomohiro Kusumi if (cluster->focus == NULL)
549*2d60b848STomohiro Kusumi return HAMMER2_ERROR_EIO;
550*2d60b848STomohiro Kusumi
551*2d60b848STomohiro Kusumi /*
552*2d60b848STomohiro Kusumi * Pass 3
553*2d60b848STomohiro Kusumi *
554*2d60b848STomohiro Kusumi * We have quorum agreement, validate elements, not end of scan.
555*2d60b848STomohiro Kusumi */
556*2d60b848STomohiro Kusumi nslaves = 0;
557*2d60b848STomohiro Kusumi cluster->error = 0;
558*2d60b848STomohiro Kusumi
559*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
560*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
561*2d60b848STomohiro Kusumi error = cluster->array[i].error;
562*2d60b848STomohiro Kusumi if (chain == NULL ||
563*2d60b848STomohiro Kusumi chain->bref.key != key ||
564*2d60b848STomohiro Kusumi chain->bref.modify_tid != quorum_tid) {
565*2d60b848STomohiro Kusumi continue;
566*2d60b848STomohiro Kusumi }
567*2d60b848STomohiro Kusumi
568*2d60b848STomohiro Kusumi /*
569*2d60b848STomohiro Kusumi * Quorum Match
570*2d60b848STomohiro Kusumi *
571*2d60b848STomohiro Kusumi * XXX for now, cumulative error.
572*2d60b848STomohiro Kusumi */
573*2d60b848STomohiro Kusumi if (cluster->error == 0)
574*2d60b848STomohiro Kusumi cluster->error = error;
575*2d60b848STomohiro Kusumi
576*2d60b848STomohiro Kusumi switch (cluster->pmp->pfs_types[i]) {
577*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_MASTER:
578*2d60b848STomohiro Kusumi cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
579*2d60b848STomohiro Kusumi cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
580*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_WRHARD;
581*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_RDHARD;
582*2d60b848STomohiro Kusumi break;
583*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SLAVE:
584*2d60b848STomohiro Kusumi /*
585*2d60b848STomohiro Kusumi * We must have enough up-to-date masters to reach
586*2d60b848STomohiro Kusumi * a quorum and the slave modify_tid must match the
587*2d60b848STomohiro Kusumi * quorum's modify_tid.
588*2d60b848STomohiro Kusumi *
589*2d60b848STomohiro Kusumi * Do not select an errored slave.
590*2d60b848STomohiro Kusumi */
591*2d60b848STomohiro Kusumi cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
592*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_RDHARD;
593*2d60b848STomohiro Kusumi ++nslaves;
594*2d60b848STomohiro Kusumi break;
595*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SOFT_MASTER:
596*2d60b848STomohiro Kusumi /*
597*2d60b848STomohiro Kusumi * Directly mounted soft master always wins. There
598*2d60b848STomohiro Kusumi * should be only one.
599*2d60b848STomohiro Kusumi */
600*2d60b848STomohiro Kusumi cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
601*2d60b848STomohiro Kusumi cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
602*2d60b848STomohiro Kusumi break;
603*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SOFT_SLAVE:
604*2d60b848STomohiro Kusumi /*
605*2d60b848STomohiro Kusumi * Directly mounted soft slave always wins. There
606*2d60b848STomohiro Kusumi * should be only one.
607*2d60b848STomohiro Kusumi *
608*2d60b848STomohiro Kusumi * XXX
609*2d60b848STomohiro Kusumi */
610*2d60b848STomohiro Kusumi cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
611*2d60b848STomohiro Kusumi break;
612*2d60b848STomohiro Kusumi case HAMMER2_PFSTYPE_SUPROOT:
613*2d60b848STomohiro Kusumi /*
614*2d60b848STomohiro Kusumi * spmp (degenerate case)
615*2d60b848STomohiro Kusumi */
616*2d60b848STomohiro Kusumi cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
617*2d60b848STomohiro Kusumi cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
618*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_WRHARD;
619*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_RDHARD;
620*2d60b848STomohiro Kusumi break;
621*2d60b848STomohiro Kusumi default:
622*2d60b848STomohiro Kusumi break;
623*2d60b848STomohiro Kusumi }
624*2d60b848STomohiro Kusumi }
625*2d60b848STomohiro Kusumi
626*2d60b848STomohiro Kusumi /*
627*2d60b848STomohiro Kusumi * Focus now set, adjust ddflag. Skip this pass if the focus
628*2d60b848STomohiro Kusumi * is bad or if we are at the PFS root (the bref won't match at
629*2d60b848STomohiro Kusumi * the PFS root, obviously).
630*2d60b848STomohiro Kusumi *
631*2d60b848STomohiro Kusumi * focus is probably not locked and it isn't safe to test its
632*2d60b848STomohiro Kusumi * content (e.g. focus->data, focus->dio, other content). We
633*2d60b848STomohiro Kusumi * do not synchronize the dio to the cpu here. In fact, in numerous
634*2d60b848STomohiro Kusumi * situations the frontend doesn't even need to access its dio/data,
635*2d60b848STomohiro Kusumi * so synchronizing it here would be wasteful.
636*2d60b848STomohiro Kusumi */
637*2d60b848STomohiro Kusumi focus = cluster->focus;
638*2d60b848STomohiro Kusumi if (focus) {
639*2d60b848STomohiro Kusumi cluster->ddflag =
640*2d60b848STomohiro Kusumi (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
641*2d60b848STomohiro Kusumi } else {
642*2d60b848STomohiro Kusumi cluster->ddflag = 0;
643*2d60b848STomohiro Kusumi goto skip4;
644*2d60b848STomohiro Kusumi }
645*2d60b848STomohiro Kusumi if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
646*2d60b848STomohiro Kusumi goto skip4;
647*2d60b848STomohiro Kusumi
648*2d60b848STomohiro Kusumi /*
649*2d60b848STomohiro Kusumi * Pass 4
650*2d60b848STomohiro Kusumi *
651*2d60b848STomohiro Kusumi * Validate the elements that were not marked invalid. They should
652*2d60b848STomohiro Kusumi * match.
653*2d60b848STomohiro Kusumi */
654*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
655*2d60b848STomohiro Kusumi int ddflag;
656*2d60b848STomohiro Kusumi
657*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
658*2d60b848STomohiro Kusumi
659*2d60b848STomohiro Kusumi if (chain == NULL)
660*2d60b848STomohiro Kusumi continue;
661*2d60b848STomohiro Kusumi if (chain == focus)
662*2d60b848STomohiro Kusumi continue;
663*2d60b848STomohiro Kusumi if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
664*2d60b848STomohiro Kusumi continue;
665*2d60b848STomohiro Kusumi
666*2d60b848STomohiro Kusumi ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
667*2d60b848STomohiro Kusumi if (chain->bref.type != focus->bref.type ||
668*2d60b848STomohiro Kusumi chain->bref.key != focus->bref.key ||
669*2d60b848STomohiro Kusumi chain->bref.keybits != focus->bref.keybits ||
670*2d60b848STomohiro Kusumi chain->bref.modify_tid != focus->bref.modify_tid ||
671*2d60b848STomohiro Kusumi chain->bytes != focus->bytes ||
672*2d60b848STomohiro Kusumi ddflag != cluster->ddflag) {
673*2d60b848STomohiro Kusumi cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
674*2d60b848STomohiro Kusumi if (hammer2_debug & 1)
675*2d60b848STomohiro Kusumi kprintf("cluster_check: matching modify_tid failed "
676*2d60b848STomohiro Kusumi "bref test: idx=%d type=%02x/%02x "
677*2d60b848STomohiro Kusumi "key=%016jx/%d-%016jx/%d "
678*2d60b848STomohiro Kusumi "mod=%016jx/%016jx bytes=%u/%u\n",
679*2d60b848STomohiro Kusumi i,
680*2d60b848STomohiro Kusumi chain->bref.type, focus->bref.type,
681*2d60b848STomohiro Kusumi chain->bref.key, chain->bref.keybits,
682*2d60b848STomohiro Kusumi focus->bref.key, focus->bref.keybits,
683*2d60b848STomohiro Kusumi chain->bref.modify_tid, focus->bref.modify_tid,
684*2d60b848STomohiro Kusumi chain->bytes, focus->bytes);
685*2d60b848STomohiro Kusumi if (hammer2_debug & 0x4000)
686*2d60b848STomohiro Kusumi panic("cluster_check");
687*2d60b848STomohiro Kusumi /* flag issue and force resync? */
688*2d60b848STomohiro Kusumi }
689*2d60b848STomohiro Kusumi }
690*2d60b848STomohiro Kusumi skip4:
691*2d60b848STomohiro Kusumi
692*2d60b848STomohiro Kusumi if (ttlslaves == 0)
693*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_NOSOFT;
694*2d60b848STomohiro Kusumi if (ttlmasters == 0)
695*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_NOHARD;
696*2d60b848STomohiro Kusumi
697*2d60b848STomohiro Kusumi /*
698*2d60b848STomohiro Kusumi * Set SSYNCED or MSYNCED for slaves and masters respectively if
699*2d60b848STomohiro Kusumi * all available nodes (even if 0 are available) are fully
700*2d60b848STomohiro Kusumi * synchronized. This is used by the synchronization thread to
701*2d60b848STomohiro Kusumi * determine if there is work it could potentially accomplish.
702*2d60b848STomohiro Kusumi */
703*2d60b848STomohiro Kusumi if (nslaves == ttlslaves)
704*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_SSYNCED;
705*2d60b848STomohiro Kusumi if (nmasters == ttlmasters)
706*2d60b848STomohiro Kusumi nflags |= HAMMER2_CLUSTER_MSYNCED;
707*2d60b848STomohiro Kusumi
708*2d60b848STomohiro Kusumi /*
709*2d60b848STomohiro Kusumi * Determine if the cluster was successfully locked for the
710*2d60b848STomohiro Kusumi * requested operation and generate an error code. The cluster
711*2d60b848STomohiro Kusumi * will not be locked (or ref'd) if an error is returned.
712*2d60b848STomohiro Kusumi */
713*2d60b848STomohiro Kusumi atomic_set_int(&cluster->flags, nflags);
714*2d60b848STomohiro Kusumi atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
715*2d60b848STomohiro Kusumi
716*2d60b848STomohiro Kusumi return cluster->error;
717*2d60b848STomohiro Kusumi }
718*2d60b848STomohiro Kusumi
719*2d60b848STomohiro Kusumi /*
720*2d60b848STomohiro Kusumi * Unlock a cluster. Refcount and focus is maintained.
721*2d60b848STomohiro Kusumi */
722*2d60b848STomohiro Kusumi void
hammer2_cluster_unlock(hammer2_cluster_t * cluster)723*2d60b848STomohiro Kusumi hammer2_cluster_unlock(hammer2_cluster_t *cluster)
724*2d60b848STomohiro Kusumi {
725*2d60b848STomohiro Kusumi hammer2_chain_t *chain;
726*2d60b848STomohiro Kusumi int i;
727*2d60b848STomohiro Kusumi
728*2d60b848STomohiro Kusumi if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
729*2d60b848STomohiro Kusumi kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
730*2d60b848STomohiro Kusumi cluster);
731*2d60b848STomohiro Kusumi }
732*2d60b848STomohiro Kusumi KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
733*2d60b848STomohiro Kusumi KKASSERT(cluster->refs > 0);
734*2d60b848STomohiro Kusumi atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
735*2d60b848STomohiro Kusumi
736*2d60b848STomohiro Kusumi for (i = 0; i < cluster->nchains; ++i) {
737*2d60b848STomohiro Kusumi chain = cluster->array[i].chain;
738*2d60b848STomohiro Kusumi if (chain)
739*2d60b848STomohiro Kusumi hammer2_chain_unlock(chain);
740*2d60b848STomohiro Kusumi }
741*2d60b848STomohiro Kusumi }
742