xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision a680a824caa81f60b4d5944e34dfd86b656e862e)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *	- Most complex functions, quorum management on transaction ids.
60  *
61  *	- Locking and data accesses must be internally asynchronous.
62  *
63  *	- Validate and manage cache coherency primitives (cache state
64  *	  is stored in chain topologies but must be validated by these
65  *	  functions).
66  *
67  * (2) Lookups and Scans
68  *		hammer2_cluster_lookup()
69  *		hammer2_cluster_next()
70  *
71  *	- Depend on locking & data retrieval functions, but still complex.
72  *
73  *	- Must do quorum management on transaction ids.
74  *
75  *	- Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *		hammer2_cluster_create()
79  *
80  *	- Can usually punt on failures, operation continues unless quorum
81  *	  is lost.  If quorum is lost, must wait for resynchronization
82  *	  (depending on the management mode).
83  *
84  *	- Must disconnect node on failures (also not flush), remount, and
85  *	  resynchronize.
86  *
87  *	- Network links (via kdmsg) are relatively easy to issue as the
88  *	  complex underworkings of hammer2_chain.c don't have to messed
89  *	  with (the protocol is at a higher level than block-level).
90  *
91  *	- Multiple local disk nodes (i.e. block devices) are another matter.
92  *	  Chain operations have to be dispatched to per-node threads (xN)
93  *	  because we can't asynchronize potentially very complex chain
94  *	  operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *	  (these threads are also used to terminate incoming kdmsg ops from
97  *	  other machines).
98  *
99  *	- Single-node filesystems do not use threads and will simply call
100  *	  hammer2_chain.c functions directly.  This short-cut is handled
101  *	  at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109 
110 #include "hammer2.h"
111 
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121 	if (cluster->error == 0) {
122 		KKASSERT(cluster->focus != NULL);
123 		return(cluster->focus->bref.type);
124 	}
125 	return 0;
126 }
127 
128 /*
129  * Returns the bref of the cluster's focus, sans any data-offset information
130  * (since offset information is per-node and wouldn't be useful).
131  *
132  * Callers use this function to access modify_tid, mirror_tid, type,
133  * key, and keybits.
134  *
135  * If the cluster is errored, returns an empty bref.
136  * The cluster must be locked.
137  */
138 void
139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
140 {
141 	if (cluster->error == 0) {
142 		KKASSERT(cluster->focus != NULL);
143 		*bref = cluster->focus->bref;
144 		bref->data_off = 0;
145 	} else {
146 		bzero(bref, sizeof(*bref));
147 	}
148 }
149 
150 /*
151  * Create a degenerate cluster with one ref from a single locked chain.
152  * The returned cluster will be focused on the chain and inherit its
153  * error state.
154  *
155  * The chain's lock and reference are transfered to the new cluster, so
156  * the caller should not try to unlock the chain separately.
157  *
158  * We fake the flags.
159  */
160 void
161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
162 {
163 	hammer2_cluster_t *cluster;
164 
165 	bzero(xop, sizeof(*xop));
166 
167 	cluster = &xop->cluster;
168 	cluster->array[0].chain = chain;
169 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
170 	cluster->nchains = 1;
171 	cluster->focus = chain;
172 	cluster->focus_index = 0;
173 	cluster->pmp = chain->pmp;
174 	cluster->refs = 1;
175 	cluster->error = chain->error;
176 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
177 			 HAMMER2_CLUSTER_WRHARD |
178 			 HAMMER2_CLUSTER_RDHARD |
179 			 HAMMER2_CLUSTER_MSYNCED |
180 			 HAMMER2_CLUSTER_SSYNCED;
181 }
182 
183 /*
184  * Add a reference to a cluster and its underlying chains.
185  *
186  * We must also ref the underlying chains in order to allow ref/unlock
187  * sequences to later re-lock.
188  */
189 void
190 hammer2_cluster_ref(hammer2_cluster_t *cluster)
191 {
192 	atomic_add_int(&cluster->refs, 1);
193 }
194 
195 /*
196  * Drop the caller's reference to the cluster.  When the ref count drops to
197  * zero this function frees the cluster and drops all underlying chains.
198  *
199  * In-progress read I/Os are typically detached from the cluster once the
200  * first one returns (the remaining stay attached to the DIOs but are then
201  * ignored and drop naturally).
202  */
203 void
204 hammer2_cluster_drop(hammer2_cluster_t *cluster)
205 {
206 	hammer2_chain_t *chain;
207 	int i;
208 
209 	KKASSERT(cluster->refs > 0);
210 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
211 		cluster->focus = NULL;		/* safety XXX chg to assert */
212 		cluster->focus_index = 0;
213 
214 		for (i = 0; i < cluster->nchains; ++i) {
215 			chain = cluster->array[i].chain;
216 			if (chain) {
217 				hammer2_chain_drop(chain);
218 				cluster->array[i].chain = NULL; /* safety */
219 			}
220 		}
221 		cluster->nchains = 0;				/* safety */
222 
223 		kfree(cluster, M_HAMMER2);
224 		/* cluster is invalid */
225 	}
226 }
227 
228 /*
229  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
230  *
231  * WARNING! This function expects the caller to handle resolution of the
232  *	    cluster.  We never re-resolve the cluster in this function,
233  *	    because it might be used to temporarily unlock/relock a cparent
234  *	    in an iteration or recursrion, and the cparents elements do not
235  *	    necessarily match.
236  */
237 void
238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
239 {
240 	hammer2_chain_t *chain;
241 	int i;
242 
243 	/* cannot be on inode-embedded cluster template, must be on copy */
244 	KKASSERT(cluster->refs > 0);
245 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
246 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
247 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
248 			cluster);
249 	}
250 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
251 
252 	/*
253 	 * Lock chains and resolve state.
254 	 */
255 	for (i = 0; i < cluster->nchains; ++i) {
256 		chain = cluster->array[i].chain;
257 		if (chain == NULL)
258 			continue;
259 		hammer2_chain_lock(chain, how);
260 	}
261 }
262 
263 /*
264  * Calculate the clustering state for the cluster and set its focus.
265  * This routine must be called with care.  For example, it should not
266  * normally be called after relocking a non-leaf cluster because parent
267  * clusters help iterations and each element might be at a slightly different
268  * indirect node (each node's topology is independently indexed).
269  *
270  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
271  * operations.  Typically this is only set on a quorum of MASTERs or
272  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
273  * is present, this bit is *not* set on a quorum of MASTERs.  The
274  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
275  * that create/modify/delete elements use it.
276  *
277  * The chains making up the cluster may be narrowed down based on quorum
278  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
279  * narrowed down to a single chain as long as the entire subtopology is known
280  * to be intact.  So, for example, we can narrow a read-only op to a single
281  * fast SLAVE but if we focus a CACHE chain we must still retain at least
282  * a SLAVE to ensure that the subtopology can be accessed.
283  *
284  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
285  * to be maintained once the topology is validated as-of the top level of
286  * the operation.
287  *
288  * If a failure occurs the operation must be aborted by higher-level code and
289  * retried. XXX
290  */
291 void
292 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
293 {
294 	hammer2_chain_t *chain;
295 	hammer2_chain_t *focus;
296 	hammer2_pfs_t *pmp;
297 	hammer2_tid_t quorum_tid;
298 	hammer2_tid_t last_best_quorum_tid;
299 	int focus_pfs_type;
300 	uint32_t nflags;
301 	int ttlmasters;
302 	int ttlslaves;
303 	int nmasters;
304 	int nslaves;
305 	int nquorum;
306 	int smpresent;
307 	int i;
308 
309 	cluster->error = 0;
310 	cluster->focus = NULL;
311 
312 	focus_pfs_type = 0;
313 	nflags = 0;
314 	ttlmasters = 0;
315 	ttlslaves = 0;
316 	nmasters = 0;
317 	nslaves = 0;
318 
319 	/*
320 	 * Calculate quorum
321 	 */
322 	pmp = cluster->pmp;
323 	KKASSERT(pmp != NULL || cluster->nchains == 0);
324 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
325 	smpresent = 0;
326 
327 	/*
328 	 * Pass 1
329 	 *
330 	 * NOTE: A NULL chain is not necessarily an error, it could be
331 	 *	 e.g. a lookup failure or the end of an iteration.
332 	 *	 Process normally.
333 	 */
334 	for (i = 0; i < cluster->nchains; ++i) {
335 		chain = cluster->array[i].chain;
336 		if (chain && chain->error) {
337 			if (cluster->focus == NULL || cluster->focus == chain) {
338 				/* error will be overridden by valid focus */
339 				cluster->error = chain->error;
340 			}
341 
342 			/*
343 			 * Must count total masters and slaves whether the
344 			 * chain is errored or not.
345 			 */
346 			switch (cluster->pmp->pfs_types[i]) {
347 			case HAMMER2_PFSTYPE_SUPROOT:
348 			case HAMMER2_PFSTYPE_MASTER:
349 				++ttlmasters;
350 				break;
351 			case HAMMER2_PFSTYPE_SLAVE:
352 				++ttlslaves;
353 				break;
354 			}
355 			continue;
356 		}
357 		switch (cluster->pmp->pfs_types[i]) {
358 		case HAMMER2_PFSTYPE_MASTER:
359 			++ttlmasters;
360 			break;
361 		case HAMMER2_PFSTYPE_SLAVE:
362 			++ttlslaves;
363 			break;
364 		case HAMMER2_PFSTYPE_SOFT_MASTER:
365 			nflags |= HAMMER2_CLUSTER_WRSOFT;
366 			nflags |= HAMMER2_CLUSTER_RDSOFT;
367 			smpresent = 1;
368 			break;
369 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
370 			nflags |= HAMMER2_CLUSTER_RDSOFT;
371 			break;
372 		case HAMMER2_PFSTYPE_SUPROOT:
373 			/*
374 			 * Degenerate cluster representing the super-root
375 			 * topology on a single device.  Fake stuff so
376 			 * cluster ops work as expected.
377 			 */
378 			nflags |= HAMMER2_CLUSTER_WRHARD;
379 			nflags |= HAMMER2_CLUSTER_RDHARD;
380 			cluster->focus_index = i;
381 			cluster->focus = chain;
382 			cluster->error = chain ? chain->error : 0;
383 			++ttlmasters;
384 			break;
385 		default:
386 			break;
387 		}
388 	}
389 
390 	/*
391 	 * Pass 2
392 	 *
393 	 * Resolve masters.  Calculate nmasters for the highest matching
394 	 * TID, if a quorum cannot be attained try the next lower matching
395 	 * TID until we exhaust TIDs.
396 	 *
397 	 * NOTE: A NULL chain is not necessarily an error, it could be
398 	 *	 e.g. a lookup failure or the end of an iteration.
399 	 *	 Process normally.
400 	 */
401 	last_best_quorum_tid = HAMMER2_TID_MAX;
402 	quorum_tid = 0;		/* fix gcc warning */
403 
404 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
405 		nmasters = 0;
406 		quorum_tid = 0;
407 
408 		for (i = 0; i < cluster->nchains; ++i) {
409 			switch (cluster->pmp->pfs_types[i]) {
410 			case HAMMER2_PFSTYPE_SUPROOT:
411 			case HAMMER2_PFSTYPE_MASTER:
412 				break;
413 			default:
414 				continue;
415 			}
416 			chain = cluster->array[i].chain;
417 
418 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
419 				/*
420 				 * Invalid as in unsynchronized, cannot be
421 				 * used to calculate the quorum.
422 				 */
423 			} else if (chain == NULL && quorum_tid == 0) {
424 				/*
425 				 * NULL chain on master matches NULL chains
426 				 * on other masters.
427 				 */
428 				++nmasters;
429 			} else if (quorum_tid < last_best_quorum_tid &&
430 				   chain != NULL &&
431 				   (quorum_tid < chain->bref.modify_tid ||
432 				    nmasters == 0)) {
433 				/*
434 				 * Better TID located, reset nmasters count.
435 				 */
436 				nmasters = 1;
437 				quorum_tid = chain->bref.modify_tid;
438 			} else if (chain &&
439 				   quorum_tid == chain->bref.modify_tid) {
440 				/*
441 				 * TID matches current collection.
442 				 */
443 				++nmasters;
444 			}
445 		}
446 		if (nmasters >= nquorum)
447 			break;
448 		last_best_quorum_tid = quorum_tid;
449 	}
450 
451 	/*
452 	 * Pass 3
453 	 *
454 	 * NOTE: A NULL chain is not necessarily an error, it could be
455 	 *	 e.g. a lookup failure or the end of an iteration.
456 	 *	 Process normally.
457 	 */
458 	for (i = 0; i < cluster->nchains; ++i) {
459 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
460 		chain = cluster->array[i].chain;
461 		if (chain && chain->error) {
462 			if (cluster->focus == NULL || cluster->focus == chain) {
463 				/* error will be overridden by valid focus */
464 				cluster->error = chain->error;
465 			}
466 			continue;
467 		}
468 
469 		switch (cluster->pmp->pfs_types[i]) {
470 		case HAMMER2_PFSTYPE_MASTER:
471 			/*
472 			 * We must have enough up-to-date masters to reach
473 			 * a quorum and the master modify_tid must match
474 			 * the quorum's modify_tid.
475 			 *
476 			 * Do not select an errored or out-of-sync master.
477 			 */
478 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
479 				nflags |= HAMMER2_CLUSTER_UNHARD;
480 			} else if (nmasters >= nquorum &&
481 				   (chain == NULL || chain->error == 0) &&
482 				   ((chain == NULL && quorum_tid == 0) ||
483 				    (chain != NULL && quorum_tid ==
484 						  chain->bref.modify_tid))) {
485 				nflags |= HAMMER2_CLUSTER_WRHARD;
486 				nflags |= HAMMER2_CLUSTER_RDHARD;
487 				if (!smpresent) {
488 					cluster->array[i].flags |=
489 							HAMMER2_CITEM_FEMOD;
490 				}
491 				if (cluster->focus == NULL ||
492 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
493 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
494 					cluster->focus_index = i;
495 					cluster->focus = chain; /* NULL ok */
496 					cluster->error = chain ? chain->error :
497 								 0;
498 				}
499 			} else if (chain == NULL || chain->error == 0) {
500 				nflags |= HAMMER2_CLUSTER_UNHARD;
501 			}
502 			break;
503 		case HAMMER2_PFSTYPE_SLAVE:
504 			/*
505 			 * We must have enough up-to-date masters to reach
506 			 * a quorum and the slave modify_tid must match the
507 			 * quorum's modify_tid.
508 			 *
509 			 * Do not select an errored slave.
510 			 */
511 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
512 				nflags |= HAMMER2_CLUSTER_UNHARD;
513 			} else if (nmasters >= nquorum &&
514 				   (chain == NULL || chain->error == 0) &&
515 				   ((chain == NULL && quorum_tid == 0) ||
516 				    (chain && quorum_tid ==
517 					      chain->bref.modify_tid))) {
518 				++nslaves;
519 				nflags |= HAMMER2_CLUSTER_RDHARD;
520 #if 0
521 				/* XXX optimize for RESOLVE_RDONLY */
522 				if (cluster->focus == NULL) {
523 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
524 					cluster->focus_index = i;
525 					cluster->focus = chain; /* NULL ok */
526 					cluster->error = chain ? chain->error :
527 								 0;
528 				}
529 #endif
530 			} else if (chain == NULL || chain->error == 0) {
531 				nflags |= HAMMER2_CLUSTER_UNSOFT;
532 			}
533 			break;
534 		case HAMMER2_PFSTYPE_SOFT_MASTER:
535 			/*
536 			 * Directly mounted soft master always wins.  There
537 			 * should be only one.
538 			 */
539 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
540 			cluster->focus_index = i;
541 			cluster->focus = chain;
542 			cluster->error = chain ? chain->error : 0;
543 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
544 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
545 			break;
546 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
547 			/*
548 			 * Directly mounted soft slave always wins.  There
549 			 * should be only one.
550 			 */
551 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
552 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
553 				cluster->focus_index = i;
554 				cluster->focus = chain;
555 				cluster->error = chain ? chain->error : 0;
556 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
557 			}
558 			break;
559 		case HAMMER2_PFSTYPE_SUPROOT:
560 			/*
561 			 * spmp (degenerate case)
562 			 */
563 			KKASSERT(i == 0);
564 			cluster->focus_index = i;
565 			cluster->focus = chain;
566 			cluster->error = chain ? chain->error : 0;
567 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
568 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
569 			break;
570 		default:
571 			break;
572 		}
573 	}
574 
575 	/*
576 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
577 	 * is bad or if we are at the PFS root (the bref won't match at
578 	 * the PFS root, obviously).
579 	 */
580 	focus = cluster->focus;
581 	if (focus) {
582 		cluster->ddflag =
583 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
584 	} else {
585 		cluster->ddflag = 0;
586 		goto skip4;
587 	}
588 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
589 		goto skip4;
590 
591 	/*
592 	 * Pass 4
593 	 *
594 	 * Validate the elements that were not marked invalid.  They should
595 	 * match.
596 	 */
597 	for (i = 0; i < cluster->nchains; ++i) {
598 		int ddflag;
599 
600 		chain = cluster->array[i].chain;
601 
602 		if (chain == NULL)
603 			continue;
604 		if (chain == focus)
605 			continue;
606 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
607 			continue;
608 
609 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
610 		if (chain->bref.type != focus->bref.type ||
611 		    chain->bref.key != focus->bref.key ||
612 		    chain->bref.keybits != focus->bref.keybits ||
613 		    chain->bref.modify_tid != focus->bref.modify_tid ||
614 		    chain->bytes != focus->bytes ||
615 		    ddflag != cluster->ddflag) {
616 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
617 			if (hammer2_debug & 1)
618 			kprintf("cluster_resolve: matching modify_tid failed "
619 				"bref test: idx=%d type=%02x/%02x "
620 				"key=%016jx/%d-%016jx/%d "
621 				"mod=%016jx/%016jx bytes=%u/%u\n",
622 				i,
623 				chain->bref.type, focus->bref.type,
624 				chain->bref.key, chain->bref.keybits,
625 				focus->bref.key, focus->bref.keybits,
626 				chain->bref.modify_tid, focus->bref.modify_tid,
627 				chain->bytes, focus->bytes);
628 			if (hammer2_debug & 0x4000)
629 				panic("cluster_resolve");
630 			/* flag issue and force resync? */
631 		}
632 	}
633 skip4:
634 
635 	if (ttlslaves == 0)
636 		nflags |= HAMMER2_CLUSTER_NOSOFT;
637 	if (ttlmasters == 0)
638 		nflags |= HAMMER2_CLUSTER_NOHARD;
639 
640 	/*
641 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
642 	 * all available nodes (even if 0 are available) are fully
643 	 * synchronized.  This is used by the synchronization thread to
644 	 * determine if there is work it could potentially accomplish.
645 	 */
646 	if (nslaves == ttlslaves)
647 		nflags |= HAMMER2_CLUSTER_SSYNCED;
648 	if (nmasters == ttlmasters)
649 		nflags |= HAMMER2_CLUSTER_MSYNCED;
650 
651 	/*
652 	 * Determine if the cluster was successfully locked for the
653 	 * requested operation and generate an error code.  The cluster
654 	 * will not be locked (or ref'd) if an error is returned.
655 	 */
656 	atomic_set_int(&cluster->flags, nflags);
657 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
658 }
659 
660 /*
661  * This is used by the XOPS subsystem to calculate the state of
662  * the collection and tell hammer2_xop_collect() what to do with it.
663  * The collection can be in various states of desynchronization, the
664  * caller specifically wants to resolve the passed-in key.
665  *
666  * Return values:
667  *	0		- Quorum agreement, key is valid
668  *
669  *	ENOENT		- Quorum agreement, end of scan
670  *
671  *	ESRCH		- Quorum agreement, key is INVALID (caller should
672  *			  skip key).
673  *
674  *	EIO		- Quorum agreement but all elements had errors.
675  *
676  *	EDEADLK		- No quorum agreement possible for key, a repair
677  *			  may be needed.  Caller has to decide what to do,
678  *			  possibly iterating the key or generating an EIO.
679  *
680  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
681  *			  possible if caller waits for more responses.  Caller
682  *			  should not iterate key.
683  *
684  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
685  *
686  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
687  */
688 int
689 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
690 {
691 	hammer2_chain_t *chain;
692 	hammer2_chain_t *focus;
693 	hammer2_pfs_t *pmp;
694 	hammer2_tid_t quorum_tid;
695 	hammer2_tid_t last_best_quorum_tid;
696 	uint32_t nflags;
697 	int ttlmasters;
698 	int ttlslaves;
699 	int nmasters;
700 	int nmasters_keymatch;
701 	int nslaves;
702 	int nquorum;
703 	int umasters;	/* unknown masters (still in progress) */
704 	int smpresent;
705 	int error;
706 	int i;
707 
708 	cluster->error = 0;
709 	cluster->focus = NULL;
710 
711 	pmp = cluster->pmp;
712 	KKASSERT(pmp != NULL || cluster->nchains == 0);
713 
714 	/*
715 	 * Calculate quorum
716 	 */
717 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
718 	smpresent = 0;
719 	nflags = 0;
720 	ttlmasters = 0;
721 	ttlslaves = 0;
722 
723 	/*
724 	 * Pass 1
725 	 *
726 	 * NOTE: A NULL chain is not necessarily an error, it could be
727 	 *	 e.g. a lookup failure or the end of an iteration.
728 	 *	 Process normally.
729 	 */
730 	for (i = 0; i < cluster->nchains; ++i) {
731 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
732 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
733 
734 		chain = cluster->array[i].chain;
735 		error = cluster->array[i].error;
736 		if (chain && error) {
737 			if (cluster->focus == NULL || cluster->focus == chain) {
738 				/* error will be overridden by valid focus */
739 				/* XXX */
740 			}
741 
742 			/*
743 			 * Must count total masters and slaves whether the
744 			 * chain is errored or not.
745 			 */
746 			switch (cluster->pmp->pfs_types[i]) {
747 			case HAMMER2_PFSTYPE_SUPROOT:
748 			case HAMMER2_PFSTYPE_MASTER:
749 				++ttlmasters;
750 				break;
751 			case HAMMER2_PFSTYPE_SLAVE:
752 				++ttlslaves;
753 				break;
754 			}
755 			continue;
756 		}
757 		switch (cluster->pmp->pfs_types[i]) {
758 		case HAMMER2_PFSTYPE_MASTER:
759 			++ttlmasters;
760 			break;
761 		case HAMMER2_PFSTYPE_SLAVE:
762 			++ttlslaves;
763 			break;
764 		case HAMMER2_PFSTYPE_SOFT_MASTER:
765 			nflags |= HAMMER2_CLUSTER_WRSOFT;
766 			nflags |= HAMMER2_CLUSTER_RDSOFT;
767 			smpresent = 1;
768 			break;
769 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
770 			nflags |= HAMMER2_CLUSTER_RDSOFT;
771 			break;
772 		case HAMMER2_PFSTYPE_SUPROOT:
773 			/*
774 			 * Degenerate cluster representing the super-root
775 			 * topology on a single device.  Fake stuff so
776 			 * cluster ops work as expected.
777 			 */
778 			++ttlmasters;
779 			nflags |= HAMMER2_CLUSTER_WRHARD;
780 			nflags |= HAMMER2_CLUSTER_RDHARD;
781 			cluster->focus_index = i;
782 			cluster->focus = chain;
783 			cluster->error = error;
784 			break;
785 		default:
786 			break;
787 		}
788 	}
789 
790 	/*
791 	 * Pass 2
792 	 *
793 	 * Resolve nmasters		- master nodes fully match
794 	 *
795 	 * Resolve umasters		- master nodes operation still
796 	 *				  in progress
797 	 *
798 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
799 	 *				  key and may or may not match
800 	 *				  the quorum-agreed tid.
801 	 *
802 	 * The quorum-agreed TID is the highest matching TID.
803 	 */
804 	last_best_quorum_tid = HAMMER2_TID_MAX;
805 	umasters = 0;
806 	nmasters = 0;
807 	nmasters_keymatch = 0;
808 	quorum_tid = 0;		/* fix gcc warning */
809 
810 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
811 		umasters = 0;
812 		nmasters = 0;
813 		nmasters_keymatch = 0;
814 		quorum_tid = 0;
815 
816 		for (i = 0; i < cluster->nchains; ++i) {
817 			/* XXX SOFT smpresent handling */
818 			switch(cluster->pmp->pfs_types[i]) {
819 			case HAMMER2_PFSTYPE_MASTER:
820 			case HAMMER2_PFSTYPE_SUPROOT:
821 				break;
822 			default:
823 				continue;
824 			}
825 
826 			chain = cluster->array[i].chain;
827 			error = cluster->array[i].error;
828 
829 			/*
830 			 * Skip elements still in progress.  umasters keeps
831 			 * track of masters that might still be in-progress.
832 			 */
833 			if (chain == NULL && (cluster->array[i].flags &
834 					      HAMMER2_CITEM_NULL) == 0) {
835 				++umasters;
836 				continue;
837 			}
838 
839 			/*
840 			 * Key match?
841 			 */
842 			if (flags & HAMMER2_CHECK_NULL) {
843 				if (chain == NULL) {
844 					++nmasters;
845 					++nmasters_keymatch;
846 					if (cluster->error == 0)
847 						cluster->error = error;
848 				}
849 			} else if (chain &&
850 				   (key == (hammer2_key_t)-1 ||
851 				    chain->bref.key == key)) {
852 				++nmasters_keymatch;
853 
854 				if (chain->bref.modify_tid <
855 				     last_best_quorum_tid &&
856 				    quorum_tid < chain->bref.modify_tid) {
857 					/*
858 					 * Select new TID as master if better
859 					 * than any found so far in this loop,
860 					 * as long as it does not reach the
861 					 * best tid found in the previous loop.
862 					 */
863 					nmasters = 0;
864 					quorum_tid = chain->bref.modify_tid;
865 				}
866 				if (quorum_tid == chain->bref.modify_tid) {
867 					/*
868 					 * TID matches current collection.
869 					 *
870 					 * (error handled in next pass)
871 					 */
872 					++nmasters;
873 					if (chain->error == 0) {
874 						cluster->focus = chain;
875 						cluster->focus_index = i;
876 					}
877 				}
878 			}
879 		}
880 		if (nmasters >= nquorum)
881 			break;
882 		last_best_quorum_tid = quorum_tid;
883 	}
884 
885 	/*
886 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
887 		nmasters, nquorum, nmasters_keymatch, umasters);
888 	*/
889 
890 	/*
891 	 * Early return if we do not have enough masters.
892 	 */
893 	if (nmasters < nquorum) {
894 		if (nmasters + umasters >= nquorum)
895 			return HAMMER2_ERROR_EINPROGRESS;
896 		if (nmasters_keymatch < nquorum)
897 			return HAMMER2_ERROR_ESRCH;
898 		return HAMMER2_ERROR_EDEADLK;
899 	}
900 
901 	/*
902 	 * Validated end of scan.
903 	 */
904 	if (flags & HAMMER2_CHECK_NULL) {
905 		if (cluster->error == 0)
906 			cluster->error = HAMMER2_ERROR_ENOENT;
907 		return cluster->error;
908 	}
909 
910 	/*
911 	 * If we have a NULL focus at this point the agreeing quorum all
912 	 * had chain errors.
913 	 */
914 	if (cluster->focus == NULL)
915 		return HAMMER2_ERROR_EIO;
916 
917 	/*
918 	 * Pass 3
919 	 *
920 	 * We have quorum agreement, validate elements, not end of scan.
921 	 */
922 	nslaves = 0;
923 	cluster->error = 0;
924 
925 	for (i = 0; i < cluster->nchains; ++i) {
926 		chain = cluster->array[i].chain;
927 		error = cluster->array[i].error;
928 		if (chain == NULL ||
929 		    chain->bref.key != key ||
930 		    chain->bref.modify_tid != quorum_tid) {
931 			continue;
932 		}
933 
934 		/*
935 		 * Quorum Match
936 		 *
937 		 * XXX for now, cumulative error.
938 		 */
939 		if (cluster->error == 0)
940 			cluster->error = error;
941 
942 		switch (cluster->pmp->pfs_types[i]) {
943 		case HAMMER2_PFSTYPE_MASTER:
944 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
945 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
946 			nflags |= HAMMER2_CLUSTER_WRHARD;
947 			nflags |= HAMMER2_CLUSTER_RDHARD;
948 			break;
949 		case HAMMER2_PFSTYPE_SLAVE:
950 			/*
951 			 * We must have enough up-to-date masters to reach
952 			 * a quorum and the slave modify_tid must match the
953 			 * quorum's modify_tid.
954 			 *
955 			 * Do not select an errored slave.
956 			 */
957 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
958 			nflags |= HAMMER2_CLUSTER_RDHARD;
959 			++nslaves;
960 			break;
961 		case HAMMER2_PFSTYPE_SOFT_MASTER:
962 			/*
963 			 * Directly mounted soft master always wins.  There
964 			 * should be only one.
965 			 */
966 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
967 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
968 			break;
969 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
970 			/*
971 			 * Directly mounted soft slave always wins.  There
972 			 * should be only one.
973 			 *
974 			 * XXX
975 			 */
976 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
977 			break;
978 		case HAMMER2_PFSTYPE_SUPROOT:
979 			/*
980 			 * spmp (degenerate case)
981 			 */
982 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
983 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
984 			nflags |= HAMMER2_CLUSTER_WRHARD;
985 			nflags |= HAMMER2_CLUSTER_RDHARD;
986 			break;
987 		default:
988 			break;
989 		}
990 	}
991 
992 	/*
993 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
994 	 * is bad or if we are at the PFS root (the bref won't match at
995 	 * the PFS root, obviously).
996 	 *
997 	 * focus is probably not locked and it isn't safe to test its
998 	 * content (e.g. focus->data, focus->dio, other content).  We
999 	 * do not synchronize the dio to the cpu here.  In fact, in numerous
1000 	 * situations the frontend doesn't even need to access its dio/data,
1001 	 * so synchronizing it here would be wasteful.
1002 	 */
1003 	focus = cluster->focus;
1004 	if (focus) {
1005 		cluster->ddflag =
1006 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1007 	} else {
1008 		cluster->ddflag = 0;
1009 		goto skip4;
1010 	}
1011 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1012 		goto skip4;
1013 
1014 	/*
1015 	 * Pass 4
1016 	 *
1017 	 * Validate the elements that were not marked invalid.  They should
1018 	 * match.
1019 	 */
1020 	for (i = 0; i < cluster->nchains; ++i) {
1021 		int ddflag;
1022 
1023 		chain = cluster->array[i].chain;
1024 
1025 		if (chain == NULL)
1026 			continue;
1027 		if (chain == focus)
1028 			continue;
1029 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1030 			continue;
1031 
1032 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1033 		if (chain->bref.type != focus->bref.type ||
1034 		    chain->bref.key != focus->bref.key ||
1035 		    chain->bref.keybits != focus->bref.keybits ||
1036 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1037 		    chain->bytes != focus->bytes ||
1038 		    ddflag != cluster->ddflag) {
1039 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1040 			if (hammer2_debug & 1)
1041 			kprintf("cluster_resolve: matching modify_tid failed "
1042 				"bref test: idx=%d type=%02x/%02x "
1043 				"key=%016jx/%d-%016jx/%d "
1044 				"mod=%016jx/%016jx bytes=%u/%u\n",
1045 				i,
1046 				chain->bref.type, focus->bref.type,
1047 				chain->bref.key, chain->bref.keybits,
1048 				focus->bref.key, focus->bref.keybits,
1049 				chain->bref.modify_tid, focus->bref.modify_tid,
1050 				chain->bytes, focus->bytes);
1051 			if (hammer2_debug & 0x4000)
1052 				panic("cluster_resolve");
1053 			/* flag issue and force resync? */
1054 		}
1055 	}
1056 skip4:
1057 
1058 	if (ttlslaves == 0)
1059 		nflags |= HAMMER2_CLUSTER_NOSOFT;
1060 	if (ttlmasters == 0)
1061 		nflags |= HAMMER2_CLUSTER_NOHARD;
1062 
1063 	/*
1064 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1065 	 * all available nodes (even if 0 are available) are fully
1066 	 * synchronized.  This is used by the synchronization thread to
1067 	 * determine if there is work it could potentially accomplish.
1068 	 */
1069 	if (nslaves == ttlslaves)
1070 		nflags |= HAMMER2_CLUSTER_SSYNCED;
1071 	if (nmasters == ttlmasters)
1072 		nflags |= HAMMER2_CLUSTER_MSYNCED;
1073 
1074 	/*
1075 	 * Determine if the cluster was successfully locked for the
1076 	 * requested operation and generate an error code.  The cluster
1077 	 * will not be locked (or ref'd) if an error is returned.
1078 	 */
1079 	atomic_set_int(&cluster->flags, nflags);
1080 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1081 
1082 	return cluster->error;
1083 }
1084 
1085 /*
1086  * This is used by the sync thread to force non-NULL elements of a copy
1087  * of the pmp->iroot cluster to be good which is required to prime the
1088  * sync.
1089  */
1090 void
1091 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1092 {
1093 	int i;
1094 
1095 	for (i = 0; i < cluster->nchains; ++i) {
1096 		if (cluster->array[i].chain)
1097 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1098 	}
1099 }
1100 
1101 /*
1102  * Unlock a cluster.  Refcount and focus is maintained.
1103  */
1104 void
1105 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1106 {
1107 	hammer2_chain_t *chain;
1108 	int i;
1109 
1110 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1111 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1112 			cluster);
1113 	}
1114 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1115 	KKASSERT(cluster->refs > 0);
1116 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1117 
1118 	for (i = 0; i < cluster->nchains; ++i) {
1119 		chain = cluster->array[i].chain;
1120 		if (chain)
1121 			hammer2_chain_unlock(chain);
1122 	}
1123 }
1124