xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision 1448a966161a9420da0adf26a910473e9202cbbc)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54 
55 #include "hammer2.h"
56 
57 /*
58  * Returns TRUE if any chain in the cluster needs to be resized.
59  */
60 int
61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
62 {
63 	hammer2_chain_t *chain;
64 	int i;
65 
66 	for (i = 0; i < cluster->nchains; ++i) {
67 		chain = cluster->array[i];
68 		if (chain && chain->bytes != bytes)
69 			return 1;
70 	}
71 	return 0;
72 }
73 
74 uint8_t
75 hammer2_cluster_type(hammer2_cluster_t *cluster)
76 {
77 	return(cluster->focus->bref.type);
78 }
79 
80 int
81 hammer2_cluster_modified(hammer2_cluster_t *cluster)
82 {
83 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
84 }
85 
86 /*
87  * Return a bref representative of the cluster.  Any data offset is removed
88  * (since it would only be applicable to a particular chain in the cluster).
89  *
90  * However, the radix portion of data_off is used for many purposes and will
91  * be retained.
92  */
93 void
94 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
95 {
96 	*bref = cluster->focus->bref;
97 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
98 }
99 
100 /*
101  * Return non-zero if the chain representing an inode has been flagged
102  * as having been unlinked.  Allows the vnode reclaim to avoid loading
103  * the inode data from disk e.g. when unmount or recycling old, clean
104  * vnodes.
105  */
106 int
107 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
108 {
109 	hammer2_chain_t *chain;
110 	int flags;
111 	int i;
112 
113 	flags = 0;
114 	for (i = 0; i < cluster->nchains; ++i) {
115 		if ((chain = cluster->array[i]) != NULL)
116 			flags |= chain->flags;
117 	}
118 	return (flags & HAMMER2_CHAIN_UNLINKED);
119 }
120 
121 void
122 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
123 {
124 	hammer2_chain_t *chain;
125 	int i;
126 
127 	for (i = 0; i < cluster->nchains; ++i) {
128 		chain = cluster->array[i];
129 		if (chain)
130 			atomic_set_int(&chain->flags, flags);
131 	}
132 }
133 
134 void
135 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
136 {
137 	hammer2_chain_t *chain;
138 	int i;
139 
140 	for (i = 0; i < cluster->nchains; ++i) {
141 		chain = cluster->array[i];
142 		if (chain)
143 			atomic_clear_int(&chain->flags, flags);
144 	}
145 }
146 
147 void
148 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
149 {
150 	hammer2_chain_t *chain;
151 	int i;
152 
153 	for (i = 0; i < cluster->nchains; ++i) {
154 		chain = cluster->array[i];
155 		if (chain)
156 			hammer2_chain_setflush(trans, chain);
157 	}
158 }
159 
160 void
161 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
162 				hammer2_cluster_t *cluster,
163 				int check_algo)
164 {
165 	hammer2_chain_t *chain;
166 	int i;
167 
168 	for (i = 0; i < cluster->nchains; ++i) {
169 		chain = cluster->array[i];
170 		if (chain) {
171 			KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
172 			chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
173 			chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
174 		}
175 	}
176 }
177 
178 /*
179  * Create a cluster with one ref from the specified chain.  The chain
180  * is not further referenced.  The caller typically supplies a locked
181  * chain and transfers ownership to the cluster.
182  */
183 hammer2_cluster_t *
184 hammer2_cluster_from_chain(hammer2_chain_t *chain)
185 {
186 	hammer2_cluster_t *cluster;
187 
188 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
189 	cluster->array[0] = chain;
190 	cluster->nchains = 1;
191 	cluster->focus = chain;
192 	cluster->pmp = chain->pmp;
193 	cluster->refs = 1;
194 
195 	return cluster;
196 }
197 
198 /*
199  * Allocates a cluster and its underlying chain structures.  The underlying
200  * chains will be locked.  The cluster and underlying chains will have one
201  * ref.
202  */
203 hammer2_cluster_t *
204 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
205 		      hammer2_trans_t *trans, hammer2_blockref_t *bref)
206 {
207 	hammer2_cluster_t *cluster;
208 	hammer2_cluster_t *rcluster;
209 	hammer2_chain_t *chain;
210 #if 0
211 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
212 #endif
213 	int i;
214 
215 	KKASSERT(pmp != NULL);
216 
217 	/*
218 	 * Construct the appropriate system structure.
219 	 */
220 	switch(bref->type) {
221 	case HAMMER2_BREF_TYPE_INODE:
222 	case HAMMER2_BREF_TYPE_INDIRECT:
223 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
224 	case HAMMER2_BREF_TYPE_DATA:
225 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
226 		/*
227 		 * Chain's are really only associated with the hmp but we
228 		 * maintain a pmp association for per-mount memory tracking
229 		 * purposes.  The pmp can be NULL.
230 		 */
231 		break;
232 	case HAMMER2_BREF_TYPE_VOLUME:
233 	case HAMMER2_BREF_TYPE_FREEMAP:
234 		chain = NULL;
235 		panic("hammer2_cluster_alloc volume type illegal for op");
236 	default:
237 		chain = NULL;
238 		panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
239 		      bref->type);
240 	}
241 
242 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
243 	cluster->refs = 1;
244 
245 	rcluster = &pmp->iroot->cluster;
246 	for (i = 0; i < rcluster->nchains; ++i) {
247 		chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
248 					    pmp, trans, bref);
249 #if 0
250 		chain->hmp = rcluster->array[i]->hmp;
251 		chain->bref = *bref;
252 		chain->bytes = bytes;
253 		chain->refs = 1;
254 		chain->flags = HAMMER2_CHAIN_ALLOCATED;
255 #endif
256 
257 		/*
258 		 * NOTE: When loading a chain from backing store or creating a
259 		 *	 snapshot, trans will be NULL and the caller is
260 		 *	 responsible for setting these fields.
261 		 */
262 		cluster->array[i] = chain;
263 	}
264 	cluster->nchains = i;
265 	cluster->pmp = pmp;
266 	cluster->focus = cluster->array[0];
267 
268 	return (cluster);
269 }
270 
271 /*
272  * Add a reference to a cluster.
273  *
274  * We must also ref the underlying chains in order to allow ref/unlock
275  * sequences to later re-lock.
276  */
277 void
278 hammer2_cluster_ref(hammer2_cluster_t *cluster)
279 {
280 	hammer2_chain_t *chain;
281 	int i;
282 
283 	atomic_add_int(&cluster->refs, 1);
284 	for (i = 0; i < cluster->nchains; ++i) {
285 		chain = cluster->array[i];
286 		if (chain)
287 			hammer2_chain_ref(chain);
288 	}
289 }
290 
291 /*
292  * Drop the caller's reference to the cluster.  When the ref count drops to
293  * zero this function frees the cluster and drops all underlying chains.
294  *
295  * In-progress read I/Os are typically detached from the cluster once the
296  * first one returns (the remaining stay attached to the DIOs but are then
297  * ignored and drop naturally).
298  */
299 void
300 hammer2_cluster_drop(hammer2_cluster_t *cluster)
301 {
302 	hammer2_chain_t *chain;
303 	int i;
304 
305 	KKASSERT(cluster->refs > 0);
306 	for (i = 0; i < cluster->nchains; ++i) {
307 		chain = cluster->array[i];
308 		if (chain) {
309 			hammer2_chain_drop(chain);
310 			if (cluster->refs == 1)
311 				cluster->array[i] = NULL;
312 		}
313 	}
314 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
315 		cluster->focus = NULL;
316 		kfree(cluster, M_HAMMER2);
317 		/* cluster = NULL; safety */
318 	}
319 }
320 
321 void
322 hammer2_cluster_wait(hammer2_cluster_t *cluster)
323 {
324 	tsleep(cluster->focus, 0, "h2clcw", 1);
325 }
326 
327 /*
328  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
329  * and then locks them.
330  */
331 int
332 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
333 {
334 	hammer2_chain_t *chain;
335 	int i;
336 	int error;
337 
338 	error = 0;
339 	atomic_add_int(&cluster->refs, 1);
340 	for (i = 0; i < cluster->nchains; ++i) {
341 		chain = cluster->array[i];
342 		if (chain) {
343 			error = hammer2_chain_lock(chain, how);
344 			if (error) {
345 				while (--i >= 0)
346 					hammer2_chain_unlock(cluster->array[i]);
347 				atomic_add_int(&cluster->refs, -1);
348 				break;
349 			}
350 		}
351 	}
352 	return error;
353 }
354 
355 /*
356  * Replace the contents of dst with src, adding a reference to src's chains.
357  * dst is assumed to already have a ref and any chains present in dst are
358  * assumed to be locked and will be unlocked.
359  *
360  * If the chains in src are locked, only one of (src) or (dst) should be
361  * considered locked by the caller after return, not both.
362  */
363 void
364 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
365 {
366 	hammer2_chain_t *chain;
367 	int i;
368 
369 	KKASSERT(dst->refs == 1);
370 	dst->focus = NULL;
371 
372 	for (i = 0; i < src->nchains; ++i) {
373 		chain = src->array[i];
374 		if (chain) {
375 			hammer2_chain_ref(chain);
376 			if (i < dst->nchains && dst->array[i])
377 				hammer2_chain_unlock(dst->array[i]);
378 			dst->array[i] = chain;
379 			if (dst->focus == NULL)
380 				dst->focus = chain;
381 		}
382 	}
383 	while (i < dst->nchains) {
384 		chain = dst->array[i];
385 		if (chain) {
386 			hammer2_chain_unlock(chain);
387 			dst->array[i] = NULL;
388 		}
389 		++i;
390 	}
391 	dst->nchains = src->nchains;
392 }
393 
394 /*
395  * Replace the contents of the locked destination with the contents of the
396  * locked source.  Destination must have one ref.
397  *
398  * Returns with the destination still with one ref and the copied chains
399  * with an additional lock (representing their state on the destination).
400  * The original chains associated with the destination are unlocked.
401  */
402 void
403 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
404 {
405 	hammer2_chain_t *chain;
406 	int i;
407 
408 	KKASSERT(dst->refs == 1);
409 
410 	dst->focus = NULL;
411 	for (i = 0; i < src->nchains; ++i) {
412 		chain = src->array[i];
413 		if (chain) {
414 			hammer2_chain_lock(chain, 0);
415 			if (i < dst->nchains && dst->array[i])
416 				hammer2_chain_unlock(dst->array[i]);
417 			dst->array[i] = src->array[i];
418 			if (dst->focus == NULL)
419 				dst->focus = chain;
420 		}
421 	}
422 	while (i < dst->nchains) {
423 		chain = dst->array[i];
424 		if (chain) {
425 			hammer2_chain_unlock(chain);
426 			dst->array[i] = NULL;
427 		}
428 		++i;
429 	}
430 	dst->nchains = src->nchains;
431 }
432 
433 /*
434  * Copy a cluster, returned a ref'd cluster.  All underlying chains
435  * are also ref'd, but not locked.
436  *
437  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
438  * to the new cluster and a reference is nominally added to them and to
439  * the cluster.  The cluster will have 1 ref.
440  *
441  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
442  * are copied but no additional references are made and the cluster will have
443  * 0 refs.  Callers must ref the cluster and the chains before dropping it
444  * (typically by locking it).
445  *
446  * If flags are passed as 0 the copy is setup as if it contained the chains
447  * but the chains will not be copied over, and the cluster will have 0 refs.
448  * Callers must ref the cluster before dropping it (typically by locking it).
449  */
450 hammer2_cluster_t *
451 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
452 {
453 	hammer2_pfsmount_t *pmp = ocluster->pmp;
454 	hammer2_cluster_t *ncluster;
455 	hammer2_chain_t *chain;
456 	int i;
457 
458 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
459 	ncluster->pmp = pmp;
460 	ncluster->nchains = ocluster->nchains;
461 	ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
462 	if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
463 		ncluster->focus = ocluster->focus;
464 		for (i = 0; i < ocluster->nchains; ++i) {
465 			chain = ocluster->array[i];
466 			ncluster->array[i] = chain;
467 			if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
468 			    chain) {
469 				hammer2_chain_ref(chain);
470 			}
471 		}
472 	}
473 	return (ncluster);
474 }
475 
476 /*
477  * Unlock and deref a cluster.  The cluster is destroyed if this is the
478  * last ref.
479  */
480 void
481 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
482 {
483 	hammer2_chain_t *chain;
484 	int i;
485 
486 	KKASSERT(cluster->refs > 0);
487 	for (i = 0; i < cluster->nchains; ++i) {
488 		chain = cluster->array[i];
489 		if (chain) {
490 			hammer2_chain_unlock(chain);
491 			if (cluster->refs == 1)
492 				cluster->array[i] = NULL;	/* safety */
493 		}
494 	}
495 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
496 		cluster->focus = NULL;
497 		kfree(cluster, M_HAMMER2);
498 		/* cluster = NULL; safety */
499 	}
500 }
501 
502 /*
503  * Resize the cluster's physical storage allocation in-place.  This may
504  * replace the cluster's chains.
505  */
506 void
507 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
508 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
509 		       int nradix, int flags)
510 {
511 	int i;
512 
513 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
514 	KKASSERT(cparent->nchains == cluster->nchains);
515 
516 	cluster->focus = NULL;
517 	for (i = 0; i < cluster->nchains; ++i) {
518 		if (cluster->array[i]) {
519 			KKASSERT(cparent->array[i]);
520 			hammer2_chain_resize(trans, ip,
521 					     cparent->array[i],
522 					     cluster->array[i],
523 					     nradix, flags);
524 			if (cluster->focus == NULL)
525 				cluster->focus = cluster->array[i];
526 		}
527 	}
528 }
529 
530 /*
531  * Set an inode's cluster modified, marking the related chains RW and
532  * duplicating them if necessary.
533  *
534  * The passed-in chain is a localized copy of the chain previously acquired
535  * when the inode was locked (and possilby replaced in the mean time), and
536  * must also be updated.  In fact, we update it first and then synchronize
537  * the inode's cluster cache.
538  */
539 hammer2_inode_data_t *
540 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
541 			  hammer2_cluster_t *cluster, int flags)
542 {
543 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
544 	hammer2_cluster_modify(trans, cluster, flags);
545 
546 	hammer2_inode_repoint(ip, NULL, cluster);
547 	if (ip->vp)
548 		vsetisdirty(ip->vp);
549 	return (&hammer2_cluster_wdata(cluster)->ipdata);
550 }
551 
552 /*
553  * Adjust the cluster's chains to allow modification and adjust the
554  * focus.  Data will be accessible on return.
555  */
556 void
557 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
558 		       int flags)
559 {
560 	int i;
561 
562 	cluster->focus = NULL;
563 	for (i = 0; i < cluster->nchains; ++i) {
564 		if (cluster->array[i]) {
565 			hammer2_chain_modify(trans, cluster->array[i], flags);
566 			if (cluster->focus == NULL)
567 				cluster->focus = cluster->array[i];
568 		}
569 	}
570 }
571 
572 /*
573  * Synchronize modifications from the focus to other chains in a cluster.
574  * Convenient because nominal API users can just modify the contents of the
575  * focus (at least for non-blockref data).
576  *
577  * Nominal front-end operations only edit non-block-table data in a single
578  * chain.  This code copies such modifications to the other chains in the
579  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
580  * by both the frontend and the backend and will explode in fireworks if
581  * blindly copied.
582  */
583 void
584 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
585 {
586 	hammer2_chain_t *focus;
587 	hammer2_chain_t *scan;
588 	const hammer2_inode_data_t *ripdata;
589 	hammer2_inode_data_t *wipdata;
590 	int i;
591 
592 	focus = cluster->focus;
593 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
594 
595 	for (i = 0; i < cluster->nchains; ++i) {
596 		scan = cluster->array[i];
597 		if (scan == NULL || scan == focus)
598 			continue;
599 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
600 		KKASSERT(focus->bytes == scan->bytes &&
601 			 focus->bref.type == scan->bref.type);
602 		switch(focus->bref.type) {
603 		case HAMMER2_BREF_TYPE_INODE:
604 			ripdata = &focus->data->ipdata;
605 			wipdata = &scan->data->ipdata;
606 			if ((ripdata->op_flags &
607 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
608 				bcopy(ripdata, wipdata,
609 				      offsetof(hammer2_inode_data_t, u));
610 				break;
611 			}
612 			/* fall through */
613 		case HAMMER2_BREF_TYPE_DATA:
614 			bcopy(focus->data, scan->data, focus->bytes);
615 			break;
616 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
617 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
618 		case HAMMER2_BREF_TYPE_FREEMAP:
619 		case HAMMER2_BREF_TYPE_VOLUME:
620 			panic("hammer2_cluster_modsync: illegal node type");
621 			/* NOT REACHED */
622 			break;
623 		default:
624 			panic("hammer2_cluster_modsync: unknown node type");
625 			break;
626 		}
627 	}
628 }
629 
630 /*
631  * Lookup initialization/completion API
632  */
633 hammer2_cluster_t *
634 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
635 {
636 	hammer2_cluster_t *cluster;
637 	int i;
638 
639 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
640 	cluster->pmp = cparent->pmp;			/* can be NULL */
641 	/* cluster->focus = NULL; already null */
642 
643 	for (i = 0; i < cparent->nchains; ++i) {
644 		cluster->array[i] = cparent->array[i];
645 		if (cluster->focus == NULL)
646 			cluster->focus = cluster->array[i];
647 	}
648 	cluster->nchains = cparent->nchains;
649 
650 	/*
651 	 * Independently lock (this will also give cluster 1 ref)
652 	 */
653 	if (flags & HAMMER2_LOOKUP_SHARED) {
654 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
655 					      HAMMER2_RESOLVE_SHARED);
656 	} else {
657 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
658 	}
659 	return (cluster);
660 }
661 
662 void
663 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
664 {
665 	if (cparent)
666 		hammer2_cluster_unlock(cparent);
667 }
668 
669 /*
670  * Locate first match or overlap under parent, return a new cluster
671  */
672 hammer2_cluster_t *
673 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
674 		     hammer2_key_t key_beg, hammer2_key_t key_end,
675 		     int flags, int *ddflagp)
676 {
677 	hammer2_pfsmount_t *pmp;
678 	hammer2_cluster_t *cluster;
679 	hammer2_chain_t *chain;
680 	hammer2_key_t key_accum;
681 	hammer2_key_t key_next;
682 	hammer2_key_t bref_key;
683 	int bref_keybits;
684 	int null_count;
685 	int ddflag;
686 	int i;
687 	uint8_t bref_type;
688 	u_int bytes;
689 
690 	pmp = cparent->pmp;				/* can be NULL */
691 	key_accum = *key_nextp;
692 	null_count = 0;
693 	bref_type = 0;
694 	bref_key = 0;
695 	bref_keybits = 0;
696 	bytes = 0;
697 
698 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
699 	cluster->pmp = pmp;				/* can be NULL */
700 	cluster->refs = 1;
701 	/* cluster->focus = NULL; already null */
702 	cparent->focus = NULL;
703 	*ddflagp = 0;
704 
705 	for (i = 0; i < cparent->nchains; ++i) {
706 		key_next = *key_nextp;
707 		if (cparent->array[i] == NULL) {
708 			++null_count;
709 			continue;
710 		}
711 		chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
712 					     key_beg, key_end,
713 					     &cparent->cache_index[i],
714 					     flags, &ddflag);
715 		if (cparent->focus == NULL)
716 			cparent->focus = cparent->array[i];
717 		cluster->array[i] = chain;
718 		if (chain == NULL) {
719 			++null_count;
720 		} else {
721 			if (cluster->focus == NULL) {
722 				bref_type = chain->bref.type;
723 				bref_key = chain->bref.key;
724 				bref_keybits = chain->bref.keybits;
725 				bytes = chain->bytes;
726 				*ddflagp = ddflag;
727 				cluster->focus = chain;
728 			}
729 			KKASSERT(bref_type == chain->bref.type);
730 			KKASSERT(bref_key == chain->bref.key);
731 			KKASSERT(bref_keybits == chain->bref.keybits);
732 			KKASSERT(bytes == chain->bytes);
733 			KKASSERT(*ddflagp == ddflag);
734 		}
735 		if (key_accum > key_next)
736 			key_accum = key_next;
737 	}
738 	*key_nextp = key_accum;
739 	cluster->nchains = i;
740 
741 	if (null_count == i) {
742 		hammer2_cluster_drop(cluster);
743 		cluster = NULL;
744 	}
745 
746 	return (cluster);
747 }
748 
749 /*
750  * Locate next match or overlap under parent, replace cluster
751  */
752 hammer2_cluster_t *
753 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
754 		     hammer2_key_t *key_nextp,
755 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
756 {
757 	hammer2_chain_t *chain;
758 	hammer2_key_t key_accum;
759 	hammer2_key_t key_next;
760 	int null_count;
761 	int i;
762 
763 	key_accum = *key_nextp;
764 	null_count = 0;
765 	cluster->focus = NULL;
766 	cparent->focus = NULL;
767 
768 	for (i = 0; i < cparent->nchains; ++i) {
769 		key_next = *key_nextp;
770 		chain = cluster->array[i];
771 		if (chain == NULL) {
772 			if (cparent->focus == NULL)
773 				cparent->focus = cparent->array[i];
774 			++null_count;
775 			continue;
776 		}
777 		if (cparent->array[i] == NULL) {
778 			if (flags & HAMMER2_LOOKUP_NOLOCK)
779 				hammer2_chain_drop(chain);
780 			else
781 				hammer2_chain_unlock(chain);
782 			++null_count;
783 			continue;
784 		}
785 		chain = hammer2_chain_next(&cparent->array[i], chain,
786 					   &key_next, key_beg, key_end,
787 					   &cparent->cache_index[i], flags);
788 		if (cparent->focus == NULL)
789 			cparent->focus = cparent->array[i];
790 		cluster->array[i] = chain;
791 		if (chain == NULL) {
792 			++null_count;
793 		} else if (cluster->focus == NULL) {
794 			cluster->focus = chain;
795 		}
796 		if (key_accum > key_next)
797 			key_accum = key_next;
798 	}
799 
800 	if (null_count == i) {
801 		hammer2_cluster_drop(cluster);
802 		cluster = NULL;
803 	}
804 	return(cluster);
805 }
806 
807 #if 0
808 /*
809  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
810  *
811  * The raw scan function is similar to lookup/next but does not seek to a key.
812  * Blockrefs are iterated via first_chain = (parent, NULL) and
813  * next_chain = (parent, chain).
814  *
815  * The passed-in parent must be locked and its data resolved.  The returned
816  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
817  * under parent and then iterate with the passed-in chain (which this
818  * function will unlock).
819  */
820 hammer2_cluster_t *
821 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
822 		     int flags)
823 {
824 	hammer2_chain_t *chain;
825 	int null_count;
826 	int i;
827 
828 	null_count = 0;
829 
830 	for (i = 0; i < cparent->nchains; ++i) {
831 		chain = cluster->array[i];
832 		if (chain == NULL) {
833 			++null_count;
834 			continue;
835 		}
836 		if (cparent->array[i] == NULL) {
837 			if (flags & HAMMER2_LOOKUP_NOLOCK)
838 				hammer2_chain_drop(chain);
839 			else
840 				hammer2_chain_unlock(chain);
841 			++null_count;
842 			continue;
843 		}
844 
845 		chain = hammer2_chain_scan(cparent->array[i], chain,
846 					   &cparent->cache_index[i], flags);
847 		cluster->array[i] = chain;
848 		if (chain == NULL)
849 			++null_count;
850 	}
851 
852 	if (null_count == i) {
853 		hammer2_cluster_drop(cluster);
854 		cluster = NULL;
855 	}
856 	return(cluster);
857 }
858 
859 #endif
860 
861 /*
862  * Create a new cluster using the specified key
863  */
864 int
865 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
866 		     hammer2_cluster_t **clusterp,
867 		     hammer2_key_t key, int keybits,
868 		     int type, size_t bytes, int flags)
869 {
870 	hammer2_cluster_t *cluster;
871 	hammer2_pfsmount_t *pmp;
872 	int error;
873 	int i;
874 
875 	pmp = trans->pmp;				/* can be NULL */
876 
877 	if ((cluster = *clusterp) == NULL) {
878 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
879 				  M_WAITOK | M_ZERO);
880 		cluster->pmp = pmp;			/* can be NULL */
881 		cluster->refs = 1;
882 	}
883 	cluster->focus = NULL;
884 	cparent->focus = NULL;
885 
886 	/*
887 	 * NOTE: cluster->array[] entries can initially be NULL.  If
888 	 *	 *clusterp is supplied, skip NULL entries, otherwise
889 	 *	 create new chains.
890 	 */
891 	for (i = 0; i < cparent->nchains; ++i) {
892 		if (*clusterp && cluster->array[i] == NULL) {
893 			if (cparent->focus == NULL)
894 				cparent->focus = cparent->array[i];
895 			continue;
896 		}
897 		error = hammer2_chain_create(trans, &cparent->array[i],
898 					     &cluster->array[i], pmp,
899 					     key, keybits,
900 					     type, bytes, flags);
901 		KKASSERT(error == 0);
902 		if (cparent->focus == NULL)
903 			cparent->focus = cparent->array[i];
904 		if (cluster->focus == NULL)
905 			cluster->focus = cluster->array[i];
906 	}
907 	cluster->nchains = i;
908 	*clusterp = cluster;
909 
910 	return error;
911 }
912 
913 /*
914  * Rename a cluster to a new parent.
915  *
916  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
917  *	    are used from a passed-in non-NULL bref pointer.  All other fields
918  *	    are extracted from the original chain for each chain in the
919  *	    iteration.
920  */
921 void
922 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
923 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
924 		       int flags)
925 {
926 	hammer2_chain_t *chain;
927 	hammer2_blockref_t xbref;
928 	int i;
929 
930 	cluster->focus = NULL;
931 	cparent->focus = NULL;
932 
933 	for (i = 0; i < cluster->nchains; ++i) {
934 		chain = cluster->array[i];
935 		if (chain) {
936 			if (bref) {
937 				xbref = chain->bref;
938 				xbref.key = bref->key;
939 				xbref.keybits = bref->keybits;
940 				hammer2_chain_rename(trans, &xbref,
941 						     &cparent->array[i],
942 						     chain, flags);
943 			} else {
944 				hammer2_chain_rename(trans, NULL,
945 						     &cparent->array[i],
946 						     chain, flags);
947 			}
948 			cluster->array[i] = chain;
949 			if (cluster->focus == NULL)
950 				cluster->focus = chain;
951 			if (cparent->focus == NULL)
952 				cparent->focus = cparent->array[i];
953 		} else {
954 			if (cparent->focus == NULL)
955 				cparent->focus = cparent->array[i];
956 		}
957 	}
958 }
959 
960 /*
961  * Mark a cluster deleted
962  */
963 void
964 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
965 		       hammer2_cluster_t *cluster, int flags)
966 {
967 	hammer2_chain_t *chain;
968 	hammer2_chain_t *parent;
969 	int i;
970 
971 	if (cparent == NULL) {
972 		kprintf("cparent is NULL\n");
973 		return;
974 	}
975 
976 	for (i = 0; i < cluster->nchains; ++i) {
977 		parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
978 		chain = cluster->array[i];
979 		if (chain == NULL)
980 			continue;
981 		if (chain->parent != parent) {
982 			kprintf("hammer2_cluster_delete: parent "
983 				"mismatch chain=%p parent=%p against=%p\n",
984 				chain, chain->parent, parent);
985 		} else {
986 			hammer2_chain_delete(trans, parent, chain, flags);
987 		}
988 	}
989 }
990 
991 /*
992  * Create a snapshot of the specified {parent, ochain} with the specified
993  * label.  The originating hammer2_inode must be exclusively locked for
994  * safety.
995  *
996  * The ioctl code has already synced the filesystem.
997  */
998 int
999 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1000 		       hammer2_ioc_pfs_t *pfs)
1001 {
1002 	hammer2_mount_t *hmp;
1003 	hammer2_cluster_t *ncluster;
1004 	const hammer2_inode_data_t *ripdata;
1005 	hammer2_inode_data_t *wipdata;
1006 	hammer2_inode_t *nip;
1007 	size_t name_len;
1008 	hammer2_key_t lhc;
1009 	struct vattr vat;
1010 	uuid_t opfs_clid;
1011 	int error;
1012 	int i;
1013 
1014 	kprintf("snapshot %s\n", pfs->name);
1015 
1016 	name_len = strlen(pfs->name);
1017 	lhc = hammer2_dirhash(pfs->name, name_len);
1018 
1019 	/*
1020 	 * Get the clid
1021 	 */
1022 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1023 	opfs_clid = ripdata->pfs_clid;
1024 	hmp = ocluster->focus->hmp;
1025 
1026 	/*
1027 	 * Create the snapshot directory under the super-root
1028 	 *
1029 	 * Set PFS type, generate a unique filesystem id, and generate
1030 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1031 	 * which theoretically allows the snapshot to be used as part of
1032 	 * the same cluster (perhaps as a cache).
1033 	 *
1034 	 * Copy the (flushed) blockref array.  Theoretically we could use
1035 	 * chain_duplicate() but it becomes difficult to disentangle
1036 	 * the shared core so for now just brute-force it.
1037 	 */
1038 	VATTR_NULL(&vat);
1039 	vat.va_type = VDIR;
1040 	vat.va_mode = 0755;
1041 	ncluster = NULL;
1042 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1043 				   proc0.p_ucred, pfs->name, name_len,
1044 				   &ncluster, &error);
1045 
1046 	if (nip) {
1047 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1048 		wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1049 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1050 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1051 			wipdata->pfs_clid = opfs_clid;
1052 		else
1053 			kern_uuidgen(&wipdata->pfs_clid, 1);
1054 
1055 		for (i = 0; i < ncluster->nchains; ++i) {
1056 			if (ncluster->array[i]) {
1057 				ncluster->array[i]->bref.flags |=
1058 				    HAMMER2_BREF_FLAG_PFSROOT;
1059 			}
1060 		}
1061 #if 0
1062 		/* XXX can't set this unless we do an explicit flush, which
1063 		   we also need a pmp assigned to do, else the flush code
1064 		   won't flush ncluster because it thinks it is crossing a
1065 		   flush boundary */
1066 		hammer2_cluster_set_chainflags(ncluster,
1067 					       HAMMER2_CHAIN_PFSBOUNDARY);
1068 #endif
1069 
1070 		/* XXX hack blockset copy */
1071 		/* XXX doesn't work with real cluster */
1072 		KKASSERT(ocluster->nchains == 1);
1073 		wipdata->u.blockset = ripdata->u.blockset;
1074 		hammer2_cluster_modsync(ncluster);
1075 		for (i = 0; i < ncluster->nchains; ++i) {
1076 			if (ncluster->array[i])
1077 				hammer2_flush(trans, ncluster->array[i]);
1078 		}
1079 		hammer2_inode_unlock_ex(nip, ncluster);
1080 	}
1081 	return (error);
1082 }
1083 
1084 /*
1085  * Return locked parent cluster given a locked child.  The child remains
1086  * locked on return.
1087  */
1088 hammer2_cluster_t *
1089 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1090 {
1091 	hammer2_cluster_t *cparent;
1092 	int i;
1093 
1094 	cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1095 	for (i = 0; i < cluster->nchains; ++i) {
1096 		hammer2_chain_t *chain;
1097 		hammer2_chain_t *rchain;
1098 
1099 		chain = cluster->array[i];
1100 		if (chain == NULL)
1101 			continue;
1102 		hammer2_chain_ref(chain);
1103 		while ((rchain = chain->parent) != NULL) {
1104 			hammer2_chain_ref(rchain);
1105 			hammer2_chain_unlock(chain);
1106 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1107 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1108 			hammer2_chain_drop(rchain);
1109 			if (chain->parent == rchain)
1110 				break;
1111 			hammer2_chain_unlock(rchain);
1112 		}
1113 		hammer2_chain_drop(chain);
1114 		cparent->array[i] = rchain;
1115 	}
1116 	return cparent;
1117 }
1118 
1119 /************************************************************************
1120  *			        CLUSTER I/O 				*
1121  ************************************************************************
1122  *
1123  *
1124  * WARNING! blockref[] array data is not universal.  These functions should
1125  *	    only be used to access universal data.
1126  *
1127  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1128  *	    complete if necessary.  The I/O's should have already been
1129  *	    initiated by the cluster_lock/chain_lock operation.
1130  *
1131  *	    The cluster must already be in a modified state before wdata
1132  *	    is called.  The data will already be available for this case.
1133  */
1134 const hammer2_media_data_t *
1135 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1136 {
1137 	return(cluster->focus->data);
1138 }
1139 
1140 hammer2_media_data_t *
1141 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1142 {
1143 	KKASSERT(hammer2_cluster_modified(cluster));
1144 	return(cluster->focus->data);
1145 }
1146 
1147 /*
1148  * Load async into independent buffer - used to load logical buffers from
1149  * underlying device data.  The callback is made for the first validated
1150  * data found, or NULL if no valid data is available.
1151  *
1152  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1153  *	 in the inode with an exclusive lock held), the chain structure may be
1154  *	 shared.
1155  */
1156 void
1157 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1158 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1159 {
1160 	hammer2_chain_t *chain;
1161 	hammer2_iocb_t *iocb;
1162 	hammer2_mount_t *hmp;
1163 	hammer2_blockref_t *bref;
1164 	int i;
1165 
1166 	/*
1167 	 * Try to find a chain whos data is already resolved.  If none can
1168 	 * be found, start with the first chain.
1169 	 */
1170 	chain = NULL;
1171 	for (i = 0; i < cluster->nchains; ++i) {
1172 		chain = cluster->array[i];
1173 		if (chain && chain->data)
1174 			break;
1175 	}
1176 	if (i == cluster->nchains) {
1177 		chain = cluster->array[0];
1178 		i = 0;
1179 	}
1180 
1181 	iocb = &cluster->iocb;
1182 	iocb->callback = callback;
1183 	iocb->dio = NULL;		/* for already-validated case */
1184 	iocb->cluster = cluster;
1185 	iocb->chain = chain;
1186 	iocb->ptr = ptr;
1187 	iocb->lbase = (off_t)i;
1188 	iocb->flags = 0;
1189 	iocb->error = 0;
1190 
1191 	/*
1192 	 * Data already validated
1193 	 */
1194 	if (chain->data) {
1195 		callback(iocb);
1196 		return;
1197 	}
1198 
1199 	/*
1200 	 * We must resolve to a device buffer, either by issuing I/O or
1201 	 * by creating a zero-fill element.  We do not mark the buffer
1202 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1203 	 * API must still be used to do that).
1204 	 *
1205 	 * The device buffer is variable-sized in powers of 2 down
1206 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1207 	 * chunk always contains buffers of the same size. (XXX)
1208 	 *
1209 	 * The minimum physical IO size may be larger than the variable
1210 	 * block size.
1211 	 */
1212 	bref = &chain->bref;
1213 	hmp = chain->hmp;
1214 
1215 #if 0
1216 	/* handled by callback? <- TODO XXX even needed for loads? */
1217 	/*
1218 	 * The getblk() optimization for a 100% overwrite can only be used
1219 	 * if the physical block size matches the request.
1220 	 */
1221 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1222 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1223 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1224 		KKASSERT(error == 0);
1225 		iocb->dio = dio;
1226 		callback(iocb);
1227 		return;
1228 	}
1229 #endif
1230 
1231 	/*
1232 	 * Otherwise issue a read
1233 	 */
1234 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1235 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1236 }
1237 
1238 /************************************************************************
1239  *			    NODE FAILURES 				*
1240  ************************************************************************
1241  *
1242  * A node failure can occur for numerous reasons.
1243  *
1244  *	- A read I/O may fail
1245  *	- A write I/O may fail
1246  *	- An unexpected chain might be found (or be missing)
1247  *	- A node might disconnect temporarily and reconnect later
1248  *	  (for example, a USB stick could get pulled, or a node might
1249  *	  be programmatically disconnected).
1250  *	- A node might run out of space during a modifying operation.
1251  *
1252  * When a read failure or an unexpected chain state is found, the chain and
1253  * parent chain at the failure point for the nodes involved (the nodes
1254  * which we determine to be in error) are flagged as failed and removed
1255  * from the cluster.  The node itself is allowed to remain active.  The
1256  * highest common point (usually a parent chain) is queued to the
1257  * resynchronization thread for action.
1258  *
1259  * When a write I/O fails or a node runs out of space, we first adjust
1260  * as if a read failure occurs but we further disable flushes on the
1261  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1262  * but any new modifying transactions will automatically remove the node
1263  * from consideration in all related cluster structures and not generate
1264  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1265  * to the resynchronization thread for action.
1266  *
1267  * A temporary disconnect is handled as if a write failure occurred.
1268  *
1269  * Any of these failures might or might not stall related high level VNOPS,
1270  * depending on what has failed, what nodes remain, the type of cluster,
1271  * and the operating state of the cluster.
1272  *
1273  *			    FLUSH ON WRITE-DISABLED NODES
1274  *
1275  * A flush on a write-disabled node is not allowed to write anything because
1276  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1277  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1278  * Dirty meta-data related to the failed node is thrown away.
1279  *
1280  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1281  * retired... that is, if the filesystem still has enough nodes to complete
1282  * the operation.
1283  */
1284 
1285 /************************************************************************
1286  *			SYNCHRONIZATION THREAD				*
1287  ************************************************************************
1288  *
1289  * This thread is responsible for [re]synchronizing the cluster representing
1290  * a PFS.  Any out-of-sync or failed node starts this thread on a
1291  * node-by-node basis when the failure is detected.
1292  *
1293  * Clusters needing resynchronization are queued at the highest point
1294  * where the parent on the failed node is still valid, or a special
1295  * incremental scan from the ROOT is queued if no parent exists.  This
1296  * thread is also responsible for waiting for reconnections of the failed
1297  * node if the cause was due to a disconnect, and waiting for space to be
1298  * freed up if the cause was due to running out of space.
1299  *
1300  * If the cause is due to a node running out of space, this thread will also
1301  * remove older (unlocked) snapshots to make new space, recover space, and
1302  * then start resynchronization.
1303  *
1304  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1305  * and synchronizes using that snapshot against the target node.  This
1306  * ensures a consistent chain topology and also avoids interference between
1307  * the resynchronization thread and frontend operations.
1308  *
1309  * Since these are per-node threads it is possible to resynchronize several
1310  * nodes at once.
1311  */
1312