xref: /dflybsd-src/sys/vfs/hammer2/hammer2_cluster.c (revision 79eafdd799bd34e8b1ec1dc221be9389c7f2b162)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54 
55 #include "hammer2.h"
56 
57 /*
58  * Returns TRUE if any chain in the cluster needs to be resized.
59  */
60 int
61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
62 {
63 	hammer2_chain_t *chain;
64 	int i;
65 
66 	for (i = 0; i < cluster->nchains; ++i) {
67 		chain = cluster->array[i];
68 		if (chain && chain->bytes != bytes)
69 			return 1;
70 	}
71 	return 0;
72 }
73 
74 uint8_t
75 hammer2_cluster_type(hammer2_cluster_t *cluster)
76 {
77 	return(cluster->focus->bref.type);
78 }
79 
80 /*
81  * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
82  *	 and hammer2_cluster_modsync(), remember that block array
83  *	 entries are not copied to the elements of the cluster.
84  */
85 const hammer2_media_data_t *
86 hammer2_cluster_data(hammer2_cluster_t *cluster)
87 {
88 	return(cluster->focus->data);
89 }
90 
91 hammer2_media_data_t *
92 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
93 {
94 	return(cluster->focus->data);
95 }
96 
97 int
98 hammer2_cluster_modified(hammer2_cluster_t *cluster)
99 {
100 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
101 }
102 
103 /*
104  * Return a bref representative of the cluster.  Any data offset is removed
105  * (since it would only be applicable to a particular chain in the cluster).
106  *
107  * However, the radix portion of data_off is used for many purposes and will
108  * be retained.
109  */
110 void
111 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
112 {
113 	*bref = cluster->focus->bref;
114 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
115 }
116 
117 void
118 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
119 {
120 	hammer2_chain_t *chain;
121 	int i;
122 
123 	for (i = 0; i < cluster->nchains; ++i) {
124 		chain = cluster->array[i];
125 		if (chain)
126 			atomic_set_int(&chain->flags, flags);
127 	}
128 }
129 
130 void
131 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
132 {
133 	hammer2_chain_t *chain;
134 	int i;
135 
136 	for (i = 0; i < cluster->nchains; ++i) {
137 		chain = cluster->array[i];
138 		if (chain)
139 			hammer2_chain_setflush(trans, chain);
140 	}
141 }
142 
143 void
144 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
145 				hammer2_cluster_t *cluster,
146 				int check_algo)
147 {
148 	hammer2_chain_t *chain;
149 	int i;
150 
151 	for (i = 0; i < cluster->nchains; ++i) {
152 		chain = cluster->array[i];
153 		if (chain) {
154 			KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
155 			chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
156 			chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
157 		}
158 	}
159 }
160 
161 /*
162  * Create a cluster with one ref from the specified chain.  The chain
163  * is not further referenced.  The caller typically supplies a locked
164  * chain and transfers ownership to the cluster.
165  */
166 hammer2_cluster_t *
167 hammer2_cluster_from_chain(hammer2_chain_t *chain)
168 {
169 	hammer2_cluster_t *cluster;
170 
171 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
172 	cluster->array[0] = chain;
173 	cluster->nchains = 1;
174 	cluster->focus = chain;
175 	cluster->pmp = chain->pmp;
176 	cluster->refs = 1;
177 
178 	return cluster;
179 }
180 
181 /*
182  * Allocates a cluster and its underlying chain structures.  The underlying
183  * chains will be locked.  The cluster and underlying chains will have one
184  * ref.
185  */
186 hammer2_cluster_t *
187 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
188 		      hammer2_trans_t *trans, hammer2_blockref_t *bref)
189 {
190 	hammer2_cluster_t *cluster;
191 	hammer2_cluster_t *rcluster;
192 	hammer2_chain_t *chain;
193 #if 0
194 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
195 #endif
196 	int i;
197 
198 	KKASSERT(pmp != NULL);
199 
200 	/*
201 	 * Construct the appropriate system structure.
202 	 */
203 	switch(bref->type) {
204 	case HAMMER2_BREF_TYPE_INODE:
205 	case HAMMER2_BREF_TYPE_INDIRECT:
206 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
207 	case HAMMER2_BREF_TYPE_DATA:
208 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
209 		/*
210 		 * Chain's are really only associated with the hmp but we
211 		 * maintain a pmp association for per-mount memory tracking
212 		 * purposes.  The pmp can be NULL.
213 		 */
214 		break;
215 	case HAMMER2_BREF_TYPE_VOLUME:
216 	case HAMMER2_BREF_TYPE_FREEMAP:
217 		chain = NULL;
218 		panic("hammer2_cluster_alloc volume type illegal for op");
219 	default:
220 		chain = NULL;
221 		panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
222 		      bref->type);
223 	}
224 
225 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
226 	cluster->refs = 1;
227 
228 	rcluster = &pmp->iroot->cluster;
229 	for (i = 0; i < rcluster->nchains; ++i) {
230 		chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
231 					    pmp, trans, bref);
232 #if 0
233 		chain->hmp = rcluster->array[i]->hmp;
234 		chain->bref = *bref;
235 		chain->bytes = bytes;
236 		chain->refs = 1;
237 		chain->flags = HAMMER2_CHAIN_ALLOCATED;
238 #endif
239 
240 		/*
241 		 * NOTE: When loading a chain from backing store or creating a
242 		 *	 snapshot, trans will be NULL and the caller is
243 		 *	 responsible for setting these fields.
244 		 */
245 		cluster->array[i] = chain;
246 	}
247 	cluster->nchains = i;
248 	cluster->pmp = pmp;
249 	cluster->focus = cluster->array[0];
250 
251 	return (cluster);
252 }
253 
254 /*
255  * Add a reference to a cluster.
256  *
257  * We must also ref the underlying chains in order to allow ref/unlock
258  * sequences to later re-lock.
259  */
260 void
261 hammer2_cluster_ref(hammer2_cluster_t *cluster)
262 {
263 	hammer2_chain_t *chain;
264 	int i;
265 
266 	atomic_add_int(&cluster->refs, 1);
267 	for (i = 0; i < cluster->nchains; ++i) {
268 		chain = cluster->array[i];
269 		if (chain)
270 			hammer2_chain_ref(chain);
271 	}
272 }
273 
274 /*
275  * Drop the caller's reference to the cluster.  When the ref count drops to
276  * zero this function frees the cluster and drops all underlying chains.
277  */
278 void
279 hammer2_cluster_drop(hammer2_cluster_t *cluster)
280 {
281 	hammer2_chain_t *chain;
282 	int i;
283 
284 	KKASSERT(cluster->refs > 0);
285 	for (i = 0; i < cluster->nchains; ++i) {
286 		chain = cluster->array[i];
287 		if (chain) {
288 			hammer2_chain_drop(chain);
289 			if (cluster->refs == 1)
290 				cluster->array[i] = NULL;
291 		}
292 	}
293 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
294 		cluster->focus = NULL;
295 		kfree(cluster, M_HAMMER2);
296 		/* cluster = NULL; safety */
297 	}
298 }
299 
300 void
301 hammer2_cluster_wait(hammer2_cluster_t *cluster)
302 {
303 	tsleep(cluster->focus, 0, "h2clcw", 1);
304 }
305 
306 /*
307  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
308  * and then locks them.
309  */
310 int
311 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
312 {
313 	hammer2_chain_t *chain;
314 	int i;
315 	int error;
316 
317 	error = 0;
318 	atomic_add_int(&cluster->refs, 1);
319 	for (i = 0; i < cluster->nchains; ++i) {
320 		chain = cluster->array[i];
321 		if (chain) {
322 			error = hammer2_chain_lock(chain, how);
323 			if (error) {
324 				while (--i >= 0)
325 					hammer2_chain_unlock(cluster->array[i]);
326 				atomic_add_int(&cluster->refs, -1);
327 				break;
328 			}
329 		}
330 	}
331 	return error;
332 }
333 
334 /*
335  * Replace the contents of dst with src, adding a reference to src's chains.
336  * dst is assumed to already have a ref and any chains present in dst are
337  * assumed to be locked and will be unlocked.
338  *
339  * If the chains in src are locked, only one of (src) or (dst) should be
340  * considered locked by the caller after return, not both.
341  */
342 void
343 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
344 {
345 	hammer2_chain_t *chain;
346 	int i;
347 
348 	KKASSERT(dst->refs == 1);
349 	dst->focus = NULL;
350 
351 	for (i = 0; i < src->nchains; ++i) {
352 		chain = src->array[i];
353 		if (chain) {
354 			hammer2_chain_ref(chain);
355 			if (i < dst->nchains && dst->array[i])
356 				hammer2_chain_unlock(dst->array[i]);
357 			dst->array[i] = chain;
358 			if (dst->focus == NULL)
359 				dst->focus = chain;
360 		}
361 	}
362 	while (i < dst->nchains) {
363 		chain = dst->array[i];
364 		if (chain) {
365 			hammer2_chain_unlock(chain);
366 			dst->array[i] = NULL;
367 		}
368 		++i;
369 	}
370 	dst->nchains = src->nchains;
371 }
372 
373 /*
374  * Replace the contents of the locked destination with the contents of the
375  * locked source.  Destination must have one ref.
376  *
377  * Returns with the destination still with one ref and the copied chains
378  * with an additional lock (representing their state on the destination).
379  * The original chains associated with the destination are unlocked.
380  */
381 void
382 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
383 {
384 	hammer2_chain_t *chain;
385 	int i;
386 
387 	KKASSERT(dst->refs == 1);
388 
389 	dst->focus = NULL;
390 	for (i = 0; i < src->nchains; ++i) {
391 		chain = src->array[i];
392 		if (chain) {
393 			hammer2_chain_lock(chain, 0);
394 			if (i < dst->nchains && dst->array[i])
395 				hammer2_chain_unlock(dst->array[i]);
396 			dst->array[i] = src->array[i];
397 			if (dst->focus == NULL)
398 				dst->focus = chain;
399 		}
400 	}
401 	while (i < dst->nchains) {
402 		chain = dst->array[i];
403 		if (chain) {
404 			hammer2_chain_unlock(chain);
405 			dst->array[i] = NULL;
406 		}
407 		++i;
408 	}
409 	dst->nchains = src->nchains;
410 }
411 
412 /*
413  * Copy a cluster, returned a ref'd cluster.  All underlying chains
414  * are also ref'd, but not locked.
415  *
416  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
417  * to the new cluster and a reference is nominally added to them and to
418  * the cluster.  The cluster will have 1 ref.
419  *
420  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
421  * are copied but no additional references are made and the cluster will have
422  * 0 refs.  Callers must ref the cluster and the chains before dropping it
423  * (typically by locking it).
424  *
425  * If flags are passed as 0 the copy is setup as if it contained the chains
426  * but the chains will not be copied over, and the cluster will have 0 refs.
427  * Callers must ref the cluster before dropping it (typically by locking it).
428  */
429 hammer2_cluster_t *
430 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
431 {
432 	hammer2_pfsmount_t *pmp = ocluster->pmp;
433 	hammer2_cluster_t *ncluster;
434 	hammer2_chain_t *chain;
435 	int i;
436 
437 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
438 	ncluster->pmp = pmp;
439 	ncluster->nchains = ocluster->nchains;
440 	ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
441 	if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
442 		ncluster->focus = ocluster->focus;
443 		for (i = 0; i < ocluster->nchains; ++i) {
444 			chain = ocluster->array[i];
445 			ncluster->array[i] = chain;
446 			if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
447 			    chain) {
448 				hammer2_chain_ref(chain);
449 			}
450 		}
451 	}
452 	return (ncluster);
453 }
454 
455 /*
456  * Unlock and deref a cluster.  The cluster is destroyed if this is the
457  * last ref.
458  */
459 void
460 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
461 {
462 	hammer2_chain_t *chain;
463 	int i;
464 
465 	KKASSERT(cluster->refs > 0);
466 	for (i = 0; i < cluster->nchains; ++i) {
467 		chain = cluster->array[i];
468 		if (chain) {
469 			hammer2_chain_unlock(chain);
470 			if (cluster->refs == 1)
471 				cluster->array[i] = NULL;	/* safety */
472 		}
473 	}
474 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
475 		cluster->focus = NULL;
476 		kfree(cluster, M_HAMMER2);
477 		/* cluster = NULL; safety */
478 	}
479 }
480 
481 /*
482  * Resize the cluster's physical storage allocation in-place.  This may
483  * replace the cluster's chains.
484  */
485 void
486 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
487 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
488 		       int nradix, int flags)
489 {
490 	int i;
491 
492 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
493 	KKASSERT(cparent->nchains == cluster->nchains);
494 
495 	cluster->focus = NULL;
496 	for (i = 0; i < cluster->nchains; ++i) {
497 		if (cluster->array[i]) {
498 			KKASSERT(cparent->array[i]);
499 			hammer2_chain_resize(trans, ip,
500 					     cparent->array[i],
501 					     cluster->array[i],
502 					     nradix, flags);
503 			if (cluster->focus == NULL)
504 				cluster->focus = cluster->array[i];
505 		}
506 	}
507 }
508 
509 /*
510  * Set an inode's cluster modified, marking the related chains RW and
511  * duplicating them if necessary.
512  *
513  * The passed-in chain is a localized copy of the chain previously acquired
514  * when the inode was locked (and possilby replaced in the mean time), and
515  * must also be updated.  In fact, we update it first and then synchronize
516  * the inode's cluster cache.
517  */
518 hammer2_inode_data_t *
519 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
520 			  hammer2_cluster_t *cluster, int flags)
521 {
522 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
523 	hammer2_cluster_modify(trans, cluster, flags);
524 
525 	hammer2_inode_repoint(ip, NULL, cluster);
526 	if (ip->vp)
527 		vsetisdirty(ip->vp);
528 	return (&hammer2_cluster_wdata(cluster)->ipdata);
529 }
530 
531 /*
532  * Adjust the cluster's chains to allow modification.
533  */
534 void
535 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
536 		       int flags)
537 {
538 	int i;
539 
540 	cluster->focus = NULL;
541 	for (i = 0; i < cluster->nchains; ++i) {
542 		if (cluster->array[i]) {
543 			hammer2_chain_modify(trans, cluster->array[i], flags);
544 			if (cluster->focus == NULL)
545 				cluster->focus = cluster->array[i];
546 		}
547 	}
548 }
549 
550 /*
551  * Synchronize modifications with other chains in a cluster.
552  *
553  * Nominal front-end operations only edit non-block-table data in a single
554  * chain.  This code copies such modifications to the other chains in the
555  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
556  * by both the frontend and the backend and will explode in fireworks if
557  * blindly copied.
558  */
559 void
560 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
561 {
562 	hammer2_chain_t *focus;
563 	hammer2_chain_t *scan;
564 	const hammer2_inode_data_t *ripdata;
565 	hammer2_inode_data_t *wipdata;
566 	int i;
567 
568 	focus = cluster->focus;
569 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
570 
571 	for (i = 0; i < cluster->nchains; ++i) {
572 		scan = cluster->array[i];
573 		if (scan == NULL || scan == focus)
574 			continue;
575 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
576 		KKASSERT(focus->bytes == scan->bytes &&
577 			 focus->bref.type == scan->bref.type);
578 		switch(focus->bref.type) {
579 		case HAMMER2_BREF_TYPE_INODE:
580 			ripdata = &focus->data->ipdata;
581 			wipdata = &scan->data->ipdata;
582 			if ((ripdata->op_flags &
583 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
584 				bcopy(ripdata, wipdata,
585 				      offsetof(hammer2_inode_data_t, u));
586 				break;
587 			}
588 			/* fall through */
589 		case HAMMER2_BREF_TYPE_DATA:
590 			bcopy(focus->data, scan->data, focus->bytes);
591 			break;
592 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
593 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
594 		case HAMMER2_BREF_TYPE_FREEMAP:
595 		case HAMMER2_BREF_TYPE_VOLUME:
596 			panic("hammer2_cluster_modsync: illegal node type");
597 			/* NOT REACHED */
598 			break;
599 		default:
600 			panic("hammer2_cluster_modsync: unknown node type");
601 			break;
602 		}
603 	}
604 }
605 
606 /*
607  * Lookup initialization/completion API
608  */
609 hammer2_cluster_t *
610 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
611 {
612 	hammer2_cluster_t *cluster;
613 	int i;
614 
615 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
616 	cluster->pmp = cparent->pmp;			/* can be NULL */
617 	/* cluster->focus = NULL; already null */
618 
619 	for (i = 0; i < cparent->nchains; ++i) {
620 		cluster->array[i] = cparent->array[i];
621 		if (cluster->focus == NULL)
622 			cluster->focus = cluster->array[i];
623 	}
624 	cluster->nchains = cparent->nchains;
625 
626 	/*
627 	 * Independently lock (this will also give cluster 1 ref)
628 	 */
629 	if (flags & HAMMER2_LOOKUP_SHARED) {
630 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
631 					      HAMMER2_RESOLVE_SHARED);
632 	} else {
633 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
634 	}
635 	return (cluster);
636 }
637 
638 void
639 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
640 {
641 	if (cparent)
642 		hammer2_cluster_unlock(cparent);
643 }
644 
645 /*
646  * Locate first match or overlap under parent, return a new cluster
647  */
648 hammer2_cluster_t *
649 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
650 		     hammer2_key_t key_beg, hammer2_key_t key_end,
651 		     int flags, int *ddflagp)
652 {
653 	hammer2_pfsmount_t *pmp;
654 	hammer2_cluster_t *cluster;
655 	hammer2_chain_t *chain;
656 	hammer2_key_t key_accum;
657 	hammer2_key_t key_next;
658 	hammer2_key_t bref_key;
659 	int bref_keybits;
660 	int null_count;
661 	int ddflag;
662 	int i;
663 	uint8_t bref_type;
664 	u_int bytes;
665 
666 	pmp = cparent->pmp;				/* can be NULL */
667 	key_accum = *key_nextp;
668 	null_count = 0;
669 	bref_type = 0;
670 	bref_key = 0;
671 	bref_keybits = 0;
672 	bytes = 0;
673 
674 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
675 	cluster->pmp = pmp;				/* can be NULL */
676 	cluster->refs = 1;
677 	/* cluster->focus = NULL; already null */
678 	cparent->focus = NULL;
679 	*ddflagp = 0;
680 
681 	for (i = 0; i < cparent->nchains; ++i) {
682 		key_next = *key_nextp;
683 		if (cparent->array[i] == NULL) {
684 			++null_count;
685 			continue;
686 		}
687 		chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
688 					     key_beg, key_end,
689 					     &cparent->cache_index[i],
690 					     flags, &ddflag);
691 		if (cparent->focus == NULL)
692 			cparent->focus = cparent->array[i];
693 		cluster->array[i] = chain;
694 		if (chain == NULL) {
695 			++null_count;
696 		} else {
697 			if (cluster->focus == NULL) {
698 				bref_type = chain->bref.type;
699 				bref_key = chain->bref.key;
700 				bref_keybits = chain->bref.keybits;
701 				bytes = chain->bytes;
702 				*ddflagp = ddflag;
703 				cluster->focus = chain;
704 			}
705 			KKASSERT(bref_type == chain->bref.type);
706 			KKASSERT(bref_key == chain->bref.key);
707 			KKASSERT(bref_keybits == chain->bref.keybits);
708 			KKASSERT(bytes == chain->bytes);
709 			KKASSERT(*ddflagp == ddflag);
710 		}
711 		if (key_accum > key_next)
712 			key_accum = key_next;
713 	}
714 	*key_nextp = key_accum;
715 	cluster->nchains = i;
716 
717 	if (null_count == i) {
718 		hammer2_cluster_drop(cluster);
719 		cluster = NULL;
720 	}
721 
722 	return (cluster);
723 }
724 
725 /*
726  * Locate next match or overlap under parent, replace cluster
727  */
728 hammer2_cluster_t *
729 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
730 		     hammer2_key_t *key_nextp,
731 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
732 {
733 	hammer2_chain_t *chain;
734 	hammer2_key_t key_accum;
735 	hammer2_key_t key_next;
736 	int null_count;
737 	int i;
738 
739 	key_accum = *key_nextp;
740 	null_count = 0;
741 	cluster->focus = NULL;
742 	cparent->focus = NULL;
743 
744 	for (i = 0; i < cparent->nchains; ++i) {
745 		key_next = *key_nextp;
746 		chain = cluster->array[i];
747 		if (chain == NULL) {
748 			if (cparent->focus == NULL)
749 				cparent->focus = cparent->array[i];
750 			++null_count;
751 			continue;
752 		}
753 		if (cparent->array[i] == NULL) {
754 			if (flags & HAMMER2_LOOKUP_NOLOCK)
755 				hammer2_chain_drop(chain);
756 			else
757 				hammer2_chain_unlock(chain);
758 			++null_count;
759 			continue;
760 		}
761 		chain = hammer2_chain_next(&cparent->array[i], chain,
762 					   &key_next, key_beg, key_end,
763 					   &cparent->cache_index[i], flags);
764 		if (cparent->focus == NULL)
765 			cparent->focus = cparent->array[i];
766 		cluster->array[i] = chain;
767 		if (chain == NULL) {
768 			++null_count;
769 		} else if (cluster->focus == NULL) {
770 			cluster->focus = chain;
771 		}
772 		if (key_accum > key_next)
773 			key_accum = key_next;
774 	}
775 
776 	if (null_count == i) {
777 		hammer2_cluster_drop(cluster);
778 		cluster = NULL;
779 	}
780 	return(cluster);
781 }
782 
783 #if 0
784 /*
785  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
786  *
787  * The raw scan function is similar to lookup/next but does not seek to a key.
788  * Blockrefs are iterated via first_chain = (parent, NULL) and
789  * next_chain = (parent, chain).
790  *
791  * The passed-in parent must be locked and its data resolved.  The returned
792  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
793  * under parent and then iterate with the passed-in chain (which this
794  * function will unlock).
795  */
796 hammer2_cluster_t *
797 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
798 		     int flags)
799 {
800 	hammer2_chain_t *chain;
801 	int null_count;
802 	int i;
803 
804 	null_count = 0;
805 
806 	for (i = 0; i < cparent->nchains; ++i) {
807 		chain = cluster->array[i];
808 		if (chain == NULL) {
809 			++null_count;
810 			continue;
811 		}
812 		if (cparent->array[i] == NULL) {
813 			if (flags & HAMMER2_LOOKUP_NOLOCK)
814 				hammer2_chain_drop(chain);
815 			else
816 				hammer2_chain_unlock(chain);
817 			++null_count;
818 			continue;
819 		}
820 
821 		chain = hammer2_chain_scan(cparent->array[i], chain,
822 					   &cparent->cache_index[i], flags);
823 		cluster->array[i] = chain;
824 		if (chain == NULL)
825 			++null_count;
826 	}
827 
828 	if (null_count == i) {
829 		hammer2_cluster_drop(cluster);
830 		cluster = NULL;
831 	}
832 	return(cluster);
833 }
834 
835 #endif
836 
837 /*
838  * Create a new cluster using the specified key
839  */
840 int
841 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
842 		     hammer2_cluster_t **clusterp,
843 		     hammer2_key_t key, int keybits,
844 		     int type, size_t bytes, int flags)
845 {
846 	hammer2_cluster_t *cluster;
847 	hammer2_pfsmount_t *pmp;
848 	int error;
849 	int i;
850 
851 	pmp = trans->pmp;				/* can be NULL */
852 
853 	if ((cluster = *clusterp) == NULL) {
854 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
855 				  M_WAITOK | M_ZERO);
856 		cluster->pmp = pmp;			/* can be NULL */
857 		cluster->refs = 1;
858 	}
859 	cluster->focus = NULL;
860 	cparent->focus = NULL;
861 
862 	/*
863 	 * NOTE: cluster->array[] entries can initially be NULL.  If
864 	 *	 *clusterp is supplied, skip NULL entries, otherwise
865 	 *	 create new chains.
866 	 */
867 	for (i = 0; i < cparent->nchains; ++i) {
868 		if (*clusterp && cluster->array[i] == NULL) {
869 			if (cparent->focus == NULL)
870 				cparent->focus = cparent->array[i];
871 			continue;
872 		}
873 		error = hammer2_chain_create(trans, &cparent->array[i],
874 					     &cluster->array[i], pmp,
875 					     key, keybits,
876 					     type, bytes, flags);
877 		KKASSERT(error == 0);
878 		if (cparent->focus == NULL)
879 			cparent->focus = cparent->array[i];
880 		if (cluster->focus == NULL)
881 			cluster->focus = cluster->array[i];
882 	}
883 	cluster->nchains = i;
884 	*clusterp = cluster;
885 
886 	return error;
887 }
888 
889 /*
890  * Rename a cluster to a new parent.
891  *
892  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
893  *	    are used from a passed-in non-NULL bref pointer.  All other fields
894  *	    are extracted from the original chain for each chain in the
895  *	    iteration.
896  */
897 void
898 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
899 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
900 		       int flags)
901 {
902 	hammer2_chain_t *chain;
903 	hammer2_blockref_t xbref;
904 	int i;
905 
906 	cluster->focus = NULL;
907 	cparent->focus = NULL;
908 
909 	for (i = 0; i < cluster->nchains; ++i) {
910 		chain = cluster->array[i];
911 		if (chain) {
912 			if (bref) {
913 				xbref = chain->bref;
914 				xbref.key = bref->key;
915 				xbref.keybits = bref->keybits;
916 				hammer2_chain_rename(trans, &xbref,
917 						     &cparent->array[i],
918 						     chain, flags);
919 			} else {
920 				hammer2_chain_rename(trans, NULL,
921 						     &cparent->array[i],
922 						     chain, flags);
923 			}
924 			cluster->array[i] = chain;
925 			if (cluster->focus == NULL)
926 				cluster->focus = chain;
927 			if (cparent->focus == NULL)
928 				cparent->focus = cparent->array[i];
929 		} else {
930 			if (cparent->focus == NULL)
931 				cparent->focus = cparent->array[i];
932 		}
933 	}
934 }
935 
936 /*
937  * Mark a cluster deleted
938  */
939 void
940 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
941 		       hammer2_cluster_t *cluster, int flags)
942 {
943 	hammer2_chain_t *chain;
944 	hammer2_chain_t *parent;
945 	int i;
946 
947 	if (cparent == NULL) {
948 		kprintf("cparent is NULL\n");
949 		return;
950 	}
951 
952 	for (i = 0; i < cluster->nchains; ++i) {
953 		parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
954 		chain = cluster->array[i];
955 		if (chain == NULL)
956 			continue;
957 		if (chain->parent != parent) {
958 			kprintf("hammer2_cluster_delete: parent "
959 				"mismatch chain=%p parent=%p against=%p\n",
960 				chain, chain->parent, parent);
961 		} else {
962 			hammer2_chain_delete(trans, parent, chain, flags);
963 		}
964 	}
965 }
966 
967 /*
968  * Create a snapshot of the specified {parent, ochain} with the specified
969  * label.  The originating hammer2_inode must be exclusively locked for
970  * safety.
971  *
972  * The ioctl code has already synced the filesystem.
973  */
974 int
975 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
976 		       hammer2_ioc_pfs_t *pfs)
977 {
978 	hammer2_mount_t *hmp;
979 	hammer2_cluster_t *ncluster;
980 	const hammer2_inode_data_t *ipdata;
981 	hammer2_inode_data_t *wipdata;
982 	hammer2_inode_t *nip;
983 	size_t name_len;
984 	hammer2_key_t lhc;
985 	struct vattr vat;
986 	uuid_t opfs_clid;
987 	int error;
988 	int i;
989 
990 	kprintf("snapshot %s\n", pfs->name);
991 
992 	name_len = strlen(pfs->name);
993 	lhc = hammer2_dirhash(pfs->name, name_len);
994 
995 	ipdata = &hammer2_cluster_data(ocluster)->ipdata;
996 	opfs_clid = ipdata->pfs_clid;
997 	hmp = ocluster->focus->hmp;
998 
999 	/*
1000 	 * Create the snapshot directory under the super-root
1001 	 *
1002 	 * Set PFS type, generate a unique filesystem id, and generate
1003 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1004 	 * which theoretically allows the snapshot to be used as part of
1005 	 * the same cluster (perhaps as a cache).
1006 	 *
1007 	 * Copy the (flushed) blockref array.  Theoretically we could use
1008 	 * chain_duplicate() but it becomes difficult to disentangle
1009 	 * the shared core so for now just brute-force it.
1010 	 */
1011 	VATTR_NULL(&vat);
1012 	vat.va_type = VDIR;
1013 	vat.va_mode = 0755;
1014 	ncluster = NULL;
1015 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1016 				   proc0.p_ucred, pfs->name, name_len,
1017 				   &ncluster, &error);
1018 
1019 	if (nip) {
1020 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1021 		wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1022 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1023 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1024 			wipdata->pfs_clid = opfs_clid;
1025 		else
1026 			kern_uuidgen(&wipdata->pfs_clid, 1);
1027 
1028 		for (i = 0; i < ncluster->nchains; ++i) {
1029 			if (ncluster->array[i]) {
1030 				ncluster->array[i]->bref.flags |=
1031 				    HAMMER2_BREF_FLAG_PFSROOT;
1032 			}
1033 		}
1034 #if 0
1035 		/* XXX can't set this unless we do an explicit flush, which
1036 		   we also need a pmp assigned to do, else the flush code
1037 		   won't flush ncluster because it thinks it is crossing a
1038 		   flush boundary */
1039 		hammer2_cluster_set_chainflags(ncluster,
1040 					       HAMMER2_CHAIN_PFSBOUNDARY);
1041 #endif
1042 
1043 		/* XXX hack blockset copy */
1044 		/* XXX doesn't work with real cluster */
1045 		KKASSERT(ocluster->nchains == 1);
1046 		wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1047 		hammer2_cluster_modsync(ncluster);
1048 		for (i = 0; i < ncluster->nchains; ++i) {
1049 			if (ncluster->array[i])
1050 				hammer2_flush(trans, ncluster->array[i]);
1051 		}
1052 		hammer2_inode_unlock_ex(nip, ncluster);
1053 	}
1054 	return (error);
1055 }
1056 
1057 /*
1058  * Return locked parent cluster given a locked child.  The child remains
1059  * locked on return.
1060  */
1061 hammer2_cluster_t *
1062 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1063 {
1064 	hammer2_cluster_t *cparent;
1065 	int i;
1066 
1067 	cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1068 	for (i = 0; i < cluster->nchains; ++i) {
1069 		hammer2_chain_t *chain;
1070 		hammer2_chain_t *rchain;
1071 
1072 		chain = cluster->array[i];
1073 		if (chain == NULL)
1074 			continue;
1075 		hammer2_chain_ref(chain);
1076 		while ((rchain = chain->parent) != NULL) {
1077 			hammer2_chain_ref(rchain);
1078 			hammer2_chain_unlock(chain);
1079 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1080 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1081 			hammer2_chain_drop(rchain);
1082 			if (chain->parent == rchain)
1083 				break;
1084 			hammer2_chain_unlock(rchain);
1085 		}
1086 		hammer2_chain_drop(chain);
1087 		cparent->array[i] = rchain;
1088 	}
1089 	return cparent;
1090 }
1091 
1092 /************************************************************************
1093  *			    NODE FAILURES 				*
1094  ************************************************************************
1095  *
1096  * A node failure can occur for numerous reasons.
1097  *
1098  *	- A read I/O may fail
1099  *	- A write I/O may fail
1100  *	- An unexpected chain might be found (or be missing)
1101  *	- A node might disconnect temporarily and reconnect later
1102  *	  (for example, a USB stick could get pulled, or a node might
1103  *	  be programmatically disconnected).
1104  *	- A node might run out of space during a modifying operation.
1105  *
1106  * When a read failure or an unexpected chain state is found, the chain and
1107  * parent chain at the failure point for the nodes involved (the nodes
1108  * which we determine to be in error) are flagged as failed and removed
1109  * from the cluster.  The node itself is allowed to remain active.  The
1110  * highest common point (usually a parent chain) is queued to the
1111  * resynchronization thread for action.
1112  *
1113  * When a write I/O fails or a node runs out of space, we first adjust
1114  * as if a read failure occurs but we further disable flushes on the
1115  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1116  * but any new modifying transactions will automatically remove the node
1117  * from consideration in all related cluster structures and not generate
1118  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1119  * to the resynchronization thread for action.
1120  *
1121  * A temporary disconnect is handled as if a write failure occurred.
1122  *
1123  * Any of these failures might or might not stall related high level VNOPS,
1124  * depending on what has failed, what nodes remain, the type of cluster,
1125  * and the operating state of the cluster.
1126  *
1127  *			    FLUSH ON WRITE-DISABLED NODES
1128  *
1129  * A flush on a write-disabled node is not allowed to write anything because
1130  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1131  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1132  * Dirty meta-data related to the failed node is thrown away.
1133  *
1134  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1135  * retired... that is, if the filesystem still has enough nodes to complete
1136  * the operation.
1137  */
1138 
1139 /************************************************************************
1140  *			SYNCHRONIZATION THREAD				*
1141  ************************************************************************
1142  *
1143  * This thread is responsible for [re]synchronizing the cluster representing
1144  * a PFS.  Any out-of-sync or failed node starts this thread on a
1145  * node-by-node basis when the failure is detected.
1146  *
1147  * Clusters needing resynchronization are queued at the highest point
1148  * where the parent on the failed node is still valid, or a special
1149  * incremental scan from the ROOT is queued if no parent exists.  This
1150  * thread is also responsible for waiting for reconnections of the failed
1151  * node if the cause was due to a disconnect, and waiting for space to be
1152  * freed up if the cause was due to running out of space.
1153  *
1154  * If the cause is due to a node running out of space, this thread will also
1155  * remove older (unlocked) snapshots to make new space, recover space, and
1156  * then start resynchronization.
1157  *
1158  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1159  * and synchronizes using that snapshot against the target node.  This
1160  * ensures a consistent chain topology and also avoid interference between
1161  * the resynchronization thread and frontend operations.
1162  *
1163  * Since these are per-node threads it is possible to resynchronize several
1164  * nodes at once.
1165  */
1166