xref: /dflybsd-src/sys/vfs/hammer2/hammer2_flush.c (revision ca86d83e7d8d6bfef814ef3683c37d99ad62f11c)
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include <sys/cdefs.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/lock.h>
41 #include <sys/uuid.h>
42 
43 #include "hammer2.h"
44 
45 /*
46  * Recursively flush the specified chain.  The chain is locked and
47  * referenced by the caller and will remain so on return.  The chain
48  * will remain referenced throughout but can temporarily lose its
49  * lock during the recursion to avoid unnecessarily stalling user
50  * processes.
51  */
52 struct hammer2_flush_info {
53 	hammer2_mount_t	*hmp;
54 	hammer2_chain_t *parent;
55 	hammer2_trans_t	*trans;
56 	int		depth;
57 	int		diddeferral;
58 	struct flush_deferral_list flush_list;
59 	hammer2_tid_t	sync_tid;	/* flush synchronization point */
60 	hammer2_tid_t	mirror_tid;	/* collect mirror TID updates */
61 };
62 
63 typedef struct hammer2_flush_info hammer2_flush_info_t;
64 
65 static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
66 				hammer2_chain_t *chain);
67 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
68 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
69 
70 /*
71  * Transaction support functions for writing to the filesystem.
72  *
73  * Initializing a new transaction allocates a transaction ID.  We
74  * don't bother marking the volume header MODIFIED.  Instead, the volume
75  * header will be updated only if the operation actually makes modifications
76  * (when then propagate to the root).
77  *
78  * WARNING! Modifications to the root volume cannot dup the root volume
79  *	    header to handle synchronization points, so alloc_tid can
80  *	    wind up (harmlessly) more advanced on flush.
81  */
82 void
83 hammer2_trans_init(hammer2_trans_t *trans, hammer2_mount_t *hmp)
84 {
85 	bzero(trans, sizeof(*trans));
86 	trans->hmp = hmp;
87 	hammer2_voldata_lock(hmp);
88 	trans->sync_tid = hmp->voldata.alloc_tid++;
89 	hammer2_voldata_unlock(hmp, 0);	/* don't immediately mark modified */
90 }
91 
92 void
93 hammer2_trans_done(hammer2_trans_t *trans)
94 {
95 	trans->hmp = NULL;
96 }
97 
98 /*
99  * Flush the chain and all modified sub-chains through the specified
100  * synchronization point (sync_tid), propagating parent chain modifications
101  * and mirror_tid updates back up as needed.  Since we are recursing downward
102  * we do not have to deal with the complexities of multi-homed chains (chains
103  * with multiple parents).
104  *
105  * Caller must have interlocked against any non-flush-related modifying
106  * operations in progress whos modify_tid values are less than or equal
107  * to the passed sync_tid.
108  *
109  * Caller must have already vetted synchronization points to ensure they
110  * are properly flushed.  Only snapshots and cluster flushes can create
111  * these sorts of synchronization points.
112  *
113  * SUBMODIFIED is not cleared if modified elements with higher modify_tid
114  * values (thus not flushed) are still present after the flush.
115  *
116  * If a chain is unable to completely flush we have to be sure that
117  * SUBMODIFIED remains set up the parent chain, and that MOVED is not
118  * cleared or our desynchronized bref will not properly update in the
119  * parent.  The parent's indirect block is copied-on-write and adjusted
120  * as needed so it no longer needs to be placemarked by the subchains,
121  * allowing the sub-chains to be cleaned out.
122  *
123  * This routine can be called from several places but the most important
124  * is from the hammer2_vop_reclaim() function.  We want to try to completely
125  * clean out the inode structure to prevent disconnected inodes from
126  * building up and blowing out the kmalloc pool.  However, it is not actually
127  * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
128  * capability.
129  *
130  * chain is locked on call and will remain locked on return.  If a flush
131  * occured, the chain's MOVED bit will be set indicating that its parent
132  * (which is not part of the flush) should be updated.
133  */
134 void
135 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
136 {
137 	hammer2_chain_t *scan;
138 	hammer2_flush_info_t info;
139 
140 	/*
141 	 * Execute the recursive flush and handle deferrals.
142 	 *
143 	 * Chains can be ridiculously long (thousands deep), so to
144 	 * avoid blowing out the kernel stack the recursive flush has a
145 	 * depth limit.  Elements at the limit are placed on a list
146 	 * for re-execution after the stack has been popped.
147 	 */
148 	bzero(&info, sizeof(info));
149 	TAILQ_INIT(&info.flush_list);
150 	info.hmp = trans->hmp;
151 	info.trans = trans;
152 	info.sync_tid = trans->sync_tid;
153 	info.mirror_tid = 0;
154 
155 	for (;;) {
156 		/*
157 		 * Unwind deep recursions which had been deferred.  This
158 		 * can leave MOVED set for these chains, which will be
159 		 * handled when we [re]flush chain after the unwind.
160 		 */
161 		while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
162 			KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
163 			TAILQ_REMOVE(&info.flush_list, scan, flush_node);
164 			atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
165 
166 			/*
167 			 * Now that we've popped back up we can do a secondary
168 			 * recursion on the deferred elements.
169 			 */
170 			if (hammer2_debug & 0x0040)
171 				kprintf("defered flush %p\n", scan);
172 			hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
173 			hammer2_chain_flush(trans, scan);
174 			hammer2_chain_unlock(scan);
175 			hammer2_chain_drop(scan);	/* ref from deferral */
176 		}
177 
178 		/*
179 		 * Flush pass1 on root.  SUBMODIFIED can remain set after
180 		 * this call for numerous reasons, including write failures,
181 		 * but most likely due to only a partial flush being
182 		 * requested.
183 		 */
184 		info.diddeferral = 0;
185 		hammer2_chain_flush_core(&info, chain);
186 
187 		/*
188 		 * Only loop if deep recursions have been deferred.
189 		 */
190 		if (TAILQ_EMPTY(&info.flush_list))
191 			break;
192 	}
193 
194 	/*
195 	 * SUBMODIFIED can be temporarily cleared and then re-set, which
196 	 * can prevent concurrent setsubmods from reaching all the way to
197 	 * the root.  If after the flush we find the node is still in need
198 	 * of flushing (though possibly due to modifications made outside
199 	 * the requested synchronization zone), we must call setsubmod again
200 	 * to cover the race.
201 	 */
202 	if (chain->flags & (HAMMER2_CHAIN_MOVED |
203 			    HAMMER2_CHAIN_DELETED |
204 			    HAMMER2_CHAIN_MODIFIED |
205 			    HAMMER2_CHAIN_SUBMODIFIED)) {
206 		hammer2_chain_parent_setsubmod(chain);
207 	}
208 }
209 
210 /*
211  * (chain) is locked by the caller and remains locked on return.
212  * This function is keyed off of SUBMODIFIED but must make fine-grained
213  * choices based on the synchronization point we are flushing to.
214  *
215  * If the flush accomplished any work chain will be flagged MOVED
216  * indicating a copy-on-write propagation back up is required.
217  * Deep sub-nodes may also have been entered onto the deferral list.
218  * MOVED is never set on the volume root.
219  *
220  * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
221  *	 only when a chain is specifically modified, and not updated
222  *	 for copy-on-write propagations.  MODIFIED is set on any modification
223  *	 including copy-on-write propagations.
224  */
225 static void
226 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
227 {
228 	hammer2_mount_t *hmp;
229 	hammer2_blockref_t *bref;
230 	hammer2_off_t pbase;
231 	size_t bbytes;
232 	size_t boff;
233 	char *bdata;
234 	struct buf *bp;
235 	int error;
236 	int wasmodified;
237 	int diddeferral = 0;
238 
239 	hmp = info->hmp;
240 
241 	/*
242 	 * If SUBMODIFIED is set we recurse the flush and adjust the
243 	 * blockrefs accordingly.
244 	 *
245 	 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
246 	 *	 finishing in the face of filesystem activity.
247 	 */
248 	if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
249 		hammer2_chain_t *saved_parent;
250 
251 		/*
252 		 * Clear SUBMODIFIED to catch races.  Note that any child
253 		 * with MODIFIED, DELETED, or MOVED set during Scan2, after
254 		 * it processes the child, will cause SUBMODIFIED to be
255 		 * re-set.
256 		 * child has to be flushed SUBMODIFIED will wind up being
257 		 * set again (for next time), but this does not stop us from
258 		 * synchronizing block updates which occurred.
259 		 *
260 		 * We don't want to set our chain to MODIFIED gratuitously.
261 		 *
262 		 * We need an extra ref on chain because we are going to
263 		 * release its lock temporarily in our child loop.
264 		 */
265 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
266 		hammer2_chain_ref(chain);
267 
268 		/*
269 		 * Run two passes.  The first pass handles MODIFIED and
270 		 * SUBMODIFIED chains and recurses while the second pass
271 		 * handles MOVED chains on the way back up.
272 		 *
273 		 * If the stack gets too deep we defer scan1, but must
274 		 * be sure to still run scan2 if on the next loop the
275 		 * deferred chain has been flushed and now needs MOVED
276 		 * handling on the way back up.
277 		 *
278 		 * Scan1 is recursive.
279 		 *
280 		 * NOTE: The act of handling a modified/submodified chain can
281 		 *	 cause the MOVED Flag to be set.  It can also be set
282 		 *	 via hammer2_chain_delete() and in other situations.
283 		 *
284 		 * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
285 		 *	 because children can be physically removed during
286 		 *	 the scan.
287 		 */
288 		saved_parent = info->parent;
289 		info->parent = chain;
290 
291 		if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
292 			if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
293 				hammer2_chain_ref(chain);
294 				TAILQ_INSERT_TAIL(&info->flush_list,
295 						  chain, flush_node);
296 				atomic_set_int(&chain->flags,
297 					       HAMMER2_CHAIN_DEFERRED);
298 			}
299 			diddeferral = 1;
300 		} else {
301 			info->diddeferral = 0;
302 			spin_lock(&chain->core->cst.spin);
303 			RB_SCAN(hammer2_chain_tree, &chain->core->rbtree,
304 				NULL, hammer2_chain_flush_scan1, info);
305 			spin_unlock(&chain->core->cst.spin);
306 			diddeferral += info->diddeferral;
307 		}
308 
309 		/*
310 		 * Handle successfully flushed children who are in the MOVED
311 		 * state on the way back up the recursion.  This can have
312 		 * the side-effect of clearing MOVED.
313 		 *
314 		 * We execute this even if there were deferrals to try to
315 		 * keep the chain topology cleaner.
316 		 *
317 		 * Scan2 is non-recursive.
318 		 */
319 		spin_lock(&chain->core->cst.spin);
320 		RB_SCAN(hammer2_chain_tree, &chain->core->rbtree,
321 			NULL, hammer2_chain_flush_scan2, info);
322 		spin_unlock(&chain->core->cst.spin);
323 		info->parent = saved_parent;
324 		hammer2_chain_drop(chain);
325 	}
326 
327 	/*
328 	 * Rollup diddeferral for caller.  Note direct assignment, not +=.
329 	 */
330 	info->diddeferral = diddeferral;
331 
332 	/*
333 	 * Do not flush chain if there were any deferrals.  It will be
334 	 * retried later after the deferrals are independently handled.
335 	 */
336 	if (diddeferral) {
337 		if (hammer2_debug & 0x0008) {
338 			kprintf("%*.*s} %p/%d %04x (deferred)",
339 				info->depth, info->depth, "",
340 				chain, chain->refs, chain->flags);
341 		}
342 		return;
343 	}
344 
345 	/*
346 	 * Chain objects flagged for complete destruction recurse down from
347 	 * their inode.  The inode will have already been removed from
348 	 * its parent.  We have no need to disconnect the children from
349 	 * their parents or the inode in this situation (it would just
350 	 * waste time and storage with copy-on-write operations), so
351 	 * we can clear both the MODIFIED bit and the MOVED bit.
352 	 *
353 	 * However, delete_tid must be within the synchronization zone
354 	 * for us to act on this bit.  Open-but-deleted files have to
355 	 * be managed by the cluster such that they are not subjected to
356 	 * reclamation.
357 	 *
358 	 * DESTROYED chains stop processing here.
359 	 */
360 	if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
361 	    (chain->delete_tid <= info->sync_tid)) {
362 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
363 			if (chain->bp)
364 				chain->bp->b_flags |= B_INVAL|B_RELBUF;
365 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
366 			hammer2_chain_drop(chain);
367 		}
368 		if (chain->flags & HAMMER2_CHAIN_MOVED) {
369 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
370 			hammer2_chain_drop(chain);
371 		}
372 		if (hammer2_debug & 0x0008) {
373 			kprintf("%*.*s} %p/%d %04x (destroyed)",
374 				info->depth, info->depth, "",
375 				chain, chain->refs, chain->flags);
376 		}
377 		return;
378 	}
379 
380 	/*
381 	 * If MODIFIED is not set or modify_tid is > sync_tid we have
382 	 * nothing to do.
383 	 *
384 	 * Note that MOVED can be set without MODIFIED being set due to
385 	 * a deletion, in which case it is handled by Scan2 later on.
386 	 *
387 	 * Both bits can be set along with DELETED due to a deletion if
388 	 * modified data within the synchronization zone and the chain
389 	 * was then deleted beyond the zone, in which case we still have
390 	 * to flush for synchronization point consistency.
391 	 */
392 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
393 		return;
394 	if (chain->bref.modify_tid > info->sync_tid) {
395 		if (hammer2_debug & 0x0008) {
396 			kprintf("%*.*s} %p/%d %04x (skip - beyond sync_tid)",
397 				info->depth, info->depth, "",
398 				chain, chain->refs, chain->flags);
399 		}
400 		return;
401 	}
402 
403 	/*
404 	 * Issue flush.
405 	 *
406 	 * A DESTROYED node that reaches this point must be flushed for
407 	 * synchronization point consistency.
408 	 */
409 
410 	/*
411 	 * Update mirror_tid, clear MODIFIED, and set MOVED.
412 	 *
413 	 * The caller will update the parent's reference to this chain
414 	 * by testing MOVED as long as the modification was in-bounds.
415 	 *
416 	 * MOVED is never set on the volume root as there is no parent
417 	 * to adjust.
418 	 */
419 	if (chain->bref.mirror_tid < info->sync_tid)
420 		chain->bref.mirror_tid = info->sync_tid;
421 	wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
422 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
423 	if (chain == &hmp->vchain)
424 		kprintf("(FLUSHED VOLUME HEADER)\n");
425 
426 	if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
427 	    chain == &hmp->vchain) {
428 		/*
429 		 * Drop the ref from the MODIFIED bit we cleared.
430 		 */
431 		if (wasmodified)
432 			hammer2_chain_drop(chain);
433 	} else {
434 		/*
435 		 * If we were MODIFIED we inherit the ref from clearing
436 		 * that bit, otherwise we need another ref.
437 		 */
438 		if (wasmodified == 0)
439 			hammer2_chain_ref(chain);
440 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
441 	}
442 
443 	/*
444 	 * If this is part of a recursive flush we can go ahead and write
445 	 * out the buffer cache buffer and pass a new bref back up the chain.
446 	 *
447 	 * This will never be a volume header.
448 	 */
449 	switch(chain->bref.type) {
450 	case HAMMER2_BREF_TYPE_VOLUME:
451 		/*
452 		 * The volume header is flushed manually by the syncer, not
453 		 * here.
454 		 */
455 		KKASSERT(chain->data != NULL);
456 		KKASSERT(chain->bp == NULL);
457 		kprintf("volume header mirror_tid %jd\n",
458 			hmp->voldata.mirror_tid);
459 
460 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
461 			hammer2_icrc32(
462 				(char *)&hmp->voldata +
463 				 HAMMER2_VOLUME_ICRC1_OFF,
464 				HAMMER2_VOLUME_ICRC1_SIZE);
465 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
466 			hammer2_icrc32(
467 				(char *)&hmp->voldata +
468 				 HAMMER2_VOLUME_ICRC0_OFF,
469 				HAMMER2_VOLUME_ICRC0_SIZE);
470 		hmp->voldata.icrc_volheader =
471 			hammer2_icrc32(
472 				(char *)&hmp->voldata +
473 				 HAMMER2_VOLUME_ICRCVH_OFF,
474 				HAMMER2_VOLUME_ICRCVH_SIZE);
475 		hmp->volsync = hmp->voldata;
476 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
477 		break;
478 	case HAMMER2_BREF_TYPE_DATA:
479 		/*
480 		 * Data elements have already been flushed via the logical
481 		 * file buffer cache.  Their hash was set in the bref by
482 		 * the vop_write code.
483 		 *
484 		 * Make sure the buffer(s) have been flushed out here.
485 		 */
486 		bbytes = chain->bytes;
487 		pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
488 		boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
489 
490 		bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
491 		if (bp) {
492 			if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
493 			    (B_CACHE | B_DIRTY)) {
494 				cluster_awrite(bp);
495 			} else {
496 				bp->b_flags |= B_RELBUF;
497 				brelse(bp);
498 			}
499 		}
500 		break;
501 	case HAMMER2_BREF_TYPE_INDIRECT:
502 		/*
503 		 * Indirect blocks may be in an INITIAL state.  Use the
504 		 * chain_lock() call to ensure that the buffer has been
505 		 * instantiated (even though it is already locked the buffer
506 		 * might not have been instantiated).
507 		 *
508 		 * Only write the buffer out if it is dirty, it is possible
509 		 * the operating system had already written out the buffer.
510 		 */
511 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
512 		KKASSERT(chain->bp != NULL);
513 
514 		bp = chain->bp;
515 		if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
516 		    (bp->b_flags & B_DIRTY)) {
517 			bdwrite(chain->bp);
518 		} else {
519 			brelse(chain->bp);
520 		}
521 		chain->bp = NULL;
522 		chain->data = NULL;
523 		hammer2_chain_unlock(chain);
524 		break;
525 	default:
526 		/*
527 		 * Embedded elements have to be flushed out.
528 		 */
529 		KKASSERT(chain->data != NULL);
530 		KKASSERT(chain->bp == NULL);
531 		bref = &chain->bref;
532 
533 		KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
534 		KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) ==
535 			 HAMMER2_CHECK_ISCSI32);
536 
537 		if (chain->bp == NULL) {
538 			/*
539 			 * The data is embedded, we have to acquire the
540 			 * buffer cache buffer and copy the data into it.
541 			 */
542 			if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
543 				bbytes = HAMMER2_MINIOSIZE;
544 			pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
545 			boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
546 
547 			/*
548 			 * The getblk() optimization can only be used if the
549 			 * physical block size matches the request.
550 			 */
551 			if (chain->bytes == bbytes) {
552 				bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
553 				error = 0;
554 			} else {
555 				error = bread(hmp->devvp, pbase, bbytes, &bp);
556 				KKASSERT(error == 0);
557 			}
558 			bdata = (char *)bp->b_data + boff;
559 
560 			/*
561 			 * Copy the data to the buffer, mark the buffer
562 			 * dirty, and convert the chain to unmodified.
563 			 */
564 			bcopy(chain->data, bdata, chain->bytes);
565 			bp->b_flags |= B_CLUSTEROK;
566 			bdwrite(bp);
567 			bp = NULL;
568 			chain->bref.check.iscsi32.value =
569 				hammer2_icrc32(chain->data, chain->bytes);
570 			if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
571 				++hammer2_iod_meta_write;
572 			else
573 				++hammer2_iod_indr_write;
574 		} else {
575 			chain->bref.check.iscsi32.value =
576 				hammer2_icrc32(chain->data, chain->bytes);
577 		}
578 	}
579 	if (hammer2_debug & 0x0008) {
580 		kprintf("%*.*s} %p/%d %04x (flushed)",
581 			info->depth, info->depth, "",
582 			chain, chain->refs, chain->flags);
583 	}
584 }
585 
586 /*
587  * Flush helper scan1 (recursive)
588  *
589  * Flushes the children of the caller's chain (parent) and updates
590  * the blockref.
591  *
592  * Ripouts during the loop should not cause any problems.  Because we are
593  * flushing to a synchronization point, modification races will occur after
594  * sync_tid and do not have to be flushed anyway.
595  */
596 static int
597 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
598 {
599 	hammer2_flush_info_t *info = data;
600 	hammer2_chain_t *parent = info->parent;
601 	/*hammer2_mount_t *hmp = info->hmp;*/
602 	int diddeferral;
603 
604 	/*
605 	 * We should only need to recurse if SUBMODIFIED is set, but as
606 	 * a safety also recursive if MODIFIED is also set.  Return early
607 	 * if neither bit is set.
608 	 */
609 	if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
610 			     HAMMER2_CHAIN_MODIFIED)) == 0) {
611 		return (0);
612 	}
613 	spin_unlock(&parent->core->cst.spin);
614 
615 	/*
616 	 * The caller has added a ref to the parent so we can temporarily
617 	 * unlock it in order to lock the child.  Re-check the flags before
618 	 * continuing.
619 	 */
620 	hammer2_chain_unlock(parent);
621 	hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
622 
623 	if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
624 			     HAMMER2_CHAIN_MODIFIED)) == 0) {
625 		hammer2_chain_unlock(child);
626 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
627 		spin_lock(&parent->core->cst.spin);
628 		return (0);
629 	}
630 
631 	/*
632 	 * Propagate the DESTROYED flag if found set as well as deal with
633 	 * delete_tid.  This also causes SUBMODIFIED to be propagated
634 	 * downward to keep the recursion going.
635 	 *
636 	 * In the case of delete_tid, nothing need be done.  Destruction
637 	 * occurs after any deletions and destruction of internal chains
638 	 * where delete_tid may be 0 (since we don't bother to copy-on-write
639 	 * the propagation of a deletion) will pass the conditional just
640 	 * fine.
641 	 *
642 	 * This optimization allows the inode reclaim (destroy unlinked file
643 	 * on vnode reclamation after last close) to be flagged by just
644 	 * setting HAMMER2_CHAIN_DESTROYED at the top level.
645 	 */
646 	if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
647 	    (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
648 		atomic_set_int(&child->flags,
649 			       HAMMER2_CHAIN_DESTROYED |
650 			       HAMMER2_CHAIN_SUBMODIFIED);
651 	}
652 
653 	/*
654 	 * Recurse and collect deferral data.
655 	 */
656 	diddeferral = info->diddeferral;
657 	++info->depth;
658 	hammer2_chain_flush_core(info, child);
659 	--info->depth;
660 	info->diddeferral += diddeferral;
661 
662 	hammer2_chain_unlock(child);
663 
664 	/*
665 	 * Always resolve when relocking the parent meta-data so Scan2
666 	 * has the indirect block data in-hand to handle the MOVED bit.
667 	 */
668 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
669 
670 	spin_lock(&parent->core->cst.spin);
671 	return (0);
672 }
673 
674 /*
675  * Flush helper scan2 (non-recursive)
676  *
677  * This pass on a chain's children propagates any MOVED or DELETED
678  * elements back up the chain towards the root.  The bref's modify_tid
679  * must be within the synchronization zone for MOVED to be recognized
680  * and delete_tid must be within the synchronization zone for DELETED
681  * to be recognized.
682  *
683  * We must re-set SUBMODIFIED if appropriate.
684  */
685 static int
686 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
687 {
688 	enum { HC_NONE, HC_DELETE, HC_UPDATE } action = HC_NONE;
689 	hammer2_flush_info_t *info = data;
690 	hammer2_chain_t *parent = info->parent;
691 	hammer2_mount_t *hmp = info->hmp;
692 	hammer2_blockref_t *base;
693 	int count;
694 
695 	/*
696 	 * Check update conditions prior to locking child.
697 	 * We may not be able to safely test the 64-bit TIDs
698 	 * but we can certainly test the flags.
699 	 */
700 	if ((child->flags & (HAMMER2_CHAIN_DELETED |
701 			     HAMMER2_CHAIN_MOVED)) == 0) {
702 		goto finalize;
703 	}
704 	spin_unlock(&parent->core->cst.spin);
705 
706 	/*
707 	 * The MOVED bit implies an additional reference which prevents
708 	 * the child from being destroyed out from under our operation
709 	 * so we can lock the child safely without worrying about it
710 	 * getting ripped up (?).
711 	 */
712 	hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
713 
714 	/*
715 	 * Full condition check.  We can only update and clear MOVED
716 	 * if the child is deleted or updated within our synchronization
717 	 * zone.
718 	 */
719 	if ((child->flags & HAMMER2_CHAIN_DELETED) &&
720 	    child->delete_tid <= info->sync_tid) {
721 		action = HC_DELETE;
722 	} else if ((child->flags & HAMMER2_CHAIN_MOVED) &&
723 		   child->bref.modify_tid <= info->sync_tid) {
724 		action = HC_UPDATE;
725 	} else {
726 		hammer2_chain_unlock(child);
727 		spin_lock(&parent->core->cst.spin);
728 		goto finalize;
729 	}
730 
731 	/*
732 	 * If the parent is to be deleted then we can clear MOVED
733 	 * in the child without updating the parent.  That is, it
734 	 * doesn't matter that the parent->child blockref is left intact
735 	 * because the parent is going to be deleted too.  This little
736 	 * bit of code will result in major optimizations of recursive
737 	 * file tree deletions and truncations.
738 	 */
739 	if ((parent->flags & HAMMER2_CHAIN_DELETED) &&
740 	    parent->delete_tid <= info->sync_tid) {
741 		goto cleanup;
742 	}
743 
744 	/*
745 	 * The parent's blockref to the child must be deleted or updated.
746 	 *
747 	 * This point is not reached on successful DESTROYED optimizations
748 	 * but can be reached on recursive deletions.  We can optimize
749 	 */
750 	hammer2_chain_modify(info->trans, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
751 
752 	switch(parent->bref.type) {
753 	case HAMMER2_BREF_TYPE_INODE:
754 		KKASSERT((parent->data->ipdata.op_flags &
755 			  HAMMER2_OPFLAG_DIRECTDATA) == 0);
756 		base = &parent->data->ipdata.u.blockset.blockref[0];
757 		count = HAMMER2_SET_COUNT;
758 		break;
759 	case HAMMER2_BREF_TYPE_INDIRECT:
760 		if (parent->data) {
761 			base = &parent->data->npdata.blockref[0];
762 		} else {
763 			base = NULL;
764 			KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
765 		}
766 		count = parent->bytes / sizeof(hammer2_blockref_t);
767 		break;
768 	case HAMMER2_BREF_TYPE_VOLUME:
769 		base = &hmp->voldata.sroot_blockset.blockref[0];
770 		count = HAMMER2_SET_COUNT;
771 		break;
772 	default:
773 		base = NULL;
774 		count = 0;
775 		panic("hammer2_chain_get: "
776 		      "unrecognized blockref type: %d",
777 		      parent->bref.type);
778 	}
779 
780 	/*
781 	 * Update the parent's blockref table and propagate mirror_tid.
782 	 * blockref updates do not touch modify_tid.  Instead, mirroring
783 	 * operations always reconcile the entire array during their
784 	 * mirror_tid based recursion.
785 	 *
786 	 * WARNING! Deleted chains may still be used by the filesystem
787 	 *	    in a later duplication, for example in a rename()
788 	 *	    operation.  Also any topological movement of the
789 	 *	    related blocks.
790 	 *
791 	 *	    We adjust the parent's bref pointer to the child but
792 	 *	    we do not modify the contents of the child.
793 	 */
794 	if (action == HC_DELETE) {
795 		if (base) {
796 			KKASSERT(child->index < count);
797 			bzero(&base[child->index], sizeof(child->bref));
798 		}
799 	} else {
800 		if (base) {
801 			KKASSERT(child->index < count);
802 			base[child->index] = child->bref;
803 		}
804 	}
805 	KKASSERT(child->index >= 0);
806 
807 	if (parent->bref.mirror_tid < child->bref.mirror_tid) {
808 		parent->bref.mirror_tid = child->bref.mirror_tid;
809 	}
810 	if (parent->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
811 	    hmp->voldata.mirror_tid < child->bref.mirror_tid) {
812 		hmp->voldata.mirror_tid = child->bref.mirror_tid;
813 	}
814 
815 cleanup:
816 
817 	/*
818 	 * Cleanup the child's MOVED flag and unlock the child.
819 	 */
820 	if (child->flags & HAMMER2_CHAIN_MOVED) {
821 		atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
822 		hammer2_chain_drop(child);	/* flag */
823 	}
824 
825 	/*
826 	 * Unlock the child.  This can wind up dropping the child's
827 	 * last ref, removing it from the parent's RB tree, and deallocating
828 	 * the structure.  The RB_SCAN() our caller is doing handles the
829 	 * situation.
830 	 */
831 	hammer2_chain_unlock(child);
832 	spin_lock(&parent->core->cst.spin);
833 
834 	/*
835 	 * The parent cleared SUBMODIFIED prior to the scan.  If the child
836 	 * still requires a flush (possibly due to being outside the current
837 	 * synchronization zone), we must re-set SUBMODIFIED on the way back
838 	 * up.
839 	 */
840 finalize:
841 	if (child->flags & (HAMMER2_CHAIN_MOVED |
842 			    HAMMER2_CHAIN_DELETED |
843 			    HAMMER2_CHAIN_MODIFIED |
844 			    HAMMER2_CHAIN_SUBMODIFIED)) {
845 		atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
846 	}
847 
848 	return (0);
849 }
850