xref: /dflybsd-src/sbin/hammer/cmd_dedup.c (revision e586f31ca9899b49a4fc156613d9ecd853defcec)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <libutil.h>
36 #include <crypto/sha2/sha2.h>
37 
38 #include "hammer.h"
39 
40 #define DEDUP_BUF (64 * 1024)
41 
42 /* Sorted list of block CRCs - light version for dedup-simulate */
43 struct sim_dedup_entry_rb_tree;
44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
45 					RB_INITIALIZER(&sim_dedup_tree);
46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
47 		rb_sim_dedup_entry_compare, hammer_crc_t);
48 
49 struct sim_dedup_entry {
50 	hammer_crc_t	crc;
51 	uint64_t	ref_blks; /* number of blocks referenced */
52 	uint64_t	ref_size; /* size of data referenced */
53 	RB_ENTRY(sim_dedup_entry) rb_entry;
54 };
55 
56 struct dedup_entry {
57 	struct hammer_btree_leaf_elm leaf;
58 	union {
59 		struct {
60 			uint64_t ref_blks;
61 			uint64_t ref_size;
62 		} de;
63 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
64 	} u;
65 	uint8_t flags;
66 	RB_ENTRY(dedup_entry) rb_entry;
67 };
68 
69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
70 
71 struct sha_dedup_entry {
72 	struct hammer_btree_leaf_elm	leaf;
73 	uint64_t			ref_blks;
74 	uint64_t			ref_size;
75 	uint8_t				sha_hash[SHA256_DIGEST_LENGTH];
76 	RB_ENTRY(sha_dedup_entry)	fict_entry;
77 };
78 
79 /* Sorted list of HAMMER B-Tree keys */
80 struct dedup_entry_rb_tree;
81 struct sha_dedup_entry_rb_tree;
82 
83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
84 					RB_INITIALIZER(&dedup_tree);
85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
86 		rb_dedup_entry_compare, hammer_crc_t);
87 
88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
89 		rb_sha_dedup_entry_compare);
90 
91 /*
92  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
93  */
94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
95 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
96 
97 struct pass2_dedup_entry {
98 	struct hammer_btree_leaf_elm	leaf;
99 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
100 };
101 
102 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
103 
104 static int SigInfoFlag;
105 static int SigAlrmFlag;
106 static int64_t DedupDataReads;
107 static int64_t DedupCurrentRecords;
108 static int64_t DedupTotalRecords;
109 static uint32_t DedupCrcStart;
110 static uint32_t DedupCrcEnd;
111 static uint64_t MemoryUse;
112 
113 /* PFS global ids - we deal with just one PFS at a run */
114 static int glob_fd;
115 static struct hammer_ioc_pseudofs_rw glob_pfs;
116 
117 /*
118  * Global accounting variables
119  *
120  * Last three don't have to be 64-bit, just to be safe..
121  */
122 static uint64_t dedup_alloc_size;
123 static uint64_t dedup_ref_size;
124 static uint64_t dedup_skipped_size;
125 static uint64_t dedup_crc_failures;
126 static uint64_t dedup_sha_failures;
127 static uint64_t dedup_underflows;
128 static uint64_t dedup_successes_count;
129 static uint64_t dedup_successes_bytes;
130 
131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
132 				struct sim_dedup_entry *sim_de2);
133 static int rb_dedup_entry_compare(struct dedup_entry *de1,
134 				struct dedup_entry *de2);
135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
136 				struct sha_dedup_entry *sha_de2);
137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash);
143 static void dump_simulated_dedup(void);
144 static void dump_real_dedup(void);
145 static void dedup_usage(int code);
146 
147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
148 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
150 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
152 		rb_sha_dedup_entry_compare);
153 
154 static
155 int
156 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
157 			struct sim_dedup_entry *sim_de2)
158 {
159 	if (sim_de1->crc < sim_de2->crc)
160 		return (-1);
161 	if (sim_de1->crc > sim_de2->crc)
162 		return (1);
163 
164 	return (0);
165 }
166 
167 static
168 int
169 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
170 {
171 	if (de1->leaf.data_crc < de2->leaf.data_crc)
172 		return (-1);
173 	if (de1->leaf.data_crc > de2->leaf.data_crc)
174 		return (1);
175 
176 	return (0);
177 }
178 
179 static
180 int
181 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
182 			struct sha_dedup_entry *sha_de2)
183 {
184 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
185 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
186 	int i;
187 
188 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
189 		if (h1[i] < h2[i])
190 			return (-1);
191 		if (h1[i] > h2[i])
192 			return (1);
193 	}
194 
195 	return (0);
196 }
197 
198 /*
199  * dedup-simulate <filesystem>
200  */
201 void
202 hammer_cmd_dedup_simulate(char **av, int ac)
203 {
204 	struct sim_dedup_entry *sim_de;
205 
206 	if (ac != 1) {
207 		dedup_usage(1);
208 		/* not reached */
209 	}
210 
211 	glob_fd = getpfs(&glob_pfs, av[0]);
212 
213 	/*
214 	 * Collection passes (memory limited)
215 	 */
216 	printf("Dedup-simulate running\n");
217 	do {
218 		DedupCrcStart = DedupCrcEnd;
219 		DedupCrcEnd = 0;
220 		MemoryUse = 0;
221 
222 		if (VerboseOpt) {
223 			printf("B-Tree pass  crc-range %08x-max\n",
224 				DedupCrcStart);
225 			fflush(stdout);
226 		}
227 		scan_pfs(av[0], collect_btree_elm, "simu-pass");
228 
229 		if (VerboseOpt >= 2)
230 			dump_simulated_dedup();
231 
232 		/*
233 		 * Calculate simulated dedup ratio and get rid of the tree
234 		 */
235 		while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
236 			assert(sim_de->ref_blks != 0);
237 			dedup_ref_size += sim_de->ref_size;
238 			dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
239 
240 			RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
241 			free(sim_de);
242 		}
243 		if (DedupCrcEnd && VerboseOpt == 0)
244 			printf(".");
245 	} while (DedupCrcEnd);
246 
247 	printf("Dedup-simulate %s succeeded\n", av[0]);
248 	relpfs(glob_fd, &glob_pfs);
249 
250 	printf("Simulated dedup ratio = %.2f\n",
251 	    (dedup_alloc_size != 0) ?
252 		(double)dedup_ref_size / dedup_alloc_size : 0);
253 }
254 
255 /*
256  * dedup <filesystem>
257  */
258 void
259 hammer_cmd_dedup(char **av, int ac)
260 {
261 	struct dedup_entry *de;
262 	struct sha_dedup_entry *sha_de;
263 	struct pass2_dedup_entry *pass2_de;
264 	char *tmp;
265 	char buf[8];
266 	int needfree = 0;
267 
268 	if (TimeoutOpt > 0)
269 		alarm(TimeoutOpt);
270 
271 	if (ac != 1) {
272 		dedup_usage(1);
273 		/* not reached */
274 	}
275 
276 	STAILQ_INIT(&pass2_dedup_queue);
277 
278 	glob_fd = getpfs(&glob_pfs, av[0]);
279 
280 	/*
281 	 * A cycle file is _required_ for resuming dedup after the timeout
282 	 * specified with -t has expired. If no -c option, then place a
283 	 * .dedup.cycle file either in the PFS snapshots directory or in
284 	 * the default snapshots directory.
285 	 */
286 	if (!CyclePath) {
287 		if (glob_pfs.ondisk->snapshots[0] != '/') {
288 			asprintf(&tmp, "%s/%s/.dedup.cycle",
289 			    SNAPSHOTS_BASE, av[0]);
290 		} else {
291 			asprintf(&tmp, "%s/.dedup.cycle",
292 			    glob_pfs.ondisk->snapshots);
293 		}
294 		CyclePath = tmp;
295 		needfree = 1;
296 	}
297 
298 	/*
299 	 * Pre-pass to cache the btree
300 	 */
301 	scan_pfs(av[0], count_btree_elm, "pre-pass ");
302 	DedupTotalRecords = DedupCurrentRecords;
303 
304 	/*
305 	 * Collection passes (memory limited)
306 	 */
307 	printf("Dedup running\n");
308 	do {
309 		DedupCrcStart = DedupCrcEnd;
310 		DedupCrcEnd = 0;
311 		MemoryUse = 0;
312 
313 		if (VerboseOpt) {
314 			printf("B-Tree pass  crc-range %08x-max\n",
315 				DedupCrcStart);
316 			fflush(stdout);
317 		}
318 		scan_pfs(av[0], process_btree_elm, "main-pass");
319 
320 		while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
321 			if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
322 				dedup_skipped_size -= pass2_de->leaf.data_len;
323 
324 			STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
325 			free(pass2_de);
326 		}
327 		assert(STAILQ_EMPTY(&pass2_dedup_queue));
328 
329 		if (VerboseOpt >= 2)
330 			dump_real_dedup();
331 
332 		/*
333 		 * Calculate dedup ratio and get rid of the trees
334 		 */
335 		while ((de = RB_ROOT(&dedup_tree)) != NULL) {
336 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
337 				while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
338 					assert(sha_de->ref_blks != 0);
339 					dedup_ref_size += sha_de->ref_size;
340 					dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
341 
342 					RB_REMOVE(sha_dedup_entry_rb_tree,
343 							&de->u.fict_root, sha_de);
344 					free(sha_de);
345 				}
346 				assert(RB_EMPTY(&de->u.fict_root));
347 			} else {
348 				assert(de->u.de.ref_blks != 0);
349 				dedup_ref_size += de->u.de.ref_size;
350 				dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
351 			}
352 
353 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
354 			free(de);
355 		}
356 		assert(RB_EMPTY(&dedup_tree));
357 		if (DedupCrcEnd && VerboseOpt == 0)
358 			printf(".");
359 	} while (DedupCrcEnd);
360 
361 	printf("Dedup %s succeeded\n", av[0]);
362 	relpfs(glob_fd, &glob_pfs);
363 
364 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
365 	printf("Dedup ratio = %.2f (in this run)\n"
366 	       "    %8s referenced\n",
367 	       ((dedup_alloc_size != 0) ?
368 			(double)dedup_ref_size / dedup_alloc_size : 0),
369 	       buf
370 	);
371 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
372 	printf("    %8s allocated\n", buf);
373 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
374 	printf("    %8s skipped\n", buf);
375 	printf("    %8jd CRC collisions\n"
376 	       "    %8jd SHA collisions\n"
377 	       "    %8jd big-block underflows\n"
378 	       "    %8jd new dedup records\n"
379 	       "    %8jd new dedup bytes\n",
380 	       (intmax_t)dedup_crc_failures,
381 	       (intmax_t)dedup_sha_failures,
382                (intmax_t)dedup_underflows,
383                (intmax_t)dedup_successes_count,
384                (intmax_t)dedup_successes_bytes
385 	);
386 
387 	/* Once completed remove cycle file */
388 	hammer_reset_cycle();
389 
390 	/* We don't want to mess up with other directives */
391 	if (needfree) {
392 		free(tmp);
393 		CyclePath = NULL;
394 	}
395 }
396 
397 static
398 int
399 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
400 {
401 	return(1);
402 }
403 
404 static
405 int
406 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
407 {
408 	struct sim_dedup_entry *sim_de;
409 
410 	/*
411 	 * If we are using too much memory we have to clean some out, which
412 	 * will cause the run to use multiple passes.  Be careful of integer
413 	 * overflows!
414 	 */
415 	if (MemoryUse > MemoryLimit) {
416 		DedupCrcEnd = DedupCrcStart +
417 			      (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
418 		if (VerboseOpt) {
419 			printf("memory limit  crc-range %08x-%08x\n",
420 				DedupCrcStart, DedupCrcEnd);
421 			fflush(stdout);
422 		}
423 		for (;;) {
424 			sim_de = RB_MAX(sim_dedup_entry_rb_tree,
425 					&sim_dedup_tree);
426 			if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
427 				break;
428 			RB_REMOVE(sim_dedup_entry_rb_tree,
429 				  &sim_dedup_tree, sim_de);
430 			MemoryUse -= sizeof(*sim_de);
431 			free(sim_de);
432 		}
433 	}
434 
435 	/*
436 	 * Collect statistics based on the CRC only, do not try to read
437 	 * any data blocks or run SHA hashes.
438 	 */
439 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
440 			   scan_leaf->data_crc);
441 
442 	if (sim_de == NULL) {
443 		sim_de = calloc(1, sizeof(*sim_de));
444 		sim_de->crc = scan_leaf->data_crc;
445 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
446 		MemoryUse += sizeof(*sim_de);
447 	}
448 
449 	sim_de->ref_blks += 1;
450 	sim_de->ref_size += scan_leaf->data_len;
451 	return (1);
452 }
453 
454 static __inline
455 int
456 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
457 {
458 	if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset))
459 		return (1);
460 	if (p->data_len != q->data_len)
461 		return (1);
462 
463 	return (0);
464 }
465 
466 #define DEDUP_TECH_FAILURE	1
467 #define DEDUP_CMP_FAILURE	2
468 #define DEDUP_INVALID_ZONE	3
469 #define DEDUP_UNDERFLOW		4
470 #define DEDUP_VERS_FAILURE	5
471 
472 static __inline
473 int
474 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
475 {
476 	struct hammer_ioc_dedup dedup;
477 
478 	bzero(&dedup, sizeof(dedup));
479 
480 	/*
481 	 * If data_offset fields are the same there is no need to run ioctl,
482 	 * candidate is already dedup'ed.
483 	 */
484 	if (p->data_offset == q->data_offset)
485 		return (0);
486 
487 	dedup.elm1 = p->base;
488 	dedup.elm2 = q->base;
489 	RunningIoctl = 1;
490 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
491 		if (errno == EOPNOTSUPP)
492 			return (DEDUP_VERS_FAILURE); /* must be at least version 5 */
493 		/* Technical failure - locking or w/e */
494 		return (DEDUP_TECH_FAILURE);
495 	}
496 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE)
497 		return (DEDUP_CMP_FAILURE);
498 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE)
499 		return (DEDUP_INVALID_ZONE);
500 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW)
501 		return (DEDUP_UNDERFLOW);
502 	RunningIoctl = 0;
503 	++dedup_successes_count;
504 	dedup_successes_bytes += p->data_len;
505 	return (0);
506 }
507 
508 static
509 int
510 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
511 {
512 	struct dedup_entry *de;
513 	struct sha_dedup_entry *sha_de, temp;
514 	struct pass2_dedup_entry *pass2_de;
515 	int error;
516 
517 	/*
518 	 * If we are using too much memory we have to clean some out, which
519 	 * will cause the run to use multiple passes.  Be careful of integer
520 	 * overflows!
521 	 */
522 	while (MemoryUse > MemoryLimit) {
523 		DedupCrcEnd = DedupCrcStart +
524 			      (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
525 		if (VerboseOpt) {
526 			printf("memory limit  crc-range %08x-%08x\n",
527 				DedupCrcStart, DedupCrcEnd);
528 			fflush(stdout);
529 		}
530 
531 		for (;;) {
532 			de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
533 			if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
534 				break;
535 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
536 				while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
537 				       NULL) {
538 					RB_REMOVE(sha_dedup_entry_rb_tree,
539 						  &de->u.fict_root, sha_de);
540 					MemoryUse -= sizeof(*sha_de);
541 					free(sha_de);
542 				}
543 			}
544 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
545 			MemoryUse -= sizeof(*de);
546 			free(de);
547 		}
548 	}
549 
550 	/*
551 	 * Collect statistics based on the CRC.  Colliding CRCs usually
552 	 * cause a SHA sub-tree to be created under the de.
553 	 *
554 	 * Trivial case if de not found.
555 	 */
556 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
557 	if (de == NULL) {
558 		de = calloc(1, sizeof(*de));
559 		de->leaf = *scan_leaf;
560 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
561 		MemoryUse += sizeof(*de);
562 		goto upgrade_stats;
563 	}
564 
565 	/*
566 	 * Found entry in CRC tree
567 	 */
568 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
569 		/*
570 		 * Optimize the case where a CRC failure results in multiple
571 		 * SHA entries.  If we unconditionally issue a data-read a
572 		 * degenerate situation where a colliding CRC's second SHA
573 		 * entry contains the lion's share of the deduplication
574 		 * candidates will result in excessive data block reads.
575 		 *
576 		 * Deal with the degenerate case by looking for a matching
577 		 * data_offset/data_len in the SHA elements we already have
578 		 * before reading the data block and generating a new SHA.
579 		 */
580 		RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
581 			if (sha_de->leaf.data_offset ==
582 						scan_leaf->data_offset &&
583 			    sha_de->leaf.data_len == scan_leaf->data_len) {
584 				memcpy(temp.sha_hash, sha_de->sha_hash,
585 					SHA256_DIGEST_LENGTH);
586 				break;
587 			}
588 		}
589 
590 		/*
591 		 * Entry in CRC tree is fictitious, so we already had problems
592 		 * with this CRC. Upgrade (compute SHA) the candidate and
593 		 * dive into SHA subtree. If upgrade fails insert the candidate
594 		 * into Pass2 list (it will be processed later).
595 		 */
596 		if (sha_de == NULL) {
597 			if (upgrade_chksum(scan_leaf, temp.sha_hash))
598 				goto pass2_insert;
599 
600 			sha_de = RB_FIND(sha_dedup_entry_rb_tree,
601 					 &de->u.fict_root, &temp);
602 		}
603 
604 		/*
605 		 * Nothing in SHA subtree so far, so this is a new
606 		 * 'dataset'. Insert new entry into SHA subtree.
607 		 */
608 		if (sha_de == NULL) {
609 			sha_de = calloc(1, sizeof(*sha_de));
610 			sha_de->leaf = *scan_leaf;
611 			memcpy(sha_de->sha_hash, temp.sha_hash,
612 			       SHA256_DIGEST_LENGTH);
613 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
614 				  sha_de);
615 			MemoryUse += sizeof(*sha_de);
616 			goto upgrade_stats_sha;
617 		}
618 
619 		/*
620 		 * Found entry in SHA subtree, it means we have a potential
621 		 * dedup pair. Validate it (zones have to match and data_len
622 		 * field have to be the same too. If validation fails, treat
623 		 * it as a SHA collision (jump to sha256_failure).
624 		 */
625 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
626 			goto sha256_failure;
627 
628 		/*
629 		 * We have a valid dedup pair (SHA match, validated).
630 		 *
631 		 * In case of technical failure (dedup pair was good, but
632 		 * ioctl failed anyways) insert the candidate into Pass2 list
633 		 * (we will try to dedup it after we are done with the rest of
634 		 * the tree).
635 		 *
636 		 * If ioctl fails because either of blocks is in the non-dedup
637 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
638 		 * bother with the candidate and terminate early.
639 		 *
640 		 * If ioctl fails because of big-block underflow replace the
641 		 * leaf node that found dedup entry represents with scan_leaf.
642 		 */
643 		error = deduplicate(&sha_de->leaf, scan_leaf);
644 		switch(error) {
645 		case 0:
646 			goto upgrade_stats_sha;
647 		case DEDUP_TECH_FAILURE:
648 			goto pass2_insert;
649 		case DEDUP_CMP_FAILURE:
650 			goto sha256_failure;
651 		case DEDUP_INVALID_ZONE:
652 			goto terminate_early;
653 		case DEDUP_UNDERFLOW:
654 			++dedup_underflows;
655 			sha_de->leaf = *scan_leaf;
656 			memcpy(sha_de->sha_hash, temp.sha_hash,
657 				SHA256_DIGEST_LENGTH);
658 			goto upgrade_stats_sha;
659 		case DEDUP_VERS_FAILURE:
660 			errx(1, "HAMMER filesystem must be at least "
661 				"version 5 to dedup");
662 			/* not reached */
663 		default:
664 			fprintf(stderr, "Unknown error\n");
665 			goto terminate_early;
666 		}
667 
668 		/*
669 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
670 		 * nothing we can do here.
671 		 */
672 sha256_failure:
673 		++dedup_sha_failures;
674 		goto terminate_early;
675 	} else {
676 		/*
677 		 * Candidate CRC is good for now (we found an entry in CRC
678 		 * tree and it's not fictitious). This means we have a
679 		 * potential dedup pair.
680 		 */
681 		if (validate_dedup_pair(&de->leaf, scan_leaf))
682 			goto crc_failure;
683 
684 		/*
685 		 * We have a valid dedup pair (CRC match, validated)
686 		 */
687 		error = deduplicate(&de->leaf, scan_leaf);
688 		switch(error) {
689 		case 0:
690 			goto upgrade_stats;
691 		case DEDUP_TECH_FAILURE:
692 			goto pass2_insert;
693 		case DEDUP_CMP_FAILURE:
694 			goto crc_failure;
695 		case DEDUP_INVALID_ZONE:
696 			goto terminate_early;
697 		case DEDUP_UNDERFLOW:
698 			++dedup_underflows;
699 			de->leaf = *scan_leaf;
700 			goto upgrade_stats;
701 		case DEDUP_VERS_FAILURE:
702 			errx(1, "HAMMER filesystem must be at least "
703 				"version 5 to dedup");
704 			/* not reached */
705 		default:
706 			fprintf(stderr, "Unknown error\n");
707 			goto terminate_early;
708 		}
709 
710 crc_failure:
711 		/*
712 		 * We got a CRC collision - either ioctl failed because of
713 		 * the comparison failure or validation of the potential
714 		 * dedup pair went bad. In all cases insert both blocks
715 		 * into SHA subtree (this requires checksum upgrade) and mark
716 		 * entry that corresponds to this CRC in the CRC tree
717 		 * fictitious, so that all futher operations with this CRC go
718 		 * through SHA subtree.
719 		 */
720 		++dedup_crc_failures;
721 
722 		/*
723 		 * Insert block that was represented by now fictitious dedup
724 		 * entry (create a new SHA entry and preserve stats of the
725 		 * old CRC one). If checksum upgrade fails insert the
726 		 * candidate into Pass2 list and return - keep both trees
727 		 * unmodified.
728 		 */
729 		sha_de = calloc(1, sizeof(*sha_de));
730 		sha_de->leaf = de->leaf;
731 		sha_de->ref_blks = de->u.de.ref_blks;
732 		sha_de->ref_size = de->u.de.ref_size;
733 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
734 			free(sha_de);
735 			goto pass2_insert;
736 		}
737 		MemoryUse += sizeof(*sha_de);
738 
739 		RB_INIT(&de->u.fict_root);
740 		/*
741 		 * Here we can insert without prior checking because the tree
742 		 * is empty at this point
743 		 */
744 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
745 
746 		/*
747 		 * Mark entry in CRC tree fictitious
748 		 */
749 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
750 
751 		/*
752 		 * Upgrade checksum of the candidate and insert it into
753 		 * SHA subtree. If upgrade fails insert the candidate into
754 		 * Pass2 list.
755 		 */
756 		if (upgrade_chksum(scan_leaf, temp.sha_hash))
757 			goto pass2_insert;
758 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
759 				 &temp);
760 		if (sha_de != NULL) {
761 			/* There is an entry with this SHA already, but the only
762 			 * RB-tree element at this point is that entry we just
763 			 * added. We know for sure these blocks are different
764 			 * (this is crc_failure branch) so treat it as SHA
765 			 * collision.
766 			 */
767 			goto sha256_failure;
768 		}
769 
770 		sha_de = calloc(1, sizeof(*sha_de));
771 		sha_de->leaf = *scan_leaf;
772 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
773 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
774 		MemoryUse += sizeof(*sha_de);
775 		goto upgrade_stats_sha;
776 	}
777 
778 upgrade_stats:
779 	de->u.de.ref_blks += 1;
780 	de->u.de.ref_size += scan_leaf->data_len;
781 	return (1);
782 
783 upgrade_stats_sha:
784 	sha_de->ref_blks += 1;
785 	sha_de->ref_size += scan_leaf->data_len;
786 	return (1);
787 
788 pass2_insert:
789 	/*
790 	 * If in pass2 mode don't insert anything, fall through to
791 	 * terminate_early
792 	 */
793 	if ((flags & DEDUP_PASS2) == 0) {
794 		pass2_de = calloc(1, sizeof(*pass2_de));
795 		pass2_de->leaf = *scan_leaf;
796 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
797 		dedup_skipped_size += scan_leaf->data_len;
798 		return (1);
799 	}
800 
801 terminate_early:
802 	/*
803 	 * Early termination path. Fixup stats.
804 	 */
805 	dedup_alloc_size += scan_leaf->data_len;
806 	dedup_ref_size += scan_leaf->data_len;
807 	return (0);
808 }
809 
810 static
811 int
812 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash)
813 {
814 	struct hammer_ioc_data data;
815 	char *buf = malloc(DEDUP_BUF);
816 	SHA256_CTX ctx;
817 	int error;
818 
819 	bzero(&data, sizeof(data));
820 	data.elm = leaf->base;
821 	data.ubuf = buf;
822 	data.size = DEDUP_BUF;
823 
824 	error = 0;
825 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
826 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
827 		error = 1;
828 		goto done;
829 	}
830 	DedupDataReads += leaf->data_len;
831 
832 	if (data.leaf.data_len != leaf->data_len) {
833 		error = 1;
834 		goto done;
835 	}
836 
837 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
838 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
839 		SHA256_Init(&ctx);
840 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
841 		SHA256_Final(sha_hash, &ctx);
842 	}
843 
844 done:
845 	free(buf);
846 	return (error);
847 }
848 
849 static
850 void
851 sigAlrm(int signo __unused)
852 {
853 	SigAlrmFlag = 1;
854 }
855 
856 static
857 void
858 sigInfo(int signo __unused)
859 {
860 	SigInfoFlag = 1;
861 }
862 
863 static
864 void
865 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
866 {
867 	struct hammer_ioc_mirror_rw mirror;
868 	hammer_ioc_mrecord_any_t mrec;
869 	struct hammer_btree_leaf_elm elm;
870 	char *buf = malloc(DEDUP_BUF);
871 	char buf1[8];
872 	char buf2[8];
873 	int offset, bytes;
874 
875 	SigInfoFlag = 0;
876 	DedupDataReads = 0;
877 	DedupCurrentRecords = 0;
878 	signal(SIGINFO, sigInfo);
879 	signal(SIGALRM, sigAlrm);
880 
881 	/*
882 	 * Deduplication happens per element so hammer(8) is in full
883 	 * control of the ioctl()s to actually perform it. SIGALRM
884 	 * needs to be handled within hammer(8) but a checkpoint
885 	 * is needed for resuming. Use cycle file for that.
886 	 *
887 	 * Try to obtain the previous obj_id from the cycle file and
888 	 * if not available just start from the beginning.
889 	 */
890 	bzero(&mirror, sizeof(mirror));
891 	hammer_key_beg_init(&mirror.key_beg);
892 	hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
893 
894 	if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
895 		if (VerboseOpt) {
896 			fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
897 			    id, (uintmax_t)mirror.key_beg.obj_id);
898 		}
899 	}
900 
901 	hammer_key_end_init(&mirror.key_end);
902 
903 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
904 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
905 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
906 	mirror.ubuf = buf;
907 	mirror.size = DEDUP_BUF;
908 	mirror.pfs_id = glob_pfs.pfs_id;
909 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
910 
911 	if (VerboseOpt && DedupCrcStart == 0) {
912 		printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
913 			id, filesystem,
914 			(uintmax_t)mirror.key_beg.obj_id,
915 			mirror.key_beg.localization,
916 			(uintmax_t)mirror.key_end.obj_id,
917 			mirror.key_end.localization);
918 		printf("%s %s: pfs_id %d\n",
919 			id, filesystem, glob_pfs.pfs_id);
920 	}
921 	fflush(stdout);
922 	fflush(stderr);
923 
924 	do {
925 		mirror.count = 0;
926 		mirror.pfs_id = glob_pfs.pfs_id;
927 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
928 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
929 			err(1, "Mirror-read %s failed", filesystem);
930 			/* not reached */
931 		}
932 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
933 			errx(1, "Mirror-read %s fatal error %d",
934 				filesystem, mirror.head.error);
935 			/* not reached */
936 		}
937 		if (mirror.count) {
938 			offset = 0;
939 			while (offset < mirror.count) {
940 				mrec = (void *)((char *)buf + offset);
941 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
942 				if (offset + bytes > mirror.count) {
943 					errx(1, "Misaligned record");
944 					/* not reached */
945 				}
946 				assert((mrec->head.type &
947 				       HAMMER_MRECF_TYPE_MASK) ==
948 				       HAMMER_MREC_TYPE_REC);
949 				offset += bytes;
950 				elm = mrec->rec.leaf;
951 				if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
952 					continue;
953 				if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
954 					continue;
955 				++DedupCurrentRecords;
956 				if (DedupCrcStart != DedupCrcEnd) {
957 					if (elm.data_crc < DedupCrcStart)
958 						continue;
959 					if (DedupCrcEnd &&
960 					    elm.data_crc >= DedupCrcEnd) {
961 						continue;
962 					}
963 				}
964 				func(&elm, 0);
965 			}
966 		}
967 		mirror.key_beg = mirror.key_cur;
968 		if (DidInterrupt || SigAlrmFlag) {
969 			if (VerboseOpt) {
970 				fprintf(stderr, "%s\n",
971 				    (DidInterrupt ? "Interrupted" : "Timeout"));
972 			}
973 			hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
974 			if (VerboseOpt) {
975 				fprintf(stderr, "Cyclefile %s updated for "
976 				    "continuation\n", CyclePath);
977 			}
978 			exit(1);
979 		}
980 		if (SigInfoFlag) {
981 			if (DedupTotalRecords) {
982 				humanize_unsigned(buf1, sizeof(buf1),
983 						  DedupDataReads,
984 						  "B", 1024);
985 				humanize_unsigned(buf2, sizeof(buf2),
986 						  dedup_successes_bytes,
987 						  "B", 1024);
988 				fprintf(stderr, "%s count %7jd/%jd "
989 						"(%02d.%02d%%) "
990 						"ioread %s newddup %s\n",
991 					id,
992 					(intmax_t)DedupCurrentRecords,
993 					(intmax_t)DedupTotalRecords,
994 					(int)(DedupCurrentRecords * 100 /
995 						DedupTotalRecords),
996 					(int)(DedupCurrentRecords * 10000 /
997 						DedupTotalRecords % 100),
998 					buf1, buf2);
999 			} else {
1000 				fprintf(stderr, "%s count %-7jd\n",
1001 					id,
1002 					(intmax_t)DedupCurrentRecords);
1003 			}
1004 			SigInfoFlag = 0;
1005 		}
1006 	} while (mirror.count != 0);
1007 
1008 	signal(SIGINFO, SIG_IGN);
1009 	signal(SIGALRM, SIG_IGN);
1010 
1011 	free(buf);
1012 }
1013 
1014 static
1015 void
1016 dump_simulated_dedup(void)
1017 {
1018 	struct sim_dedup_entry *sim_de;
1019 
1020 	printf("=== Dumping simulated dedup entries:\n");
1021 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1022 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
1023 			sim_de->crc,
1024 			(intmax_t)sim_de->ref_blks,
1025 			(intmax_t)sim_de->ref_size);
1026 	}
1027 	printf("end of dump ===\n");
1028 }
1029 
1030 static
1031 void
1032 dump_real_dedup(void)
1033 {
1034 	struct dedup_entry *de;
1035 	struct sha_dedup_entry *sha_de;
1036 	int i;
1037 
1038 	printf("=== Dumping dedup entries:\n");
1039 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1040 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1041 			printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1042 
1043 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1044 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1045 				       "\t\tsha=",
1046 				       sha_de->leaf.data_crc,
1047 				       (intmax_t)sha_de->ref_blks,
1048 				       (intmax_t)sha_de->ref_size);
1049 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1050 					printf("%02x", sha_de->sha_hash[i]);
1051 				printf("\n");
1052 			}
1053 		} else {
1054 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
1055 			       de->leaf.data_crc,
1056 			       (intmax_t)de->u.de.ref_blks,
1057 			       (intmax_t)de->u.de.ref_size);
1058 		}
1059 	}
1060 	printf("end of dump ===\n");
1061 }
1062 
1063 static
1064 void
1065 dedup_usage(int code)
1066 {
1067 	fprintf(stderr,
1068 		"hammer dedup-simulate <filesystem>\n"
1069 		"hammer dedup <filesystem>\n"
1070 	);
1071 	exit(code);
1072 }
1073