1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <libutil.h> 36 #include <crypto/sha2/sha2.h> 37 38 #include "hammer.h" 39 40 #define DEDUP_BUF (64 * 1024) 41 42 /* Sorted list of block CRCs - light version for dedup-simulate */ 43 struct sim_dedup_entry_rb_tree; 44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree = 45 RB_INITIALIZER(&sim_dedup_tree); 46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 47 rb_sim_dedup_entry_compare, hammer_crc_t); 48 49 struct sim_dedup_entry { 50 hammer_crc_t crc; 51 uint64_t ref_blks; /* number of blocks referenced */ 52 uint64_t ref_size; /* size of data referenced */ 53 RB_ENTRY(sim_dedup_entry) rb_entry; 54 }; 55 56 struct dedup_entry { 57 struct hammer_btree_leaf_elm leaf; 58 union { 59 struct { 60 uint64_t ref_blks; 61 uint64_t ref_size; 62 } de; 63 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root; 64 } u; 65 uint8_t flags; 66 RB_ENTRY(dedup_entry) rb_entry; 67 }; 68 69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001 70 71 struct sha_dedup_entry { 72 struct hammer_btree_leaf_elm leaf; 73 uint64_t ref_blks; 74 uint64_t ref_size; 75 uint8_t sha_hash[SHA256_DIGEST_LENGTH]; 76 RB_ENTRY(sha_dedup_entry) fict_entry; 77 }; 78 79 /* Sorted list of HAMMER B-Tree keys */ 80 struct dedup_entry_rb_tree; 81 struct sha_dedup_entry_rb_tree; 82 83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree = 84 RB_INITIALIZER(&dedup_tree); 85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 86 rb_dedup_entry_compare, hammer_crc_t); 87 88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 89 rb_sha_dedup_entry_compare); 90 91 /* 92 * Pass2 list - contains entries that were not dedup'ed because ioctl failed 93 */ 94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue = 95 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue); 96 97 struct pass2_dedup_entry { 98 struct hammer_btree_leaf_elm leaf; 99 STAILQ_ENTRY(pass2_dedup_entry) sq_entry; 100 }; 101 102 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */ 103 104 static int SigInfoFlag; 105 static int SigAlrmFlag; 106 static int64_t DedupDataReads; 107 static int64_t DedupCurrentRecords; 108 static int64_t DedupTotalRecords; 109 static uint32_t DedupCrcStart; 110 static uint32_t DedupCrcEnd; 111 static uint64_t MemoryUse; 112 113 /* PFS global ids - we deal with just one PFS at a run */ 114 static int glob_fd; 115 static struct hammer_ioc_pseudofs_rw glob_pfs; 116 117 /* 118 * Global accounting variables 119 * 120 * Last three don't have to be 64-bit, just to be safe.. 121 */ 122 static uint64_t dedup_alloc_size; 123 static uint64_t dedup_ref_size; 124 static uint64_t dedup_skipped_size; 125 static uint64_t dedup_crc_failures; 126 static uint64_t dedup_sha_failures; 127 static uint64_t dedup_underflows; 128 static uint64_t dedup_successes_count; 129 static uint64_t dedup_successes_bytes; 130 131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 132 struct sim_dedup_entry *sim_de2); 133 static int rb_dedup_entry_compare(struct dedup_entry *de1, 134 struct dedup_entry *de2); 135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 136 struct sha_dedup_entry *sha_de2); 137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags); 138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id); 139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash); 143 static void dump_simulated_dedup(void); 144 static void dump_real_dedup(void); 145 static void dedup_usage(int code); 146 147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 148 rb_sim_dedup_entry_compare, hammer_crc_t, crc); 149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 150 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc); 151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 152 rb_sha_dedup_entry_compare); 153 154 static 155 int 156 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 157 struct sim_dedup_entry *sim_de2) 158 { 159 if (sim_de1->crc < sim_de2->crc) 160 return (-1); 161 if (sim_de1->crc > sim_de2->crc) 162 return (1); 163 164 return (0); 165 } 166 167 static 168 int 169 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2) 170 { 171 if (de1->leaf.data_crc < de2->leaf.data_crc) 172 return (-1); 173 if (de1->leaf.data_crc > de2->leaf.data_crc) 174 return (1); 175 176 return (0); 177 } 178 179 static 180 int 181 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 182 struct sha_dedup_entry *sha_de2) 183 { 184 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash; 185 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash; 186 int i; 187 188 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) { 189 if (h1[i] < h2[i]) 190 return (-1); 191 if (h1[i] > h2[i]) 192 return (1); 193 } 194 195 return (0); 196 } 197 198 /* 199 * dedup-simulate <filesystem> 200 */ 201 void 202 hammer_cmd_dedup_simulate(char **av, int ac) 203 { 204 struct sim_dedup_entry *sim_de; 205 206 if (ac != 1) { 207 dedup_usage(1); 208 /* not reached */ 209 } 210 211 glob_fd = getpfs(&glob_pfs, av[0]); 212 213 /* 214 * Collection passes (memory limited) 215 */ 216 printf("Dedup-simulate running\n"); 217 do { 218 DedupCrcStart = DedupCrcEnd; 219 DedupCrcEnd = 0; 220 MemoryUse = 0; 221 222 if (VerboseOpt) { 223 printf("B-Tree pass crc-range %08x-max\n", 224 DedupCrcStart); 225 fflush(stdout); 226 } 227 scan_pfs(av[0], collect_btree_elm, "simu-pass"); 228 229 if (VerboseOpt >= 2) 230 dump_simulated_dedup(); 231 232 /* 233 * Calculate simulated dedup ratio and get rid of the tree 234 */ 235 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) { 236 assert(sim_de->ref_blks != 0); 237 dedup_ref_size += sim_de->ref_size; 238 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks; 239 240 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 241 free(sim_de); 242 } 243 if (DedupCrcEnd && VerboseOpt == 0) 244 printf("."); 245 } while (DedupCrcEnd); 246 247 printf("Dedup-simulate %s succeeded\n", av[0]); 248 relpfs(glob_fd, &glob_pfs); 249 250 printf("Simulated dedup ratio = %.2f\n", 251 (dedup_alloc_size != 0) ? 252 (double)dedup_ref_size / dedup_alloc_size : 0); 253 } 254 255 /* 256 * dedup <filesystem> 257 */ 258 void 259 hammer_cmd_dedup(char **av, int ac) 260 { 261 struct dedup_entry *de; 262 struct sha_dedup_entry *sha_de; 263 struct pass2_dedup_entry *pass2_de; 264 char *tmp; 265 char buf[8]; 266 int needfree = 0; 267 268 if (TimeoutOpt > 0) 269 alarm(TimeoutOpt); 270 271 if (ac != 1) { 272 dedup_usage(1); 273 /* not reached */ 274 } 275 276 STAILQ_INIT(&pass2_dedup_queue); 277 278 glob_fd = getpfs(&glob_pfs, av[0]); 279 280 /* 281 * A cycle file is _required_ for resuming dedup after the timeout 282 * specified with -t has expired. If no -c option, then place a 283 * .dedup.cycle file either in the PFS snapshots directory or in 284 * the default snapshots directory. 285 */ 286 if (!CyclePath) { 287 if (glob_pfs.ondisk->snapshots[0] != '/') { 288 asprintf(&tmp, "%s/%s/.dedup.cycle", 289 SNAPSHOTS_BASE, av[0]); 290 } else { 291 asprintf(&tmp, "%s/.dedup.cycle", 292 glob_pfs.ondisk->snapshots); 293 } 294 CyclePath = tmp; 295 needfree = 1; 296 } 297 298 /* 299 * Pre-pass to cache the btree 300 */ 301 scan_pfs(av[0], count_btree_elm, "pre-pass "); 302 DedupTotalRecords = DedupCurrentRecords; 303 304 /* 305 * Collection passes (memory limited) 306 */ 307 printf("Dedup running\n"); 308 do { 309 DedupCrcStart = DedupCrcEnd; 310 DedupCrcEnd = 0; 311 MemoryUse = 0; 312 313 if (VerboseOpt) { 314 printf("B-Tree pass crc-range %08x-max\n", 315 DedupCrcStart); 316 fflush(stdout); 317 } 318 scan_pfs(av[0], process_btree_elm, "main-pass"); 319 320 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) { 321 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2)) 322 dedup_skipped_size -= pass2_de->leaf.data_len; 323 324 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry); 325 free(pass2_de); 326 } 327 assert(STAILQ_EMPTY(&pass2_dedup_queue)); 328 329 if (VerboseOpt >= 2) 330 dump_real_dedup(); 331 332 /* 333 * Calculate dedup ratio and get rid of the trees 334 */ 335 while ((de = RB_ROOT(&dedup_tree)) != NULL) { 336 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 337 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) { 338 assert(sha_de->ref_blks != 0); 339 dedup_ref_size += sha_de->ref_size; 340 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks; 341 342 RB_REMOVE(sha_dedup_entry_rb_tree, 343 &de->u.fict_root, sha_de); 344 free(sha_de); 345 } 346 assert(RB_EMPTY(&de->u.fict_root)); 347 } else { 348 assert(de->u.de.ref_blks != 0); 349 dedup_ref_size += de->u.de.ref_size; 350 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks; 351 } 352 353 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 354 free(de); 355 } 356 assert(RB_EMPTY(&dedup_tree)); 357 if (DedupCrcEnd && VerboseOpt == 0) 358 printf("."); 359 } while (DedupCrcEnd); 360 361 printf("Dedup %s succeeded\n", av[0]); 362 relpfs(glob_fd, &glob_pfs); 363 364 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024); 365 printf("Dedup ratio = %.2f (in this run)\n" 366 " %8s referenced\n", 367 ((dedup_alloc_size != 0) ? 368 (double)dedup_ref_size / dedup_alloc_size : 0), 369 buf 370 ); 371 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024); 372 printf(" %8s allocated\n", buf); 373 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024); 374 printf(" %8s skipped\n", buf); 375 printf(" %8jd CRC collisions\n" 376 " %8jd SHA collisions\n" 377 " %8jd big-block underflows\n" 378 " %8jd new dedup records\n" 379 " %8jd new dedup bytes\n", 380 (intmax_t)dedup_crc_failures, 381 (intmax_t)dedup_sha_failures, 382 (intmax_t)dedup_underflows, 383 (intmax_t)dedup_successes_count, 384 (intmax_t)dedup_successes_bytes 385 ); 386 387 /* Once completed remove cycle file */ 388 hammer_reset_cycle(); 389 390 /* We don't want to mess up with other directives */ 391 if (needfree) { 392 free(tmp); 393 CyclePath = NULL; 394 } 395 } 396 397 static 398 int 399 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused) 400 { 401 return(1); 402 } 403 404 static 405 int 406 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused) 407 { 408 struct sim_dedup_entry *sim_de; 409 410 /* 411 * If we are using too much memory we have to clean some out, which 412 * will cause the run to use multiple passes. Be careful of integer 413 * overflows! 414 */ 415 if (MemoryUse > MemoryLimit) { 416 DedupCrcEnd = DedupCrcStart + 417 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 418 if (VerboseOpt) { 419 printf("memory limit crc-range %08x-%08x\n", 420 DedupCrcStart, DedupCrcEnd); 421 fflush(stdout); 422 } 423 for (;;) { 424 sim_de = RB_MAX(sim_dedup_entry_rb_tree, 425 &sim_dedup_tree); 426 if (sim_de == NULL || sim_de->crc < DedupCrcEnd) 427 break; 428 RB_REMOVE(sim_dedup_entry_rb_tree, 429 &sim_dedup_tree, sim_de); 430 MemoryUse -= sizeof(*sim_de); 431 free(sim_de); 432 } 433 } 434 435 /* 436 * Collect statistics based on the CRC only, do not try to read 437 * any data blocks or run SHA hashes. 438 */ 439 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, 440 scan_leaf->data_crc); 441 442 if (sim_de == NULL) { 443 sim_de = calloc(1, sizeof(*sim_de)); 444 sim_de->crc = scan_leaf->data_crc; 445 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 446 MemoryUse += sizeof(*sim_de); 447 } 448 449 sim_de->ref_blks += 1; 450 sim_de->ref_size += scan_leaf->data_len; 451 return (1); 452 } 453 454 static __inline 455 int 456 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 457 { 458 if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset)) 459 return (1); 460 if (p->data_len != q->data_len) 461 return (1); 462 463 return (0); 464 } 465 466 #define DEDUP_TECH_FAILURE 1 467 #define DEDUP_CMP_FAILURE 2 468 #define DEDUP_INVALID_ZONE 3 469 #define DEDUP_UNDERFLOW 4 470 #define DEDUP_VERS_FAILURE 5 471 472 static __inline 473 int 474 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 475 { 476 struct hammer_ioc_dedup dedup; 477 478 bzero(&dedup, sizeof(dedup)); 479 480 /* 481 * If data_offset fields are the same there is no need to run ioctl, 482 * candidate is already dedup'ed. 483 */ 484 if (p->data_offset == q->data_offset) 485 return (0); 486 487 dedup.elm1 = p->base; 488 dedup.elm2 = q->base; 489 RunningIoctl = 1; 490 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) { 491 if (errno == EOPNOTSUPP) 492 return (DEDUP_VERS_FAILURE); /* must be at least version 5 */ 493 /* Technical failure - locking or w/e */ 494 return (DEDUP_TECH_FAILURE); 495 } 496 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) 497 return (DEDUP_CMP_FAILURE); 498 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) 499 return (DEDUP_INVALID_ZONE); 500 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) 501 return (DEDUP_UNDERFLOW); 502 RunningIoctl = 0; 503 ++dedup_successes_count; 504 dedup_successes_bytes += p->data_len; 505 return (0); 506 } 507 508 static 509 int 510 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags) 511 { 512 struct dedup_entry *de; 513 struct sha_dedup_entry *sha_de, temp; 514 struct pass2_dedup_entry *pass2_de; 515 int error; 516 517 /* 518 * If we are using too much memory we have to clean some out, which 519 * will cause the run to use multiple passes. Be careful of integer 520 * overflows! 521 */ 522 while (MemoryUse > MemoryLimit) { 523 DedupCrcEnd = DedupCrcStart + 524 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 525 if (VerboseOpt) { 526 printf("memory limit crc-range %08x-%08x\n", 527 DedupCrcStart, DedupCrcEnd); 528 fflush(stdout); 529 } 530 531 for (;;) { 532 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree); 533 if (de == NULL || de->leaf.data_crc < DedupCrcEnd) 534 break; 535 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 536 while ((sha_de = RB_ROOT(&de->u.fict_root)) != 537 NULL) { 538 RB_REMOVE(sha_dedup_entry_rb_tree, 539 &de->u.fict_root, sha_de); 540 MemoryUse -= sizeof(*sha_de); 541 free(sha_de); 542 } 543 } 544 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 545 MemoryUse -= sizeof(*de); 546 free(de); 547 } 548 } 549 550 /* 551 * Collect statistics based on the CRC. Colliding CRCs usually 552 * cause a SHA sub-tree to be created under the de. 553 * 554 * Trivial case if de not found. 555 */ 556 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc); 557 if (de == NULL) { 558 de = calloc(1, sizeof(*de)); 559 de->leaf = *scan_leaf; 560 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de); 561 MemoryUse += sizeof(*de); 562 goto upgrade_stats; 563 } 564 565 /* 566 * Found entry in CRC tree 567 */ 568 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 569 /* 570 * Optimize the case where a CRC failure results in multiple 571 * SHA entries. If we unconditionally issue a data-read a 572 * degenerate situation where a colliding CRC's second SHA 573 * entry contains the lion's share of the deduplication 574 * candidates will result in excessive data block reads. 575 * 576 * Deal with the degenerate case by looking for a matching 577 * data_offset/data_len in the SHA elements we already have 578 * before reading the data block and generating a new SHA. 579 */ 580 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 581 if (sha_de->leaf.data_offset == 582 scan_leaf->data_offset && 583 sha_de->leaf.data_len == scan_leaf->data_len) { 584 memcpy(temp.sha_hash, sha_de->sha_hash, 585 SHA256_DIGEST_LENGTH); 586 break; 587 } 588 } 589 590 /* 591 * Entry in CRC tree is fictitious, so we already had problems 592 * with this CRC. Upgrade (compute SHA) the candidate and 593 * dive into SHA subtree. If upgrade fails insert the candidate 594 * into Pass2 list (it will be processed later). 595 */ 596 if (sha_de == NULL) { 597 if (upgrade_chksum(scan_leaf, temp.sha_hash)) 598 goto pass2_insert; 599 600 sha_de = RB_FIND(sha_dedup_entry_rb_tree, 601 &de->u.fict_root, &temp); 602 } 603 604 /* 605 * Nothing in SHA subtree so far, so this is a new 606 * 'dataset'. Insert new entry into SHA subtree. 607 */ 608 if (sha_de == NULL) { 609 sha_de = calloc(1, sizeof(*sha_de)); 610 sha_de->leaf = *scan_leaf; 611 memcpy(sha_de->sha_hash, temp.sha_hash, 612 SHA256_DIGEST_LENGTH); 613 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, 614 sha_de); 615 MemoryUse += sizeof(*sha_de); 616 goto upgrade_stats_sha; 617 } 618 619 /* 620 * Found entry in SHA subtree, it means we have a potential 621 * dedup pair. Validate it (zones have to match and data_len 622 * field have to be the same too. If validation fails, treat 623 * it as a SHA collision (jump to sha256_failure). 624 */ 625 if (validate_dedup_pair(&sha_de->leaf, scan_leaf)) 626 goto sha256_failure; 627 628 /* 629 * We have a valid dedup pair (SHA match, validated). 630 * 631 * In case of technical failure (dedup pair was good, but 632 * ioctl failed anyways) insert the candidate into Pass2 list 633 * (we will try to dedup it after we are done with the rest of 634 * the tree). 635 * 636 * If ioctl fails because either of blocks is in the non-dedup 637 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't 638 * bother with the candidate and terminate early. 639 * 640 * If ioctl fails because of big-block underflow replace the 641 * leaf node that found dedup entry represents with scan_leaf. 642 */ 643 error = deduplicate(&sha_de->leaf, scan_leaf); 644 switch(error) { 645 case 0: 646 goto upgrade_stats_sha; 647 case DEDUP_TECH_FAILURE: 648 goto pass2_insert; 649 case DEDUP_CMP_FAILURE: 650 goto sha256_failure; 651 case DEDUP_INVALID_ZONE: 652 goto terminate_early; 653 case DEDUP_UNDERFLOW: 654 ++dedup_underflows; 655 sha_de->leaf = *scan_leaf; 656 memcpy(sha_de->sha_hash, temp.sha_hash, 657 SHA256_DIGEST_LENGTH); 658 goto upgrade_stats_sha; 659 case DEDUP_VERS_FAILURE: 660 errx(1, "HAMMER filesystem must be at least " 661 "version 5 to dedup"); 662 /* not reached */ 663 default: 664 fprintf(stderr, "Unknown error\n"); 665 goto terminate_early; 666 } 667 668 /* 669 * Ooh la la.. SHA-256 collision. Terminate early, there's 670 * nothing we can do here. 671 */ 672 sha256_failure: 673 ++dedup_sha_failures; 674 goto terminate_early; 675 } else { 676 /* 677 * Candidate CRC is good for now (we found an entry in CRC 678 * tree and it's not fictitious). This means we have a 679 * potential dedup pair. 680 */ 681 if (validate_dedup_pair(&de->leaf, scan_leaf)) 682 goto crc_failure; 683 684 /* 685 * We have a valid dedup pair (CRC match, validated) 686 */ 687 error = deduplicate(&de->leaf, scan_leaf); 688 switch(error) { 689 case 0: 690 goto upgrade_stats; 691 case DEDUP_TECH_FAILURE: 692 goto pass2_insert; 693 case DEDUP_CMP_FAILURE: 694 goto crc_failure; 695 case DEDUP_INVALID_ZONE: 696 goto terminate_early; 697 case DEDUP_UNDERFLOW: 698 ++dedup_underflows; 699 de->leaf = *scan_leaf; 700 goto upgrade_stats; 701 case DEDUP_VERS_FAILURE: 702 errx(1, "HAMMER filesystem must be at least " 703 "version 5 to dedup"); 704 /* not reached */ 705 default: 706 fprintf(stderr, "Unknown error\n"); 707 goto terminate_early; 708 } 709 710 crc_failure: 711 /* 712 * We got a CRC collision - either ioctl failed because of 713 * the comparison failure or validation of the potential 714 * dedup pair went bad. In all cases insert both blocks 715 * into SHA subtree (this requires checksum upgrade) and mark 716 * entry that corresponds to this CRC in the CRC tree 717 * fictitious, so that all futher operations with this CRC go 718 * through SHA subtree. 719 */ 720 ++dedup_crc_failures; 721 722 /* 723 * Insert block that was represented by now fictitious dedup 724 * entry (create a new SHA entry and preserve stats of the 725 * old CRC one). If checksum upgrade fails insert the 726 * candidate into Pass2 list and return - keep both trees 727 * unmodified. 728 */ 729 sha_de = calloc(1, sizeof(*sha_de)); 730 sha_de->leaf = de->leaf; 731 sha_de->ref_blks = de->u.de.ref_blks; 732 sha_de->ref_size = de->u.de.ref_size; 733 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) { 734 free(sha_de); 735 goto pass2_insert; 736 } 737 MemoryUse += sizeof(*sha_de); 738 739 RB_INIT(&de->u.fict_root); 740 /* 741 * Here we can insert without prior checking because the tree 742 * is empty at this point 743 */ 744 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 745 746 /* 747 * Mark entry in CRC tree fictitious 748 */ 749 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS; 750 751 /* 752 * Upgrade checksum of the candidate and insert it into 753 * SHA subtree. If upgrade fails insert the candidate into 754 * Pass2 list. 755 */ 756 if (upgrade_chksum(scan_leaf, temp.sha_hash)) 757 goto pass2_insert; 758 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, 759 &temp); 760 if (sha_de != NULL) { 761 /* There is an entry with this SHA already, but the only 762 * RB-tree element at this point is that entry we just 763 * added. We know for sure these blocks are different 764 * (this is crc_failure branch) so treat it as SHA 765 * collision. 766 */ 767 goto sha256_failure; 768 } 769 770 sha_de = calloc(1, sizeof(*sha_de)); 771 sha_de->leaf = *scan_leaf; 772 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); 773 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 774 MemoryUse += sizeof(*sha_de); 775 goto upgrade_stats_sha; 776 } 777 778 upgrade_stats: 779 de->u.de.ref_blks += 1; 780 de->u.de.ref_size += scan_leaf->data_len; 781 return (1); 782 783 upgrade_stats_sha: 784 sha_de->ref_blks += 1; 785 sha_de->ref_size += scan_leaf->data_len; 786 return (1); 787 788 pass2_insert: 789 /* 790 * If in pass2 mode don't insert anything, fall through to 791 * terminate_early 792 */ 793 if ((flags & DEDUP_PASS2) == 0) { 794 pass2_de = calloc(1, sizeof(*pass2_de)); 795 pass2_de->leaf = *scan_leaf; 796 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry); 797 dedup_skipped_size += scan_leaf->data_len; 798 return (1); 799 } 800 801 terminate_early: 802 /* 803 * Early termination path. Fixup stats. 804 */ 805 dedup_alloc_size += scan_leaf->data_len; 806 dedup_ref_size += scan_leaf->data_len; 807 return (0); 808 } 809 810 static 811 int 812 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash) 813 { 814 struct hammer_ioc_data data; 815 char *buf = malloc(DEDUP_BUF); 816 SHA256_CTX ctx; 817 int error; 818 819 bzero(&data, sizeof(data)); 820 data.elm = leaf->base; 821 data.ubuf = buf; 822 data.size = DEDUP_BUF; 823 824 error = 0; 825 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) { 826 fprintf(stderr, "Get-data failed: %s\n", strerror(errno)); 827 error = 1; 828 goto done; 829 } 830 DedupDataReads += leaf->data_len; 831 832 if (data.leaf.data_len != leaf->data_len) { 833 error = 1; 834 goto done; 835 } 836 837 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD && 838 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) { 839 SHA256_Init(&ctx); 840 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len); 841 SHA256_Final(sha_hash, &ctx); 842 } 843 844 done: 845 free(buf); 846 return (error); 847 } 848 849 static 850 void 851 sigAlrm(int signo __unused) 852 { 853 SigAlrmFlag = 1; 854 } 855 856 static 857 void 858 sigInfo(int signo __unused) 859 { 860 SigInfoFlag = 1; 861 } 862 863 static 864 void 865 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id) 866 { 867 struct hammer_ioc_mirror_rw mirror; 868 hammer_ioc_mrecord_any_t mrec; 869 struct hammer_btree_leaf_elm elm; 870 char *buf = malloc(DEDUP_BUF); 871 char buf1[8]; 872 char buf2[8]; 873 int offset, bytes; 874 875 SigInfoFlag = 0; 876 DedupDataReads = 0; 877 DedupCurrentRecords = 0; 878 signal(SIGINFO, sigInfo); 879 signal(SIGALRM, sigAlrm); 880 881 /* 882 * Deduplication happens per element so hammer(8) is in full 883 * control of the ioctl()s to actually perform it. SIGALRM 884 * needs to be handled within hammer(8) but a checkpoint 885 * is needed for resuming. Use cycle file for that. 886 * 887 * Try to obtain the previous obj_id from the cycle file and 888 * if not available just start from the beginning. 889 */ 890 bzero(&mirror, sizeof(mirror)); 891 hammer_key_beg_init(&mirror.key_beg); 892 hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg); 893 894 if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) { 895 if (VerboseOpt) { 896 fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n", 897 id, (uintmax_t)mirror.key_beg.obj_id); 898 } 899 } 900 901 hammer_key_end_init(&mirror.key_end); 902 903 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid; 904 mirror.tid_end = glob_pfs.ondisk->sync_end_tid; 905 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */ 906 mirror.ubuf = buf; 907 mirror.size = DEDUP_BUF; 908 mirror.pfs_id = glob_pfs.pfs_id; 909 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 910 911 if (VerboseOpt && DedupCrcStart == 0) { 912 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n", 913 id, filesystem, 914 (uintmax_t)mirror.key_beg.obj_id, 915 mirror.key_beg.localization, 916 (uintmax_t)mirror.key_end.obj_id, 917 mirror.key_end.localization); 918 printf("%s %s: pfs_id %d\n", 919 id, filesystem, glob_pfs.pfs_id); 920 } 921 fflush(stdout); 922 fflush(stderr); 923 924 do { 925 mirror.count = 0; 926 mirror.pfs_id = glob_pfs.pfs_id; 927 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 928 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) { 929 err(1, "Mirror-read %s failed", filesystem); 930 /* not reached */ 931 } 932 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) { 933 errx(1, "Mirror-read %s fatal error %d", 934 filesystem, mirror.head.error); 935 /* not reached */ 936 } 937 if (mirror.count) { 938 offset = 0; 939 while (offset < mirror.count) { 940 mrec = (void *)((char *)buf + offset); 941 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size); 942 if (offset + bytes > mirror.count) { 943 errx(1, "Misaligned record"); 944 /* not reached */ 945 } 946 assert((mrec->head.type & 947 HAMMER_MRECF_TYPE_MASK) == 948 HAMMER_MREC_TYPE_REC); 949 offset += bytes; 950 elm = mrec->rec.leaf; 951 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD) 952 continue; 953 if (elm.base.rec_type != HAMMER_RECTYPE_DATA) 954 continue; 955 ++DedupCurrentRecords; 956 if (DedupCrcStart != DedupCrcEnd) { 957 if (elm.data_crc < DedupCrcStart) 958 continue; 959 if (DedupCrcEnd && 960 elm.data_crc >= DedupCrcEnd) { 961 continue; 962 } 963 } 964 func(&elm, 0); 965 } 966 } 967 mirror.key_beg = mirror.key_cur; 968 if (DidInterrupt || SigAlrmFlag) { 969 if (VerboseOpt) { 970 fprintf(stderr, "%s\n", 971 (DidInterrupt ? "Interrupted" : "Timeout")); 972 } 973 hammer_set_cycle(&mirror.key_cur, mirror.tid_beg); 974 if (VerboseOpt) { 975 fprintf(stderr, "Cyclefile %s updated for " 976 "continuation\n", CyclePath); 977 } 978 exit(1); 979 } 980 if (SigInfoFlag) { 981 if (DedupTotalRecords) { 982 humanize_unsigned(buf1, sizeof(buf1), 983 DedupDataReads, 984 "B", 1024); 985 humanize_unsigned(buf2, sizeof(buf2), 986 dedup_successes_bytes, 987 "B", 1024); 988 fprintf(stderr, "%s count %7jd/%jd " 989 "(%02d.%02d%%) " 990 "ioread %s newddup %s\n", 991 id, 992 (intmax_t)DedupCurrentRecords, 993 (intmax_t)DedupTotalRecords, 994 (int)(DedupCurrentRecords * 100 / 995 DedupTotalRecords), 996 (int)(DedupCurrentRecords * 10000 / 997 DedupTotalRecords % 100), 998 buf1, buf2); 999 } else { 1000 fprintf(stderr, "%s count %-7jd\n", 1001 id, 1002 (intmax_t)DedupCurrentRecords); 1003 } 1004 SigInfoFlag = 0; 1005 } 1006 } while (mirror.count != 0); 1007 1008 signal(SIGINFO, SIG_IGN); 1009 signal(SIGALRM, SIG_IGN); 1010 1011 free(buf); 1012 } 1013 1014 static 1015 void 1016 dump_simulated_dedup(void) 1017 { 1018 struct sim_dedup_entry *sim_de; 1019 1020 printf("=== Dumping simulated dedup entries:\n"); 1021 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) { 1022 printf("\tcrc=%08x cnt=%ju size=%ju\n", 1023 sim_de->crc, 1024 (intmax_t)sim_de->ref_blks, 1025 (intmax_t)sim_de->ref_size); 1026 } 1027 printf("end of dump ===\n"); 1028 } 1029 1030 static 1031 void 1032 dump_real_dedup(void) 1033 { 1034 struct dedup_entry *de; 1035 struct sha_dedup_entry *sha_de; 1036 int i; 1037 1038 printf("=== Dumping dedup entries:\n"); 1039 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) { 1040 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 1041 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc); 1042 1043 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 1044 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t" 1045 "\t\tsha=", 1046 sha_de->leaf.data_crc, 1047 (intmax_t)sha_de->ref_blks, 1048 (intmax_t)sha_de->ref_size); 1049 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i) 1050 printf("%02x", sha_de->sha_hash[i]); 1051 printf("\n"); 1052 } 1053 } else { 1054 printf("\tcrc=%08x cnt=%ju size=%ju\n", 1055 de->leaf.data_crc, 1056 (intmax_t)de->u.de.ref_blks, 1057 (intmax_t)de->u.de.ref_size); 1058 } 1059 } 1060 printf("end of dump ===\n"); 1061 } 1062 1063 static 1064 void 1065 dedup_usage(int code) 1066 { 1067 fprintf(stderr, 1068 "hammer dedup-simulate <filesystem>\n" 1069 "hammer dedup <filesystem>\n" 1070 ); 1071 exit(code); 1072 } 1073