1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy 22eda14cbcSMatt Macy /* 23eda14cbcSMatt Macy * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24eda14cbcSMatt Macy * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 252a58b312SMartin Matuska * Copyright (c) 2022 by Pawel Jakub Dawidek 26ce4dcb97SMartin Matuska * Copyright (c) 2019, 2023, Klara Inc. 27eda14cbcSMatt Macy */ 28eda14cbcSMatt Macy 29eda14cbcSMatt Macy #include <sys/zfs_context.h> 30eda14cbcSMatt Macy #include <sys/spa.h> 31eda14cbcSMatt Macy #include <sys/spa_impl.h> 32eda14cbcSMatt Macy #include <sys/zio.h> 33eda14cbcSMatt Macy #include <sys/ddt.h> 344fefe1b7SMartin Matuska #include <sys/ddt_impl.h> 35eda14cbcSMatt Macy #include <sys/zap.h> 36eda14cbcSMatt Macy #include <sys/dmu_tx.h> 37eda14cbcSMatt Macy #include <sys/arc.h> 38eda14cbcSMatt Macy #include <sys/dsl_pool.h> 39eda14cbcSMatt Macy #include <sys/zio_checksum.h> 40eda14cbcSMatt Macy #include <sys/dsl_scan.h> 41eda14cbcSMatt Macy #include <sys/abd.h> 42*e2df9bb4SMartin Matuska #include <sys/zfeature.h> 43eda14cbcSMatt Macy 444fefe1b7SMartin Matuska /* 454fefe1b7SMartin Matuska * # DDT: Deduplication tables 464fefe1b7SMartin Matuska * 474fefe1b7SMartin Matuska * The dedup subsystem provides block-level deduplication. When enabled, blocks 484fefe1b7SMartin Matuska * to be written will have the dedup (D) bit set, which causes them to be 494fefe1b7SMartin Matuska * tracked in a "dedup table", or DDT. If a block has been seen before (exists 504fefe1b7SMartin Matuska * in the DDT), instead of being written, it will instead be made to reference 514fefe1b7SMartin Matuska * the existing on-disk data, and a refcount bumped in the DDT instead. 524fefe1b7SMartin Matuska * 534fefe1b7SMartin Matuska * ## Dedup tables and entries 544fefe1b7SMartin Matuska * 554fefe1b7SMartin Matuska * Conceptually, a DDT is a dictionary or map. Each entry has a "key" 564fefe1b7SMartin Matuska * (ddt_key_t) made up a block's checksum and certian properties, and a "value" 574fefe1b7SMartin Matuska * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth 584fefe1b7SMartin Matuska * time and refcount. Together these are enough to track references to a 594fefe1b7SMartin Matuska * specific block, to build a valid block pointer to reference that block (for 604fefe1b7SMartin Matuska * freeing, scrubbing, etc), and to fill a new block pointer with the missing 614fefe1b7SMartin Matuska * pieces to make it seem like it was written. 624fefe1b7SMartin Matuska * 634fefe1b7SMartin Matuska * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. 644fefe1b7SMartin Matuska * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk 654fefe1b7SMartin Matuska * object data formats, each with their own implementations) and "classes" 664fefe1b7SMartin Matuska * (ddt_class_t, instance of a storage type object, for entries with a specific 674fefe1b7SMartin Matuska * characteristic). An entry (key) will only ever exist on one of these objects 684fefe1b7SMartin Matuska * at any given time, but may be moved from one to another if their type or 694fefe1b7SMartin Matuska * class changes. 704fefe1b7SMartin Matuska * 714fefe1b7SMartin Matuska * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block 724fefe1b7SMartin Matuska * is to be written, before DVAs have been allocated, ddt_lookup() is called to 734fefe1b7SMartin Matuska * see if the block has been seen before. If its not found, the write proceeds 744fefe1b7SMartin Matuska * as normal, and after it succeeds, a new entry is created. If it is found, we 754fefe1b7SMartin Matuska * fill the BP with the DVAs from the entry, increment the refcount and cause 764fefe1b7SMartin Matuska * the write IO to return immediately. 774fefe1b7SMartin Matuska * 78*e2df9bb4SMartin Matuska * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup 79*e2df9bb4SMartin Matuska * block for the same content/checksum. The slot is selected based on the 80*e2df9bb4SMartin Matuska * zp_copies parameter the block is written with, that is, the number of DVAs 81*e2df9bb4SMartin Matuska * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for 82*e2df9bb4SMartin Matuska * now-removed "dedupditto" feature. These are no longer written, and will be 83*e2df9bb4SMartin Matuska * freed if encountered on old pools. 84*e2df9bb4SMartin Matuska * 85*e2df9bb4SMartin Matuska * If the "fast_dedup" feature is enabled, new dedup tables will be created 86*e2df9bb4SMartin Matuska * with the "flat phys" option. In this mode, there is only one ddt_phys_t 87*e2df9bb4SMartin Matuska * slot. If a write is issued for an entry that exists, but has fewer DVAs, 88*e2df9bb4SMartin Matuska * then only as many new DVAs are allocated and written to make up the 89*e2df9bb4SMartin Matuska * shortfall. The existing entry is then extended (ddt_phys_extend()) with the 90*e2df9bb4SMartin Matuska * new DVAs. 914fefe1b7SMartin Matuska * 924fefe1b7SMartin Matuska * ## Lifetime of an entry 934fefe1b7SMartin Matuska * 944fefe1b7SMartin Matuska * A DDT can be enormous, and typically is not held in memory all at once. 954fefe1b7SMartin Matuska * Instead, the changes to an entry are tracked in memory, and written down to 964fefe1b7SMartin Matuska * disk at the end of each txg. 974fefe1b7SMartin Matuska * 984fefe1b7SMartin Matuska * A "live" in-memory entry (ddt_entry_t) is a node on the live tree 994fefe1b7SMartin Matuska * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is 1004fefe1b7SMartin Matuska * required for IO, ddt_lookup() is called. If an entry already exists on 1014fefe1b7SMartin Matuska * ddt_tree, it is returned. Otherwise, a new one is created, and the 1024fefe1b7SMartin Matuska * type/class objects for the DDT are searched for that key. If its found, its 1034fefe1b7SMartin Matuska * value is copied into the live entry. If not, an empty entry is created. 1044fefe1b7SMartin Matuska * 1054fefe1b7SMartin Matuska * The live entry will be modified during the txg, usually by modifying the 1064fefe1b7SMartin Matuska * refcount, but sometimes by adding or updating DVAs. At the end of the txg 1074fefe1b7SMartin Matuska * (during spa_sync()), type and class are recalculated for entry (see 1084fefe1b7SMartin Matuska * ddt_sync_entry()), and the entry is written to the appropriate storage 1094fefe1b7SMartin Matuska * object and (if necessary), removed from an old one. ddt_tree is cleared and 1104fefe1b7SMartin Matuska * the next txg can start. 1114fefe1b7SMartin Matuska * 112ce4dcb97SMartin Matuska * ## Dedup quota 113ce4dcb97SMartin Matuska * 114ce4dcb97SMartin Matuska * A maximum size for all DDTs on the pool can be set with the 115ce4dcb97SMartin Matuska * dedup_table_quota property. This is determined in ddt_over_quota() and 116ce4dcb97SMartin Matuska * enforced during ddt_lookup(). If the pool is at or over its quota limit, 117ce4dcb97SMartin Matuska * ddt_lookup() will only return entries for existing blocks, as updates are 118ce4dcb97SMartin Matuska * still possible. New entries will not be created; instead, ddt_lookup() will 119ce4dcb97SMartin Matuska * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove 120ce4dcb97SMartin Matuska * the D bit on the block and reissue the IO as a regular write. The block will 121ce4dcb97SMartin Matuska * not be deduplicated. 122ce4dcb97SMartin Matuska * 123ce4dcb97SMartin Matuska * Note that this is based on the on-disk size of the dedup store. Reclaiming 124ce4dcb97SMartin Matuska * this space after deleting entries relies on the ZAP "shrinking" behaviour, 125ce4dcb97SMartin Matuska * without which, no space would be recovered and the DDT would continue to be 126ce4dcb97SMartin Matuska * considered "over quota". See zap_shrink_enabled. 127ce4dcb97SMartin Matuska * 128*e2df9bb4SMartin Matuska * ## Dedup table pruning 129*e2df9bb4SMartin Matuska * 130*e2df9bb4SMartin Matuska * As a complement to the dedup quota feature, ddtprune allows removal of older 131*e2df9bb4SMartin Matuska * non-duplicate entries to make room for newer duplicate entries. The amount 132*e2df9bb4SMartin Matuska * to prune can be based on a target percentage of the unique entries or based 133*e2df9bb4SMartin Matuska * on the age (i.e., prune unique entry older than N days). 134*e2df9bb4SMartin Matuska * 135*e2df9bb4SMartin Matuska * ## Dedup log 136*e2df9bb4SMartin Matuska * 137*e2df9bb4SMartin Matuska * Historically, all entries modified on a txg were written back to dedup 138*e2df9bb4SMartin Matuska * storage objects at the end of every txg. This could cause significant 139*e2df9bb4SMartin Matuska * overheads, as each entry only takes up a tiny portion of a ZAP leaf node, 140*e2df9bb4SMartin Matuska * and so required reading the whole node, updating the entry, and writing it 141*e2df9bb4SMartin Matuska * back. On busy pools, this could add serious IO and memory overheads. 142*e2df9bb4SMartin Matuska * 143*e2df9bb4SMartin Matuska * To address this, the dedup log was added. If the "fast_dedup" feature is 144*e2df9bb4SMartin Matuska * enabled, at the end of each txg, modified entries will be copied to an 145*e2df9bb4SMartin Matuska * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the 146*e2df9bb4SMartin Matuska * same block is requested again, the in-memory object will be checked first, 147*e2df9bb4SMartin Matuska * and if its there, the entry inflated back onto the live tree without going 148*e2df9bb4SMartin Matuska * to storage. The on-disk log is only read at pool import time, to reload the 149*e2df9bb4SMartin Matuska * in-memory log. 150*e2df9bb4SMartin Matuska * 151*e2df9bb4SMartin Matuska * Each txg, some amount of the in-memory log will be flushed out to a DDT 152*e2df9bb4SMartin Matuska * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to 153*e2df9bb4SMartin Matuska * keep up with the rate of change on dedup entries, but not so much that it 154*e2df9bb4SMartin Matuska * would impact overall throughput, and not using too much memory. See the 155*e2df9bb4SMartin Matuska * zfs_dedup_log_* tuneables in zfs(4) for more details. 156*e2df9bb4SMartin Matuska * 1574fefe1b7SMartin Matuska * ## Repair IO 1584fefe1b7SMartin Matuska * 1594fefe1b7SMartin Matuska * If a read on a dedup block fails, but there are other copies of the block in 1604fefe1b7SMartin Matuska * the other ddt_phys_t slots, reads will be issued for those instead 1614fefe1b7SMartin Matuska * (zio_ddt_read_start()). If one of those succeeds, the read is returned to 1624fefe1b7SMartin Matuska * the caller, and a copy is stashed on the entry's dde_repair_abd. 1634fefe1b7SMartin Matuska * 1644fefe1b7SMartin Matuska * During the end-of-txg sync, any entries with a dde_repair_abd get a 1654fefe1b7SMartin Matuska * "rewrite" write issued for the original block pointer, with the data read 1664fefe1b7SMartin Matuska * from the alternate block. If the block is actually damaged, this will invoke 1674fefe1b7SMartin Matuska * the pool's "self-healing" mechanism, and repair the block. 1684fefe1b7SMartin Matuska * 169*e2df9bb4SMartin Matuska * If the "fast_dedup" feature is enabled, the "flat phys" option will be in 170*e2df9bb4SMartin Matuska * use, so there is only ever one ddt_phys_t slot. The repair process will 171*e2df9bb4SMartin Matuska * still happen in this case, though it is unlikely to succeed as there will 172*e2df9bb4SMartin Matuska * usually be no other equivalent blocks to fall back on (though there might 173*e2df9bb4SMartin Matuska * be, if this was an early version of a dedup'd block that has since been 174*e2df9bb4SMartin Matuska * extended). 175*e2df9bb4SMartin Matuska * 176*e2df9bb4SMartin Matuska * Note that this repair mechanism is in addition to and separate from the 177*e2df9bb4SMartin Matuska * regular OpenZFS scrub and self-healing mechanisms. 178*e2df9bb4SMartin Matuska * 1794fefe1b7SMartin Matuska * ## Scanning (scrub/resilver) 1804fefe1b7SMartin Matuska * 1814fefe1b7SMartin Matuska * If dedup is active, the scrub machinery will walk the dedup table first, and 1824fefe1b7SMartin Matuska * scrub all blocks with refcnt > 1 first. After that it will move on to the 1834fefe1b7SMartin Matuska * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. 1844fefe1b7SMartin Matuska * In this way, heavily deduplicated blocks are only scrubbed once. See the 1854fefe1b7SMartin Matuska * commentary on dsl_scan_ddt() for more details. 1864fefe1b7SMartin Matuska * 1874fefe1b7SMartin Matuska * Walking the DDT is done via ddt_walk(). The current position is stored in a 1884fefe1b7SMartin Matuska * ddt_bookmark_t, which represents a stable position in the storage object. 1894fefe1b7SMartin Matuska * This bookmark is stored by the scan machinery, and must reference the same 1904fefe1b7SMartin Matuska * position on the object even if the object changes, the pool is exported, or 1914fefe1b7SMartin Matuska * OpenZFS is upgraded. 1924fefe1b7SMartin Matuska * 193*e2df9bb4SMartin Matuska * If the "fast_dedup" feature is enabled and the table has a log, the scan 194*e2df9bb4SMartin Matuska * cannot begin until entries on the log are flushed, as the on-disk log has no 195*e2df9bb4SMartin Matuska * concept of a "stable position". Instead, the log flushing process will enter 196*e2df9bb4SMartin Matuska * a more aggressive mode, to flush out as much as is necesary as soon as 197*e2df9bb4SMartin Matuska * possible, in order to begin the scan as soon as possible. 198*e2df9bb4SMartin Matuska * 1994fefe1b7SMartin Matuska * ## Interaction with block cloning 2004fefe1b7SMartin Matuska * 2014fefe1b7SMartin Matuska * If block cloning and dedup are both enabled on a pool, BRT will look for the 2024fefe1b7SMartin Matuska * dedup bit on an incoming block pointer. If set, it will call into the DDT 2034fefe1b7SMartin Matuska * (ddt_addref()) to add a reference to the block, instead of adding a 2044fefe1b7SMartin Matuska * reference to the BRT. See brt_pending_apply(). 2054fefe1b7SMartin Matuska */ 2064fefe1b7SMartin Matuska 2074fefe1b7SMartin Matuska /* 2084fefe1b7SMartin Matuska * These are the only checksums valid for dedup. They must match the list 2094fefe1b7SMartin Matuska * from dedup_table in zfs_prop.c 2104fefe1b7SMartin Matuska */ 2114fefe1b7SMartin Matuska #define DDT_CHECKSUM_VALID(c) \ 2124fefe1b7SMartin Matuska (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \ 2134fefe1b7SMartin Matuska c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \ 2144fefe1b7SMartin Matuska c == ZIO_CHECKSUM_BLAKE3) 2154fefe1b7SMartin Matuska 216eda14cbcSMatt Macy static kmem_cache_t *ddt_cache; 217*e2df9bb4SMartin Matuska 218*e2df9bb4SMartin Matuska static kmem_cache_t *ddt_entry_flat_cache; 219*e2df9bb4SMartin Matuska static kmem_cache_t *ddt_entry_trad_cache; 220*e2df9bb4SMartin Matuska 221*e2df9bb4SMartin Matuska #define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE) 222*e2df9bb4SMartin Matuska #define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE) 223*e2df9bb4SMartin Matuska 224*e2df9bb4SMartin Matuska #define DDT_ENTRY_SIZE(ddt) \ 225*e2df9bb4SMartin Matuska _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE) 226eda14cbcSMatt Macy 227eda14cbcSMatt Macy /* 228eda14cbcSMatt Macy * Enable/disable prefetching of dedup-ed blocks which are going to be freed. 229eda14cbcSMatt Macy */ 230eda14cbcSMatt Macy int zfs_dedup_prefetch = 0; 231eda14cbcSMatt Macy 232ce4dcb97SMartin Matuska /* 233ce4dcb97SMartin Matuska * If the dedup class cannot satisfy a DDT allocation, treat as over quota 234ce4dcb97SMartin Matuska * for this many TXGs. 235ce4dcb97SMartin Matuska */ 236ce4dcb97SMartin Matuska uint_t dedup_class_wait_txgs = 5; 237ce4dcb97SMartin Matuska 238*e2df9bb4SMartin Matuska /* 239*e2df9bb4SMartin Matuska * How many DDT prune entries to add to the DDT sync AVL tree. 240*e2df9bb4SMartin Matuska * Note these addtional entries have a memory footprint of a 241*e2df9bb4SMartin Matuska * ddt_entry_t (216 bytes). 242*e2df9bb4SMartin Matuska */ 243*e2df9bb4SMartin Matuska static uint32_t zfs_ddt_prunes_per_txg = 50000; 244*e2df9bb4SMartin Matuska 245*e2df9bb4SMartin Matuska /* 246*e2df9bb4SMartin Matuska * For testing, synthesize aged DDT entries 247*e2df9bb4SMartin Matuska * (in global scope for ztest) 248*e2df9bb4SMartin Matuska */ 249*e2df9bb4SMartin Matuska boolean_t ddt_prune_artificial_age = B_FALSE; 250*e2df9bb4SMartin Matuska boolean_t ddt_dump_prune_histogram = B_FALSE; 251*e2df9bb4SMartin Matuska 252*e2df9bb4SMartin Matuska /* 253*e2df9bb4SMartin Matuska * Don't do more than this many incremental flush passes per txg. 254*e2df9bb4SMartin Matuska */ 255*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_flush_passes_max = 8; 256*e2df9bb4SMartin Matuska 257*e2df9bb4SMartin Matuska /* 258*e2df9bb4SMartin Matuska * Minimum time to flush per txg. 259*e2df9bb4SMartin Matuska */ 260*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_flush_min_time_ms = 1000; 261*e2df9bb4SMartin Matuska 262*e2df9bb4SMartin Matuska /* 263*e2df9bb4SMartin Matuska * Minimum entries to flush per txg. 264*e2df9bb4SMartin Matuska */ 265*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_flush_entries_min = 1000; 266*e2df9bb4SMartin Matuska 267*e2df9bb4SMartin Matuska /* 268*e2df9bb4SMartin Matuska * Number of txgs to average flow rates across. 269*e2df9bb4SMartin Matuska */ 270*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_flush_flow_rate_txgs = 10; 271ce4dcb97SMartin Matuska 272da5137abSMartin Matuska static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { 273eda14cbcSMatt Macy &ddt_zap_ops, 274eda14cbcSMatt Macy }; 275eda14cbcSMatt Macy 276da5137abSMartin Matuska static const char *const ddt_class_name[DDT_CLASSES] = { 277eda14cbcSMatt Macy "ditto", 278eda14cbcSMatt Macy "duplicate", 279eda14cbcSMatt Macy "unique", 280eda14cbcSMatt Macy }; 281eda14cbcSMatt Macy 282*e2df9bb4SMartin Matuska /* 283*e2df9bb4SMartin Matuska * DDT feature flags automatically enabled for each on-disk version. Note that 284*e2df9bb4SMartin Matuska * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled. 285*e2df9bb4SMartin Matuska */ 286*e2df9bb4SMartin Matuska static const uint64_t ddt_version_flags[] = { 287*e2df9bb4SMartin Matuska [DDT_VERSION_LEGACY] = 0, 288*e2df9bb4SMartin Matuska [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, 289*e2df9bb4SMartin Matuska }; 290*e2df9bb4SMartin Matuska 291*e2df9bb4SMartin Matuska /* per-DDT kstats */ 292*e2df9bb4SMartin Matuska typedef struct { 293*e2df9bb4SMartin Matuska /* total lookups and whether they returned new or existing entries */ 294*e2df9bb4SMartin Matuska kstat_named_t dds_lookup; 295*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_new; 296*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_existing; 297*e2df9bb4SMartin Matuska 298*e2df9bb4SMartin Matuska /* entries found on live tree, and if we had to wait for load */ 299*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_live_hit; 300*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_live_wait; 301*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_live_miss; 302*e2df9bb4SMartin Matuska 303*e2df9bb4SMartin Matuska /* entries found on log trees */ 304*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_log_hit; 305*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_log_active_hit; 306*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_log_flushing_hit; 307*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_log_miss; 308*e2df9bb4SMartin Matuska 309*e2df9bb4SMartin Matuska /* entries found on store objects */ 310*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_stored_hit; 311*e2df9bb4SMartin Matuska kstat_named_t dds_lookup_stored_miss; 312*e2df9bb4SMartin Matuska 313*e2df9bb4SMartin Matuska /* number of entries on log trees */ 314*e2df9bb4SMartin Matuska kstat_named_t dds_log_active_entries; 315*e2df9bb4SMartin Matuska kstat_named_t dds_log_flushing_entries; 316*e2df9bb4SMartin Matuska 317*e2df9bb4SMartin Matuska /* avg updated/flushed entries per txg */ 318*e2df9bb4SMartin Matuska kstat_named_t dds_log_ingest_rate; 319*e2df9bb4SMartin Matuska kstat_named_t dds_log_flush_rate; 320*e2df9bb4SMartin Matuska kstat_named_t dds_log_flush_time_rate; 321*e2df9bb4SMartin Matuska } ddt_kstats_t; 322*e2df9bb4SMartin Matuska 323*e2df9bb4SMartin Matuska static const ddt_kstats_t ddt_kstats_template = { 324*e2df9bb4SMartin Matuska { "lookup", KSTAT_DATA_UINT64 }, 325*e2df9bb4SMartin Matuska { "lookup_new", KSTAT_DATA_UINT64 }, 326*e2df9bb4SMartin Matuska { "lookup_existing", KSTAT_DATA_UINT64 }, 327*e2df9bb4SMartin Matuska { "lookup_live_hit", KSTAT_DATA_UINT64 }, 328*e2df9bb4SMartin Matuska { "lookup_live_wait", KSTAT_DATA_UINT64 }, 329*e2df9bb4SMartin Matuska { "lookup_live_miss", KSTAT_DATA_UINT64 }, 330*e2df9bb4SMartin Matuska { "lookup_log_hit", KSTAT_DATA_UINT64 }, 331*e2df9bb4SMartin Matuska { "lookup_log_active_hit", KSTAT_DATA_UINT64 }, 332*e2df9bb4SMartin Matuska { "lookup_log_flushing_hit", KSTAT_DATA_UINT64 }, 333*e2df9bb4SMartin Matuska { "lookup_log_miss", KSTAT_DATA_UINT64 }, 334*e2df9bb4SMartin Matuska { "lookup_stored_hit", KSTAT_DATA_UINT64 }, 335*e2df9bb4SMartin Matuska { "lookup_stored_miss", KSTAT_DATA_UINT64 }, 336*e2df9bb4SMartin Matuska { "log_active_entries", KSTAT_DATA_UINT64 }, 337*e2df9bb4SMartin Matuska { "log_flushing_entries", KSTAT_DATA_UINT64 }, 338*e2df9bb4SMartin Matuska { "log_ingest_rate", KSTAT_DATA_UINT32 }, 339*e2df9bb4SMartin Matuska { "log_flush_rate", KSTAT_DATA_UINT32 }, 340*e2df9bb4SMartin Matuska { "log_flush_time_rate", KSTAT_DATA_UINT32 }, 341*e2df9bb4SMartin Matuska }; 342*e2df9bb4SMartin Matuska 343*e2df9bb4SMartin Matuska #ifdef _KERNEL 344*e2df9bb4SMartin Matuska #define _DDT_KSTAT_STAT(ddt, stat) \ 345*e2df9bb4SMartin Matuska &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64 346*e2df9bb4SMartin Matuska #define DDT_KSTAT_BUMP(ddt, stat) \ 347*e2df9bb4SMartin Matuska do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0) 348*e2df9bb4SMartin Matuska #define DDT_KSTAT_ADD(ddt, stat, val) \ 349*e2df9bb4SMartin Matuska do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) 350*e2df9bb4SMartin Matuska #define DDT_KSTAT_SUB(ddt, stat, val) \ 351*e2df9bb4SMartin Matuska do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) 352*e2df9bb4SMartin Matuska #define DDT_KSTAT_SET(ddt, stat, val) \ 353*e2df9bb4SMartin Matuska do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) 354*e2df9bb4SMartin Matuska #define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0) 355*e2df9bb4SMartin Matuska #else 356*e2df9bb4SMartin Matuska #define DDT_KSTAT_BUMP(ddt, stat) do {} while (0) 357*e2df9bb4SMartin Matuska #define DDT_KSTAT_ADD(ddt, stat, val) do {} while (0) 358*e2df9bb4SMartin Matuska #define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0) 359*e2df9bb4SMartin Matuska #define DDT_KSTAT_SET(ddt, stat, val) do {} while (0) 360*e2df9bb4SMartin Matuska #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0) 361*e2df9bb4SMartin Matuska #endif /* _KERNEL */ 362*e2df9bb4SMartin Matuska 363*e2df9bb4SMartin Matuska 364eda14cbcSMatt Macy static void 3654fefe1b7SMartin Matuska ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 366eda14cbcSMatt Macy dmu_tx_t *tx) 367eda14cbcSMatt Macy { 368eda14cbcSMatt Macy spa_t *spa = ddt->ddt_spa; 369eda14cbcSMatt Macy objset_t *os = ddt->ddt_os; 370eda14cbcSMatt Macy uint64_t *objectp = &ddt->ddt_object[type][class]; 371eda14cbcSMatt Macy boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & 372eda14cbcSMatt Macy ZCHECKSUM_FLAG_DEDUP; 373eda14cbcSMatt Macy char name[DDT_NAMELEN]; 374eda14cbcSMatt Macy 375*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, >, 0); 376*e2df9bb4SMartin Matuska 377eda14cbcSMatt Macy ddt_object_name(ddt, type, class, name); 378eda14cbcSMatt Macy 3794fefe1b7SMartin Matuska ASSERT3U(*objectp, ==, 0); 3804fefe1b7SMartin Matuska VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); 3814fefe1b7SMartin Matuska ASSERT3U(*objectp, !=, 0); 382eda14cbcSMatt Macy 383*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); 384*e2df9bb4SMartin Matuska 385*e2df9bb4SMartin Matuska VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, 386*e2df9bb4SMartin Matuska objectp, tx)); 387eda14cbcSMatt Macy 3884fefe1b7SMartin Matuska VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, 389eda14cbcSMatt Macy sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 3904fefe1b7SMartin Matuska &ddt->ddt_histogram[type][class], tx)); 391eda14cbcSMatt Macy } 392eda14cbcSMatt Macy 393eda14cbcSMatt Macy static void 3944fefe1b7SMartin Matuska ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 395eda14cbcSMatt Macy dmu_tx_t *tx) 396eda14cbcSMatt Macy { 397eda14cbcSMatt Macy spa_t *spa = ddt->ddt_spa; 398eda14cbcSMatt Macy objset_t *os = ddt->ddt_os; 399eda14cbcSMatt Macy uint64_t *objectp = &ddt->ddt_object[type][class]; 400eda14cbcSMatt Macy uint64_t count; 401eda14cbcSMatt Macy char name[DDT_NAMELEN]; 402eda14cbcSMatt Macy 403*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, >, 0); 404*e2df9bb4SMartin Matuska 405eda14cbcSMatt Macy ddt_object_name(ddt, type, class, name); 406eda14cbcSMatt Macy 4074fefe1b7SMartin Matuska ASSERT3U(*objectp, !=, 0); 408eda14cbcSMatt Macy ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 4094fefe1b7SMartin Matuska VERIFY0(ddt_object_count(ddt, type, class, &count)); 4104fefe1b7SMartin Matuska VERIFY0(count); 411*e2df9bb4SMartin Matuska VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx)); 4124fefe1b7SMartin Matuska VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); 4134fefe1b7SMartin Matuska VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); 414da5137abSMartin Matuska memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); 415eda14cbcSMatt Macy 416eda14cbcSMatt Macy *objectp = 0; 417eda14cbcSMatt Macy } 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy static int 4204fefe1b7SMartin Matuska ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 421eda14cbcSMatt Macy { 422eda14cbcSMatt Macy ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 423eda14cbcSMatt Macy dmu_object_info_t doi; 424eda14cbcSMatt Macy uint64_t count; 425eda14cbcSMatt Macy char name[DDT_NAMELEN]; 426eda14cbcSMatt Macy int error; 427eda14cbcSMatt Macy 428*e2df9bb4SMartin Matuska if (ddt->ddt_dir_object == 0) { 429*e2df9bb4SMartin Matuska /* 430*e2df9bb4SMartin Matuska * If we're configured but the containing dir doesn't exist 431*e2df9bb4SMartin Matuska * yet, then this object can't possibly exist either. 432*e2df9bb4SMartin Matuska */ 433*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); 434*e2df9bb4SMartin Matuska return (SET_ERROR(ENOENT)); 435*e2df9bb4SMartin Matuska } 436*e2df9bb4SMartin Matuska 437eda14cbcSMatt Macy ddt_object_name(ddt, type, class, name); 438eda14cbcSMatt Macy 439*e2df9bb4SMartin Matuska error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, 440eda14cbcSMatt Macy sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 441eda14cbcSMatt Macy if (error != 0) 442eda14cbcSMatt Macy return (error); 443eda14cbcSMatt Macy 444eda14cbcSMatt Macy error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 445eda14cbcSMatt Macy sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 446eda14cbcSMatt Macy &ddt->ddt_histogram[type][class]); 447eda14cbcSMatt Macy if (error != 0) 448eda14cbcSMatt Macy return (error); 449eda14cbcSMatt Macy 450eda14cbcSMatt Macy /* 451eda14cbcSMatt Macy * Seed the cached statistics. 452eda14cbcSMatt Macy */ 453eda14cbcSMatt Macy error = ddt_object_info(ddt, type, class, &doi); 454eda14cbcSMatt Macy if (error) 455eda14cbcSMatt Macy return (error); 456eda14cbcSMatt Macy 457eda14cbcSMatt Macy error = ddt_object_count(ddt, type, class, &count); 458eda14cbcSMatt Macy if (error) 459eda14cbcSMatt Macy return (error); 460eda14cbcSMatt Macy 461eda14cbcSMatt Macy ddo->ddo_count = count; 462eda14cbcSMatt Macy ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 463eda14cbcSMatt Macy ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 464eda14cbcSMatt Macy 465eda14cbcSMatt Macy return (0); 466eda14cbcSMatt Macy } 467eda14cbcSMatt Macy 468eda14cbcSMatt Macy static void 4694fefe1b7SMartin Matuska ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 470eda14cbcSMatt Macy dmu_tx_t *tx) 471eda14cbcSMatt Macy { 472eda14cbcSMatt Macy ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 473eda14cbcSMatt Macy dmu_object_info_t doi; 474eda14cbcSMatt Macy uint64_t count; 475eda14cbcSMatt Macy char name[DDT_NAMELEN]; 476eda14cbcSMatt Macy 477eda14cbcSMatt Macy ddt_object_name(ddt, type, class, name); 478eda14cbcSMatt Macy 4794fefe1b7SMartin Matuska VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 480eda14cbcSMatt Macy sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 4814fefe1b7SMartin Matuska &ddt->ddt_histogram[type][class], tx)); 482eda14cbcSMatt Macy 483eda14cbcSMatt Macy /* 484eda14cbcSMatt Macy * Cache DDT statistics; this is the only time they'll change. 485eda14cbcSMatt Macy */ 4864fefe1b7SMartin Matuska VERIFY0(ddt_object_info(ddt, type, class, &doi)); 4874fefe1b7SMartin Matuska VERIFY0(ddt_object_count(ddt, type, class, &count)); 488eda14cbcSMatt Macy 489eda14cbcSMatt Macy ddo->ddo_count = count; 490eda14cbcSMatt Macy ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 491eda14cbcSMatt Macy ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 492eda14cbcSMatt Macy } 493eda14cbcSMatt Macy 4944fefe1b7SMartin Matuska static boolean_t 4954fefe1b7SMartin Matuska ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 4964fefe1b7SMartin Matuska { 4974fefe1b7SMartin Matuska return (!!ddt->ddt_object[type][class]); 4984fefe1b7SMartin Matuska } 4994fefe1b7SMartin Matuska 500eda14cbcSMatt Macy static int 5014fefe1b7SMartin Matuska ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 502eda14cbcSMatt Macy ddt_entry_t *dde) 503eda14cbcSMatt Macy { 504eda14cbcSMatt Macy if (!ddt_object_exists(ddt, type, class)) 505eda14cbcSMatt Macy return (SET_ERROR(ENOENT)); 506eda14cbcSMatt Macy 507eda14cbcSMatt Macy return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 5084fefe1b7SMartin Matuska ddt->ddt_object[type][class], &dde->dde_key, 509*e2df9bb4SMartin Matuska dde->dde_phys, DDT_PHYS_SIZE(ddt))); 5104fefe1b7SMartin Matuska } 5114fefe1b7SMartin Matuska 5124fefe1b7SMartin Matuska static int 5134fefe1b7SMartin Matuska ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 5144fefe1b7SMartin Matuska const ddt_key_t *ddk) 5154fefe1b7SMartin Matuska { 5164fefe1b7SMartin Matuska if (!ddt_object_exists(ddt, type, class)) 5174fefe1b7SMartin Matuska return (SET_ERROR(ENOENT)); 5184fefe1b7SMartin Matuska 5194fefe1b7SMartin Matuska return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os, 5204fefe1b7SMartin Matuska ddt->ddt_object[type][class], ddk)); 521eda14cbcSMatt Macy } 522eda14cbcSMatt Macy 523eda14cbcSMatt Macy static void 5244fefe1b7SMartin Matuska ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 5254fefe1b7SMartin Matuska const ddt_key_t *ddk) 526eda14cbcSMatt Macy { 527eda14cbcSMatt Macy if (!ddt_object_exists(ddt, type, class)) 528eda14cbcSMatt Macy return; 529eda14cbcSMatt Macy 530eda14cbcSMatt Macy ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 5314fefe1b7SMartin Matuska ddt->ddt_object[type][class], ddk); 532eda14cbcSMatt Macy } 533eda14cbcSMatt Macy 534ce4dcb97SMartin Matuska static void 535ce4dcb97SMartin Matuska ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 536ce4dcb97SMartin Matuska { 537ce4dcb97SMartin Matuska if (!ddt_object_exists(ddt, type, class)) 538ce4dcb97SMartin Matuska return; 539ce4dcb97SMartin Matuska 540ce4dcb97SMartin Matuska ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os, 541ce4dcb97SMartin Matuska ddt->ddt_object[type][class]); 542ce4dcb97SMartin Matuska } 543ce4dcb97SMartin Matuska 5444fefe1b7SMartin Matuska static int 5454fefe1b7SMartin Matuska ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 546*e2df9bb4SMartin Matuska const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) 547eda14cbcSMatt Macy { 548eda14cbcSMatt Macy ASSERT(ddt_object_exists(ddt, type, class)); 549eda14cbcSMatt Macy 550eda14cbcSMatt Macy return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 551*e2df9bb4SMartin Matuska ddt->ddt_object[type][class], &ddlwe->ddlwe_key, 552*e2df9bb4SMartin Matuska &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx)); 553eda14cbcSMatt Macy } 554eda14cbcSMatt Macy 555eda14cbcSMatt Macy static int 5564fefe1b7SMartin Matuska ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 5574fefe1b7SMartin Matuska const ddt_key_t *ddk, dmu_tx_t *tx) 558eda14cbcSMatt Macy { 559eda14cbcSMatt Macy ASSERT(ddt_object_exists(ddt, type, class)); 560eda14cbcSMatt Macy 561eda14cbcSMatt Macy return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 5624fefe1b7SMartin Matuska ddt->ddt_object[type][class], ddk, tx)); 563eda14cbcSMatt Macy } 564eda14cbcSMatt Macy 565eda14cbcSMatt Macy int 5664fefe1b7SMartin Matuska ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 567*e2df9bb4SMartin Matuska uint64_t *walk, ddt_lightweight_entry_t *ddlwe) 568eda14cbcSMatt Macy { 569eda14cbcSMatt Macy ASSERT(ddt_object_exists(ddt, type, class)); 570eda14cbcSMatt Macy 571*e2df9bb4SMartin Matuska int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 572*e2df9bb4SMartin Matuska ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, 573*e2df9bb4SMartin Matuska &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 574*e2df9bb4SMartin Matuska if (error == 0) { 575*e2df9bb4SMartin Matuska ddlwe->ddlwe_type = type; 576*e2df9bb4SMartin Matuska ddlwe->ddlwe_class = class; 577*e2df9bb4SMartin Matuska return (0); 578*e2df9bb4SMartin Matuska } 579*e2df9bb4SMartin Matuska return (error); 580eda14cbcSMatt Macy } 581eda14cbcSMatt Macy 582eda14cbcSMatt Macy int 5834fefe1b7SMartin Matuska ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 584eda14cbcSMatt Macy uint64_t *count) 585eda14cbcSMatt Macy { 586eda14cbcSMatt Macy ASSERT(ddt_object_exists(ddt, type, class)); 587eda14cbcSMatt Macy 588eda14cbcSMatt Macy return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 589eda14cbcSMatt Macy ddt->ddt_object[type][class], count)); 590eda14cbcSMatt Macy } 591eda14cbcSMatt Macy 592eda14cbcSMatt Macy int 5934fefe1b7SMartin Matuska ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 594eda14cbcSMatt Macy dmu_object_info_t *doi) 595eda14cbcSMatt Macy { 596eda14cbcSMatt Macy if (!ddt_object_exists(ddt, type, class)) 597eda14cbcSMatt Macy return (SET_ERROR(ENOENT)); 598eda14cbcSMatt Macy 599eda14cbcSMatt Macy return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 600eda14cbcSMatt Macy doi)); 601eda14cbcSMatt Macy } 602eda14cbcSMatt Macy 603eda14cbcSMatt Macy void 6044fefe1b7SMartin Matuska ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 605eda14cbcSMatt Macy char *name) 606eda14cbcSMatt Macy { 607eda14cbcSMatt Macy (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, 608eda14cbcSMatt Macy zio_checksum_table[ddt->ddt_checksum].ci_name, 609eda14cbcSMatt Macy ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 610eda14cbcSMatt Macy } 611eda14cbcSMatt Macy 612eda14cbcSMatt Macy void 613*e2df9bb4SMartin Matuska ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, 614*e2df9bb4SMartin Matuska blkptr_t *bp, uint64_t txg) 615eda14cbcSMatt Macy { 6164fefe1b7SMartin Matuska ASSERT3U(txg, !=, 0); 617*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 618*e2df9bb4SMartin Matuska uint64_t phys_birth; 619*e2df9bb4SMartin Matuska const dva_t *dvap; 620*e2df9bb4SMartin Matuska 621*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) { 622*e2df9bb4SMartin Matuska phys_birth = ddp->ddp_flat.ddp_phys_birth; 623*e2df9bb4SMartin Matuska dvap = ddp->ddp_flat.ddp_dva; 624*e2df9bb4SMartin Matuska } else { 625*e2df9bb4SMartin Matuska phys_birth = ddp->ddp_trad[v].ddp_phys_birth; 626*e2df9bb4SMartin Matuska dvap = ddp->ddp_trad[v].ddp_dva; 627*e2df9bb4SMartin Matuska } 628eda14cbcSMatt Macy 629eda14cbcSMatt Macy for (int d = 0; d < SPA_DVAS_PER_BP; d++) 630*e2df9bb4SMartin Matuska bp->blk_dva[d] = dvap[d]; 631*e2df9bb4SMartin Matuska BP_SET_BIRTH(bp, txg, phys_birth); 632eda14cbcSMatt Macy } 633eda14cbcSMatt Macy 634eda14cbcSMatt Macy /* 635eda14cbcSMatt Macy * The bp created via this function may be used for repairs and scrub, but it 636eda14cbcSMatt Macy * will be missing the salt / IV required to do a full decrypting read. 637eda14cbcSMatt Macy */ 638eda14cbcSMatt Macy void 639*e2df9bb4SMartin Matuska ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, 640*e2df9bb4SMartin Matuska const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp) 641eda14cbcSMatt Macy { 642eda14cbcSMatt Macy BP_ZERO(bp); 643eda14cbcSMatt Macy 644eda14cbcSMatt Macy if (ddp != NULL) 645*e2df9bb4SMartin Matuska ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v)); 646eda14cbcSMatt Macy 647eda14cbcSMatt Macy bp->blk_cksum = ddk->ddk_cksum; 648eda14cbcSMatt Macy 649eda14cbcSMatt Macy BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 650eda14cbcSMatt Macy BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 651eda14cbcSMatt Macy BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 652eda14cbcSMatt Macy BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk)); 653eda14cbcSMatt Macy BP_SET_FILL(bp, 1); 654eda14cbcSMatt Macy BP_SET_CHECKSUM(bp, checksum); 655eda14cbcSMatt Macy BP_SET_TYPE(bp, DMU_OT_DEDUP); 656eda14cbcSMatt Macy BP_SET_LEVEL(bp, 0); 657eda14cbcSMatt Macy BP_SET_DEDUP(bp, 1); 658eda14cbcSMatt Macy BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 659eda14cbcSMatt Macy } 660eda14cbcSMatt Macy 661eda14cbcSMatt Macy void 662eda14cbcSMatt Macy ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 663eda14cbcSMatt Macy { 664eda14cbcSMatt Macy ddk->ddk_cksum = bp->blk_cksum; 665eda14cbcSMatt Macy ddk->ddk_prop = 0; 666eda14cbcSMatt Macy 667eda14cbcSMatt Macy ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp)); 668eda14cbcSMatt Macy 669eda14cbcSMatt Macy DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 670eda14cbcSMatt Macy DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 671eda14cbcSMatt Macy DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 672eda14cbcSMatt Macy DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp)); 673eda14cbcSMatt Macy } 674eda14cbcSMatt Macy 675eda14cbcSMatt Macy void 676*e2df9bb4SMartin Matuska ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) 677eda14cbcSMatt Macy { 678*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 679*e2df9bb4SMartin Matuska int bp_ndvas = BP_GET_NDVAS(bp); 680*e2df9bb4SMartin Matuska int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ? 681*e2df9bb4SMartin Matuska SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; 682*e2df9bb4SMartin Matuska dva_t *dvas = (v == DDT_PHYS_FLAT) ? 683*e2df9bb4SMartin Matuska ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; 684eda14cbcSMatt Macy 685*e2df9bb4SMartin Matuska int s = 0, d = 0; 686*e2df9bb4SMartin Matuska while (s < bp_ndvas && d < ddp_max_dvas) { 687*e2df9bb4SMartin Matuska if (DVA_IS_VALID(&dvas[d])) { 688*e2df9bb4SMartin Matuska d++; 689*e2df9bb4SMartin Matuska continue; 690*e2df9bb4SMartin Matuska } 691*e2df9bb4SMartin Matuska dvas[d] = bp->blk_dva[s]; 692*e2df9bb4SMartin Matuska s++; d++; 693*e2df9bb4SMartin Matuska } 694*e2df9bb4SMartin Matuska 695*e2df9bb4SMartin Matuska /* 696*e2df9bb4SMartin Matuska * If the caller offered us more DVAs than we can fit, something has 697*e2df9bb4SMartin Matuska * gone wrong in their accounting. zio_ddt_write() should never ask for 698*e2df9bb4SMartin Matuska * more than we need. 699*e2df9bb4SMartin Matuska */ 700*e2df9bb4SMartin Matuska ASSERT3U(s, ==, bp_ndvas); 701*e2df9bb4SMartin Matuska 702*e2df9bb4SMartin Matuska if (BP_IS_ENCRYPTED(bp)) 703*e2df9bb4SMartin Matuska dvas[2] = bp->blk_dva[2]; 704*e2df9bb4SMartin Matuska 705*e2df9bb4SMartin Matuska if (ddt_phys_birth(ddp, v) == 0) { 706*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 707*e2df9bb4SMartin Matuska ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); 708*e2df9bb4SMartin Matuska else 709*e2df9bb4SMartin Matuska ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); 710*e2df9bb4SMartin Matuska } 711eda14cbcSMatt Macy } 712eda14cbcSMatt Macy 713eda14cbcSMatt Macy void 714*e2df9bb4SMartin Matuska ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, 715*e2df9bb4SMartin Matuska ddt_phys_variant_t v) 716eda14cbcSMatt Macy { 717*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 718*e2df9bb4SMartin Matuska 719*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 720*e2df9bb4SMartin Matuska dst->ddp_flat = src->ddp_flat; 721*e2df9bb4SMartin Matuska else 722*e2df9bb4SMartin Matuska dst->ddp_trad[v] = src->ddp_trad[v]; 723eda14cbcSMatt Macy } 724eda14cbcSMatt Macy 725eda14cbcSMatt Macy void 726*e2df9bb4SMartin Matuska ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) 727eda14cbcSMatt Macy { 728*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 729*e2df9bb4SMartin Matuska 730*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 731*e2df9bb4SMartin Matuska memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE); 732*e2df9bb4SMartin Matuska else 733*e2df9bb4SMartin Matuska memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); 734*e2df9bb4SMartin Matuska } 735*e2df9bb4SMartin Matuska 736*e2df9bb4SMartin Matuska static uint64_t 737*e2df9bb4SMartin Matuska ddt_class_start(void) 738*e2df9bb4SMartin Matuska { 739*e2df9bb4SMartin Matuska uint64_t start = gethrestime_sec(); 740*e2df9bb4SMartin Matuska 741*e2df9bb4SMartin Matuska if (ddt_prune_artificial_age) { 742*e2df9bb4SMartin Matuska /* 743*e2df9bb4SMartin Matuska * debug aide -- simulate a wider distribution 744*e2df9bb4SMartin Matuska * so we don't have to wait for an aged DDT 745*e2df9bb4SMartin Matuska * to test prune. 746*e2df9bb4SMartin Matuska */ 747*e2df9bb4SMartin Matuska int range = 1 << 21; 748*e2df9bb4SMartin Matuska int percent = random_in_range(100); 749*e2df9bb4SMartin Matuska if (percent < 50) { 750*e2df9bb4SMartin Matuska range = range >> 4; 751*e2df9bb4SMartin Matuska } else if (percent > 75) { 752*e2df9bb4SMartin Matuska range /= 2; 753*e2df9bb4SMartin Matuska } 754*e2df9bb4SMartin Matuska start -= random_in_range(range); 755*e2df9bb4SMartin Matuska } 756*e2df9bb4SMartin Matuska 757*e2df9bb4SMartin Matuska return (start); 758eda14cbcSMatt Macy } 759eda14cbcSMatt Macy 760eda14cbcSMatt Macy void 761*e2df9bb4SMartin Matuska ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) 762eda14cbcSMatt Macy { 763*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 764*e2df9bb4SMartin Matuska 765*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 766*e2df9bb4SMartin Matuska ddp->ddp_flat.ddp_refcnt++; 767*e2df9bb4SMartin Matuska else 768*e2df9bb4SMartin Matuska ddp->ddp_trad[v].ddp_refcnt++; 769eda14cbcSMatt Macy } 770*e2df9bb4SMartin Matuska 771*e2df9bb4SMartin Matuska uint64_t 772*e2df9bb4SMartin Matuska ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) 773*e2df9bb4SMartin Matuska { 774*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 775*e2df9bb4SMartin Matuska 776*e2df9bb4SMartin Matuska uint64_t *refcntp; 777*e2df9bb4SMartin Matuska 778*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 779*e2df9bb4SMartin Matuska refcntp = &ddp->ddp_flat.ddp_refcnt; 780*e2df9bb4SMartin Matuska else 781*e2df9bb4SMartin Matuska refcntp = &ddp->ddp_trad[v].ddp_refcnt; 782*e2df9bb4SMartin Matuska 783*e2df9bb4SMartin Matuska ASSERT3U(*refcntp, >, 0); 784*e2df9bb4SMartin Matuska (*refcntp)--; 785*e2df9bb4SMartin Matuska return (*refcntp); 786eda14cbcSMatt Macy } 787eda14cbcSMatt Macy 7884fefe1b7SMartin Matuska static void 789*e2df9bb4SMartin Matuska ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp, 790*e2df9bb4SMartin Matuska ddt_phys_variant_t v, uint64_t txg) 791eda14cbcSMatt Macy { 792eda14cbcSMatt Macy blkptr_t blk; 793eda14cbcSMatt Macy 794*e2df9bb4SMartin Matuska ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); 795eda14cbcSMatt Macy 796eda14cbcSMatt Macy /* 797eda14cbcSMatt Macy * We clear the dedup bit so that zio_free() will actually free the 798eda14cbcSMatt Macy * space, rather than just decrementing the refcount in the DDT. 799eda14cbcSMatt Macy */ 800eda14cbcSMatt Macy BP_SET_DEDUP(&blk, 0); 801eda14cbcSMatt Macy 802*e2df9bb4SMartin Matuska ddt_phys_clear(ddp, v); 803eda14cbcSMatt Macy zio_free(ddt->ddt_spa, txg, &blk); 804eda14cbcSMatt Macy } 805eda14cbcSMatt Macy 806*e2df9bb4SMartin Matuska uint64_t 807*e2df9bb4SMartin Matuska ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) 808eda14cbcSMatt Macy { 809*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 810eda14cbcSMatt Macy 811*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 812*e2df9bb4SMartin Matuska return (ddp->ddp_flat.ddp_phys_birth); 813*e2df9bb4SMartin Matuska else 814*e2df9bb4SMartin Matuska return (ddp->ddp_trad[v].ddp_phys_birth); 815eda14cbcSMatt Macy } 816*e2df9bb4SMartin Matuska 817*e2df9bb4SMartin Matuska int 818*e2df9bb4SMartin Matuska ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, 819*e2df9bb4SMartin Matuska boolean_t encrypted) 820*e2df9bb4SMartin Matuska { 821*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 822*e2df9bb4SMartin Matuska 823*e2df9bb4SMartin Matuska const dva_t *dvas = (v == DDT_PHYS_FLAT) ? 824*e2df9bb4SMartin Matuska ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; 825*e2df9bb4SMartin Matuska 826*e2df9bb4SMartin Matuska return (DVA_IS_VALID(&dvas[0]) + 827*e2df9bb4SMartin Matuska DVA_IS_VALID(&dvas[1]) + 828*e2df9bb4SMartin Matuska DVA_IS_VALID(&dvas[2]) * !encrypted); 829*e2df9bb4SMartin Matuska } 830*e2df9bb4SMartin Matuska 831*e2df9bb4SMartin Matuska ddt_phys_variant_t 832*e2df9bb4SMartin Matuska ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) 833*e2df9bb4SMartin Matuska { 834*e2df9bb4SMartin Matuska if (dde == NULL) 835*e2df9bb4SMartin Matuska return (DDT_PHYS_NONE); 836*e2df9bb4SMartin Matuska 837*e2df9bb4SMartin Matuska const ddt_univ_phys_t *ddp = dde->dde_phys; 838*e2df9bb4SMartin Matuska 839*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_FLAT) { 840*e2df9bb4SMartin Matuska if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && 841*e2df9bb4SMartin Matuska BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { 842*e2df9bb4SMartin Matuska return (DDT_PHYS_FLAT); 843*e2df9bb4SMartin Matuska } 844*e2df9bb4SMartin Matuska } else /* traditional phys */ { 845*e2df9bb4SMartin Matuska for (int p = 0; p < DDT_PHYS_MAX; p++) { 846*e2df9bb4SMartin Matuska if (DVA_EQUAL(BP_IDENTITY(bp), 847*e2df9bb4SMartin Matuska &ddp->ddp_trad[p].ddp_dva[0]) && 848*e2df9bb4SMartin Matuska BP_GET_BIRTH(bp) == 849*e2df9bb4SMartin Matuska ddp->ddp_trad[p].ddp_phys_birth) { 850*e2df9bb4SMartin Matuska return (p); 851*e2df9bb4SMartin Matuska } 852*e2df9bb4SMartin Matuska } 853*e2df9bb4SMartin Matuska } 854*e2df9bb4SMartin Matuska return (DDT_PHYS_NONE); 855eda14cbcSMatt Macy } 856eda14cbcSMatt Macy 857eda14cbcSMatt Macy uint64_t 858*e2df9bb4SMartin Matuska ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) 859*e2df9bb4SMartin Matuska { 860*e2df9bb4SMartin Matuska ASSERT3U(v, <, DDT_PHYS_NONE); 861*e2df9bb4SMartin Matuska 862*e2df9bb4SMartin Matuska if (v == DDT_PHYS_FLAT) 863*e2df9bb4SMartin Matuska return (ddp->ddp_flat.ddp_refcnt); 864*e2df9bb4SMartin Matuska else 865*e2df9bb4SMartin Matuska return (ddp->ddp_trad[v].ddp_refcnt); 866*e2df9bb4SMartin Matuska } 867*e2df9bb4SMartin Matuska 868*e2df9bb4SMartin Matuska uint64_t 869*e2df9bb4SMartin Matuska ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp) 870eda14cbcSMatt Macy { 871eda14cbcSMatt Macy uint64_t refcnt = 0; 872eda14cbcSMatt Macy 873*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_FLAT) 874*e2df9bb4SMartin Matuska refcnt = ddp->ddp_flat.ddp_refcnt; 875*e2df9bb4SMartin Matuska else 876*e2df9bb4SMartin Matuska for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++) 877*e2df9bb4SMartin Matuska refcnt += ddp->ddp_trad[v].ddp_refcnt; 878eda14cbcSMatt Macy 879eda14cbcSMatt Macy return (refcnt); 880eda14cbcSMatt Macy } 881eda14cbcSMatt Macy 882eda14cbcSMatt Macy ddt_t * 883eda14cbcSMatt Macy ddt_select(spa_t *spa, const blkptr_t *bp) 884eda14cbcSMatt Macy { 8854fefe1b7SMartin Matuska ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp))); 886eda14cbcSMatt Macy return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 887eda14cbcSMatt Macy } 888eda14cbcSMatt Macy 889eda14cbcSMatt Macy void 890eda14cbcSMatt Macy ddt_enter(ddt_t *ddt) 891eda14cbcSMatt Macy { 892eda14cbcSMatt Macy mutex_enter(&ddt->ddt_lock); 893eda14cbcSMatt Macy } 894eda14cbcSMatt Macy 895eda14cbcSMatt Macy void 896eda14cbcSMatt Macy ddt_exit(ddt_t *ddt) 897eda14cbcSMatt Macy { 898eda14cbcSMatt Macy mutex_exit(&ddt->ddt_lock); 899eda14cbcSMatt Macy } 900eda14cbcSMatt Macy 901eda14cbcSMatt Macy void 902eda14cbcSMatt Macy ddt_init(void) 903eda14cbcSMatt Macy { 904eda14cbcSMatt Macy ddt_cache = kmem_cache_create("ddt_cache", 905eda14cbcSMatt Macy sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 906*e2df9bb4SMartin Matuska ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache", 907*e2df9bb4SMartin Matuska DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 908*e2df9bb4SMartin Matuska ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", 909*e2df9bb4SMartin Matuska DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 910*e2df9bb4SMartin Matuska 911*e2df9bb4SMartin Matuska ddt_log_init(); 912eda14cbcSMatt Macy } 913eda14cbcSMatt Macy 914eda14cbcSMatt Macy void 915eda14cbcSMatt Macy ddt_fini(void) 916eda14cbcSMatt Macy { 917*e2df9bb4SMartin Matuska ddt_log_fini(); 918*e2df9bb4SMartin Matuska 919*e2df9bb4SMartin Matuska kmem_cache_destroy(ddt_entry_trad_cache); 920*e2df9bb4SMartin Matuska kmem_cache_destroy(ddt_entry_flat_cache); 921eda14cbcSMatt Macy kmem_cache_destroy(ddt_cache); 922eda14cbcSMatt Macy } 923eda14cbcSMatt Macy 924eda14cbcSMatt Macy static ddt_entry_t * 925*e2df9bb4SMartin Matuska ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk) 926eda14cbcSMatt Macy { 927eda14cbcSMatt Macy ddt_entry_t *dde; 928eda14cbcSMatt Macy 929*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_FLAT) { 930*e2df9bb4SMartin Matuska dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP); 931*e2df9bb4SMartin Matuska memset(dde, 0, DDT_ENTRY_FLAT_SIZE); 932*e2df9bb4SMartin Matuska } else { 933*e2df9bb4SMartin Matuska dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP); 934*e2df9bb4SMartin Matuska memset(dde, 0, DDT_ENTRY_TRAD_SIZE); 935*e2df9bb4SMartin Matuska } 936*e2df9bb4SMartin Matuska 937eda14cbcSMatt Macy cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 938eda14cbcSMatt Macy 939eda14cbcSMatt Macy dde->dde_key = *ddk; 940eda14cbcSMatt Macy 941eda14cbcSMatt Macy return (dde); 942eda14cbcSMatt Macy } 943eda14cbcSMatt Macy 944*e2df9bb4SMartin Matuska void 945*e2df9bb4SMartin Matuska ddt_alloc_entry_io(ddt_entry_t *dde) 946eda14cbcSMatt Macy { 947*e2df9bb4SMartin Matuska if (dde->dde_io != NULL) 948*e2df9bb4SMartin Matuska return; 949eda14cbcSMatt Macy 950*e2df9bb4SMartin Matuska dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP); 951*e2df9bb4SMartin Matuska } 952*e2df9bb4SMartin Matuska 953*e2df9bb4SMartin Matuska static void 954*e2df9bb4SMartin Matuska ddt_free(const ddt_t *ddt, ddt_entry_t *dde) 955*e2df9bb4SMartin Matuska { 956*e2df9bb4SMartin Matuska if (dde->dde_io != NULL) { 957*e2df9bb4SMartin Matuska for (int p = 0; p < DDT_NPHYS(ddt); p++) 958*e2df9bb4SMartin Matuska ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL); 959*e2df9bb4SMartin Matuska 960*e2df9bb4SMartin Matuska if (dde->dde_io->dde_repair_abd != NULL) 961*e2df9bb4SMartin Matuska abd_free(dde->dde_io->dde_repair_abd); 962*e2df9bb4SMartin Matuska 963*e2df9bb4SMartin Matuska kmem_free(dde->dde_io, sizeof (ddt_entry_io_t)); 964*e2df9bb4SMartin Matuska } 965eda14cbcSMatt Macy 966eda14cbcSMatt Macy cv_destroy(&dde->dde_cv); 967*e2df9bb4SMartin Matuska kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 968*e2df9bb4SMartin Matuska ddt_entry_flat_cache : ddt_entry_trad_cache, dde); 969eda14cbcSMatt Macy } 970eda14cbcSMatt Macy 971eda14cbcSMatt Macy void 972eda14cbcSMatt Macy ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 973eda14cbcSMatt Macy { 974eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 975eda14cbcSMatt Macy 976*e2df9bb4SMartin Matuska /* Entry is still in the log, so charge the entry back to it */ 977*e2df9bb4SMartin Matuska if (dde->dde_flags & DDE_FLAG_LOGGED) { 978*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 979*e2df9bb4SMartin Matuska DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); 980*e2df9bb4SMartin Matuska ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 981*e2df9bb4SMartin Matuska } 982*e2df9bb4SMartin Matuska 983eda14cbcSMatt Macy avl_remove(&ddt->ddt_tree, dde); 984*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 985eda14cbcSMatt Macy } 986eda14cbcSMatt Macy 987ce4dcb97SMartin Matuska static boolean_t 988ce4dcb97SMartin Matuska ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc) 989ce4dcb97SMartin Matuska { 990ce4dcb97SMartin Matuska if (mc != NULL && metaslab_class_get_space(mc) > 0) { 991ce4dcb97SMartin Matuska /* Over quota if allocating outside of this special class */ 992ce4dcb97SMartin Matuska if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg + 993ce4dcb97SMartin Matuska dedup_class_wait_txgs) { 994ce4dcb97SMartin Matuska /* Waiting for some deferred frees to be processed */ 995ce4dcb97SMartin Matuska return (B_TRUE); 996ce4dcb97SMartin Matuska } 997ce4dcb97SMartin Matuska 998ce4dcb97SMartin Matuska /* 999ce4dcb97SMartin Matuska * We're considered over quota when we hit 85% full, or for 1000ce4dcb97SMartin Matuska * larger drives, when there is less than 8GB free. 1001ce4dcb97SMartin Matuska */ 1002ce4dcb97SMartin Matuska uint64_t allocated = metaslab_class_get_alloc(mc); 1003ce4dcb97SMartin Matuska uint64_t capacity = metaslab_class_get_space(mc); 1004ce4dcb97SMartin Matuska uint64_t limit = MAX(capacity * 85 / 100, 1005ce4dcb97SMartin Matuska (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); 1006ce4dcb97SMartin Matuska 1007ce4dcb97SMartin Matuska return (allocated >= limit); 1008ce4dcb97SMartin Matuska } 1009ce4dcb97SMartin Matuska return (B_FALSE); 1010ce4dcb97SMartin Matuska } 1011ce4dcb97SMartin Matuska 1012ce4dcb97SMartin Matuska /* 1013ce4dcb97SMartin Matuska * Check if the DDT is over its quota. This can be due to a few conditions: 1014ce4dcb97SMartin Matuska * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize 1015ce4dcb97SMartin Matuska * exceeds this limit 1016ce4dcb97SMartin Matuska * 1017ce4dcb97SMartin Matuska * 2. 'dedup_table_quota' property is set to automatic and 1018ce4dcb97SMartin Matuska * a. the dedup or special allocation class could not satisfy a DDT 1019ce4dcb97SMartin Matuska * allocation in a recent transaction 1020ce4dcb97SMartin Matuska * b. the dedup or special allocation class has exceeded its 85% limit 1021ce4dcb97SMartin Matuska */ 1022ce4dcb97SMartin Matuska static boolean_t 1023ce4dcb97SMartin Matuska ddt_over_quota(spa_t *spa) 1024ce4dcb97SMartin Matuska { 1025ce4dcb97SMartin Matuska if (spa->spa_dedup_table_quota == 0) 1026ce4dcb97SMartin Matuska return (B_FALSE); 1027ce4dcb97SMartin Matuska 1028ce4dcb97SMartin Matuska if (spa->spa_dedup_table_quota != UINT64_MAX) 1029ce4dcb97SMartin Matuska return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota); 1030ce4dcb97SMartin Matuska 1031ce4dcb97SMartin Matuska /* 1032ce4dcb97SMartin Matuska * For automatic quota, table size is limited by dedup or special class 1033ce4dcb97SMartin Matuska */ 1034ce4dcb97SMartin Matuska if (ddt_special_over_quota(spa, spa_dedup_class(spa))) 1035ce4dcb97SMartin Matuska return (B_TRUE); 1036ce4dcb97SMartin Matuska else if (spa_special_has_ddt(spa) && 1037ce4dcb97SMartin Matuska ddt_special_over_quota(spa, spa_special_class(spa))) 1038ce4dcb97SMartin Matuska return (B_TRUE); 1039ce4dcb97SMartin Matuska 1040ce4dcb97SMartin Matuska return (B_FALSE); 1041ce4dcb97SMartin Matuska } 1042ce4dcb97SMartin Matuska 1043ce4dcb97SMartin Matuska void 1044ce4dcb97SMartin Matuska ddt_prefetch_all(spa_t *spa) 1045ce4dcb97SMartin Matuska { 1046ce4dcb97SMartin Matuska /* 1047ce4dcb97SMartin Matuska * Load all DDT entries for each type/class combination. This is 1048ce4dcb97SMartin Matuska * indended to perform a prefetch on all such blocks. For the same 1049ce4dcb97SMartin Matuska * reason that ddt_prefetch isn't locked, this is also not locked. 1050ce4dcb97SMartin Matuska */ 1051ce4dcb97SMartin Matuska for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1052ce4dcb97SMartin Matuska ddt_t *ddt = spa->spa_ddt[c]; 1053ce4dcb97SMartin Matuska if (!ddt) 1054ce4dcb97SMartin Matuska continue; 1055ce4dcb97SMartin Matuska 1056ce4dcb97SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 1057ce4dcb97SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; 1058ce4dcb97SMartin Matuska class++) { 1059ce4dcb97SMartin Matuska ddt_object_prefetch_all(ddt, type, class); 1060ce4dcb97SMartin Matuska } 1061ce4dcb97SMartin Matuska } 1062ce4dcb97SMartin Matuska } 1063ce4dcb97SMartin Matuska } 1064ce4dcb97SMartin Matuska 1065*e2df9bb4SMartin Matuska static int ddt_configure(ddt_t *ddt, boolean_t new); 1066*e2df9bb4SMartin Matuska 1067*e2df9bb4SMartin Matuska /* 1068*e2df9bb4SMartin Matuska * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them 1069*e2df9bb4SMartin Matuska * to the ones in the entry. If they're different, then the passed-in BP is 1070*e2df9bb4SMartin Matuska * from a previous generation of this entry (ie was previously pruned) and we 1071*e2df9bb4SMartin Matuska * have to act like the entry doesn't exist at all. 1072*e2df9bb4SMartin Matuska * 1073*e2df9bb4SMartin Matuska * This should only happen during a lookup to free the block (zio_ddt_free()). 1074*e2df9bb4SMartin Matuska * 1075*e2df9bb4SMartin Matuska * XXX this is similar in spirit to ddt_phys_select(), maybe can combine 1076*e2df9bb4SMartin Matuska * -- robn, 2024-02-09 1077*e2df9bb4SMartin Matuska */ 1078*e2df9bb4SMartin Matuska static boolean_t 1079*e2df9bb4SMartin Matuska ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde) 1080*e2df9bb4SMartin Matuska { 1081*e2df9bb4SMartin Matuska /* If the BP has no DVAs, then this entry is good */ 1082*e2df9bb4SMartin Matuska uint_t ndvas = BP_GET_NDVAS(bp); 1083*e2df9bb4SMartin Matuska if (ndvas == 0) 1084*e2df9bb4SMartin Matuska return (B_TRUE); 1085*e2df9bb4SMartin Matuska 1086*e2df9bb4SMartin Matuska /* 1087*e2df9bb4SMartin Matuska * Only checking the phys for the copies. For flat, there's only one; 1088*e2df9bb4SMartin Matuska * for trad it'll be the one that has the matching set of DVAs. 1089*e2df9bb4SMartin Matuska */ 1090*e2df9bb4SMartin Matuska const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? 1091*e2df9bb4SMartin Matuska dde->dde_phys->ddp_flat.ddp_dva : 1092*e2df9bb4SMartin Matuska dde->dde_phys->ddp_trad[ndvas].ddp_dva; 1093*e2df9bb4SMartin Matuska 1094*e2df9bb4SMartin Matuska /* 1095*e2df9bb4SMartin Matuska * Compare entry DVAs with the BP. They should all be there, but 1096*e2df9bb4SMartin Matuska * there's not really anything we can do if its only partial anyway, 1097*e2df9bb4SMartin Matuska * that's an error somewhere else, maybe long ago. 1098*e2df9bb4SMartin Matuska */ 1099*e2df9bb4SMartin Matuska uint_t d; 1100*e2df9bb4SMartin Matuska for (d = 0; d < ndvas; d++) 1101*e2df9bb4SMartin Matuska if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d])) 1102*e2df9bb4SMartin Matuska return (B_FALSE); 1103*e2df9bb4SMartin Matuska ASSERT3U(d, ==, ndvas); 1104*e2df9bb4SMartin Matuska 1105*e2df9bb4SMartin Matuska return (B_TRUE); 1106*e2df9bb4SMartin Matuska } 1107*e2df9bb4SMartin Matuska 1108eda14cbcSMatt Macy ddt_entry_t * 1109*e2df9bb4SMartin Matuska ddt_lookup(ddt_t *ddt, const blkptr_t *bp) 1110eda14cbcSMatt Macy { 1111ce4dcb97SMartin Matuska spa_t *spa = ddt->ddt_spa; 11124fefe1b7SMartin Matuska ddt_key_t search; 11134fefe1b7SMartin Matuska ddt_entry_t *dde; 11144fefe1b7SMartin Matuska ddt_type_t type; 11154fefe1b7SMartin Matuska ddt_class_t class; 1116eda14cbcSMatt Macy avl_index_t where; 1117eda14cbcSMatt Macy int error; 1118eda14cbcSMatt Macy 1119eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 1120eda14cbcSMatt Macy 1121*e2df9bb4SMartin Matuska if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) { 1122*e2df9bb4SMartin Matuska /* 1123*e2df9bb4SMartin Matuska * This is the first use of this DDT since the pool was 1124*e2df9bb4SMartin Matuska * created; finish getting it ready for use. 1125*e2df9bb4SMartin Matuska */ 1126*e2df9bb4SMartin Matuska VERIFY0(ddt_configure(ddt, B_TRUE)); 1127*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); 1128*e2df9bb4SMartin Matuska } 1129*e2df9bb4SMartin Matuska 1130*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup); 1131*e2df9bb4SMartin Matuska 11324fefe1b7SMartin Matuska ddt_key_fill(&search, bp); 1133eda14cbcSMatt Macy 11344fefe1b7SMartin Matuska /* Find an existing live entry */ 11354fefe1b7SMartin Matuska dde = avl_find(&ddt->ddt_tree, &search, &where); 11364fefe1b7SMartin Matuska if (dde != NULL) { 1137ce4dcb97SMartin Matuska /* If we went over quota, act like we didn't find it */ 1138ce4dcb97SMartin Matuska if (dde->dde_flags & DDE_FLAG_OVERQUOTA) 1139ce4dcb97SMartin Matuska return (NULL); 1140ce4dcb97SMartin Matuska 1141ce4dcb97SMartin Matuska /* If it's already loaded, we can just return it. */ 1142*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit); 1143*e2df9bb4SMartin Matuska if (dde->dde_flags & DDE_FLAG_LOADED) { 1144*e2df9bb4SMartin Matuska if (ddt_entry_lookup_is_valid(ddt, bp, dde)) 1145eda14cbcSMatt Macy return (dde); 1146*e2df9bb4SMartin Matuska return (NULL); 1147*e2df9bb4SMartin Matuska } 1148eda14cbcSMatt Macy 11494fefe1b7SMartin Matuska /* Someone else is loading it, wait for it. */ 1150ce4dcb97SMartin Matuska dde->dde_waiters++; 1151*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait); 11524fefe1b7SMartin Matuska while (!(dde->dde_flags & DDE_FLAG_LOADED)) 11534fefe1b7SMartin Matuska cv_wait(&dde->dde_cv, &ddt->ddt_lock); 1154ce4dcb97SMartin Matuska dde->dde_waiters--; 1155ce4dcb97SMartin Matuska 1156ce4dcb97SMartin Matuska /* Loaded but over quota, forget we were ever here */ 1157ce4dcb97SMartin Matuska if (dde->dde_flags & DDE_FLAG_OVERQUOTA) { 1158ce4dcb97SMartin Matuska if (dde->dde_waiters == 0) { 1159ce4dcb97SMartin Matuska avl_remove(&ddt->ddt_tree, dde); 1160*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 1161ce4dcb97SMartin Matuska } 1162ce4dcb97SMartin Matuska return (NULL); 1163ce4dcb97SMartin Matuska } 1164eda14cbcSMatt Macy 1165*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_existing); 1166*e2df9bb4SMartin Matuska 1167*e2df9bb4SMartin Matuska /* Make sure the loaded entry matches the BP */ 1168*e2df9bb4SMartin Matuska if (ddt_entry_lookup_is_valid(ddt, bp, dde)) 1169*e2df9bb4SMartin Matuska return (dde); 1170*e2df9bb4SMartin Matuska return (NULL); 1171*e2df9bb4SMartin Matuska } else 1172*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss); 1173*e2df9bb4SMartin Matuska 1174*e2df9bb4SMartin Matuska /* Time to make a new entry. */ 1175*e2df9bb4SMartin Matuska dde = ddt_alloc(ddt, &search); 1176*e2df9bb4SMartin Matuska 1177*e2df9bb4SMartin Matuska /* Record the time this class was created (used by ddt prune) */ 1178*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_FLAT) 1179*e2df9bb4SMartin Matuska dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start(); 1180*e2df9bb4SMartin Matuska 1181*e2df9bb4SMartin Matuska avl_insert(&ddt->ddt_tree, dde, where); 1182*e2df9bb4SMartin Matuska 1183*e2df9bb4SMartin Matuska /* If its in the log tree, we can "load" it from there */ 1184*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_LOG) { 1185*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 1186*e2df9bb4SMartin Matuska 1187*e2df9bb4SMartin Matuska if (ddt_log_find_key(ddt, &search, &ddlwe)) { 1188*e2df9bb4SMartin Matuska /* 1189*e2df9bb4SMartin Matuska * See if we have the key first, and if so, set up 1190*e2df9bb4SMartin Matuska * the entry. 1191*e2df9bb4SMartin Matuska */ 1192*e2df9bb4SMartin Matuska dde->dde_type = ddlwe.ddlwe_type; 1193*e2df9bb4SMartin Matuska dde->dde_class = ddlwe.ddlwe_class; 1194*e2df9bb4SMartin Matuska memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, 1195*e2df9bb4SMartin Matuska DDT_PHYS_SIZE(ddt)); 1196*e2df9bb4SMartin Matuska /* Whatever we found isn't valid for this BP, eject */ 1197*e2df9bb4SMartin Matuska if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) { 1198*e2df9bb4SMartin Matuska avl_remove(&ddt->ddt_tree, dde); 1199*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 1200*e2df9bb4SMartin Matuska return (NULL); 1201*e2df9bb4SMartin Matuska } 1202*e2df9bb4SMartin Matuska 1203*e2df9bb4SMartin Matuska /* Remove it and count it */ 1204*e2df9bb4SMartin Matuska if (ddt_log_remove_key(ddt, 1205*e2df9bb4SMartin Matuska ddt->ddt_log_active, &search)) { 1206*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); 1207*e2df9bb4SMartin Matuska } else { 1208*e2df9bb4SMartin Matuska VERIFY(ddt_log_remove_key(ddt, 1209*e2df9bb4SMartin Matuska ddt->ddt_log_flushing, &search)); 1210*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, 1211*e2df9bb4SMartin Matuska dds_lookup_log_flushing_hit); 1212*e2df9bb4SMartin Matuska } 1213*e2df9bb4SMartin Matuska 1214*e2df9bb4SMartin Matuska dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; 1215*e2df9bb4SMartin Matuska 1216*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit); 1217*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_existing); 1218*e2df9bb4SMartin Matuska 12194fefe1b7SMartin Matuska return (dde); 12204fefe1b7SMartin Matuska } 12214fefe1b7SMartin Matuska 1222*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss); 1223*e2df9bb4SMartin Matuska } 12244fefe1b7SMartin Matuska 12254fefe1b7SMartin Matuska /* 12264fefe1b7SMartin Matuska * ddt_tree is now stable, so unlock and let everyone else keep moving. 12274fefe1b7SMartin Matuska * Anyone landing on this entry will find it without DDE_FLAG_LOADED, 12284fefe1b7SMartin Matuska * and go to sleep waiting for it above. 12294fefe1b7SMartin Matuska */ 1230eda14cbcSMatt Macy ddt_exit(ddt); 1231eda14cbcSMatt Macy 12324fefe1b7SMartin Matuska /* Search all store objects for the entry. */ 1233eda14cbcSMatt Macy error = ENOENT; 1234eda14cbcSMatt Macy for (type = 0; type < DDT_TYPES; type++) { 1235eda14cbcSMatt Macy for (class = 0; class < DDT_CLASSES; class++) { 1236eda14cbcSMatt Macy error = ddt_object_lookup(ddt, type, class, dde); 1237eda14cbcSMatt Macy if (error != ENOENT) { 1238eda14cbcSMatt Macy ASSERT0(error); 1239eda14cbcSMatt Macy break; 1240eda14cbcSMatt Macy } 1241eda14cbcSMatt Macy } 1242eda14cbcSMatt Macy if (error != ENOENT) 1243eda14cbcSMatt Macy break; 1244eda14cbcSMatt Macy } 1245eda14cbcSMatt Macy 1246eda14cbcSMatt Macy ddt_enter(ddt); 1247eda14cbcSMatt Macy 12484fefe1b7SMartin Matuska ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED)); 1249eda14cbcSMatt Macy 1250eda14cbcSMatt Macy dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 1251eda14cbcSMatt Macy dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 1252eda14cbcSMatt Macy 1253*e2df9bb4SMartin Matuska boolean_t valid = B_TRUE; 1254*e2df9bb4SMartin Matuska 1255ce4dcb97SMartin Matuska if (dde->dde_type == DDT_TYPES && 1256ce4dcb97SMartin Matuska dde->dde_class == DDT_CLASSES && 1257ce4dcb97SMartin Matuska ddt_over_quota(spa)) { 1258ce4dcb97SMartin Matuska /* Over quota. If no one is waiting, clean up right now. */ 1259ce4dcb97SMartin Matuska if (dde->dde_waiters == 0) { 1260ce4dcb97SMartin Matuska avl_remove(&ddt->ddt_tree, dde); 1261*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 1262ce4dcb97SMartin Matuska return (NULL); 1263ce4dcb97SMartin Matuska } 1264ce4dcb97SMartin Matuska 1265ce4dcb97SMartin Matuska /* Flag cleanup required */ 1266ce4dcb97SMartin Matuska dde->dde_flags |= DDE_FLAG_OVERQUOTA; 1267ce4dcb97SMartin Matuska } else if (error == 0) { 1268*e2df9bb4SMartin Matuska /* 1269*e2df9bb4SMartin Matuska * If what we loaded is no good for this BP and there's no one 1270*e2df9bb4SMartin Matuska * waiting for it, we can just remove it and get out. If its no 1271*e2df9bb4SMartin Matuska * good but there are waiters, we have to leave it, because we 1272*e2df9bb4SMartin Matuska * don't know what they want. If its not needed we'll end up 1273*e2df9bb4SMartin Matuska * taking an entry log/sync, but it can only happen if more 1274*e2df9bb4SMartin Matuska * than one previous version of this block is being deleted at 1275*e2df9bb4SMartin Matuska * the same time. This is extremely unlikely to happen and not 1276*e2df9bb4SMartin Matuska * worth the effort to deal with without taking an entry 1277*e2df9bb4SMartin Matuska * update. 1278*e2df9bb4SMartin Matuska */ 1279*e2df9bb4SMartin Matuska valid = ddt_entry_lookup_is_valid(ddt, bp, dde); 1280*e2df9bb4SMartin Matuska if (!valid && dde->dde_waiters == 0) { 1281*e2df9bb4SMartin Matuska avl_remove(&ddt->ddt_tree, dde); 1282*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 1283*e2df9bb4SMartin Matuska return (NULL); 1284*e2df9bb4SMartin Matuska } 1285*e2df9bb4SMartin Matuska 1286*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit); 1287*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_existing); 1288*e2df9bb4SMartin Matuska 1289*e2df9bb4SMartin Matuska /* 1290*e2df9bb4SMartin Matuska * The histograms only track inactive (stored or logged) blocks. 1291*e2df9bb4SMartin Matuska * We've just put an entry onto the live list, so we need to 1292*e2df9bb4SMartin Matuska * remove its counts. When its synced back, it'll be re-added 1293*e2df9bb4SMartin Matuska * to the right one. 1294*e2df9bb4SMartin Matuska * 1295*e2df9bb4SMartin Matuska * We only do this when we successfully found it in the store. 1296*e2df9bb4SMartin Matuska * error == ENOENT means this is a new entry, and so its already 1297*e2df9bb4SMartin Matuska * not counted. 1298*e2df9bb4SMartin Matuska */ 1299*e2df9bb4SMartin Matuska ddt_histogram_t *ddh = 1300*e2df9bb4SMartin Matuska &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 1301*e2df9bb4SMartin Matuska 1302*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 1303*e2df9bb4SMartin Matuska DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); 1304*e2df9bb4SMartin Matuska ddt_histogram_sub_entry(ddt, ddh, &ddlwe); 1305*e2df9bb4SMartin Matuska } else { 1306*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss); 1307*e2df9bb4SMartin Matuska DDT_KSTAT_BUMP(ddt, dds_lookup_new); 1308ce4dcb97SMartin Matuska } 1309eda14cbcSMatt Macy 13104fefe1b7SMartin Matuska /* Entry loaded, everyone can proceed now */ 13114fefe1b7SMartin Matuska dde->dde_flags |= DDE_FLAG_LOADED; 1312eda14cbcSMatt Macy cv_broadcast(&dde->dde_cv); 1313eda14cbcSMatt Macy 1314*e2df9bb4SMartin Matuska if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid) 1315*e2df9bb4SMartin Matuska return (NULL); 1316*e2df9bb4SMartin Matuska 1317*e2df9bb4SMartin Matuska return (dde); 1318eda14cbcSMatt Macy } 1319eda14cbcSMatt Macy 1320eda14cbcSMatt Macy void 1321eda14cbcSMatt Macy ddt_prefetch(spa_t *spa, const blkptr_t *bp) 1322eda14cbcSMatt Macy { 1323eda14cbcSMatt Macy ddt_t *ddt; 13244fefe1b7SMartin Matuska ddt_key_t ddk; 1325eda14cbcSMatt Macy 1326eda14cbcSMatt Macy if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 1327eda14cbcSMatt Macy return; 1328eda14cbcSMatt Macy 1329eda14cbcSMatt Macy /* 1330eda14cbcSMatt Macy * We only remove the DDT once all tables are empty and only 1331eda14cbcSMatt Macy * prefetch dedup blocks when there are entries in the DDT. 1332eda14cbcSMatt Macy * Thus no locking is required as the DDT can't disappear on us. 1333eda14cbcSMatt Macy */ 1334eda14cbcSMatt Macy ddt = ddt_select(spa, bp); 13354fefe1b7SMartin Matuska ddt_key_fill(&ddk, bp); 1336eda14cbcSMatt Macy 13374fefe1b7SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 13384fefe1b7SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 13394fefe1b7SMartin Matuska ddt_object_prefetch(ddt, type, class, &ddk); 1340eda14cbcSMatt Macy } 1341eda14cbcSMatt Macy } 1342eda14cbcSMatt Macy } 1343eda14cbcSMatt Macy 1344eda14cbcSMatt Macy /* 1345*e2df9bb4SMartin Matuska * ddt_key_t comparison. Any struct wanting to make use of this function must 1346*e2df9bb4SMartin Matuska * have the key as the first element. Casts it to N uint64_ts, and checks until 1347*e2df9bb4SMartin Matuska * we find there's a difference. This is intended to match how ddt_zap.c drives 1348*e2df9bb4SMartin Matuska * the ZAPs (first uint64_t as the key prehash), which will minimise the number 1349*e2df9bb4SMartin Matuska * of ZAP blocks touched when flushing logged entries from an AVL walk. This is 1350*e2df9bb4SMartin Matuska * not an invariant for this function though, should you wish to change it. 1351eda14cbcSMatt Macy */ 1352eda14cbcSMatt Macy int 13534fefe1b7SMartin Matuska ddt_key_compare(const void *x1, const void *x2) 1354eda14cbcSMatt Macy { 1355*e2df9bb4SMartin Matuska const uint64_t *k1 = (const uint64_t *)x1; 1356*e2df9bb4SMartin Matuska const uint64_t *k2 = (const uint64_t *)x2; 1357eda14cbcSMatt Macy 1358*e2df9bb4SMartin Matuska int cmp; 1359*e2df9bb4SMartin Matuska for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++) 1360*e2df9bb4SMartin Matuska if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0)) 1361*e2df9bb4SMartin Matuska return (cmp); 1362*e2df9bb4SMartin Matuska 1363*e2df9bb4SMartin Matuska return (0); 1364eda14cbcSMatt Macy } 1365eda14cbcSMatt Macy 1366*e2df9bb4SMartin Matuska /* Create the containing dir for this DDT and bump the feature count */ 1367*e2df9bb4SMartin Matuska static void 1368*e2df9bb4SMartin Matuska ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) 1369*e2df9bb4SMartin Matuska { 1370*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, ==, 0); 1371*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); 1372*e2df9bb4SMartin Matuska 1373*e2df9bb4SMartin Matuska char name[DDT_NAMELEN]; 1374*e2df9bb4SMartin Matuska snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, 1375*e2df9bb4SMartin Matuska zio_checksum_table[ddt->ddt_checksum].ci_name); 1376*e2df9bb4SMartin Matuska 1377*e2df9bb4SMartin Matuska ddt->ddt_dir_object = zap_create_link(ddt->ddt_os, 1378*e2df9bb4SMartin Matuska DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx); 1379*e2df9bb4SMartin Matuska 1380*e2df9bb4SMartin Matuska VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, 1381*e2df9bb4SMartin Matuska sizeof (uint64_t), 1, &ddt->ddt_version, tx)); 1382*e2df9bb4SMartin Matuska VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, 1383*e2df9bb4SMartin Matuska sizeof (uint64_t), 1, &ddt->ddt_flags, tx)); 1384*e2df9bb4SMartin Matuska 1385*e2df9bb4SMartin Matuska spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); 1386*e2df9bb4SMartin Matuska } 1387*e2df9bb4SMartin Matuska 1388*e2df9bb4SMartin Matuska /* Destroy the containing dir and deactivate the feature */ 1389*e2df9bb4SMartin Matuska static void 1390*e2df9bb4SMartin Matuska ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) 1391*e2df9bb4SMartin Matuska { 1392*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, !=, 0); 1393*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT); 1394*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); 1395*e2df9bb4SMartin Matuska 1396*e2df9bb4SMartin Matuska char name[DDT_NAMELEN]; 1397*e2df9bb4SMartin Matuska snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, 1398*e2df9bb4SMartin Matuska zio_checksum_table[ddt->ddt_checksum].ci_name); 1399*e2df9bb4SMartin Matuska 1400*e2df9bb4SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 1401*e2df9bb4SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1402*e2df9bb4SMartin Matuska ASSERT(!ddt_object_exists(ddt, type, class)); 1403*e2df9bb4SMartin Matuska } 1404*e2df9bb4SMartin Matuska } 1405*e2df9bb4SMartin Matuska 1406*e2df9bb4SMartin Matuska ddt_log_destroy(ddt, tx); 1407*e2df9bb4SMartin Matuska 1408*e2df9bb4SMartin Matuska uint64_t count; 1409*e2df9bb4SMartin Matuska ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); 1410*e2df9bb4SMartin Matuska ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, 1411*e2df9bb4SMartin Matuska DDT_DIR_VERSION)); 1412*e2df9bb4SMartin Matuska ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS)); 1413*e2df9bb4SMartin Matuska ASSERT3U(count, ==, 2); 1414*e2df9bb4SMartin Matuska 1415*e2df9bb4SMartin Matuska VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 1416*e2df9bb4SMartin Matuska VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx)); 1417*e2df9bb4SMartin Matuska 1418*e2df9bb4SMartin Matuska ddt->ddt_dir_object = 0; 1419*e2df9bb4SMartin Matuska 1420*e2df9bb4SMartin Matuska spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); 1421*e2df9bb4SMartin Matuska } 1422*e2df9bb4SMartin Matuska 1423*e2df9bb4SMartin Matuska /* 1424*e2df9bb4SMartin Matuska * Determine, flags and on-disk layout from what's already stored. If there's 1425*e2df9bb4SMartin Matuska * nothing stored, then if new is false, returns ENOENT, and if true, selects 1426*e2df9bb4SMartin Matuska * based on pool config. 1427*e2df9bb4SMartin Matuska */ 1428*e2df9bb4SMartin Matuska static int 1429*e2df9bb4SMartin Matuska ddt_configure(ddt_t *ddt, boolean_t new) 1430*e2df9bb4SMartin Matuska { 1431*e2df9bb4SMartin Matuska spa_t *spa = ddt->ddt_spa; 1432*e2df9bb4SMartin Matuska char name[DDT_NAMELEN]; 1433*e2df9bb4SMartin Matuska int error; 1434*e2df9bb4SMartin Matuska 1435*e2df9bb4SMartin Matuska ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE); 1436*e2df9bb4SMartin Matuska 1437*e2df9bb4SMartin Matuska boolean_t fdt_enabled = 1438*e2df9bb4SMartin Matuska spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP); 1439*e2df9bb4SMartin Matuska boolean_t fdt_active = 1440*e2df9bb4SMartin Matuska spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP); 1441*e2df9bb4SMartin Matuska 1442*e2df9bb4SMartin Matuska /* 1443*e2df9bb4SMartin Matuska * First, look for the global DDT stats object. If its not there, then 1444*e2df9bb4SMartin Matuska * there's never been a DDT written before ever, and we know we're 1445*e2df9bb4SMartin Matuska * starting from scratch. 1446*e2df9bb4SMartin Matuska */ 1447*e2df9bb4SMartin Matuska error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1448*e2df9bb4SMartin Matuska DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 1449*e2df9bb4SMartin Matuska &spa->spa_ddt_stat_object); 1450*e2df9bb4SMartin Matuska if (error != 0) { 1451*e2df9bb4SMartin Matuska if (error != ENOENT) 1452*e2df9bb4SMartin Matuska return (error); 1453*e2df9bb4SMartin Matuska goto not_found; 1454*e2df9bb4SMartin Matuska } 1455*e2df9bb4SMartin Matuska 1456*e2df9bb4SMartin Matuska if (fdt_active) { 1457*e2df9bb4SMartin Matuska /* 1458*e2df9bb4SMartin Matuska * Now look for a DDT directory. If it exists, then it has 1459*e2df9bb4SMartin Matuska * everything we need. 1460*e2df9bb4SMartin Matuska */ 1461*e2df9bb4SMartin Matuska snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, 1462*e2df9bb4SMartin Matuska zio_checksum_table[ddt->ddt_checksum].ci_name); 1463*e2df9bb4SMartin Matuska 1464*e2df9bb4SMartin Matuska error = zap_lookup(spa->spa_meta_objset, 1465*e2df9bb4SMartin Matuska DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, 1466*e2df9bb4SMartin Matuska &ddt->ddt_dir_object); 1467*e2df9bb4SMartin Matuska if (error == 0) { 1468*e2df9bb4SMartin Matuska ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os); 1469*e2df9bb4SMartin Matuska 1470*e2df9bb4SMartin Matuska error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, 1471*e2df9bb4SMartin Matuska DDT_DIR_VERSION, sizeof (uint64_t), 1, 1472*e2df9bb4SMartin Matuska &ddt->ddt_version); 1473*e2df9bb4SMartin Matuska if (error != 0) 1474*e2df9bb4SMartin Matuska return (error); 1475*e2df9bb4SMartin Matuska 1476*e2df9bb4SMartin Matuska error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, 1477*e2df9bb4SMartin Matuska DDT_DIR_FLAGS, sizeof (uint64_t), 1, 1478*e2df9bb4SMartin Matuska &ddt->ddt_flags); 1479*e2df9bb4SMartin Matuska if (error != 0) 1480*e2df9bb4SMartin Matuska return (error); 1481*e2df9bb4SMartin Matuska 1482*e2df9bb4SMartin Matuska if (ddt->ddt_version != DDT_VERSION_FDT) { 1483*e2df9bb4SMartin Matuska zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " 1484*e2df9bb4SMartin Matuska "unknown version %llu", spa_name(spa), 1485*e2df9bb4SMartin Matuska name, (u_longlong_t)ddt->ddt_version); 1486*e2df9bb4SMartin Matuska return (SET_ERROR(EINVAL)); 1487*e2df9bb4SMartin Matuska } 1488*e2df9bb4SMartin Matuska 1489*e2df9bb4SMartin Matuska if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) { 1490*e2df9bb4SMartin Matuska zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " 1491*e2df9bb4SMartin Matuska "version=%llu unknown flags %llx", 1492*e2df9bb4SMartin Matuska spa_name(spa), name, 1493*e2df9bb4SMartin Matuska (u_longlong_t)ddt->ddt_flags, 1494*e2df9bb4SMartin Matuska (u_longlong_t)ddt->ddt_version); 1495*e2df9bb4SMartin Matuska return (SET_ERROR(EINVAL)); 1496*e2df9bb4SMartin Matuska } 1497*e2df9bb4SMartin Matuska 1498*e2df9bb4SMartin Matuska return (0); 1499*e2df9bb4SMartin Matuska } 1500*e2df9bb4SMartin Matuska if (error != ENOENT) 1501*e2df9bb4SMartin Matuska return (error); 1502*e2df9bb4SMartin Matuska } 1503*e2df9bb4SMartin Matuska 1504*e2df9bb4SMartin Matuska /* Any object in the root indicates a traditional setup. */ 1505*e2df9bb4SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 1506*e2df9bb4SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1507*e2df9bb4SMartin Matuska ddt_object_name(ddt, type, class, name); 1508*e2df9bb4SMartin Matuska uint64_t obj; 1509*e2df9bb4SMartin Matuska error = zap_lookup(spa->spa_meta_objset, 1510*e2df9bb4SMartin Matuska DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1511*e2df9bb4SMartin Matuska 1, &obj); 1512*e2df9bb4SMartin Matuska if (error == ENOENT) 1513*e2df9bb4SMartin Matuska continue; 1514*e2df9bb4SMartin Matuska if (error != 0) 1515*e2df9bb4SMartin Matuska return (error); 1516*e2df9bb4SMartin Matuska 1517*e2df9bb4SMartin Matuska ddt->ddt_version = DDT_VERSION_LEGACY; 1518*e2df9bb4SMartin Matuska ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; 1519*e2df9bb4SMartin Matuska ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; 1520*e2df9bb4SMartin Matuska 1521*e2df9bb4SMartin Matuska return (0); 1522*e2df9bb4SMartin Matuska } 1523*e2df9bb4SMartin Matuska } 1524*e2df9bb4SMartin Matuska 1525*e2df9bb4SMartin Matuska not_found: 1526*e2df9bb4SMartin Matuska if (!new) 1527*e2df9bb4SMartin Matuska return (SET_ERROR(ENOENT)); 1528*e2df9bb4SMartin Matuska 1529*e2df9bb4SMartin Matuska /* Nothing on disk, so set up for the best version we can */ 1530*e2df9bb4SMartin Matuska if (fdt_enabled) { 1531*e2df9bb4SMartin Matuska ddt->ddt_version = DDT_VERSION_FDT; 1532*e2df9bb4SMartin Matuska ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; 1533*e2df9bb4SMartin Matuska ddt->ddt_dir_object = 0; /* create on first use */ 1534*e2df9bb4SMartin Matuska } else { 1535*e2df9bb4SMartin Matuska ddt->ddt_version = DDT_VERSION_LEGACY; 1536*e2df9bb4SMartin Matuska ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; 1537*e2df9bb4SMartin Matuska ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; 1538*e2df9bb4SMartin Matuska } 1539*e2df9bb4SMartin Matuska 1540*e2df9bb4SMartin Matuska return (0); 1541*e2df9bb4SMartin Matuska } 1542*e2df9bb4SMartin Matuska 1543*e2df9bb4SMartin Matuska static void 1544*e2df9bb4SMartin Matuska ddt_table_alloc_kstats(ddt_t *ddt) 1545*e2df9bb4SMartin Matuska { 1546*e2df9bb4SMartin Matuska char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa)); 1547*e2df9bb4SMartin Matuska char *name = kmem_asprintf("ddt_stats_%s", 1548*e2df9bb4SMartin Matuska zio_checksum_table[ddt->ddt_checksum].ci_name); 1549*e2df9bb4SMartin Matuska 1550*e2df9bb4SMartin Matuska ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED, 1551*e2df9bb4SMartin Matuska sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1552*e2df9bb4SMartin Matuska if (ddt->ddt_ksp != NULL) { 1553*e2df9bb4SMartin Matuska ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP); 1554*e2df9bb4SMartin Matuska memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t)); 1555*e2df9bb4SMartin Matuska ddt->ddt_ksp->ks_data = dds; 1556*e2df9bb4SMartin Matuska kstat_install(ddt->ddt_ksp); 1557*e2df9bb4SMartin Matuska } 1558*e2df9bb4SMartin Matuska 1559*e2df9bb4SMartin Matuska kmem_strfree(name); 1560*e2df9bb4SMartin Matuska kmem_strfree(mod); 1561eda14cbcSMatt Macy } 1562eda14cbcSMatt Macy 1563eda14cbcSMatt Macy static ddt_t * 1564eda14cbcSMatt Macy ddt_table_alloc(spa_t *spa, enum zio_checksum c) 1565eda14cbcSMatt Macy { 1566eda14cbcSMatt Macy ddt_t *ddt; 1567eda14cbcSMatt Macy 1568eda14cbcSMatt Macy ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); 1569da5137abSMartin Matuska memset(ddt, 0, sizeof (ddt_t)); 1570eda14cbcSMatt Macy mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 15714fefe1b7SMartin Matuska avl_create(&ddt->ddt_tree, ddt_key_compare, 1572eda14cbcSMatt Macy sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 15734fefe1b7SMartin Matuska avl_create(&ddt->ddt_repair_tree, ddt_key_compare, 1574eda14cbcSMatt Macy sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 1575*e2df9bb4SMartin Matuska 1576eda14cbcSMatt Macy ddt->ddt_checksum = c; 1577eda14cbcSMatt Macy ddt->ddt_spa = spa; 1578eda14cbcSMatt Macy ddt->ddt_os = spa->spa_meta_objset; 1579*e2df9bb4SMartin Matuska ddt->ddt_version = DDT_VERSION_UNCONFIGURED; 1580*e2df9bb4SMartin Matuska 1581*e2df9bb4SMartin Matuska ddt_log_alloc(ddt); 1582*e2df9bb4SMartin Matuska ddt_table_alloc_kstats(ddt); 1583eda14cbcSMatt Macy 1584eda14cbcSMatt Macy return (ddt); 1585eda14cbcSMatt Macy } 1586eda14cbcSMatt Macy 1587eda14cbcSMatt Macy static void 1588eda14cbcSMatt Macy ddt_table_free(ddt_t *ddt) 1589eda14cbcSMatt Macy { 1590*e2df9bb4SMartin Matuska if (ddt->ddt_ksp != NULL) { 1591*e2df9bb4SMartin Matuska kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t)); 1592*e2df9bb4SMartin Matuska ddt->ddt_ksp->ks_data = NULL; 1593*e2df9bb4SMartin Matuska kstat_delete(ddt->ddt_ksp); 1594*e2df9bb4SMartin Matuska } 1595*e2df9bb4SMartin Matuska 1596*e2df9bb4SMartin Matuska ddt_log_free(ddt); 15974fefe1b7SMartin Matuska ASSERT0(avl_numnodes(&ddt->ddt_tree)); 15984fefe1b7SMartin Matuska ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); 1599eda14cbcSMatt Macy avl_destroy(&ddt->ddt_tree); 1600eda14cbcSMatt Macy avl_destroy(&ddt->ddt_repair_tree); 1601eda14cbcSMatt Macy mutex_destroy(&ddt->ddt_lock); 1602eda14cbcSMatt Macy kmem_cache_free(ddt_cache, ddt); 1603eda14cbcSMatt Macy } 1604eda14cbcSMatt Macy 1605eda14cbcSMatt Macy void 1606eda14cbcSMatt Macy ddt_create(spa_t *spa) 1607eda14cbcSMatt Macy { 1608eda14cbcSMatt Macy spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 1609eda14cbcSMatt Macy 16104fefe1b7SMartin Matuska for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 16114fefe1b7SMartin Matuska if (DDT_CHECKSUM_VALID(c)) 1612eda14cbcSMatt Macy spa->spa_ddt[c] = ddt_table_alloc(spa, c); 1613eda14cbcSMatt Macy } 16144fefe1b7SMartin Matuska } 1615eda14cbcSMatt Macy 1616eda14cbcSMatt Macy int 1617eda14cbcSMatt Macy ddt_load(spa_t *spa) 1618eda14cbcSMatt Macy { 1619eda14cbcSMatt Macy int error; 1620eda14cbcSMatt Macy 1621eda14cbcSMatt Macy ddt_create(spa); 1622eda14cbcSMatt Macy 1623eda14cbcSMatt Macy error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1624eda14cbcSMatt Macy DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 1625eda14cbcSMatt Macy &spa->spa_ddt_stat_object); 1626eda14cbcSMatt Macy if (error) 1627eda14cbcSMatt Macy return (error == ENOENT ? 0 : error); 1628eda14cbcSMatt Macy 1629eda14cbcSMatt Macy for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 16304fefe1b7SMartin Matuska if (!DDT_CHECKSUM_VALID(c)) 16314fefe1b7SMartin Matuska continue; 16324fefe1b7SMartin Matuska 1633eda14cbcSMatt Macy ddt_t *ddt = spa->spa_ddt[c]; 1634*e2df9bb4SMartin Matuska error = ddt_configure(ddt, B_FALSE); 1635*e2df9bb4SMartin Matuska if (error == ENOENT) 1636*e2df9bb4SMartin Matuska continue; 1637*e2df9bb4SMartin Matuska if (error != 0) 1638*e2df9bb4SMartin Matuska return (error); 1639*e2df9bb4SMartin Matuska 16404fefe1b7SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 16414fefe1b7SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; 1642eda14cbcSMatt Macy class++) { 1643eda14cbcSMatt Macy error = ddt_object_load(ddt, type, class); 1644eda14cbcSMatt Macy if (error != 0 && error != ENOENT) 1645eda14cbcSMatt Macy return (error); 1646eda14cbcSMatt Macy } 1647eda14cbcSMatt Macy } 1648eda14cbcSMatt Macy 1649*e2df9bb4SMartin Matuska error = ddt_log_load(ddt); 1650*e2df9bb4SMartin Matuska if (error != 0 && error != ENOENT) 1651*e2df9bb4SMartin Matuska return (error); 1652*e2df9bb4SMartin Matuska 1653*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_active_entries, 1654*e2df9bb4SMartin Matuska avl_numnodes(&ddt->ddt_log_active->ddl_tree)); 1655*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_flushing_entries, 1656*e2df9bb4SMartin Matuska avl_numnodes(&ddt->ddt_log_flushing->ddl_tree)); 1657*e2df9bb4SMartin Matuska 1658eda14cbcSMatt Macy /* 1659eda14cbcSMatt Macy * Seed the cached histograms. 1660eda14cbcSMatt Macy */ 1661da5137abSMartin Matuska memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 1662eda14cbcSMatt Macy sizeof (ddt->ddt_histogram)); 1663*e2df9bb4SMartin Matuska } 1664*e2df9bb4SMartin Matuska 1665eda14cbcSMatt Macy spa->spa_dedup_dspace = ~0ULL; 1666ce4dcb97SMartin Matuska spa->spa_dedup_dsize = ~0ULL; 1667eda14cbcSMatt Macy 1668eda14cbcSMatt Macy return (0); 1669eda14cbcSMatt Macy } 1670eda14cbcSMatt Macy 1671eda14cbcSMatt Macy void 1672eda14cbcSMatt Macy ddt_unload(spa_t *spa) 1673eda14cbcSMatt Macy { 1674eda14cbcSMatt Macy for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1675eda14cbcSMatt Macy if (spa->spa_ddt[c]) { 1676eda14cbcSMatt Macy ddt_table_free(spa->spa_ddt[c]); 1677eda14cbcSMatt Macy spa->spa_ddt[c] = NULL; 1678eda14cbcSMatt Macy } 1679eda14cbcSMatt Macy } 1680eda14cbcSMatt Macy } 1681eda14cbcSMatt Macy 1682eda14cbcSMatt Macy boolean_t 16834fefe1b7SMartin Matuska ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) 1684eda14cbcSMatt Macy { 1685eda14cbcSMatt Macy ddt_t *ddt; 16864fefe1b7SMartin Matuska ddt_key_t ddk; 1687eda14cbcSMatt Macy 1688eda14cbcSMatt Macy if (!BP_GET_DEDUP(bp)) 1689eda14cbcSMatt Macy return (B_FALSE); 1690eda14cbcSMatt Macy 1691eda14cbcSMatt Macy if (max_class == DDT_CLASS_UNIQUE) 1692eda14cbcSMatt Macy return (B_TRUE); 1693eda14cbcSMatt Macy 1694eda14cbcSMatt Macy ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 1695eda14cbcSMatt Macy 16964fefe1b7SMartin Matuska ddt_key_fill(&ddk, bp); 1697eda14cbcSMatt Macy 16984fefe1b7SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 16994fefe1b7SMartin Matuska for (ddt_class_t class = 0; class <= max_class; class++) { 17004fefe1b7SMartin Matuska if (ddt_object_contains(ddt, type, class, &ddk) == 0) 1701eda14cbcSMatt Macy return (B_TRUE); 1702eda14cbcSMatt Macy } 1703eda14cbcSMatt Macy } 1704eda14cbcSMatt Macy 1705eda14cbcSMatt Macy return (B_FALSE); 1706eda14cbcSMatt Macy } 1707eda14cbcSMatt Macy 1708eda14cbcSMatt Macy ddt_entry_t * 1709eda14cbcSMatt Macy ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 1710eda14cbcSMatt Macy { 1711eda14cbcSMatt Macy ddt_key_t ddk; 1712eda14cbcSMatt Macy ddt_entry_t *dde; 1713eda14cbcSMatt Macy 1714eda14cbcSMatt Macy ddt_key_fill(&ddk, bp); 1715eda14cbcSMatt Macy 1716*e2df9bb4SMartin Matuska dde = ddt_alloc(ddt, &ddk); 1717*e2df9bb4SMartin Matuska ddt_alloc_entry_io(dde); 1718eda14cbcSMatt Macy 17194fefe1b7SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 17204fefe1b7SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1721eda14cbcSMatt Macy /* 1722eda14cbcSMatt Macy * We can only do repair if there are multiple copies 1723eda14cbcSMatt Macy * of the block. For anything in the UNIQUE class, 1724eda14cbcSMatt Macy * there's definitely only one copy, so don't even try. 1725eda14cbcSMatt Macy */ 1726eda14cbcSMatt Macy if (class != DDT_CLASS_UNIQUE && 1727eda14cbcSMatt Macy ddt_object_lookup(ddt, type, class, dde) == 0) 1728eda14cbcSMatt Macy return (dde); 1729eda14cbcSMatt Macy } 1730eda14cbcSMatt Macy } 1731eda14cbcSMatt Macy 1732*e2df9bb4SMartin Matuska memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt)); 1733eda14cbcSMatt Macy 1734eda14cbcSMatt Macy return (dde); 1735eda14cbcSMatt Macy } 1736eda14cbcSMatt Macy 1737eda14cbcSMatt Macy void 1738eda14cbcSMatt Macy ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 1739eda14cbcSMatt Macy { 1740eda14cbcSMatt Macy avl_index_t where; 1741eda14cbcSMatt Macy 1742eda14cbcSMatt Macy ddt_enter(ddt); 1743eda14cbcSMatt Macy 1744*e2df9bb4SMartin Matuska if (dde->dde_io->dde_repair_abd != NULL && 1745*e2df9bb4SMartin Matuska spa_writeable(ddt->ddt_spa) && 1746eda14cbcSMatt Macy avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 1747eda14cbcSMatt Macy avl_insert(&ddt->ddt_repair_tree, dde, where); 1748eda14cbcSMatt Macy else 1749*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 1750eda14cbcSMatt Macy 1751eda14cbcSMatt Macy ddt_exit(ddt); 1752eda14cbcSMatt Macy } 1753eda14cbcSMatt Macy 1754eda14cbcSMatt Macy static void 1755eda14cbcSMatt Macy ddt_repair_entry_done(zio_t *zio) 1756eda14cbcSMatt Macy { 1757*e2df9bb4SMartin Matuska ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1758eda14cbcSMatt Macy ddt_entry_t *rdde = zio->io_private; 1759eda14cbcSMatt Macy 1760*e2df9bb4SMartin Matuska ddt_free(ddt, rdde); 1761eda14cbcSMatt Macy } 1762eda14cbcSMatt Macy 1763eda14cbcSMatt Macy static void 1764eda14cbcSMatt Macy ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 1765eda14cbcSMatt Macy { 1766eda14cbcSMatt Macy ddt_key_t *ddk = &dde->dde_key; 1767eda14cbcSMatt Macy ddt_key_t *rddk = &rdde->dde_key; 1768eda14cbcSMatt Macy zio_t *zio; 1769eda14cbcSMatt Macy blkptr_t blk; 1770eda14cbcSMatt Macy 1771eda14cbcSMatt Macy zio = zio_null(rio, rio->io_spa, NULL, 1772eda14cbcSMatt Macy ddt_repair_entry_done, rdde, rio->io_flags); 1773eda14cbcSMatt Macy 1774*e2df9bb4SMartin Matuska for (int p = 0; p < DDT_NPHYS(ddt); p++) { 1775*e2df9bb4SMartin Matuska ddt_univ_phys_t *ddp = dde->dde_phys; 1776*e2df9bb4SMartin Matuska ddt_univ_phys_t *rddp = rdde->dde_phys; 1777*e2df9bb4SMartin Matuska ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); 1778*e2df9bb4SMartin Matuska uint64_t phys_birth = ddt_phys_birth(ddp, v); 1779*e2df9bb4SMartin Matuska const dva_t *dvas, *rdvas; 1780*e2df9bb4SMartin Matuska 1781*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_FLAT) { 1782*e2df9bb4SMartin Matuska dvas = ddp->ddp_flat.ddp_dva; 1783*e2df9bb4SMartin Matuska rdvas = rddp->ddp_flat.ddp_dva; 1784*e2df9bb4SMartin Matuska } else { 1785*e2df9bb4SMartin Matuska dvas = ddp->ddp_trad[p].ddp_dva; 1786*e2df9bb4SMartin Matuska rdvas = rddp->ddp_trad[p].ddp_dva; 1787*e2df9bb4SMartin Matuska } 1788*e2df9bb4SMartin Matuska 1789*e2df9bb4SMartin Matuska if (phys_birth == 0 || 1790*e2df9bb4SMartin Matuska phys_birth != ddt_phys_birth(rddp, v) || 1791*e2df9bb4SMartin Matuska memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP)) 1792eda14cbcSMatt Macy continue; 1793*e2df9bb4SMartin Matuska 1794*e2df9bb4SMartin Matuska ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); 1795eda14cbcSMatt Macy zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 1796*e2df9bb4SMartin Matuska rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), 1797*e2df9bb4SMartin Matuska NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, 1798*e2df9bb4SMartin Matuska ZIO_DDT_CHILD_FLAGS(zio), NULL)); 1799eda14cbcSMatt Macy } 1800eda14cbcSMatt Macy 1801eda14cbcSMatt Macy zio_nowait(zio); 1802eda14cbcSMatt Macy } 1803eda14cbcSMatt Macy 1804eda14cbcSMatt Macy static void 1805eda14cbcSMatt Macy ddt_repair_table(ddt_t *ddt, zio_t *rio) 1806eda14cbcSMatt Macy { 1807eda14cbcSMatt Macy spa_t *spa = ddt->ddt_spa; 1808eda14cbcSMatt Macy ddt_entry_t *dde, *rdde_next, *rdde; 1809eda14cbcSMatt Macy avl_tree_t *t = &ddt->ddt_repair_tree; 1810eda14cbcSMatt Macy blkptr_t blk; 1811eda14cbcSMatt Macy 1812eda14cbcSMatt Macy if (spa_sync_pass(spa) > 1) 1813eda14cbcSMatt Macy return; 1814eda14cbcSMatt Macy 1815eda14cbcSMatt Macy ddt_enter(ddt); 1816eda14cbcSMatt Macy for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 1817eda14cbcSMatt Macy rdde_next = AVL_NEXT(t, rdde); 1818eda14cbcSMatt Macy avl_remove(&ddt->ddt_repair_tree, rdde); 1819eda14cbcSMatt Macy ddt_exit(ddt); 1820*e2df9bb4SMartin Matuska ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, 1821*e2df9bb4SMartin Matuska DDT_PHYS_NONE, &blk); 1822eda14cbcSMatt Macy dde = ddt_repair_start(ddt, &blk); 1823eda14cbcSMatt Macy ddt_repair_entry(ddt, dde, rdde, rio); 1824eda14cbcSMatt Macy ddt_repair_done(ddt, dde); 1825eda14cbcSMatt Macy ddt_enter(ddt); 1826eda14cbcSMatt Macy } 1827eda14cbcSMatt Macy ddt_exit(ddt); 1828eda14cbcSMatt Macy } 1829eda14cbcSMatt Macy 1830eda14cbcSMatt Macy static void 1831*e2df9bb4SMartin Matuska ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx) 1832*e2df9bb4SMartin Matuska { 1833*e2df9bb4SMartin Matuska /* 1834*e2df9bb4SMartin Matuska * Count all the entries stored for each type/class, and updates the 1835*e2df9bb4SMartin Matuska * stats within (ddt_object_sync()). If there's no entries for the 1836*e2df9bb4SMartin Matuska * type/class, the whole object is removed. If all objects for the DDT 1837*e2df9bb4SMartin Matuska * are removed, its containing dir is removed, effectively resetting 1838*e2df9bb4SMartin Matuska * the entire DDT to an empty slate. 1839*e2df9bb4SMartin Matuska */ 1840*e2df9bb4SMartin Matuska uint64_t count = 0; 1841*e2df9bb4SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 1842*e2df9bb4SMartin Matuska uint64_t add, tcount = 0; 1843*e2df9bb4SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1844*e2df9bb4SMartin Matuska if (ddt_object_exists(ddt, type, class)) { 1845*e2df9bb4SMartin Matuska ddt_object_sync(ddt, type, class, tx); 1846*e2df9bb4SMartin Matuska VERIFY0(ddt_object_count(ddt, type, class, 1847*e2df9bb4SMartin Matuska &add)); 1848*e2df9bb4SMartin Matuska tcount += add; 1849*e2df9bb4SMartin Matuska } 1850*e2df9bb4SMartin Matuska } 1851*e2df9bb4SMartin Matuska for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1852*e2df9bb4SMartin Matuska if (tcount == 0 && ddt_object_exists(ddt, type, class)) 1853*e2df9bb4SMartin Matuska ddt_object_destroy(ddt, type, class, tx); 1854*e2df9bb4SMartin Matuska } 1855*e2df9bb4SMartin Matuska count += tcount; 1856*e2df9bb4SMartin Matuska } 1857*e2df9bb4SMartin Matuska 1858*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_LOG) { 1859*e2df9bb4SMartin Matuska /* Include logged entries in the total count */ 1860*e2df9bb4SMartin Matuska count += avl_numnodes(&ddt->ddt_log_active->ddl_tree); 1861*e2df9bb4SMartin Matuska count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); 1862*e2df9bb4SMartin Matuska } 1863*e2df9bb4SMartin Matuska 1864*e2df9bb4SMartin Matuska if (count == 0) { 1865*e2df9bb4SMartin Matuska /* 1866*e2df9bb4SMartin Matuska * No entries left on the DDT, so reset the version for next 1867*e2df9bb4SMartin Matuska * time. This allows us to handle the feature being changed 1868*e2df9bb4SMartin Matuska * since the DDT was originally created. New entries should get 1869*e2df9bb4SMartin Matuska * whatever the feature currently demands. 1870*e2df9bb4SMartin Matuska */ 1871*e2df9bb4SMartin Matuska if (ddt->ddt_version == DDT_VERSION_FDT) 1872*e2df9bb4SMartin Matuska ddt_destroy_dir(ddt, tx); 1873*e2df9bb4SMartin Matuska 1874*e2df9bb4SMartin Matuska ddt->ddt_version = DDT_VERSION_UNCONFIGURED; 1875*e2df9bb4SMartin Matuska ddt->ddt_flags = 0; 1876*e2df9bb4SMartin Matuska } 1877*e2df9bb4SMartin Matuska 1878*e2df9bb4SMartin Matuska memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 1879*e2df9bb4SMartin Matuska sizeof (ddt->ddt_histogram)); 1880*e2df9bb4SMartin Matuska ddt->ddt_spa->spa_dedup_dspace = ~0ULL; 1881*e2df9bb4SMartin Matuska ddt->ddt_spa->spa_dedup_dsize = ~0ULL; 1882*e2df9bb4SMartin Matuska } 1883*e2df9bb4SMartin Matuska 1884*e2df9bb4SMartin Matuska static void 1885*e2df9bb4SMartin Matuska ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) 1886eda14cbcSMatt Macy { 1887eda14cbcSMatt Macy dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 1888eda14cbcSMatt Macy 1889eda14cbcSMatt Macy /* 1890*e2df9bb4SMartin Matuska * Compute the target class, so we can decide whether or not to inform 1891*e2df9bb4SMartin Matuska * the scrub traversal (below). Note that we don't store this in the 1892*e2df9bb4SMartin Matuska * entry, as it might change multiple times before finally being 1893*e2df9bb4SMartin Matuska * committed (if we're logging). Instead, we recompute it in 1894*e2df9bb4SMartin Matuska * ddt_sync_entry(). 1895eda14cbcSMatt Macy */ 1896*e2df9bb4SMartin Matuska uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys); 1897*e2df9bb4SMartin Matuska ddt_class_t nclass = 1898*e2df9bb4SMartin Matuska (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; 1899*e2df9bb4SMartin Matuska 1900*e2df9bb4SMartin Matuska /* 1901*e2df9bb4SMartin Matuska * If the class changes, the order that we scan this bp changes. If it 1902*e2df9bb4SMartin Matuska * decreases, we could miss it, so scan it right now. (This covers both 1903*e2df9bb4SMartin Matuska * class changing while we are doing ddt_walk(), and when we are 1904*e2df9bb4SMartin Matuska * traversing.) 1905*e2df9bb4SMartin Matuska * 1906*e2df9bb4SMartin Matuska * We also do this when the refcnt goes to zero, because that change is 1907*e2df9bb4SMartin Matuska * only in the log so far; the blocks on disk won't be freed until 1908*e2df9bb4SMartin Matuska * the log is flushed, and the refcnt might increase before that. If it 1909*e2df9bb4SMartin Matuska * does, then we could miss it in the same way. 1910*e2df9bb4SMartin Matuska */ 1911*e2df9bb4SMartin Matuska if (refcnt == 0 || nclass < ddlwe->ddlwe_class) 1912*e2df9bb4SMartin Matuska dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt, 1913*e2df9bb4SMartin Matuska ddlwe, tx); 1914*e2df9bb4SMartin Matuska } 1915*e2df9bb4SMartin Matuska 1916*e2df9bb4SMartin Matuska static void 1917*e2df9bb4SMartin Matuska ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, 1918*e2df9bb4SMartin Matuska ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx) 1919*e2df9bb4SMartin Matuska { 1920*e2df9bb4SMartin Matuska ddt_key_t *ddk = &ddlwe->ddlwe_key; 1921*e2df9bb4SMartin Matuska ddt_type_t ntype = DDT_TYPE_DEFAULT; 1922*e2df9bb4SMartin Matuska uint64_t refcnt = 0; 1923*e2df9bb4SMartin Matuska 1924*e2df9bb4SMartin Matuska /* 1925*e2df9bb4SMartin Matuska * Compute the total refcnt. Along the way, issue frees for any DVAs 1926*e2df9bb4SMartin Matuska * we no longer want. 1927*e2df9bb4SMartin Matuska */ 1928*e2df9bb4SMartin Matuska for (int p = 0; p < DDT_NPHYS(ddt); p++) { 1929*e2df9bb4SMartin Matuska ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; 1930*e2df9bb4SMartin Matuska ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); 1931*e2df9bb4SMartin Matuska uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); 1932*e2df9bb4SMartin Matuska 1933*e2df9bb4SMartin Matuska if (ddt_phys_birth(ddp, v) == 0) { 1934*e2df9bb4SMartin Matuska ASSERT0(phys_refcnt); 1935eda14cbcSMatt Macy continue; 1936eda14cbcSMatt Macy } 1937*e2df9bb4SMartin Matuska if (DDT_PHYS_IS_DITTO(ddt, p)) { 1938*e2df9bb4SMartin Matuska /* 1939*e2df9bb4SMartin Matuska * We don't want to keep any obsolete slots (eg ditto), 1940*e2df9bb4SMartin Matuska * regardless of their refcount, but we don't want to 1941*e2df9bb4SMartin Matuska * leak them either. So, free them. 1942*e2df9bb4SMartin Matuska */ 1943*e2df9bb4SMartin Matuska ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); 1944*e2df9bb4SMartin Matuska continue; 1945*e2df9bb4SMartin Matuska } 1946*e2df9bb4SMartin Matuska if (phys_refcnt == 0) 1947*e2df9bb4SMartin Matuska /* No remaining references, free it! */ 1948*e2df9bb4SMartin Matuska ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); 1949*e2df9bb4SMartin Matuska refcnt += phys_refcnt; 1950eda14cbcSMatt Macy } 1951eda14cbcSMatt Macy 1952*e2df9bb4SMartin Matuska /* Select the best class for the entry. */ 1953*e2df9bb4SMartin Matuska ddt_class_t nclass = 1954*e2df9bb4SMartin Matuska (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; 1955eda14cbcSMatt Macy 1956*e2df9bb4SMartin Matuska /* 1957*e2df9bb4SMartin Matuska * If an existing entry changed type or class, or its refcount reached 1958*e2df9bb4SMartin Matuska * zero, delete it from the DDT object 1959*e2df9bb4SMartin Matuska */ 1960eda14cbcSMatt Macy if (otype != DDT_TYPES && 1961*e2df9bb4SMartin Matuska (otype != ntype || oclass != nclass || refcnt == 0)) { 19624fefe1b7SMartin Matuska VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); 1963*e2df9bb4SMartin Matuska ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT); 1964eda14cbcSMatt Macy } 1965eda14cbcSMatt Macy 1966*e2df9bb4SMartin Matuska /* 1967*e2df9bb4SMartin Matuska * Add or update the entry 1968*e2df9bb4SMartin Matuska */ 1969*e2df9bb4SMartin Matuska if (refcnt != 0) { 1970*e2df9bb4SMartin Matuska ddt_histogram_t *ddh = 1971*e2df9bb4SMartin Matuska &ddt->ddt_histogram[ntype][nclass]; 1972*e2df9bb4SMartin Matuska 1973*e2df9bb4SMartin Matuska ddt_histogram_add_entry(ddt, ddh, ddlwe); 1974*e2df9bb4SMartin Matuska 1975eda14cbcSMatt Macy if (!ddt_object_exists(ddt, ntype, nclass)) 1976eda14cbcSMatt Macy ddt_object_create(ddt, ntype, nclass, tx); 1977*e2df9bb4SMartin Matuska VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx)); 1978*e2df9bb4SMartin Matuska } 1979*e2df9bb4SMartin Matuska } 1980*e2df9bb4SMartin Matuska 1981*e2df9bb4SMartin Matuska /* Calculate an exponential weighted moving average, lower limited to zero */ 1982*e2df9bb4SMartin Matuska static inline int32_t 1983*e2df9bb4SMartin Matuska _ewma(int32_t val, int32_t prev, uint32_t weight) 1984*e2df9bb4SMartin Matuska { 1985*e2df9bb4SMartin Matuska ASSERT3U(val, >=, 0); 1986*e2df9bb4SMartin Matuska ASSERT3U(prev, >=, 0); 1987*e2df9bb4SMartin Matuska const int32_t new = 1988*e2df9bb4SMartin Matuska MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1)); 1989*e2df9bb4SMartin Matuska ASSERT3U(new, >=, 0); 1990*e2df9bb4SMartin Matuska return (new); 1991*e2df9bb4SMartin Matuska } 1992*e2df9bb4SMartin Matuska 1993*e2df9bb4SMartin Matuska /* Returns true if done for this txg */ 1994*e2df9bb4SMartin Matuska static boolean_t 1995*e2df9bb4SMartin Matuska ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) 1996*e2df9bb4SMartin Matuska { 1997*e2df9bb4SMartin Matuska if (ddt->ddt_flush_pass == 0) { 1998*e2df9bb4SMartin Matuska if (spa_sync_pass(ddt->ddt_spa) == 1) { 1999*e2df9bb4SMartin Matuska /* First run this txg, get set up */ 2000*e2df9bb4SMartin Matuska ddt->ddt_flush_start = gethrtime(); 2001*e2df9bb4SMartin Matuska ddt->ddt_flush_count = 0; 2002eda14cbcSMatt Macy 2003eda14cbcSMatt Macy /* 2004*e2df9bb4SMartin Matuska * How many entries we need to flush. We want to at 2005*e2df9bb4SMartin Matuska * least match the ingest rate. 2006eda14cbcSMatt Macy */ 2007*e2df9bb4SMartin Matuska ddt->ddt_flush_min = MAX( 2008*e2df9bb4SMartin Matuska ddt->ddt_log_ingest_rate, 2009*e2df9bb4SMartin Matuska zfs_dedup_log_flush_entries_min); 2010*e2df9bb4SMartin Matuska 2011*e2df9bb4SMartin Matuska /* 2012*e2df9bb4SMartin Matuska * If we've been asked to flush everything in a hurry, 2013*e2df9bb4SMartin Matuska * try to dump as much as possible on this txg. In 2014*e2df9bb4SMartin Matuska * this case we're only limited by time, not amount. 2015*e2df9bb4SMartin Matuska */ 2016*e2df9bb4SMartin Matuska if (ddt->ddt_flush_force_txg > 0) 2017*e2df9bb4SMartin Matuska ddt->ddt_flush_min = 2018*e2df9bb4SMartin Matuska MAX(ddt->ddt_flush_min, avl_numnodes( 2019*e2df9bb4SMartin Matuska &ddt->ddt_log_flushing->ddl_tree)); 2020*e2df9bb4SMartin Matuska } else { 2021*e2df9bb4SMartin Matuska /* We already decided we're done for this txg */ 2022*e2df9bb4SMartin Matuska return (B_FALSE); 2023eda14cbcSMatt Macy } 2024*e2df9bb4SMartin Matuska } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { 2025*e2df9bb4SMartin Matuska /* 2026*e2df9bb4SMartin Matuska * We already did some flushing on this pass, skip it. This 2027*e2df9bb4SMartin Matuska * happens when dsl_process_async_destroys() runs during a scan 2028*e2df9bb4SMartin Matuska * (on pass 1) and does an additional ddt_sync() to update 2029*e2df9bb4SMartin Matuska * freed blocks. 2030*e2df9bb4SMartin Matuska */ 2031*e2df9bb4SMartin Matuska return (B_FALSE); 2032*e2df9bb4SMartin Matuska } 2033*e2df9bb4SMartin Matuska 2034*e2df9bb4SMartin Matuska if (spa_sync_pass(ddt->ddt_spa) > 2035*e2df9bb4SMartin Matuska MAX(zfs_dedup_log_flush_passes_max, 1)) { 2036*e2df9bb4SMartin Matuska /* Too many passes this txg, defer until next. */ 2037*e2df9bb4SMartin Matuska ddt->ddt_flush_pass = 0; 2038*e2df9bb4SMartin Matuska return (B_TRUE); 2039*e2df9bb4SMartin Matuska } 2040*e2df9bb4SMartin Matuska 2041*e2df9bb4SMartin Matuska if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { 2042*e2df9bb4SMartin Matuska /* Nothing to flush, done for this txg. */ 2043*e2df9bb4SMartin Matuska ddt->ddt_flush_pass = 0; 2044*e2df9bb4SMartin Matuska return (B_TRUE); 2045*e2df9bb4SMartin Matuska } 2046*e2df9bb4SMartin Matuska 2047*e2df9bb4SMartin Matuska uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? 2048*e2df9bb4SMartin Matuska MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), 2049*e2df9bb4SMartin Matuska SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); 2050*e2df9bb4SMartin Matuska 2051*e2df9bb4SMartin Matuska uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; 2052*e2df9bb4SMartin Matuska 2053*e2df9bb4SMartin Matuska if (elapsed_time >= target_time) { 2054*e2df9bb4SMartin Matuska /* Too long since we started, done for this txg. */ 2055*e2df9bb4SMartin Matuska ddt->ddt_flush_pass = 0; 2056*e2df9bb4SMartin Matuska return (B_TRUE); 2057*e2df9bb4SMartin Matuska } 2058*e2df9bb4SMartin Matuska 2059*e2df9bb4SMartin Matuska ddt->ddt_flush_pass++; 2060*e2df9bb4SMartin Matuska ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); 2061*e2df9bb4SMartin Matuska 2062*e2df9bb4SMartin Matuska /* 2063*e2df9bb4SMartin Matuska * Estimate how much time we'll need to flush the remaining entries 2064*e2df9bb4SMartin Matuska * based on how long it normally takes. 2065*e2df9bb4SMartin Matuska */ 2066*e2df9bb4SMartin Matuska uint32_t want_time; 2067*e2df9bb4SMartin Matuska if (ddt->ddt_flush_pass == 1) { 2068*e2df9bb4SMartin Matuska /* First pass, use the average time/entries */ 2069*e2df9bb4SMartin Matuska if (ddt->ddt_log_flush_rate == 0) 2070*e2df9bb4SMartin Matuska /* Zero rate, just assume the whole time */ 2071*e2df9bb4SMartin Matuska want_time = target_time; 2072*e2df9bb4SMartin Matuska else 2073*e2df9bb4SMartin Matuska want_time = ddt->ddt_flush_min * 2074*e2df9bb4SMartin Matuska ddt->ddt_log_flush_time_rate / 2075*e2df9bb4SMartin Matuska ddt->ddt_log_flush_rate; 2076*e2df9bb4SMartin Matuska } else { 2077*e2df9bb4SMartin Matuska /* Later pass, calculate from this txg so far */ 2078*e2df9bb4SMartin Matuska want_time = ddt->ddt_flush_min * 2079*e2df9bb4SMartin Matuska elapsed_time / ddt->ddt_flush_count; 2080*e2df9bb4SMartin Matuska } 2081*e2df9bb4SMartin Matuska 2082*e2df9bb4SMartin Matuska /* Figure out how much time we have left */ 2083*e2df9bb4SMartin Matuska uint32_t remain_time = target_time - elapsed_time; 2084*e2df9bb4SMartin Matuska 2085*e2df9bb4SMartin Matuska /* Smear the remaining entries over the remaining passes. */ 2086*e2df9bb4SMartin Matuska uint32_t nentries = ddt->ddt_flush_min / 2087*e2df9bb4SMartin Matuska (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); 2088*e2df9bb4SMartin Matuska if (want_time > remain_time) { 2089*e2df9bb4SMartin Matuska /* 2090*e2df9bb4SMartin Matuska * We're behind; try to catch up a bit by doubling the amount 2091*e2df9bb4SMartin Matuska * this pass. If we're behind that means we're in a later 2092*e2df9bb4SMartin Matuska * pass and likely have most of the remaining time to 2093*e2df9bb4SMartin Matuska * ourselves. If we're in the last couple of passes, then 2094*e2df9bb4SMartin Matuska * doubling might just take us over the timeout, but probably 2095*e2df9bb4SMartin Matuska * not be much, and it stops us falling behind. If we're 2096*e2df9bb4SMartin Matuska * in the middle passes, there'll be more to do, but it 2097*e2df9bb4SMartin Matuska * might just help us catch up a bit and we'll recalculate on 2098*e2df9bb4SMartin Matuska * the next pass anyway. 2099*e2df9bb4SMartin Matuska */ 2100*e2df9bb4SMartin Matuska nentries = MIN(ddt->ddt_flush_min, nentries*2); 2101*e2df9bb4SMartin Matuska } 2102*e2df9bb4SMartin Matuska 2103*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 2104*e2df9bb4SMartin Matuska uint32_t count = 0; 2105*e2df9bb4SMartin Matuska while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { 2106*e2df9bb4SMartin Matuska ddt_sync_flush_entry(ddt, &ddlwe, 2107*e2df9bb4SMartin Matuska ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); 2108*e2df9bb4SMartin Matuska 2109*e2df9bb4SMartin Matuska /* End this pass if we've synced as much as we need to. */ 2110*e2df9bb4SMartin Matuska if (++count >= nentries) 2111*e2df9bb4SMartin Matuska break; 2112*e2df9bb4SMartin Matuska } 2113*e2df9bb4SMartin Matuska ddt->ddt_flush_count += count; 2114*e2df9bb4SMartin Matuska ddt->ddt_flush_min -= count; 2115*e2df9bb4SMartin Matuska 2116*e2df9bb4SMartin Matuska if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { 2117*e2df9bb4SMartin Matuska /* We emptied it, so truncate on-disk */ 2118*e2df9bb4SMartin Matuska DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); 2119*e2df9bb4SMartin Matuska ddt_log_truncate(ddt, tx); 2120*e2df9bb4SMartin Matuska /* No more passes needed this txg */ 2121*e2df9bb4SMartin Matuska ddt->ddt_flush_pass = 0; 2122*e2df9bb4SMartin Matuska } else { 2123*e2df9bb4SMartin Matuska /* More to do next time, save checkpoint */ 2124*e2df9bb4SMartin Matuska DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); 2125*e2df9bb4SMartin Matuska ddt_log_checkpoint(ddt, &ddlwe, tx); 2126*e2df9bb4SMartin Matuska } 2127*e2df9bb4SMartin Matuska 2128*e2df9bb4SMartin Matuska ddt_sync_update_stats(ddt, tx); 2129*e2df9bb4SMartin Matuska 2130*e2df9bb4SMartin Matuska return (ddt->ddt_flush_pass == 0); 2131*e2df9bb4SMartin Matuska } 2132*e2df9bb4SMartin Matuska 2133*e2df9bb4SMartin Matuska static inline void 2134*e2df9bb4SMartin Matuska ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) 2135*e2df9bb4SMartin Matuska { 2136*e2df9bb4SMartin Matuska /* 2137*e2df9bb4SMartin Matuska * If we're not forcing flush, and not being asked to start, then 2138*e2df9bb4SMartin Matuska * there's nothing more to do. 2139*e2df9bb4SMartin Matuska */ 2140*e2df9bb4SMartin Matuska if (txg == 0) { 2141*e2df9bb4SMartin Matuska /* Update requested, are we currently forcing flush? */ 2142*e2df9bb4SMartin Matuska if (ddt->ddt_flush_force_txg == 0) 2143*e2df9bb4SMartin Matuska return; 2144*e2df9bb4SMartin Matuska txg = ddt->ddt_flush_force_txg; 2145*e2df9bb4SMartin Matuska } 2146*e2df9bb4SMartin Matuska 2147*e2df9bb4SMartin Matuska /* 2148*e2df9bb4SMartin Matuska * If either of the logs have entries unflushed entries before 2149*e2df9bb4SMartin Matuska * the wanted txg, set the force txg, otherwise clear it. 2150*e2df9bb4SMartin Matuska */ 2151*e2df9bb4SMartin Matuska 2152*e2df9bb4SMartin Matuska if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) && 2153*e2df9bb4SMartin Matuska ddt->ddt_log_active->ddl_first_txg <= txg) || 2154*e2df9bb4SMartin Matuska (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && 2155*e2df9bb4SMartin Matuska ddt->ddt_log_flushing->ddl_first_txg <= txg)) { 2156*e2df9bb4SMartin Matuska ddt->ddt_flush_force_txg = txg; 2157*e2df9bb4SMartin Matuska return; 2158*e2df9bb4SMartin Matuska } 2159*e2df9bb4SMartin Matuska 2160*e2df9bb4SMartin Matuska /* 2161*e2df9bb4SMartin Matuska * Nothing to flush behind the given txg, so we can clear force flush 2162*e2df9bb4SMartin Matuska * state. 2163*e2df9bb4SMartin Matuska */ 2164*e2df9bb4SMartin Matuska ddt->ddt_flush_force_txg = 0; 2165*e2df9bb4SMartin Matuska } 2166*e2df9bb4SMartin Matuska 2167*e2df9bb4SMartin Matuska static void 2168*e2df9bb4SMartin Matuska ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) 2169*e2df9bb4SMartin Matuska { 2170*e2df9bb4SMartin Matuska ASSERT(avl_is_empty(&ddt->ddt_tree)); 2171*e2df9bb4SMartin Matuska 2172*e2df9bb4SMartin Matuska /* Don't do any flushing when the pool is ready to shut down */ 2173*e2df9bb4SMartin Matuska if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) 2174*e2df9bb4SMartin Matuska return; 2175*e2df9bb4SMartin Matuska 2176*e2df9bb4SMartin Matuska /* Try to flush some. */ 2177*e2df9bb4SMartin Matuska if (!ddt_sync_flush_log_incremental(ddt, tx)) 2178*e2df9bb4SMartin Matuska /* More to do next time */ 2179*e2df9bb4SMartin Matuska return; 2180*e2df9bb4SMartin Matuska 2181*e2df9bb4SMartin Matuska /* No more flushing this txg, so we can do end-of-txg housekeeping */ 2182*e2df9bb4SMartin Matuska 2183*e2df9bb4SMartin Matuska if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && 2184*e2df9bb4SMartin Matuska !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { 2185*e2df9bb4SMartin Matuska /* 2186*e2df9bb4SMartin Matuska * No more to flush, and the active list has stuff, so 2187*e2df9bb4SMartin Matuska * try to swap the logs for next time. 2188*e2df9bb4SMartin Matuska */ 2189*e2df9bb4SMartin Matuska if (ddt_log_swap(ddt, tx)) { 2190*e2df9bb4SMartin Matuska DDT_KSTAT_ZERO(ddt, dds_log_active_entries); 2191*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_flushing_entries, 2192*e2df9bb4SMartin Matuska avl_numnodes(&ddt->ddt_log_flushing->ddl_tree)); 2193*e2df9bb4SMartin Matuska } 2194*e2df9bb4SMartin Matuska } 2195*e2df9bb4SMartin Matuska 2196*e2df9bb4SMartin Matuska /* If force flush is no longer necessary, turn it off. */ 2197*e2df9bb4SMartin Matuska ddt_flush_force_update_txg(ddt, 0); 2198*e2df9bb4SMartin Matuska 2199*e2df9bb4SMartin Matuska /* 2200*e2df9bb4SMartin Matuska * Update flush rate. This is an exponential weighted moving average of 2201*e2df9bb4SMartin Matuska * the number of entries flushed over recent txgs. 2202*e2df9bb4SMartin Matuska */ 2203*e2df9bb4SMartin Matuska ddt->ddt_log_flush_rate = _ewma( 2204*e2df9bb4SMartin Matuska ddt->ddt_flush_count, ddt->ddt_log_flush_rate, 2205*e2df9bb4SMartin Matuska zfs_dedup_log_flush_flow_rate_txgs); 2206*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate); 2207*e2df9bb4SMartin Matuska 2208*e2df9bb4SMartin Matuska /* 2209*e2df9bb4SMartin Matuska * Update flush time rate. This is an exponential weighted moving 2210*e2df9bb4SMartin Matuska * average of the total time taken to flush over recent txgs. 2211*e2df9bb4SMartin Matuska */ 2212*e2df9bb4SMartin Matuska ddt->ddt_log_flush_time_rate = _ewma( 2213*e2df9bb4SMartin Matuska ddt->ddt_log_flush_time_rate, 2214*e2df9bb4SMartin Matuska ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), 2215*e2df9bb4SMartin Matuska zfs_dedup_log_flush_flow_rate_txgs); 2216*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_flush_time_rate, 2217*e2df9bb4SMartin Matuska ddt->ddt_log_flush_time_rate); 2218*e2df9bb4SMartin Matuska } 2219*e2df9bb4SMartin Matuska 2220*e2df9bb4SMartin Matuska static void 2221*e2df9bb4SMartin Matuska ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx) 2222*e2df9bb4SMartin Matuska { 2223*e2df9bb4SMartin Matuska uint64_t count = avl_numnodes(&ddt->ddt_tree); 2224*e2df9bb4SMartin Matuska 2225*e2df9bb4SMartin Matuska if (count > 0) { 2226*e2df9bb4SMartin Matuska ddt_log_update_t dlu = {0}; 2227*e2df9bb4SMartin Matuska ddt_log_begin(ddt, count, tx, &dlu); 2228*e2df9bb4SMartin Matuska 2229*e2df9bb4SMartin Matuska ddt_entry_t *dde; 2230*e2df9bb4SMartin Matuska void *cookie = NULL; 2231*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 2232*e2df9bb4SMartin Matuska while ((dde = 2233*e2df9bb4SMartin Matuska avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 2234*e2df9bb4SMartin Matuska ASSERT(dde->dde_flags & DDE_FLAG_LOADED); 2235*e2df9bb4SMartin Matuska DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); 2236*e2df9bb4SMartin Matuska ddt_log_entry(ddt, &ddlwe, &dlu); 2237*e2df9bb4SMartin Matuska ddt_sync_scan_entry(ddt, &ddlwe, tx); 2238*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 2239*e2df9bb4SMartin Matuska } 2240*e2df9bb4SMartin Matuska 2241*e2df9bb4SMartin Matuska ddt_log_commit(ddt, &dlu); 2242*e2df9bb4SMartin Matuska 2243*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_active_entries, 2244*e2df9bb4SMartin Matuska avl_numnodes(&ddt->ddt_log_active->ddl_tree)); 2245*e2df9bb4SMartin Matuska 2246*e2df9bb4SMartin Matuska /* 2247*e2df9bb4SMartin Matuska * Sync the stats for the store objects. Even though we haven't 2248*e2df9bb4SMartin Matuska * modified anything on those objects, they're no longer the 2249*e2df9bb4SMartin Matuska * source of truth for entries that are now in the log, and we 2250*e2df9bb4SMartin Matuska * need the on-disk counts to reflect that, otherwise we'll 2251*e2df9bb4SMartin Matuska * miscount later when importing. 2252*e2df9bb4SMartin Matuska */ 2253*e2df9bb4SMartin Matuska for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 2254*e2df9bb4SMartin Matuska for (ddt_class_t class = 0; 2255*e2df9bb4SMartin Matuska class < DDT_CLASSES; class++) { 2256*e2df9bb4SMartin Matuska if (ddt_object_exists(ddt, type, class)) 2257*e2df9bb4SMartin Matuska ddt_object_sync(ddt, type, class, tx); 2258*e2df9bb4SMartin Matuska } 2259*e2df9bb4SMartin Matuska } 2260*e2df9bb4SMartin Matuska 2261*e2df9bb4SMartin Matuska memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 2262*e2df9bb4SMartin Matuska sizeof (ddt->ddt_histogram)); 2263*e2df9bb4SMartin Matuska ddt->ddt_spa->spa_dedup_dspace = ~0ULL; 2264*e2df9bb4SMartin Matuska ddt->ddt_spa->spa_dedup_dsize = ~0ULL; 2265*e2df9bb4SMartin Matuska } 2266*e2df9bb4SMartin Matuska 2267*e2df9bb4SMartin Matuska if (spa_sync_pass(ddt->ddt_spa) == 1) { 2268*e2df9bb4SMartin Matuska /* 2269*e2df9bb4SMartin Matuska * Update ingest rate. This is an exponential weighted moving 2270*e2df9bb4SMartin Matuska * average of the number of entries changed over recent txgs. 2271*e2df9bb4SMartin Matuska * The ramp-up cost shouldn't matter too much because the 2272*e2df9bb4SMartin Matuska * flusher will be trying to take at least the minimum anyway. 2273*e2df9bb4SMartin Matuska */ 2274*e2df9bb4SMartin Matuska ddt->ddt_log_ingest_rate = _ewma( 2275*e2df9bb4SMartin Matuska count, ddt->ddt_log_ingest_rate, 2276*e2df9bb4SMartin Matuska zfs_dedup_log_flush_flow_rate_txgs); 2277*e2df9bb4SMartin Matuska DDT_KSTAT_SET(ddt, dds_log_ingest_rate, 2278*e2df9bb4SMartin Matuska ddt->ddt_log_ingest_rate); 2279eda14cbcSMatt Macy } 2280eda14cbcSMatt Macy } 2281eda14cbcSMatt Macy 2282eda14cbcSMatt Macy static void 2283*e2df9bb4SMartin Matuska ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx) 2284eda14cbcSMatt Macy { 2285eda14cbcSMatt Macy if (avl_numnodes(&ddt->ddt_tree) == 0) 2286eda14cbcSMatt Macy return; 2287eda14cbcSMatt Macy 2288*e2df9bb4SMartin Matuska ddt_entry_t *dde; 2289*e2df9bb4SMartin Matuska void *cookie = NULL; 2290*e2df9bb4SMartin Matuska while ((dde = avl_destroy_nodes( 2291*e2df9bb4SMartin Matuska &ddt->ddt_tree, &cookie)) != NULL) { 2292*e2df9bb4SMartin Matuska ASSERT(dde->dde_flags & DDE_FLAG_LOADED); 2293*e2df9bb4SMartin Matuska 2294*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 2295*e2df9bb4SMartin Matuska DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); 2296*e2df9bb4SMartin Matuska ddt_sync_flush_entry(ddt, &ddlwe, 2297*e2df9bb4SMartin Matuska dde->dde_type, dde->dde_class, tx); 2298*e2df9bb4SMartin Matuska ddt_sync_scan_entry(ddt, &ddlwe, tx); 2299*e2df9bb4SMartin Matuska ddt_free(ddt, dde); 2300*e2df9bb4SMartin Matuska } 2301*e2df9bb4SMartin Matuska 2302*e2df9bb4SMartin Matuska memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 2303*e2df9bb4SMartin Matuska sizeof (ddt->ddt_histogram)); 2304*e2df9bb4SMartin Matuska ddt->ddt_spa->spa_dedup_dspace = ~0ULL; 2305*e2df9bb4SMartin Matuska ddt->ddt_spa->spa_dedup_dsize = ~0ULL; 2306*e2df9bb4SMartin Matuska ddt_sync_update_stats(ddt, tx); 2307*e2df9bb4SMartin Matuska } 2308*e2df9bb4SMartin Matuska 2309*e2df9bb4SMartin Matuska static void 2310*e2df9bb4SMartin Matuska ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx) 2311*e2df9bb4SMartin Matuska { 2312*e2df9bb4SMartin Matuska spa_t *spa = ddt->ddt_spa; 2313*e2df9bb4SMartin Matuska 2314*e2df9bb4SMartin Matuska if (ddt->ddt_version == UINT64_MAX) 2315*e2df9bb4SMartin Matuska return; 2316*e2df9bb4SMartin Matuska 2317*e2df9bb4SMartin Matuska if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) { 2318*e2df9bb4SMartin Matuska ASSERT0(avl_numnodes(&ddt->ddt_tree)); 2319*e2df9bb4SMartin Matuska return; 2320*e2df9bb4SMartin Matuska } 2321eda14cbcSMatt Macy 2322eda14cbcSMatt Macy if (spa->spa_ddt_stat_object == 0) { 2323eda14cbcSMatt Macy spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 2324eda14cbcSMatt Macy DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 2325eda14cbcSMatt Macy DMU_POOL_DDT_STATS, tx); 2326eda14cbcSMatt Macy } 2327eda14cbcSMatt Macy 2328*e2df9bb4SMartin Matuska if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) 2329*e2df9bb4SMartin Matuska ddt_create_dir(ddt, tx); 2330eda14cbcSMatt Macy 2331*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_LOG) 2332*e2df9bb4SMartin Matuska ddt_sync_table_log(ddt, tx); 2333*e2df9bb4SMartin Matuska else 2334*e2df9bb4SMartin Matuska ddt_sync_table_flush(ddt, tx); 2335eda14cbcSMatt Macy } 2336eda14cbcSMatt Macy 2337eda14cbcSMatt Macy void 2338eda14cbcSMatt Macy ddt_sync(spa_t *spa, uint64_t txg) 2339eda14cbcSMatt Macy { 2340eda14cbcSMatt Macy dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2341eda14cbcSMatt Macy dmu_tx_t *tx; 2342eda14cbcSMatt Macy zio_t *rio; 2343eda14cbcSMatt Macy 23444fefe1b7SMartin Matuska ASSERT3U(spa_syncing_txg(spa), ==, txg); 2345eda14cbcSMatt Macy 2346eda14cbcSMatt Macy tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2347eda14cbcSMatt Macy 2348eda14cbcSMatt Macy rio = zio_root(spa, NULL, NULL, 2349eda14cbcSMatt Macy ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); 2350eda14cbcSMatt Macy 2351eda14cbcSMatt Macy /* 2352eda14cbcSMatt Macy * This function may cause an immediate scan of ddt blocks (see 2353eda14cbcSMatt Macy * the comment above dsl_scan_ddt() for details). We set the 2354eda14cbcSMatt Macy * scan's root zio here so that we can wait for any scan IOs in 2355eda14cbcSMatt Macy * addition to the regular ddt IOs. 2356eda14cbcSMatt Macy */ 2357eda14cbcSMatt Macy ASSERT3P(scn->scn_zio_root, ==, NULL); 2358eda14cbcSMatt Macy scn->scn_zio_root = rio; 2359eda14cbcSMatt Macy 2360eda14cbcSMatt Macy for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 2361eda14cbcSMatt Macy ddt_t *ddt = spa->spa_ddt[c]; 2362eda14cbcSMatt Macy if (ddt == NULL) 2363eda14cbcSMatt Macy continue; 2364*e2df9bb4SMartin Matuska ddt_sync_table(ddt, tx); 2365*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_LOG) 2366*e2df9bb4SMartin Matuska ddt_sync_flush_log(ddt, tx); 2367eda14cbcSMatt Macy ddt_repair_table(ddt, rio); 2368eda14cbcSMatt Macy } 2369eda14cbcSMatt Macy 2370eda14cbcSMatt Macy (void) zio_wait(rio); 2371eda14cbcSMatt Macy scn->scn_zio_root = NULL; 2372eda14cbcSMatt Macy 2373eda14cbcSMatt Macy dmu_tx_commit(tx); 2374eda14cbcSMatt Macy } 2375eda14cbcSMatt Macy 2376*e2df9bb4SMartin Matuska void 2377*e2df9bb4SMartin Matuska ddt_walk_init(spa_t *spa, uint64_t txg) 2378*e2df9bb4SMartin Matuska { 2379*e2df9bb4SMartin Matuska if (txg == 0) 2380*e2df9bb4SMartin Matuska txg = spa_syncing_txg(spa); 2381*e2df9bb4SMartin Matuska 2382*e2df9bb4SMartin Matuska for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 2383*e2df9bb4SMartin Matuska ddt_t *ddt = spa->spa_ddt[c]; 2384*e2df9bb4SMartin Matuska if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) 2385*e2df9bb4SMartin Matuska continue; 2386*e2df9bb4SMartin Matuska 2387*e2df9bb4SMartin Matuska ddt_enter(ddt); 2388*e2df9bb4SMartin Matuska ddt_flush_force_update_txg(ddt, txg); 2389*e2df9bb4SMartin Matuska ddt_exit(ddt); 2390*e2df9bb4SMartin Matuska } 2391*e2df9bb4SMartin Matuska } 2392*e2df9bb4SMartin Matuska 2393*e2df9bb4SMartin Matuska boolean_t 2394*e2df9bb4SMartin Matuska ddt_walk_ready(spa_t *spa) 2395*e2df9bb4SMartin Matuska { 2396*e2df9bb4SMartin Matuska for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 2397*e2df9bb4SMartin Matuska ddt_t *ddt = spa->spa_ddt[c]; 2398*e2df9bb4SMartin Matuska if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) 2399*e2df9bb4SMartin Matuska continue; 2400*e2df9bb4SMartin Matuska 2401*e2df9bb4SMartin Matuska if (ddt->ddt_flush_force_txg > 0) 2402*e2df9bb4SMartin Matuska return (B_FALSE); 2403*e2df9bb4SMartin Matuska } 2404*e2df9bb4SMartin Matuska 2405*e2df9bb4SMartin Matuska return (B_TRUE); 2406*e2df9bb4SMartin Matuska } 2407*e2df9bb4SMartin Matuska 2408*e2df9bb4SMartin Matuska static int 2409*e2df9bb4SMartin Matuska ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe, 2410*e2df9bb4SMartin Matuska uint64_t flags, boolean_t wait) 2411eda14cbcSMatt Macy { 2412eda14cbcSMatt Macy do { 2413eda14cbcSMatt Macy do { 2414eda14cbcSMatt Macy do { 2415eda14cbcSMatt Macy ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 24164fefe1b7SMartin Matuska if (ddt == NULL) 24174fefe1b7SMartin Matuska continue; 2418*e2df9bb4SMartin Matuska 2419*e2df9bb4SMartin Matuska if (flags != 0 && 2420*e2df9bb4SMartin Matuska (ddt->ddt_flags & flags) != flags) 2421*e2df9bb4SMartin Matuska continue; 2422*e2df9bb4SMartin Matuska 2423*e2df9bb4SMartin Matuska if (wait && ddt->ddt_flush_force_txg > 0) 2424*e2df9bb4SMartin Matuska return (EAGAIN); 2425*e2df9bb4SMartin Matuska 2426eda14cbcSMatt Macy int error = ENOENT; 2427eda14cbcSMatt Macy if (ddt_object_exists(ddt, ddb->ddb_type, 2428eda14cbcSMatt Macy ddb->ddb_class)) { 2429eda14cbcSMatt Macy error = ddt_object_walk(ddt, 2430eda14cbcSMatt Macy ddb->ddb_type, ddb->ddb_class, 2431*e2df9bb4SMartin Matuska &ddb->ddb_cursor, ddlwe); 2432eda14cbcSMatt Macy } 2433eda14cbcSMatt Macy if (error == 0) 2434eda14cbcSMatt Macy return (0); 2435eda14cbcSMatt Macy if (error != ENOENT) 2436eda14cbcSMatt Macy return (error); 2437eda14cbcSMatt Macy ddb->ddb_cursor = 0; 2438eda14cbcSMatt Macy } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 2439eda14cbcSMatt Macy ddb->ddb_checksum = 0; 2440eda14cbcSMatt Macy } while (++ddb->ddb_type < DDT_TYPES); 2441eda14cbcSMatt Macy ddb->ddb_type = 0; 2442eda14cbcSMatt Macy } while (++ddb->ddb_class < DDT_CLASSES); 2443eda14cbcSMatt Macy 2444eda14cbcSMatt Macy return (SET_ERROR(ENOENT)); 2445eda14cbcSMatt Macy } 2446eda14cbcSMatt Macy 2447*e2df9bb4SMartin Matuska int 2448*e2df9bb4SMartin Matuska ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) 2449*e2df9bb4SMartin Matuska { 2450*e2df9bb4SMartin Matuska return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE)); 2451*e2df9bb4SMartin Matuska } 2452*e2df9bb4SMartin Matuska 24532a58b312SMartin Matuska /* 24542a58b312SMartin Matuska * This function is used by Block Cloning (brt.c) to increase reference 24552a58b312SMartin Matuska * counter for the DDT entry if the block is already in DDT. 24562a58b312SMartin Matuska * 24572a58b312SMartin Matuska * Return false if the block, despite having the D bit set, is not present 2458*e2df9bb4SMartin Matuska * in the DDT. This is possible when the DDT has been pruned by an admin 2459*e2df9bb4SMartin Matuska * or by the DDT quota mechanism. 24602a58b312SMartin Matuska */ 24612a58b312SMartin Matuska boolean_t 24622a58b312SMartin Matuska ddt_addref(spa_t *spa, const blkptr_t *bp) 24632a58b312SMartin Matuska { 24642a58b312SMartin Matuska ddt_t *ddt; 24652a58b312SMartin Matuska ddt_entry_t *dde; 24662a58b312SMartin Matuska boolean_t result; 24672a58b312SMartin Matuska 24682a58b312SMartin Matuska spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 24692a58b312SMartin Matuska ddt = ddt_select(spa, bp); 24702a58b312SMartin Matuska ddt_enter(ddt); 24712a58b312SMartin Matuska 2472*e2df9bb4SMartin Matuska dde = ddt_lookup(ddt, bp); 2473ce4dcb97SMartin Matuska 2474ce4dcb97SMartin Matuska /* Can be NULL if the entry for this block was pruned. */ 2475ce4dcb97SMartin Matuska if (dde == NULL) { 2476ce4dcb97SMartin Matuska ddt_exit(ddt); 2477ce4dcb97SMartin Matuska spa_config_exit(spa, SCL_ZIO, FTAG); 2478ce4dcb97SMartin Matuska return (B_FALSE); 2479ce4dcb97SMartin Matuska } 24802a58b312SMartin Matuska 2481*e2df9bb4SMartin Matuska if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) { 24820a97523dSMartin Matuska /* 2483*e2df9bb4SMartin Matuska * This entry was either synced to a store object (dde_type is 2484*e2df9bb4SMartin Matuska * real) or was logged. It must be properly on disk at this 2485*e2df9bb4SMartin Matuska * point, so we can just bump its refcount. 24860a97523dSMartin Matuska */ 2487*e2df9bb4SMartin Matuska int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); 2488*e2df9bb4SMartin Matuska ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); 24890a97523dSMartin Matuska 2490*e2df9bb4SMartin Matuska ddt_phys_addref(dde->dde_phys, v); 24912a58b312SMartin Matuska result = B_TRUE; 24922a58b312SMartin Matuska } else { 24932a58b312SMartin Matuska /* 2494*e2df9bb4SMartin Matuska * If the block has the DEDUP flag set it still might not 2495*e2df9bb4SMartin Matuska * exist in the DEDUP table due to DDT pruning of entries 2496*e2df9bb4SMartin Matuska * where refcnt=1. 24972a58b312SMartin Matuska */ 24982a58b312SMartin Matuska ddt_remove(ddt, dde); 24992a58b312SMartin Matuska result = B_FALSE; 25002a58b312SMartin Matuska } 25012a58b312SMartin Matuska 25022a58b312SMartin Matuska ddt_exit(ddt); 25032a58b312SMartin Matuska spa_config_exit(spa, SCL_ZIO, FTAG); 25042a58b312SMartin Matuska 25052a58b312SMartin Matuska return (result); 25062a58b312SMartin Matuska } 25072a58b312SMartin Matuska 2508*e2df9bb4SMartin Matuska typedef struct ddt_prune_entry { 2509*e2df9bb4SMartin Matuska ddt_t *dpe_ddt; 2510*e2df9bb4SMartin Matuska ddt_key_t dpe_key; 2511*e2df9bb4SMartin Matuska list_node_t dpe_node; 2512*e2df9bb4SMartin Matuska ddt_univ_phys_t dpe_phys[]; 2513*e2df9bb4SMartin Matuska } ddt_prune_entry_t; 2514*e2df9bb4SMartin Matuska 2515*e2df9bb4SMartin Matuska typedef struct ddt_prune_info { 2516*e2df9bb4SMartin Matuska spa_t *dpi_spa; 2517*e2df9bb4SMartin Matuska uint64_t dpi_txg_syncs; 2518*e2df9bb4SMartin Matuska uint64_t dpi_pruned; 2519*e2df9bb4SMartin Matuska list_t dpi_candidates; 2520*e2df9bb4SMartin Matuska } ddt_prune_info_t; 2521*e2df9bb4SMartin Matuska 2522*e2df9bb4SMartin Matuska /* 2523*e2df9bb4SMartin Matuska * Add prune candidates for ddt_sync during spa_sync 2524*e2df9bb4SMartin Matuska */ 2525*e2df9bb4SMartin Matuska static void 2526*e2df9bb4SMartin Matuska prune_candidates_sync(void *arg, dmu_tx_t *tx) 2527*e2df9bb4SMartin Matuska { 2528*e2df9bb4SMartin Matuska (void) tx; 2529*e2df9bb4SMartin Matuska ddt_prune_info_t *dpi = arg; 2530*e2df9bb4SMartin Matuska ddt_prune_entry_t *dpe; 2531*e2df9bb4SMartin Matuska 2532*e2df9bb4SMartin Matuska spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER); 2533*e2df9bb4SMartin Matuska 2534*e2df9bb4SMartin Matuska /* Process the prune candidates collected so far */ 2535*e2df9bb4SMartin Matuska while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) { 2536*e2df9bb4SMartin Matuska blkptr_t blk; 2537*e2df9bb4SMartin Matuska ddt_t *ddt = dpe->dpe_ddt; 2538*e2df9bb4SMartin Matuska 2539*e2df9bb4SMartin Matuska ddt_enter(ddt); 2540*e2df9bb4SMartin Matuska 2541*e2df9bb4SMartin Matuska /* 2542*e2df9bb4SMartin Matuska * If it's on the live list, then it was loaded for update 2543*e2df9bb4SMartin Matuska * this txg and is no longer stale; skip it. 2544*e2df9bb4SMartin Matuska */ 2545*e2df9bb4SMartin Matuska if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) { 2546*e2df9bb4SMartin Matuska ddt_exit(ddt); 2547*e2df9bb4SMartin Matuska kmem_free(dpe, sizeof (*dpe)); 2548*e2df9bb4SMartin Matuska continue; 2549*e2df9bb4SMartin Matuska } 2550*e2df9bb4SMartin Matuska 2551*e2df9bb4SMartin Matuska ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, 2552*e2df9bb4SMartin Matuska dpe->dpe_phys, DDT_PHYS_FLAT, &blk); 2553*e2df9bb4SMartin Matuska 2554*e2df9bb4SMartin Matuska ddt_entry_t *dde = ddt_lookup(ddt, &blk); 2555*e2df9bb4SMartin Matuska if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) { 2556*e2df9bb4SMartin Matuska ASSERT(dde->dde_flags & DDE_FLAG_LOADED); 2557*e2df9bb4SMartin Matuska /* 2558*e2df9bb4SMartin Matuska * Zero the physical, so we don't try to free DVAs 2559*e2df9bb4SMartin Matuska * at flush nor try to reuse this entry. 2560*e2df9bb4SMartin Matuska */ 2561*e2df9bb4SMartin Matuska ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT); 2562*e2df9bb4SMartin Matuska 2563*e2df9bb4SMartin Matuska dpi->dpi_pruned++; 2564*e2df9bb4SMartin Matuska } 2565*e2df9bb4SMartin Matuska 2566*e2df9bb4SMartin Matuska ddt_exit(ddt); 2567*e2df9bb4SMartin Matuska kmem_free(dpe, sizeof (*dpe)); 2568*e2df9bb4SMartin Matuska } 2569*e2df9bb4SMartin Matuska 2570*e2df9bb4SMartin Matuska spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG); 2571*e2df9bb4SMartin Matuska dpi->dpi_txg_syncs++; 2572*e2df9bb4SMartin Matuska } 2573*e2df9bb4SMartin Matuska 2574*e2df9bb4SMartin Matuska /* 2575*e2df9bb4SMartin Matuska * Prune candidates are collected in open context and processed 2576*e2df9bb4SMartin Matuska * in sync context as part of ddt_sync_table(). 2577*e2df9bb4SMartin Matuska */ 2578*e2df9bb4SMartin Matuska static void 2579*e2df9bb4SMartin Matuska ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk, 2580*e2df9bb4SMartin Matuska const ddt_univ_phys_t *ddp) 2581*e2df9bb4SMartin Matuska { 2582*e2df9bb4SMartin Matuska ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); 2583*e2df9bb4SMartin Matuska 2584*e2df9bb4SMartin Matuska size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE; 2585*e2df9bb4SMartin Matuska ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP); 2586*e2df9bb4SMartin Matuska 2587*e2df9bb4SMartin Matuska dpe->dpe_ddt = ddt; 2588*e2df9bb4SMartin Matuska dpe->dpe_key = *ddk; 2589*e2df9bb4SMartin Matuska memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE); 2590*e2df9bb4SMartin Matuska list_insert_head(list, dpe); 2591*e2df9bb4SMartin Matuska } 2592*e2df9bb4SMartin Matuska 2593*e2df9bb4SMartin Matuska /* 2594*e2df9bb4SMartin Matuska * Interate over all the entries in the DDT unique class. 2595*e2df9bb4SMartin Matuska * The walk will perform one of the following operations: 2596*e2df9bb4SMartin Matuska * (a) build a histogram than can be used when pruning 2597*e2df9bb4SMartin Matuska * (b) prune entries older than the cutoff 2598*e2df9bb4SMartin Matuska * 2599*e2df9bb4SMartin Matuska * Also called by zdb(8) to dump the age histogram 2600*e2df9bb4SMartin Matuska */ 2601*e2df9bb4SMartin Matuska void 2602*e2df9bb4SMartin Matuska ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) 2603*e2df9bb4SMartin Matuska { 2604*e2df9bb4SMartin Matuska ddt_bookmark_t ddb = { 2605*e2df9bb4SMartin Matuska .ddb_class = DDT_CLASS_UNIQUE, 2606*e2df9bb4SMartin Matuska .ddb_type = 0, 2607*e2df9bb4SMartin Matuska .ddb_checksum = 0, 2608*e2df9bb4SMartin Matuska .ddb_cursor = 0 2609*e2df9bb4SMartin Matuska }; 2610*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe = {0}; 2611*e2df9bb4SMartin Matuska int error; 2612*e2df9bb4SMartin Matuska int valid = 0; 2613*e2df9bb4SMartin Matuska int candidates = 0; 2614*e2df9bb4SMartin Matuska uint64_t now = gethrestime_sec(); 2615*e2df9bb4SMartin Matuska ddt_prune_info_t dpi; 2616*e2df9bb4SMartin Matuska boolean_t pruning = (cutoff != 0); 2617*e2df9bb4SMartin Matuska 2618*e2df9bb4SMartin Matuska if (pruning) { 2619*e2df9bb4SMartin Matuska dpi.dpi_txg_syncs = 0; 2620*e2df9bb4SMartin Matuska dpi.dpi_pruned = 0; 2621*e2df9bb4SMartin Matuska dpi.dpi_spa = spa; 2622*e2df9bb4SMartin Matuska list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t), 2623*e2df9bb4SMartin Matuska offsetof(ddt_prune_entry_t, dpe_node)); 2624*e2df9bb4SMartin Matuska } 2625*e2df9bb4SMartin Matuska 2626*e2df9bb4SMartin Matuska if (histogram != NULL) 2627*e2df9bb4SMartin Matuska memset(histogram, 0, sizeof (ddt_age_histo_t)); 2628*e2df9bb4SMartin Matuska 2629*e2df9bb4SMartin Matuska while ((error = 2630*e2df9bb4SMartin Matuska ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) { 2631*e2df9bb4SMartin Matuska ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; 2632*e2df9bb4SMartin Matuska VERIFY(ddt); 2633*e2df9bb4SMartin Matuska 2634*e2df9bb4SMartin Matuska if (spa_shutting_down(spa) || issig()) 2635*e2df9bb4SMartin Matuska break; 2636*e2df9bb4SMartin Matuska 2637*e2df9bb4SMartin Matuska ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); 2638*e2df9bb4SMartin Matuska ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1); 2639*e2df9bb4SMartin Matuska 2640*e2df9bb4SMartin Matuska uint64_t class_start = 2641*e2df9bb4SMartin Matuska ddlwe.ddlwe_phys.ddp_flat.ddp_class_start; 2642*e2df9bb4SMartin Matuska 2643*e2df9bb4SMartin Matuska /* 2644*e2df9bb4SMartin Matuska * If this entry is on the log, then the stored entry is stale 2645*e2df9bb4SMartin Matuska * and we should skip it. 2646*e2df9bb4SMartin Matuska */ 2647*e2df9bb4SMartin Matuska if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL)) 2648*e2df9bb4SMartin Matuska continue; 2649*e2df9bb4SMartin Matuska 2650*e2df9bb4SMartin Matuska /* prune older entries */ 2651*e2df9bb4SMartin Matuska if (pruning && class_start < cutoff) { 2652*e2df9bb4SMartin Matuska if (candidates++ >= zfs_ddt_prunes_per_txg) { 2653*e2df9bb4SMartin Matuska /* sync prune candidates in batches */ 2654*e2df9bb4SMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), 2655*e2df9bb4SMartin Matuska NULL, prune_candidates_sync, 2656*e2df9bb4SMartin Matuska &dpi, 0, ZFS_SPACE_CHECK_NONE)); 2657*e2df9bb4SMartin Matuska candidates = 1; 2658*e2df9bb4SMartin Matuska } 2659*e2df9bb4SMartin Matuska ddt_prune_entry(&dpi.dpi_candidates, ddt, 2660*e2df9bb4SMartin Matuska &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys); 2661*e2df9bb4SMartin Matuska } 2662*e2df9bb4SMartin Matuska 2663*e2df9bb4SMartin Matuska /* build a histogram */ 2664*e2df9bb4SMartin Matuska if (histogram != NULL) { 2665*e2df9bb4SMartin Matuska uint64_t age = MAX(1, (now - class_start) / 3600); 2666*e2df9bb4SMartin Matuska int bin = MIN(highbit64(age) - 1, HIST_BINS - 1); 2667*e2df9bb4SMartin Matuska histogram->dah_entries++; 2668*e2df9bb4SMartin Matuska histogram->dah_age_histo[bin]++; 2669*e2df9bb4SMartin Matuska } 2670*e2df9bb4SMartin Matuska 2671*e2df9bb4SMartin Matuska valid++; 2672*e2df9bb4SMartin Matuska } 2673*e2df9bb4SMartin Matuska 2674*e2df9bb4SMartin Matuska if (pruning && valid > 0) { 2675*e2df9bb4SMartin Matuska if (!list_is_empty(&dpi.dpi_candidates)) { 2676*e2df9bb4SMartin Matuska /* sync out final batch of prune candidates */ 2677*e2df9bb4SMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2678*e2df9bb4SMartin Matuska prune_candidates_sync, &dpi, 0, 2679*e2df9bb4SMartin Matuska ZFS_SPACE_CHECK_NONE)); 2680*e2df9bb4SMartin Matuska } 2681*e2df9bb4SMartin Matuska list_destroy(&dpi.dpi_candidates); 2682*e2df9bb4SMartin Matuska 2683*e2df9bb4SMartin Matuska zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs", 2684*e2df9bb4SMartin Matuska (u_longlong_t)dpi.dpi_pruned, 2685*e2df9bb4SMartin Matuska (int)((dpi.dpi_pruned * 100) / valid), 2686*e2df9bb4SMartin Matuska (u_longlong_t)dpi.dpi_txg_syncs); 2687*e2df9bb4SMartin Matuska } 2688*e2df9bb4SMartin Matuska } 2689*e2df9bb4SMartin Matuska 2690*e2df9bb4SMartin Matuska static uint64_t 2691*e2df9bb4SMartin Matuska ddt_total_entries(spa_t *spa) 2692*e2df9bb4SMartin Matuska { 2693*e2df9bb4SMartin Matuska ddt_object_t ddo; 2694*e2df9bb4SMartin Matuska ddt_get_dedup_object_stats(spa, &ddo); 2695*e2df9bb4SMartin Matuska 2696*e2df9bb4SMartin Matuska return (ddo.ddo_count); 2697*e2df9bb4SMartin Matuska } 2698*e2df9bb4SMartin Matuska 2699*e2df9bb4SMartin Matuska int 2700*e2df9bb4SMartin Matuska ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, 2701*e2df9bb4SMartin Matuska uint64_t amount) 2702*e2df9bb4SMartin Matuska { 2703*e2df9bb4SMartin Matuska uint64_t cutoff; 2704*e2df9bb4SMartin Matuska uint64_t start_time = gethrtime(); 2705*e2df9bb4SMartin Matuska 2706*e2df9bb4SMartin Matuska if (spa->spa_active_ddt_prune) 2707*e2df9bb4SMartin Matuska return (SET_ERROR(EALREADY)); 2708*e2df9bb4SMartin Matuska if (ddt_total_entries(spa) == 0) 2709*e2df9bb4SMartin Matuska return (0); 2710*e2df9bb4SMartin Matuska 2711*e2df9bb4SMartin Matuska spa->spa_active_ddt_prune = B_TRUE; 2712*e2df9bb4SMartin Matuska 2713*e2df9bb4SMartin Matuska zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount, 2714*e2df9bb4SMartin Matuska unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older"); 2715*e2df9bb4SMartin Matuska 2716*e2df9bb4SMartin Matuska if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { 2717*e2df9bb4SMartin Matuska ddt_age_histo_t histogram; 2718*e2df9bb4SMartin Matuska uint64_t oldest = 0; 2719*e2df9bb4SMartin Matuska 2720*e2df9bb4SMartin Matuska /* Make a pass over DDT to build a histogram */ 2721*e2df9bb4SMartin Matuska ddt_prune_walk(spa, 0, &histogram); 2722*e2df9bb4SMartin Matuska 2723*e2df9bb4SMartin Matuska int target = (histogram.dah_entries * amount) / 100; 2724*e2df9bb4SMartin Matuska 2725*e2df9bb4SMartin Matuska /* 2726*e2df9bb4SMartin Matuska * Figure out our cutoff date 2727*e2df9bb4SMartin Matuska * (i.e., which bins to prune from) 2728*e2df9bb4SMartin Matuska */ 2729*e2df9bb4SMartin Matuska for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) { 2730*e2df9bb4SMartin Matuska if (histogram.dah_age_histo[i] != 0) { 2731*e2df9bb4SMartin Matuska /* less than this bucket remaining */ 2732*e2df9bb4SMartin Matuska if (target < histogram.dah_age_histo[i]) { 2733*e2df9bb4SMartin Matuska oldest = MAX(1, (1<<i) * 3600); 2734*e2df9bb4SMartin Matuska target = 0; 2735*e2df9bb4SMartin Matuska } else { 2736*e2df9bb4SMartin Matuska target -= histogram.dah_age_histo[i]; 2737*e2df9bb4SMartin Matuska } 2738*e2df9bb4SMartin Matuska } 2739*e2df9bb4SMartin Matuska } 2740*e2df9bb4SMartin Matuska cutoff = gethrestime_sec() - oldest; 2741*e2df9bb4SMartin Matuska 2742*e2df9bb4SMartin Matuska if (ddt_dump_prune_histogram) 2743*e2df9bb4SMartin Matuska ddt_dump_age_histogram(&histogram, cutoff); 2744*e2df9bb4SMartin Matuska } else if (unit == ZPOOL_DDT_PRUNE_AGE) { 2745*e2df9bb4SMartin Matuska cutoff = gethrestime_sec() - amount; 2746*e2df9bb4SMartin Matuska } else { 2747*e2df9bb4SMartin Matuska return (EINVAL); 2748*e2df9bb4SMartin Matuska } 2749*e2df9bb4SMartin Matuska 2750*e2df9bb4SMartin Matuska if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) { 2751*e2df9bb4SMartin Matuska /* Traverse DDT to prune entries older that our cuttoff */ 2752*e2df9bb4SMartin Matuska ddt_prune_walk(spa, cutoff, NULL); 2753*e2df9bb4SMartin Matuska } 2754*e2df9bb4SMartin Matuska 2755*e2df9bb4SMartin Matuska zfs_dbgmsg("%s: prune completed in %llu ms", 2756*e2df9bb4SMartin Matuska spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); 2757*e2df9bb4SMartin Matuska 2758*e2df9bb4SMartin Matuska spa->spa_active_ddt_prune = B_FALSE; 2759*e2df9bb4SMartin Matuska return (0); 2760*e2df9bb4SMartin Matuska } 2761*e2df9bb4SMartin Matuska 2762eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, 2763eda14cbcSMatt Macy "Enable prefetching dedup-ed blks"); 2764*e2df9bb4SMartin Matuska 2765*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, 2766*e2df9bb4SMartin Matuska "Max number of incremental dedup log flush passes per transaction"); 2767*e2df9bb4SMartin Matuska 2768*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, 2769*e2df9bb4SMartin Matuska "Min time to spend on incremental dedup log flush each transaction"); 2770*e2df9bb4SMartin Matuska 2771*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, 2772*e2df9bb4SMartin Matuska "Min number of log entries to flush each transaction"); 2773*e2df9bb4SMartin Matuska 2774*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, 2775*e2df9bb4SMartin Matuska "Number of txgs to average flow rates across"); 2776