1*e2df9bb4SMartin Matuska /* 2*e2df9bb4SMartin Matuska * CDDL HEADER START 3*e2df9bb4SMartin Matuska * 4*e2df9bb4SMartin Matuska * The contents of this file are subject to the terms of the 5*e2df9bb4SMartin Matuska * Common Development and Distribution License (the "License"). 6*e2df9bb4SMartin Matuska * You may not use this file except in compliance with the License. 7*e2df9bb4SMartin Matuska * 8*e2df9bb4SMartin Matuska * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*e2df9bb4SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10*e2df9bb4SMartin Matuska * See the License for the specific language governing permissions 11*e2df9bb4SMartin Matuska * and limitations under the License. 12*e2df9bb4SMartin Matuska * 13*e2df9bb4SMartin Matuska * When distributing Covered Code, include this CDDL HEADER in each 14*e2df9bb4SMartin Matuska * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*e2df9bb4SMartin Matuska * If applicable, add the following below this CDDL HEADER, with the 16*e2df9bb4SMartin Matuska * fields enclosed by brackets "[]" replaced with your own identifying 17*e2df9bb4SMartin Matuska * information: Portions Copyright [yyyy] [name of copyright owner] 18*e2df9bb4SMartin Matuska * 19*e2df9bb4SMartin Matuska * CDDL HEADER END 20*e2df9bb4SMartin Matuska */ 21*e2df9bb4SMartin Matuska 22*e2df9bb4SMartin Matuska /* 23*e2df9bb4SMartin Matuska * Copyright (c) 2023, Klara Inc. 24*e2df9bb4SMartin Matuska */ 25*e2df9bb4SMartin Matuska 26*e2df9bb4SMartin Matuska #include <sys/zfs_context.h> 27*e2df9bb4SMartin Matuska #include <sys/spa.h> 28*e2df9bb4SMartin Matuska #include <sys/ddt.h> 29*e2df9bb4SMartin Matuska #include <sys/dmu_tx.h> 30*e2df9bb4SMartin Matuska #include <sys/dmu.h> 31*e2df9bb4SMartin Matuska #include <sys/ddt_impl.h> 32*e2df9bb4SMartin Matuska #include <sys/dnode.h> 33*e2df9bb4SMartin Matuska #include <sys/dbuf.h> 34*e2df9bb4SMartin Matuska #include <sys/zap.h> 35*e2df9bb4SMartin Matuska #include <sys/zio_checksum.h> 36*e2df9bb4SMartin Matuska 37*e2df9bb4SMartin Matuska /* 38*e2df9bb4SMartin Matuska * No more than this many txgs before swapping logs. 39*e2df9bb4SMartin Matuska */ 40*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_txg_max = 8; 41*e2df9bb4SMartin Matuska 42*e2df9bb4SMartin Matuska /* 43*e2df9bb4SMartin Matuska * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module 44*e2df9bb4SMartin Matuska * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. 45*e2df9bb4SMartin Matuska */ 46*e2df9bb4SMartin Matuska uint64_t zfs_dedup_log_mem_max = 0; 47*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_mem_max_percent = 1; 48*e2df9bb4SMartin Matuska 49*e2df9bb4SMartin Matuska 50*e2df9bb4SMartin Matuska static kmem_cache_t *ddt_log_entry_flat_cache; 51*e2df9bb4SMartin Matuska static kmem_cache_t *ddt_log_entry_trad_cache; 52*e2df9bb4SMartin Matuska 53*e2df9bb4SMartin Matuska #define DDT_LOG_ENTRY_FLAT_SIZE \ 54*e2df9bb4SMartin Matuska (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) 55*e2df9bb4SMartin Matuska #define DDT_LOG_ENTRY_TRAD_SIZE \ 56*e2df9bb4SMartin Matuska (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) 57*e2df9bb4SMartin Matuska 58*e2df9bb4SMartin Matuska #define DDT_LOG_ENTRY_SIZE(ddt) \ 59*e2df9bb4SMartin Matuska _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) 60*e2df9bb4SMartin Matuska 61*e2df9bb4SMartin Matuska void 62*e2df9bb4SMartin Matuska ddt_log_init(void) 63*e2df9bb4SMartin Matuska { 64*e2df9bb4SMartin Matuska ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", 65*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 66*e2df9bb4SMartin Matuska ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", 67*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 68*e2df9bb4SMartin Matuska 69*e2df9bb4SMartin Matuska /* 70*e2df9bb4SMartin Matuska * Max memory for log AVL entries. At least 1M, because we need 71*e2df9bb4SMartin Matuska * something (that's ~3800 entries per tree). They can say 100% if they 72*e2df9bb4SMartin Matuska * want; it just means they're at the mercy of the the txg flush limit. 73*e2df9bb4SMartin Matuska */ 74*e2df9bb4SMartin Matuska if (zfs_dedup_log_mem_max == 0) { 75*e2df9bb4SMartin Matuska zfs_dedup_log_mem_max_percent = 76*e2df9bb4SMartin Matuska MIN(zfs_dedup_log_mem_max_percent, 100); 77*e2df9bb4SMartin Matuska zfs_dedup_log_mem_max = (physmem * PAGESIZE) * 78*e2df9bb4SMartin Matuska zfs_dedup_log_mem_max_percent / 100; 79*e2df9bb4SMartin Matuska } 80*e2df9bb4SMartin Matuska zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); 81*e2df9bb4SMartin Matuska } 82*e2df9bb4SMartin Matuska 83*e2df9bb4SMartin Matuska void 84*e2df9bb4SMartin Matuska ddt_log_fini(void) 85*e2df9bb4SMartin Matuska { 86*e2df9bb4SMartin Matuska kmem_cache_destroy(ddt_log_entry_trad_cache); 87*e2df9bb4SMartin Matuska kmem_cache_destroy(ddt_log_entry_flat_cache); 88*e2df9bb4SMartin Matuska } 89*e2df9bb4SMartin Matuska 90*e2df9bb4SMartin Matuska static void 91*e2df9bb4SMartin Matuska ddt_log_name(ddt_t *ddt, char *name, uint_t n) 92*e2df9bb4SMartin Matuska { 93*e2df9bb4SMartin Matuska snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, 94*e2df9bb4SMartin Matuska zio_checksum_table[ddt->ddt_checksum].ci_name, n); 95*e2df9bb4SMartin Matuska } 96*e2df9bb4SMartin Matuska 97*e2df9bb4SMartin Matuska static void 98*e2df9bb4SMartin Matuska ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) 99*e2df9bb4SMartin Matuska { 100*e2df9bb4SMartin Matuska dmu_buf_t *db; 101*e2df9bb4SMartin Matuska VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); 102*e2df9bb4SMartin Matuska dmu_buf_will_dirty(db, tx); 103*e2df9bb4SMartin Matuska 104*e2df9bb4SMartin Matuska ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; 105*e2df9bb4SMartin Matuska DLH_SET_VERSION(hdr, 1); 106*e2df9bb4SMartin Matuska DLH_SET_FLAGS(hdr, ddl->ddl_flags); 107*e2df9bb4SMartin Matuska hdr->dlh_length = ddl->ddl_length; 108*e2df9bb4SMartin Matuska hdr->dlh_first_txg = ddl->ddl_first_txg; 109*e2df9bb4SMartin Matuska hdr->dlh_checkpoint = ddl->ddl_checkpoint; 110*e2df9bb4SMartin Matuska 111*e2df9bb4SMartin Matuska dmu_buf_rele(db, FTAG); 112*e2df9bb4SMartin Matuska } 113*e2df9bb4SMartin Matuska 114*e2df9bb4SMartin Matuska static void 115*e2df9bb4SMartin Matuska ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) 116*e2df9bb4SMartin Matuska { 117*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, >, 0); 118*e2df9bb4SMartin Matuska ASSERT3U(ddl->ddl_object, ==, 0); 119*e2df9bb4SMartin Matuska 120*e2df9bb4SMartin Matuska char name[DDT_NAMELEN]; 121*e2df9bb4SMartin Matuska ddt_log_name(ddt, name, n); 122*e2df9bb4SMartin Matuska 123*e2df9bb4SMartin Matuska ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, 124*e2df9bb4SMartin Matuska DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 125*e2df9bb4SMartin Matuska DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); 126*e2df9bb4SMartin Matuska VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, 127*e2df9bb4SMartin Matuska sizeof (uint64_t), 1, &ddl->ddl_object, tx)); 128*e2df9bb4SMartin Matuska ddl->ddl_length = 0; 129*e2df9bb4SMartin Matuska ddl->ddl_first_txg = tx->tx_txg; 130*e2df9bb4SMartin Matuska ddt_log_update_header(ddt, ddl, tx); 131*e2df9bb4SMartin Matuska } 132*e2df9bb4SMartin Matuska 133*e2df9bb4SMartin Matuska static void 134*e2df9bb4SMartin Matuska ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) 135*e2df9bb4SMartin Matuska { 136*e2df9bb4SMartin Matuska ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); 137*e2df9bb4SMartin Matuska ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); 138*e2df9bb4SMartin Matuska } 139*e2df9bb4SMartin Matuska 140*e2df9bb4SMartin Matuska static void 141*e2df9bb4SMartin Matuska ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) 142*e2df9bb4SMartin Matuska { 143*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_dir_object, >, 0); 144*e2df9bb4SMartin Matuska 145*e2df9bb4SMartin Matuska if (ddl->ddl_object == 0) 146*e2df9bb4SMartin Matuska return; 147*e2df9bb4SMartin Matuska 148*e2df9bb4SMartin Matuska ASSERT0(ddl->ddl_length); 149*e2df9bb4SMartin Matuska 150*e2df9bb4SMartin Matuska char name[DDT_NAMELEN]; 151*e2df9bb4SMartin Matuska ddt_log_name(ddt, name, n); 152*e2df9bb4SMartin Matuska 153*e2df9bb4SMartin Matuska VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); 154*e2df9bb4SMartin Matuska VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); 155*e2df9bb4SMartin Matuska 156*e2df9bb4SMartin Matuska ddl->ddl_object = 0; 157*e2df9bb4SMartin Matuska } 158*e2df9bb4SMartin Matuska 159*e2df9bb4SMartin Matuska void 160*e2df9bb4SMartin Matuska ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) 161*e2df9bb4SMartin Matuska { 162*e2df9bb4SMartin Matuska ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); 163*e2df9bb4SMartin Matuska ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); 164*e2df9bb4SMartin Matuska } 165*e2df9bb4SMartin Matuska 166*e2df9bb4SMartin Matuska static void 167*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt_t *ddt) 168*e2df9bb4SMartin Matuska { 169*e2df9bb4SMartin Matuska /* 170*e2df9bb4SMartin Matuska * Log object stats. We count the number of live entries in the log 171*e2df9bb4SMartin Matuska * tree, even if there are more than on disk, and even if the same 172*e2df9bb4SMartin Matuska * entry is on both append and flush trees, because that's more what 173*e2df9bb4SMartin Matuska * the user expects to see. This does mean the on-disk size is not 174*e2df9bb4SMartin Matuska * really correlated with the number of entries, but I don't think 175*e2df9bb4SMartin Matuska * that's reasonable to expect anyway. 176*e2df9bb4SMartin Matuska */ 177*e2df9bb4SMartin Matuska dmu_object_info_t doi; 178*e2df9bb4SMartin Matuska uint64_t nblocks; 179*e2df9bb4SMartin Matuska dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); 180*e2df9bb4SMartin Matuska nblocks = doi.doi_physical_blocks_512; 181*e2df9bb4SMartin Matuska dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); 182*e2df9bb4SMartin Matuska nblocks += doi.doi_physical_blocks_512; 183*e2df9bb4SMartin Matuska 184*e2df9bb4SMartin Matuska ddt_object_t *ddo = &ddt->ddt_log_stats; 185*e2df9bb4SMartin Matuska ddo->ddo_count = 186*e2df9bb4SMartin Matuska avl_numnodes(&ddt->ddt_log_active->ddl_tree) + 187*e2df9bb4SMartin Matuska avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); 188*e2df9bb4SMartin Matuska ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); 189*e2df9bb4SMartin Matuska ddo->ddo_dspace = nblocks << 9; 190*e2df9bb4SMartin Matuska } 191*e2df9bb4SMartin Matuska 192*e2df9bb4SMartin Matuska void 193*e2df9bb4SMartin Matuska ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) 194*e2df9bb4SMartin Matuska { 195*e2df9bb4SMartin Matuska ASSERT3U(nentries, >, 0); 196*e2df9bb4SMartin Matuska ASSERT3P(dlu->dlu_dbp, ==, NULL); 197*e2df9bb4SMartin Matuska 198*e2df9bb4SMartin Matuska if (ddt->ddt_log_active->ddl_object == 0) 199*e2df9bb4SMartin Matuska ddt_log_create(ddt, tx); 200*e2df9bb4SMartin Matuska 201*e2df9bb4SMartin Matuska /* 202*e2df9bb4SMartin Matuska * We want to store as many entries as we can in a block, but never 203*e2df9bb4SMartin Matuska * split an entry across block boundaries. 204*e2df9bb4SMartin Matuska */ 205*e2df9bb4SMartin Matuska size_t reclen = P2ALIGN_TYPED( 206*e2df9bb4SMartin Matuska sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + 207*e2df9bb4SMartin Matuska DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); 208*e2df9bb4SMartin Matuska ASSERT3U(reclen, <=, UINT16_MAX); 209*e2df9bb4SMartin Matuska dlu->dlu_reclen = reclen; 210*e2df9bb4SMartin Matuska 211*e2df9bb4SMartin Matuska VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, 212*e2df9bb4SMartin Matuska &dlu->dlu_dn)); 213*e2df9bb4SMartin Matuska dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); 214*e2df9bb4SMartin Matuska 215*e2df9bb4SMartin Matuska uint64_t nblocks = howmany(nentries, 216*e2df9bb4SMartin Matuska dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); 217*e2df9bb4SMartin Matuska uint64_t offset = ddt->ddt_log_active->ddl_length; 218*e2df9bb4SMartin Matuska uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; 219*e2df9bb4SMartin Matuska 220*e2df9bb4SMartin Matuska VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, 221*e2df9bb4SMartin Matuska B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, 222*e2df9bb4SMartin Matuska DMU_READ_NO_PREFETCH)); 223*e2df9bb4SMartin Matuska 224*e2df9bb4SMartin Matuska dlu->dlu_tx = tx; 225*e2df9bb4SMartin Matuska dlu->dlu_block = dlu->dlu_offset = 0; 226*e2df9bb4SMartin Matuska } 227*e2df9bb4SMartin Matuska 228*e2df9bb4SMartin Matuska static ddt_log_entry_t * 229*e2df9bb4SMartin Matuska ddt_log_alloc_entry(ddt_t *ddt) 230*e2df9bb4SMartin Matuska { 231*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle; 232*e2df9bb4SMartin Matuska 233*e2df9bb4SMartin Matuska if (ddt->ddt_flags & DDT_FLAG_FLAT) { 234*e2df9bb4SMartin Matuska ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); 235*e2df9bb4SMartin Matuska memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); 236*e2df9bb4SMartin Matuska } else { 237*e2df9bb4SMartin Matuska ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); 238*e2df9bb4SMartin Matuska memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); 239*e2df9bb4SMartin Matuska } 240*e2df9bb4SMartin Matuska 241*e2df9bb4SMartin Matuska return (ddle); 242*e2df9bb4SMartin Matuska } 243*e2df9bb4SMartin Matuska 244*e2df9bb4SMartin Matuska static void 245*e2df9bb4SMartin Matuska ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) 246*e2df9bb4SMartin Matuska { 247*e2df9bb4SMartin Matuska /* Create the log tree entry from a live or stored entry */ 248*e2df9bb4SMartin Matuska avl_index_t where; 249*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle = 250*e2df9bb4SMartin Matuska avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); 251*e2df9bb4SMartin Matuska if (ddle == NULL) { 252*e2df9bb4SMartin Matuska ddle = ddt_log_alloc_entry(ddt); 253*e2df9bb4SMartin Matuska ddle->ddle_key = ddlwe->ddlwe_key; 254*e2df9bb4SMartin Matuska avl_insert(&ddl->ddl_tree, ddle, where); 255*e2df9bb4SMartin Matuska } 256*e2df9bb4SMartin Matuska ddle->ddle_type = ddlwe->ddlwe_type; 257*e2df9bb4SMartin Matuska ddle->ddle_class = ddlwe->ddlwe_class; 258*e2df9bb4SMartin Matuska memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 259*e2df9bb4SMartin Matuska } 260*e2df9bb4SMartin Matuska 261*e2df9bb4SMartin Matuska void 262*e2df9bb4SMartin Matuska ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) 263*e2df9bb4SMartin Matuska { 264*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_dbp, !=, NULL); 265*e2df9bb4SMartin Matuska 266*e2df9bb4SMartin Matuska ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); 267*e2df9bb4SMartin Matuska ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); 268*e2df9bb4SMartin Matuska 269*e2df9bb4SMartin Matuska /* Get our block */ 270*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); 271*e2df9bb4SMartin Matuska dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; 272*e2df9bb4SMartin Matuska 273*e2df9bb4SMartin Matuska /* 274*e2df9bb4SMartin Matuska * If this would take us past the end of the block, finish it and 275*e2df9bb4SMartin Matuska * move to the next one. 276*e2df9bb4SMartin Matuska */ 277*e2df9bb4SMartin Matuska if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { 278*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_offset, >, 0); 279*e2df9bb4SMartin Matuska dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); 280*e2df9bb4SMartin Matuska dlu->dlu_block++; 281*e2df9bb4SMartin Matuska dlu->dlu_offset = 0; 282*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); 283*e2df9bb4SMartin Matuska db = dlu->dlu_dbp[dlu->dlu_block]; 284*e2df9bb4SMartin Matuska } 285*e2df9bb4SMartin Matuska 286*e2df9bb4SMartin Matuska /* 287*e2df9bb4SMartin Matuska * If this is the first time touching the block, inform the DMU that 288*e2df9bb4SMartin Matuska * we will fill it, and zero it out. 289*e2df9bb4SMartin Matuska */ 290*e2df9bb4SMartin Matuska if (dlu->dlu_offset == 0) { 291*e2df9bb4SMartin Matuska dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE); 292*e2df9bb4SMartin Matuska memset(db->db_data, 0, db->db_size); 293*e2df9bb4SMartin Matuska } 294*e2df9bb4SMartin Matuska 295*e2df9bb4SMartin Matuska /* Create the log record directly in the buffer */ 296*e2df9bb4SMartin Matuska ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); 297*e2df9bb4SMartin Matuska DLR_SET_TYPE(dlr, DLR_ENTRY); 298*e2df9bb4SMartin Matuska DLR_SET_RECLEN(dlr, dlu->dlu_reclen); 299*e2df9bb4SMartin Matuska DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); 300*e2df9bb4SMartin Matuska DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); 301*e2df9bb4SMartin Matuska 302*e2df9bb4SMartin Matuska ddt_log_record_entry_t *dlre = 303*e2df9bb4SMartin Matuska (ddt_log_record_entry_t *)&dlr->dlr_payload; 304*e2df9bb4SMartin Matuska dlre->dlre_key = ddlwe->ddlwe_key; 305*e2df9bb4SMartin Matuska memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 306*e2df9bb4SMartin Matuska 307*e2df9bb4SMartin Matuska /* Advance offset for next record. */ 308*e2df9bb4SMartin Matuska dlu->dlu_offset += dlu->dlu_reclen; 309*e2df9bb4SMartin Matuska } 310*e2df9bb4SMartin Matuska 311*e2df9bb4SMartin Matuska void 312*e2df9bb4SMartin Matuska ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) 313*e2df9bb4SMartin Matuska { 314*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_dbp, !=, NULL); 315*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); 316*e2df9bb4SMartin Matuska ASSERT3U(dlu->dlu_offset, >, 0); 317*e2df9bb4SMartin Matuska 318*e2df9bb4SMartin Matuska /* 319*e2df9bb4SMartin Matuska * Close out the last block. Whatever we haven't used will be zeroed, 320*e2df9bb4SMartin Matuska * which matches DLR_INVALID, so we can detect this during load. 321*e2df9bb4SMartin Matuska */ 322*e2df9bb4SMartin Matuska dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); 323*e2df9bb4SMartin Matuska 324*e2df9bb4SMartin Matuska dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); 325*e2df9bb4SMartin Matuska 326*e2df9bb4SMartin Matuska ddt->ddt_log_active->ddl_length += 327*e2df9bb4SMartin Matuska dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; 328*e2df9bb4SMartin Matuska dnode_rele(dlu->dlu_dn, FTAG); 329*e2df9bb4SMartin Matuska 330*e2df9bb4SMartin Matuska ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); 331*e2df9bb4SMartin Matuska 332*e2df9bb4SMartin Matuska memset(dlu, 0, sizeof (ddt_log_update_t)); 333*e2df9bb4SMartin Matuska 334*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt); 335*e2df9bb4SMartin Matuska } 336*e2df9bb4SMartin Matuska 337*e2df9bb4SMartin Matuska boolean_t 338*e2df9bb4SMartin Matuska ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) 339*e2df9bb4SMartin Matuska { 340*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 341*e2df9bb4SMartin Matuska if (ddle == NULL) 342*e2df9bb4SMartin Matuska return (B_FALSE); 343*e2df9bb4SMartin Matuska 344*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); 345*e2df9bb4SMartin Matuska 346*e2df9bb4SMartin Matuska ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); 347*e2df9bb4SMartin Matuska 348*e2df9bb4SMartin Matuska avl_remove(&ddl->ddl_tree, ddle); 349*e2df9bb4SMartin Matuska kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 350*e2df9bb4SMartin Matuska ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 351*e2df9bb4SMartin Matuska 352*e2df9bb4SMartin Matuska return (B_TRUE); 353*e2df9bb4SMartin Matuska } 354*e2df9bb4SMartin Matuska 355*e2df9bb4SMartin Matuska boolean_t 356*e2df9bb4SMartin Matuska ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) 357*e2df9bb4SMartin Matuska { 358*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); 359*e2df9bb4SMartin Matuska if (ddle == NULL) 360*e2df9bb4SMartin Matuska return (B_FALSE); 361*e2df9bb4SMartin Matuska 362*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 363*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 364*e2df9bb4SMartin Matuska ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 365*e2df9bb4SMartin Matuska 366*e2df9bb4SMartin Matuska avl_remove(&ddl->ddl_tree, ddle); 367*e2df9bb4SMartin Matuska kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 368*e2df9bb4SMartin Matuska ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 369*e2df9bb4SMartin Matuska 370*e2df9bb4SMartin Matuska return (B_TRUE); 371*e2df9bb4SMartin Matuska } 372*e2df9bb4SMartin Matuska 373*e2df9bb4SMartin Matuska boolean_t 374*e2df9bb4SMartin Matuska ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, 375*e2df9bb4SMartin Matuska ddt_lightweight_entry_t *ddlwe) 376*e2df9bb4SMartin Matuska { 377*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle = 378*e2df9bb4SMartin Matuska avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); 379*e2df9bb4SMartin Matuska if (!ddle) 380*e2df9bb4SMartin Matuska ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); 381*e2df9bb4SMartin Matuska if (!ddle) 382*e2df9bb4SMartin Matuska return (B_FALSE); 383*e2df9bb4SMartin Matuska if (ddlwe) 384*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); 385*e2df9bb4SMartin Matuska return (B_TRUE); 386*e2df9bb4SMartin Matuska } 387*e2df9bb4SMartin Matuska 388*e2df9bb4SMartin Matuska void 389*e2df9bb4SMartin Matuska ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) 390*e2df9bb4SMartin Matuska { 391*e2df9bb4SMartin Matuska ddt_log_t *ddl = ddt->ddt_log_flushing; 392*e2df9bb4SMartin Matuska 393*e2df9bb4SMartin Matuska ASSERT3U(ddl->ddl_object, !=, 0); 394*e2df9bb4SMartin Matuska 395*e2df9bb4SMartin Matuska #ifdef ZFS_DEBUG 396*e2df9bb4SMartin Matuska /* 397*e2df9bb4SMartin Matuska * There should not be any entries on the log tree before the given 398*e2df9bb4SMartin Matuska * checkpoint. Assert that this is the case. 399*e2df9bb4SMartin Matuska */ 400*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 401*e2df9bb4SMartin Matuska if (ddle != NULL) 402*e2df9bb4SMartin Matuska VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), 403*e2df9bb4SMartin Matuska >, 0); 404*e2df9bb4SMartin Matuska #endif 405*e2df9bb4SMartin Matuska 406*e2df9bb4SMartin Matuska ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; 407*e2df9bb4SMartin Matuska ddl->ddl_checkpoint = ddlwe->ddlwe_key; 408*e2df9bb4SMartin Matuska ddt_log_update_header(ddt, ddl, tx); 409*e2df9bb4SMartin Matuska 410*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt); 411*e2df9bb4SMartin Matuska } 412*e2df9bb4SMartin Matuska 413*e2df9bb4SMartin Matuska void 414*e2df9bb4SMartin Matuska ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) 415*e2df9bb4SMartin Matuska { 416*e2df9bb4SMartin Matuska ddt_log_t *ddl = ddt->ddt_log_flushing; 417*e2df9bb4SMartin Matuska 418*e2df9bb4SMartin Matuska if (ddl->ddl_object == 0) 419*e2df9bb4SMartin Matuska return; 420*e2df9bb4SMartin Matuska 421*e2df9bb4SMartin Matuska ASSERT(avl_is_empty(&ddl->ddl_tree)); 422*e2df9bb4SMartin Matuska 423*e2df9bb4SMartin Matuska /* Eject the entire object */ 424*e2df9bb4SMartin Matuska dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); 425*e2df9bb4SMartin Matuska 426*e2df9bb4SMartin Matuska ddl->ddl_length = 0; 427*e2df9bb4SMartin Matuska ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; 428*e2df9bb4SMartin Matuska memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); 429*e2df9bb4SMartin Matuska ddt_log_update_header(ddt, ddl, tx); 430*e2df9bb4SMartin Matuska 431*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt); 432*e2df9bb4SMartin Matuska } 433*e2df9bb4SMartin Matuska 434*e2df9bb4SMartin Matuska boolean_t 435*e2df9bb4SMartin Matuska ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) 436*e2df9bb4SMartin Matuska { 437*e2df9bb4SMartin Matuska /* Swap the logs. The old flushing one must be empty */ 438*e2df9bb4SMartin Matuska VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); 439*e2df9bb4SMartin Matuska 440*e2df9bb4SMartin Matuska /* 441*e2df9bb4SMartin Matuska * If there are still blocks on the flushing log, truncate it first. 442*e2df9bb4SMartin Matuska * This can happen if there were entries on the flushing log that were 443*e2df9bb4SMartin Matuska * removed in memory via ddt_lookup(); their vestigal remains are 444*e2df9bb4SMartin Matuska * on disk. 445*e2df9bb4SMartin Matuska */ 446*e2df9bb4SMartin Matuska if (ddt->ddt_log_flushing->ddl_length > 0) 447*e2df9bb4SMartin Matuska ddt_log_truncate(ddt, tx); 448*e2df9bb4SMartin Matuska 449*e2df9bb4SMartin Matuska /* 450*e2df9bb4SMartin Matuska * Swap policy. We swap the logs (and so begin flushing) when the 451*e2df9bb4SMartin Matuska * active tree grows too large, or when we haven't swapped it in 452*e2df9bb4SMartin Matuska * some amount of time, or if something has requested the logs be 453*e2df9bb4SMartin Matuska * flushed ASAP (see ddt_walk_init()). 454*e2df9bb4SMartin Matuska */ 455*e2df9bb4SMartin Matuska 456*e2df9bb4SMartin Matuska /* 457*e2df9bb4SMartin Matuska * The log tree is too large if the memory usage of its entries is over 458*e2df9bb4SMartin Matuska * half of the memory limit. This effectively gives each log tree half 459*e2df9bb4SMartin Matuska * the available memory. 460*e2df9bb4SMartin Matuska */ 461*e2df9bb4SMartin Matuska const boolean_t too_large = 462*e2df9bb4SMartin Matuska (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * 463*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); 464*e2df9bb4SMartin Matuska 465*e2df9bb4SMartin Matuska const boolean_t too_old = 466*e2df9bb4SMartin Matuska tx->tx_txg >= 467*e2df9bb4SMartin Matuska (ddt->ddt_log_active->ddl_first_txg + 468*e2df9bb4SMartin Matuska MAX(1, zfs_dedup_log_txg_max)); 469*e2df9bb4SMartin Matuska 470*e2df9bb4SMartin Matuska const boolean_t force = 471*e2df9bb4SMartin Matuska ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg; 472*e2df9bb4SMartin Matuska 473*e2df9bb4SMartin Matuska if (!(too_large || too_old || force)) 474*e2df9bb4SMartin Matuska return (B_FALSE); 475*e2df9bb4SMartin Matuska 476*e2df9bb4SMartin Matuska ddt_log_t *swap = ddt->ddt_log_active; 477*e2df9bb4SMartin Matuska ddt->ddt_log_active = ddt->ddt_log_flushing; 478*e2df9bb4SMartin Matuska ddt->ddt_log_flushing = swap; 479*e2df9bb4SMartin Matuska 480*e2df9bb4SMartin Matuska ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); 481*e2df9bb4SMartin Matuska ddt->ddt_log_active->ddl_flags &= 482*e2df9bb4SMartin Matuska ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); 483*e2df9bb4SMartin Matuska 484*e2df9bb4SMartin Matuska ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); 485*e2df9bb4SMartin Matuska ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; 486*e2df9bb4SMartin Matuska 487*e2df9bb4SMartin Matuska ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; 488*e2df9bb4SMartin Matuska 489*e2df9bb4SMartin Matuska ddt_log_update_header(ddt, ddt->ddt_log_active, tx); 490*e2df9bb4SMartin Matuska ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); 491*e2df9bb4SMartin Matuska 492*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt); 493*e2df9bb4SMartin Matuska 494*e2df9bb4SMartin Matuska return (B_TRUE); 495*e2df9bb4SMartin Matuska } 496*e2df9bb4SMartin Matuska 497*e2df9bb4SMartin Matuska static inline void 498*e2df9bb4SMartin Matuska ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, 499*e2df9bb4SMartin Matuska const ddt_key_t *checkpoint) 500*e2df9bb4SMartin Matuska { 501*e2df9bb4SMartin Matuska ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); 502*e2df9bb4SMartin Matuska 503*e2df9bb4SMartin Matuska ddt_log_record_entry_t *dlre = 504*e2df9bb4SMartin Matuska (ddt_log_record_entry_t *)dlr->dlr_payload; 505*e2df9bb4SMartin Matuska if (checkpoint != NULL && 506*e2df9bb4SMartin Matuska ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { 507*e2df9bb4SMartin Matuska /* Skip pre-checkpoint entries; they're already flushed. */ 508*e2df9bb4SMartin Matuska return; 509*e2df9bb4SMartin Matuska } 510*e2df9bb4SMartin Matuska 511*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 512*e2df9bb4SMartin Matuska ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); 513*e2df9bb4SMartin Matuska ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); 514*e2df9bb4SMartin Matuska 515*e2df9bb4SMartin Matuska ddlwe.ddlwe_key = dlre->dlre_key; 516*e2df9bb4SMartin Matuska memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); 517*e2df9bb4SMartin Matuska 518*e2df9bb4SMartin Matuska ddt_log_update_entry(ddt, ddl, &ddlwe); 519*e2df9bb4SMartin Matuska } 520*e2df9bb4SMartin Matuska 521*e2df9bb4SMartin Matuska static void 522*e2df9bb4SMartin Matuska ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) 523*e2df9bb4SMartin Matuska { 524*e2df9bb4SMartin Matuska void *cookie = NULL; 525*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle; 526*e2df9bb4SMartin Matuska IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); 527*e2df9bb4SMartin Matuska while ((ddle = 528*e2df9bb4SMartin Matuska avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { 529*e2df9bb4SMartin Matuska kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 530*e2df9bb4SMartin Matuska ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 531*e2df9bb4SMartin Matuska } 532*e2df9bb4SMartin Matuska ASSERT(avl_is_empty(&ddl->ddl_tree)); 533*e2df9bb4SMartin Matuska } 534*e2df9bb4SMartin Matuska 535*e2df9bb4SMartin Matuska static int 536*e2df9bb4SMartin Matuska ddt_log_load_one(ddt_t *ddt, uint_t n) 537*e2df9bb4SMartin Matuska { 538*e2df9bb4SMartin Matuska ASSERT3U(n, <, 2); 539*e2df9bb4SMartin Matuska 540*e2df9bb4SMartin Matuska ddt_log_t *ddl = &ddt->ddt_log[n]; 541*e2df9bb4SMartin Matuska 542*e2df9bb4SMartin Matuska char name[DDT_NAMELEN]; 543*e2df9bb4SMartin Matuska ddt_log_name(ddt, name, n); 544*e2df9bb4SMartin Matuska 545*e2df9bb4SMartin Matuska uint64_t obj; 546*e2df9bb4SMartin Matuska int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, 547*e2df9bb4SMartin Matuska sizeof (uint64_t), 1, &obj); 548*e2df9bb4SMartin Matuska if (err == ENOENT) 549*e2df9bb4SMartin Matuska return (0); 550*e2df9bb4SMartin Matuska if (err != 0) 551*e2df9bb4SMartin Matuska return (err); 552*e2df9bb4SMartin Matuska 553*e2df9bb4SMartin Matuska dnode_t *dn; 554*e2df9bb4SMartin Matuska err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); 555*e2df9bb4SMartin Matuska if (err != 0) 556*e2df9bb4SMartin Matuska return (err); 557*e2df9bb4SMartin Matuska 558*e2df9bb4SMartin Matuska ddt_log_header_t hdr; 559*e2df9bb4SMartin Matuska dmu_buf_t *db; 560*e2df9bb4SMartin Matuska err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); 561*e2df9bb4SMartin Matuska if (err != 0) { 562*e2df9bb4SMartin Matuska dnode_rele(dn, FTAG); 563*e2df9bb4SMartin Matuska return (err); 564*e2df9bb4SMartin Matuska } 565*e2df9bb4SMartin Matuska memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); 566*e2df9bb4SMartin Matuska dmu_buf_rele(db, FTAG); 567*e2df9bb4SMartin Matuska 568*e2df9bb4SMartin Matuska if (DLH_GET_VERSION(&hdr) != 1) { 569*e2df9bb4SMartin Matuska dnode_rele(dn, FTAG); 570*e2df9bb4SMartin Matuska zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " 571*e2df9bb4SMartin Matuska "unknown version=%llu", spa_name(ddt->ddt_spa), name, 572*e2df9bb4SMartin Matuska (u_longlong_t)DLH_GET_VERSION(&hdr)); 573*e2df9bb4SMartin Matuska return (SET_ERROR(EINVAL)); 574*e2df9bb4SMartin Matuska } 575*e2df9bb4SMartin Matuska 576*e2df9bb4SMartin Matuska ddt_key_t *checkpoint = NULL; 577*e2df9bb4SMartin Matuska if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { 578*e2df9bb4SMartin Matuska /* 579*e2df9bb4SMartin Matuska * If the log has a checkpoint, then we can ignore any entries 580*e2df9bb4SMartin Matuska * that have already been flushed. 581*e2df9bb4SMartin Matuska */ 582*e2df9bb4SMartin Matuska ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); 583*e2df9bb4SMartin Matuska checkpoint = &hdr.dlh_checkpoint; 584*e2df9bb4SMartin Matuska } 585*e2df9bb4SMartin Matuska 586*e2df9bb4SMartin Matuska if (hdr.dlh_length > 0) { 587*e2df9bb4SMartin Matuska dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, 588*e2df9bb4SMartin Matuska ZIO_PRIORITY_SYNC_READ); 589*e2df9bb4SMartin Matuska 590*e2df9bb4SMartin Matuska for (uint64_t offset = 0; offset < hdr.dlh_length; 591*e2df9bb4SMartin Matuska offset += dn->dn_datablksz) { 592*e2df9bb4SMartin Matuska err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, 593*e2df9bb4SMartin Matuska DMU_READ_PREFETCH); 594*e2df9bb4SMartin Matuska if (err != 0) { 595*e2df9bb4SMartin Matuska dnode_rele(dn, FTAG); 596*e2df9bb4SMartin Matuska ddt_log_empty(ddt, ddl); 597*e2df9bb4SMartin Matuska return (err); 598*e2df9bb4SMartin Matuska } 599*e2df9bb4SMartin Matuska 600*e2df9bb4SMartin Matuska uint64_t boffset = 0; 601*e2df9bb4SMartin Matuska while (boffset < db->db_size) { 602*e2df9bb4SMartin Matuska ddt_log_record_t *dlr = 603*e2df9bb4SMartin Matuska (ddt_log_record_t *)(db->db_data + boffset); 604*e2df9bb4SMartin Matuska 605*e2df9bb4SMartin Matuska /* Partially-filled block, skip the rest */ 606*e2df9bb4SMartin Matuska if (DLR_GET_TYPE(dlr) == DLR_INVALID) 607*e2df9bb4SMartin Matuska break; 608*e2df9bb4SMartin Matuska 609*e2df9bb4SMartin Matuska switch (DLR_GET_TYPE(dlr)) { 610*e2df9bb4SMartin Matuska case DLR_ENTRY: 611*e2df9bb4SMartin Matuska ddt_log_load_entry(ddt, ddl, dlr, 612*e2df9bb4SMartin Matuska checkpoint); 613*e2df9bb4SMartin Matuska break; 614*e2df9bb4SMartin Matuska 615*e2df9bb4SMartin Matuska default: 616*e2df9bb4SMartin Matuska dmu_buf_rele(db, FTAG); 617*e2df9bb4SMartin Matuska dnode_rele(dn, FTAG); 618*e2df9bb4SMartin Matuska ddt_log_empty(ddt, ddl); 619*e2df9bb4SMartin Matuska return (SET_ERROR(EINVAL)); 620*e2df9bb4SMartin Matuska } 621*e2df9bb4SMartin Matuska 622*e2df9bb4SMartin Matuska boffset += DLR_GET_RECLEN(dlr); 623*e2df9bb4SMartin Matuska } 624*e2df9bb4SMartin Matuska 625*e2df9bb4SMartin Matuska dmu_buf_rele(db, FTAG); 626*e2df9bb4SMartin Matuska } 627*e2df9bb4SMartin Matuska } 628*e2df9bb4SMartin Matuska 629*e2df9bb4SMartin Matuska dnode_rele(dn, FTAG); 630*e2df9bb4SMartin Matuska 631*e2df9bb4SMartin Matuska ddl->ddl_object = obj; 632*e2df9bb4SMartin Matuska ddl->ddl_flags = DLH_GET_FLAGS(&hdr); 633*e2df9bb4SMartin Matuska ddl->ddl_length = hdr.dlh_length; 634*e2df9bb4SMartin Matuska ddl->ddl_first_txg = hdr.dlh_first_txg; 635*e2df9bb4SMartin Matuska 636*e2df9bb4SMartin Matuska if (ddl->ddl_flags & DDL_FLAG_FLUSHING) 637*e2df9bb4SMartin Matuska ddt->ddt_log_flushing = ddl; 638*e2df9bb4SMartin Matuska else 639*e2df9bb4SMartin Matuska ddt->ddt_log_active = ddl; 640*e2df9bb4SMartin Matuska 641*e2df9bb4SMartin Matuska return (0); 642*e2df9bb4SMartin Matuska } 643*e2df9bb4SMartin Matuska 644*e2df9bb4SMartin Matuska int 645*e2df9bb4SMartin Matuska ddt_log_load(ddt_t *ddt) 646*e2df9bb4SMartin Matuska { 647*e2df9bb4SMartin Matuska int err; 648*e2df9bb4SMartin Matuska 649*e2df9bb4SMartin Matuska if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { 650*e2df9bb4SMartin Matuska /* 651*e2df9bb4SMartin Matuska * The DDT is going to be freed again in a moment, so there's 652*e2df9bb4SMartin Matuska * no point loading the log; it'll just slow down import. 653*e2df9bb4SMartin Matuska */ 654*e2df9bb4SMartin Matuska return (0); 655*e2df9bb4SMartin Matuska } 656*e2df9bb4SMartin Matuska 657*e2df9bb4SMartin Matuska ASSERT0(ddt->ddt_log[0].ddl_object); 658*e2df9bb4SMartin Matuska ASSERT0(ddt->ddt_log[1].ddl_object); 659*e2df9bb4SMartin Matuska if (ddt->ddt_dir_object == 0) { 660*e2df9bb4SMartin Matuska /* 661*e2df9bb4SMartin Matuska * If we're configured but the containing dir doesn't exist 662*e2df9bb4SMartin Matuska * yet, then the log object can't possibly exist either. 663*e2df9bb4SMartin Matuska */ 664*e2df9bb4SMartin Matuska ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); 665*e2df9bb4SMartin Matuska return (SET_ERROR(ENOENT)); 666*e2df9bb4SMartin Matuska } 667*e2df9bb4SMartin Matuska 668*e2df9bb4SMartin Matuska if ((err = ddt_log_load_one(ddt, 0)) != 0) 669*e2df9bb4SMartin Matuska return (err); 670*e2df9bb4SMartin Matuska if ((err = ddt_log_load_one(ddt, 1)) != 0) 671*e2df9bb4SMartin Matuska return (err); 672*e2df9bb4SMartin Matuska 673*e2df9bb4SMartin Matuska VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); 674*e2df9bb4SMartin Matuska VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); 675*e2df9bb4SMartin Matuska VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); 676*e2df9bb4SMartin Matuska VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); 677*e2df9bb4SMartin Matuska 678*e2df9bb4SMartin Matuska /* 679*e2df9bb4SMartin Matuska * We have two finalisation tasks: 680*e2df9bb4SMartin Matuska * 681*e2df9bb4SMartin Matuska * - rebuild the histogram. We do this at the end rather than while 682*e2df9bb4SMartin Matuska * we're loading so we don't need to uncount and recount entries that 683*e2df9bb4SMartin Matuska * appear multiple times in the log. 684*e2df9bb4SMartin Matuska * 685*e2df9bb4SMartin Matuska * - remove entries from the flushing tree that are on both trees. This 686*e2df9bb4SMartin Matuska * happens when ddt_lookup() rehydrates an entry from the flushing 687*e2df9bb4SMartin Matuska * tree, as ddt_log_take_key() removes the entry from the in-memory 688*e2df9bb4SMartin Matuska * tree but doesn't remove it from disk. 689*e2df9bb4SMartin Matuska */ 690*e2df9bb4SMartin Matuska 691*e2df9bb4SMartin Matuska /* 692*e2df9bb4SMartin Matuska * We don't technically need a config lock here, since there shouldn't 693*e2df9bb4SMartin Matuska * be pool config changes during DDT load. dva_get_dsize_sync() via 694*e2df9bb4SMartin Matuska * ddt_stat_generate() is expecting it though, and it won't hurt 695*e2df9bb4SMartin Matuska * anything, so we take it. 696*e2df9bb4SMartin Matuska */ 697*e2df9bb4SMartin Matuska spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); 698*e2df9bb4SMartin Matuska 699*e2df9bb4SMartin Matuska avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; 700*e2df9bb4SMartin Matuska avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; 701*e2df9bb4SMartin Matuska ddt_log_entry_t *ae = avl_first(al); 702*e2df9bb4SMartin Matuska ddt_log_entry_t *fe = avl_first(fl); 703*e2df9bb4SMartin Matuska while (ae != NULL || fe != NULL) { 704*e2df9bb4SMartin Matuska ddt_log_entry_t *ddle; 705*e2df9bb4SMartin Matuska if (ae == NULL) { 706*e2df9bb4SMartin Matuska /* active exhausted, take flushing */ 707*e2df9bb4SMartin Matuska ddle = fe; 708*e2df9bb4SMartin Matuska fe = AVL_NEXT(fl, fe); 709*e2df9bb4SMartin Matuska } else if (fe == NULL) { 710*e2df9bb4SMartin Matuska /* flushing exuhausted, take active */ 711*e2df9bb4SMartin Matuska ddle = ae; 712*e2df9bb4SMartin Matuska ae = AVL_NEXT(al, ae); 713*e2df9bb4SMartin Matuska } else { 714*e2df9bb4SMartin Matuska /* compare active and flushing */ 715*e2df9bb4SMartin Matuska int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); 716*e2df9bb4SMartin Matuska if (c < 0) { 717*e2df9bb4SMartin Matuska /* active behind, take and advance */ 718*e2df9bb4SMartin Matuska ddle = ae; 719*e2df9bb4SMartin Matuska ae = AVL_NEXT(al, ae); 720*e2df9bb4SMartin Matuska } else if (c > 0) { 721*e2df9bb4SMartin Matuska /* flushing behind, take and advance */ 722*e2df9bb4SMartin Matuska ddle = fe; 723*e2df9bb4SMartin Matuska fe = AVL_NEXT(fl, fe); 724*e2df9bb4SMartin Matuska } else { 725*e2df9bb4SMartin Matuska /* match. remove from flushing, take active */ 726*e2df9bb4SMartin Matuska ddle = fe; 727*e2df9bb4SMartin Matuska fe = AVL_NEXT(fl, fe); 728*e2df9bb4SMartin Matuska avl_remove(fl, ddle); 729*e2df9bb4SMartin Matuska 730*e2df9bb4SMartin Matuska ddle = ae; 731*e2df9bb4SMartin Matuska ae = AVL_NEXT(al, ae); 732*e2df9bb4SMartin Matuska } 733*e2df9bb4SMartin Matuska } 734*e2df9bb4SMartin Matuska 735*e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe; 736*e2df9bb4SMartin Matuska DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 737*e2df9bb4SMartin Matuska ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 738*e2df9bb4SMartin Matuska } 739*e2df9bb4SMartin Matuska 740*e2df9bb4SMartin Matuska spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); 741*e2df9bb4SMartin Matuska 742*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt); 743*e2df9bb4SMartin Matuska 744*e2df9bb4SMartin Matuska return (0); 745*e2df9bb4SMartin Matuska } 746*e2df9bb4SMartin Matuska 747*e2df9bb4SMartin Matuska void 748*e2df9bb4SMartin Matuska ddt_log_alloc(ddt_t *ddt) 749*e2df9bb4SMartin Matuska { 750*e2df9bb4SMartin Matuska ASSERT3P(ddt->ddt_log_active, ==, NULL); 751*e2df9bb4SMartin Matuska ASSERT3P(ddt->ddt_log_flushing, ==, NULL); 752*e2df9bb4SMartin Matuska 753*e2df9bb4SMartin Matuska avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, 754*e2df9bb4SMartin Matuska sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); 755*e2df9bb4SMartin Matuska avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, 756*e2df9bb4SMartin Matuska sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); 757*e2df9bb4SMartin Matuska ddt->ddt_log_active = &ddt->ddt_log[0]; 758*e2df9bb4SMartin Matuska ddt->ddt_log_flushing = &ddt->ddt_log[1]; 759*e2df9bb4SMartin Matuska ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; 760*e2df9bb4SMartin Matuska } 761*e2df9bb4SMartin Matuska 762*e2df9bb4SMartin Matuska void 763*e2df9bb4SMartin Matuska ddt_log_free(ddt_t *ddt) 764*e2df9bb4SMartin Matuska { 765*e2df9bb4SMartin Matuska ddt_log_empty(ddt, &ddt->ddt_log[0]); 766*e2df9bb4SMartin Matuska ddt_log_empty(ddt, &ddt->ddt_log[1]); 767*e2df9bb4SMartin Matuska avl_destroy(&ddt->ddt_log[0].ddl_tree); 768*e2df9bb4SMartin Matuska avl_destroy(&ddt->ddt_log[1].ddl_tree); 769*e2df9bb4SMartin Matuska } 770*e2df9bb4SMartin Matuska 771*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, 772*e2df9bb4SMartin Matuska "Max transactions before starting to flush dedup logs"); 773*e2df9bb4SMartin Matuska 774*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, 775*e2df9bb4SMartin Matuska "Max memory for dedup logs"); 776*e2df9bb4SMartin Matuska 777*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, 778*e2df9bb4SMartin Matuska "Max memory for dedup logs, as % of total memory"); 779