1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23eda14cbcSMatt Macy * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 24eda14cbcSMatt Macy * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25eda14cbcSMatt Macy */ 26eda14cbcSMatt Macy 27eda14cbcSMatt Macy #include <sys/dmu.h> 28eda14cbcSMatt Macy #include <sys/zap.h> 29eda14cbcSMatt Macy #include <sys/zfs_context.h> 30eda14cbcSMatt Macy #include <sys/dsl_pool.h> 31eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 32eda14cbcSMatt Macy 33eda14cbcSMatt Macy /* 34eda14cbcSMatt Macy * Deadlist concurrency: 35eda14cbcSMatt Macy * 36eda14cbcSMatt Macy * Deadlists can only be modified from the syncing thread. 37eda14cbcSMatt Macy * 38eda14cbcSMatt Macy * Except for dsl_deadlist_insert(), it can only be modified with the 39eda14cbcSMatt Macy * dp_config_rwlock held with RW_WRITER. 40eda14cbcSMatt Macy * 41eda14cbcSMatt Macy * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 42eda14cbcSMatt Macy * be called concurrently, from open context, with the dl_config_rwlock held 43eda14cbcSMatt Macy * with RW_READER. 44eda14cbcSMatt Macy * 45eda14cbcSMatt Macy * Therefore, we only need to provide locking between dsl_deadlist_insert() and 46eda14cbcSMatt Macy * the accessors, protecting: 47eda14cbcSMatt Macy * dl_phys->dl_used,comp,uncomp 48eda14cbcSMatt Macy * and protecting the dl_tree from being loaded. 49eda14cbcSMatt Macy * The locking is provided by dl_lock. Note that locking on the bpobj_t 50eda14cbcSMatt Macy * provides its own locking, and dl_oldfmt is immutable. 51eda14cbcSMatt Macy */ 52eda14cbcSMatt Macy 53eda14cbcSMatt Macy /* 54eda14cbcSMatt Macy * Livelist Overview 55eda14cbcSMatt Macy * ================ 56eda14cbcSMatt Macy * 57eda14cbcSMatt Macy * Livelists use the same 'deadlist_t' struct as deadlists and are also used 58eda14cbcSMatt Macy * to track blkptrs over the lifetime of a dataset. Livelists however, belong 59eda14cbcSMatt Macy * to clones and track the blkptrs that are clone-specific (were born after 60eda14cbcSMatt Macy * the clone's creation). The exception is embedded block pointers which are 61eda14cbcSMatt Macy * not included in livelists because they do not need to be freed. 62eda14cbcSMatt Macy * 63eda14cbcSMatt Macy * When it comes time to delete the clone, the livelist provides a quick 64eda14cbcSMatt Macy * reference as to what needs to be freed. For this reason, livelists also track 65eda14cbcSMatt Macy * when clone-specific blkptrs are freed before deletion to prevent double 66eda14cbcSMatt Macy * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the 67eda14cbcSMatt Macy * deletion algorithm iterates backwards over the livelist, matching 68eda14cbcSMatt Macy * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists 69eda14cbcSMatt Macy * are also updated in the case when blkptrs are remapped: the old version 70eda14cbcSMatt Macy * of the blkptr is cancelled out with a FREE and the new version is tracked 71eda14cbcSMatt Macy * with an ALLOC. 72eda14cbcSMatt Macy * 73eda14cbcSMatt Macy * To bound the amount of memory required for deletion, livelists over a 74eda14cbcSMatt Macy * certain size are spread over multiple entries. Entries are grouped by 75eda14cbcSMatt Macy * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will 76eda14cbcSMatt Macy * be in the same entry. This allows us to delete livelists incrementally 77eda14cbcSMatt Macy * over multiple syncs, one entry at a time. 78eda14cbcSMatt Macy * 79eda14cbcSMatt Macy * During the lifetime of the clone, livelists can get extremely large. 80eda14cbcSMatt Macy * Their size is managed by periodic condensing (preemptively cancelling out 81eda14cbcSMatt Macy * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when 82eda14cbcSMatt Macy * the shared space between the clone and its origin is so small that it 83eda14cbcSMatt Macy * doesn't make sense to use livelists anymore. 84eda14cbcSMatt Macy */ 85eda14cbcSMatt Macy 86eda14cbcSMatt Macy /* 87eda14cbcSMatt Macy * The threshold sublist size at which we create a new sub-livelist for the 88eda14cbcSMatt Macy * next txg. However, since blkptrs of the same transaction group must be in 89eda14cbcSMatt Macy * the same sub-list, the actual sublist size may exceed this. When picking the 90eda14cbcSMatt Macy * size we had to balance the fact that larger sublists mean fewer sublists 91eda14cbcSMatt Macy * (decreasing the cost of insertion) against the consideration that sublists 92eda14cbcSMatt Macy * will be loaded into memory and shouldn't take up an inordinate amount of 93eda14cbcSMatt Macy * space. We settled on ~500000 entries, corresponding to roughly 128M. 94eda14cbcSMatt Macy */ 95dbd5678dSMartin Matuska uint64_t zfs_livelist_max_entries = 500000; 96eda14cbcSMatt Macy 97eda14cbcSMatt Macy /* 98eda14cbcSMatt Macy * We can approximate how much of a performance gain a livelist will give us 99eda14cbcSMatt Macy * based on the percentage of blocks shared between the clone and its origin. 100eda14cbcSMatt Macy * 0 percent shared means that the clone has completely diverged and that the 101eda14cbcSMatt Macy * old method is maximally effective: every read from the block tree will 102eda14cbcSMatt Macy * result in lots of frees. Livelists give us gains when they track blocks 103eda14cbcSMatt Macy * scattered across the tree, when one read in the old method might only 104eda14cbcSMatt Macy * result in a few frees. Once the clone has been overwritten enough, 105eda14cbcSMatt Macy * writes are no longer sparse and we'll no longer get much of a benefit from 106eda14cbcSMatt Macy * tracking them with a livelist. We chose a lower limit of 75 percent shared 107eda14cbcSMatt Macy * (25 percent overwritten). This means that 1/4 of all block pointers will be 108eda14cbcSMatt Macy * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists 109eda14cbcSMatt Macy * to make deletion 4x faster. Once the amount of shared space drops below this 110eda14cbcSMatt Macy * threshold, the clone will revert to the old deletion method. 111eda14cbcSMatt Macy */ 112eda14cbcSMatt Macy int zfs_livelist_min_percent_shared = 75; 113eda14cbcSMatt Macy 114eda14cbcSMatt Macy static int 115eda14cbcSMatt Macy dsl_deadlist_compare(const void *arg1, const void *arg2) 116eda14cbcSMatt Macy { 117eda14cbcSMatt Macy const dsl_deadlist_entry_t *dle1 = arg1; 118eda14cbcSMatt Macy const dsl_deadlist_entry_t *dle2 = arg2; 119eda14cbcSMatt Macy 120eda14cbcSMatt Macy return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); 121eda14cbcSMatt Macy } 122eda14cbcSMatt Macy 123eda14cbcSMatt Macy static int 124eda14cbcSMatt Macy dsl_deadlist_cache_compare(const void *arg1, const void *arg2) 125eda14cbcSMatt Macy { 126eda14cbcSMatt Macy const dsl_deadlist_cache_entry_t *dlce1 = arg1; 127eda14cbcSMatt Macy const dsl_deadlist_cache_entry_t *dlce2 = arg2; 128eda14cbcSMatt Macy 129eda14cbcSMatt Macy return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); 130eda14cbcSMatt Macy } 131eda14cbcSMatt Macy 132eda14cbcSMatt Macy static void 133eda14cbcSMatt Macy dsl_deadlist_load_tree(dsl_deadlist_t *dl) 134eda14cbcSMatt Macy { 135eda14cbcSMatt Macy zap_cursor_t zc; 1367a7741afSMartin Matuska zap_attribute_t *za; 137eda14cbcSMatt Macy int error; 138eda14cbcSMatt Macy 139eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&dl->dl_lock)); 140eda14cbcSMatt Macy 141eda14cbcSMatt Macy ASSERT(!dl->dl_oldfmt); 142eda14cbcSMatt Macy if (dl->dl_havecache) { 143eda14cbcSMatt Macy /* 144eda14cbcSMatt Macy * After loading the tree, the caller may modify the tree, 145eda14cbcSMatt Macy * e.g. to add or remove nodes, or to make a node no longer 146eda14cbcSMatt Macy * refer to the empty_bpobj. These changes would make the 147eda14cbcSMatt Macy * dl_cache incorrect. Therefore we discard the cache here, 148eda14cbcSMatt Macy * so that it can't become incorrect. 149eda14cbcSMatt Macy */ 150eda14cbcSMatt Macy dsl_deadlist_cache_entry_t *dlce; 151eda14cbcSMatt Macy void *cookie = NULL; 152eda14cbcSMatt Macy while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) 153eda14cbcSMatt Macy != NULL) { 154eda14cbcSMatt Macy kmem_free(dlce, sizeof (*dlce)); 155eda14cbcSMatt Macy } 156eda14cbcSMatt Macy avl_destroy(&dl->dl_cache); 157eda14cbcSMatt Macy dl->dl_havecache = B_FALSE; 158eda14cbcSMatt Macy } 159eda14cbcSMatt Macy if (dl->dl_havetree) 160eda14cbcSMatt Macy return; 161eda14cbcSMatt Macy 1627a7741afSMartin Matuska za = zap_attribute_alloc(); 163eda14cbcSMatt Macy avl_create(&dl->dl_tree, dsl_deadlist_compare, 164eda14cbcSMatt Macy sizeof (dsl_deadlist_entry_t), 165eda14cbcSMatt Macy offsetof(dsl_deadlist_entry_t, dle_node)); 166eda14cbcSMatt Macy for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 1677a7741afSMartin Matuska (error = zap_cursor_retrieve(&zc, za)) == 0; 168eda14cbcSMatt Macy zap_cursor_advance(&zc)) { 169eda14cbcSMatt Macy dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 1707a7741afSMartin Matuska dle->dle_mintxg = zfs_strtonum(za->za_name, NULL); 171eda14cbcSMatt Macy 172eda14cbcSMatt Macy /* 173eda14cbcSMatt Macy * Prefetch all the bpobj's so that we do that i/o 174eda14cbcSMatt Macy * in parallel. Then open them all in a second pass. 175eda14cbcSMatt Macy */ 1767a7741afSMartin Matuska dle->dle_bpobj.bpo_object = za->za_first_integer; 177315ee00fSMartin Matuska dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, 178315ee00fSMartin Matuska ZIO_PRIORITY_SYNC_READ); 179eda14cbcSMatt Macy 180eda14cbcSMatt Macy avl_add(&dl->dl_tree, dle); 181eda14cbcSMatt Macy } 182eda14cbcSMatt Macy VERIFY3U(error, ==, ENOENT); 183eda14cbcSMatt Macy zap_cursor_fini(&zc); 1847a7741afSMartin Matuska zap_attribute_free(za); 185eda14cbcSMatt Macy 186eda14cbcSMatt Macy for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); 187eda14cbcSMatt Macy dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { 188eda14cbcSMatt Macy VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, 189eda14cbcSMatt Macy dle->dle_bpobj.bpo_object)); 190eda14cbcSMatt Macy } 191eda14cbcSMatt Macy dl->dl_havetree = B_TRUE; 192eda14cbcSMatt Macy } 193eda14cbcSMatt Macy 194eda14cbcSMatt Macy /* 195eda14cbcSMatt Macy * Load only the non-empty bpobj's into the dl_cache. The cache is an analog 196eda14cbcSMatt Macy * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It 197eda14cbcSMatt Macy * is used only for gathering space statistics. The dl_cache has two 198eda14cbcSMatt Macy * advantages over the dl_tree: 199eda14cbcSMatt Macy * 200eda14cbcSMatt Macy * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's 201eda14cbcSMatt Macy * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj 202eda14cbcSMatt Macy * many times and to inquire about its (zero) space stats many times. 203eda14cbcSMatt Macy * 204eda14cbcSMatt Macy * 2. The dl_cache uses less memory than the dl_tree. We only need to load 205eda14cbcSMatt Macy * the dl_tree of snapshots when deleting a snapshot, after which we free the 206eda14cbcSMatt Macy * dl_tree with dsl_deadlist_discard_tree 207eda14cbcSMatt Macy */ 208eda14cbcSMatt Macy static void 209eda14cbcSMatt Macy dsl_deadlist_load_cache(dsl_deadlist_t *dl) 210eda14cbcSMatt Macy { 211eda14cbcSMatt Macy zap_cursor_t zc; 2127a7741afSMartin Matuska zap_attribute_t *za; 213eda14cbcSMatt Macy int error; 214eda14cbcSMatt Macy 215eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&dl->dl_lock)); 216eda14cbcSMatt Macy 217eda14cbcSMatt Macy ASSERT(!dl->dl_oldfmt); 218eda14cbcSMatt Macy if (dl->dl_havecache) 219eda14cbcSMatt Macy return; 220eda14cbcSMatt Macy 221eda14cbcSMatt Macy uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; 222eda14cbcSMatt Macy 223eda14cbcSMatt Macy avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, 224eda14cbcSMatt Macy sizeof (dsl_deadlist_cache_entry_t), 225eda14cbcSMatt Macy offsetof(dsl_deadlist_cache_entry_t, dlce_node)); 2267a7741afSMartin Matuska za = zap_attribute_alloc(); 227eda14cbcSMatt Macy for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 2287a7741afSMartin Matuska (error = zap_cursor_retrieve(&zc, za)) == 0; 229eda14cbcSMatt Macy zap_cursor_advance(&zc)) { 2307a7741afSMartin Matuska if (za->za_first_integer == empty_bpobj) 231eda14cbcSMatt Macy continue; 232eda14cbcSMatt Macy dsl_deadlist_cache_entry_t *dlce = 233eda14cbcSMatt Macy kmem_zalloc(sizeof (*dlce), KM_SLEEP); 2347a7741afSMartin Matuska dlce->dlce_mintxg = zfs_strtonum(za->za_name, NULL); 235eda14cbcSMatt Macy 236eda14cbcSMatt Macy /* 237eda14cbcSMatt Macy * Prefetch all the bpobj's so that we do that i/o 238eda14cbcSMatt Macy * in parallel. Then open them all in a second pass. 239eda14cbcSMatt Macy */ 2407a7741afSMartin Matuska dlce->dlce_bpobj = za->za_first_integer; 241315ee00fSMartin Matuska dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, 242315ee00fSMartin Matuska ZIO_PRIORITY_SYNC_READ); 243eda14cbcSMatt Macy avl_add(&dl->dl_cache, dlce); 244eda14cbcSMatt Macy } 245eda14cbcSMatt Macy VERIFY3U(error, ==, ENOENT); 246eda14cbcSMatt Macy zap_cursor_fini(&zc); 2477a7741afSMartin Matuska zap_attribute_free(za); 248eda14cbcSMatt Macy 249eda14cbcSMatt Macy for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); 250eda14cbcSMatt Macy dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { 251eda14cbcSMatt Macy bpobj_t bpo; 252eda14cbcSMatt Macy VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); 253eda14cbcSMatt Macy 254eda14cbcSMatt Macy VERIFY0(bpobj_space(&bpo, 255eda14cbcSMatt Macy &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); 256eda14cbcSMatt Macy bpobj_close(&bpo); 257eda14cbcSMatt Macy } 258eda14cbcSMatt Macy dl->dl_havecache = B_TRUE; 259eda14cbcSMatt Macy } 260eda14cbcSMatt Macy 261eda14cbcSMatt Macy /* 262eda14cbcSMatt Macy * Discard the tree to save memory. 263eda14cbcSMatt Macy */ 264eda14cbcSMatt Macy void 265eda14cbcSMatt Macy dsl_deadlist_discard_tree(dsl_deadlist_t *dl) 266eda14cbcSMatt Macy { 267eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 268eda14cbcSMatt Macy 269eda14cbcSMatt Macy if (!dl->dl_havetree) { 270eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 271eda14cbcSMatt Macy return; 272eda14cbcSMatt Macy } 273eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 274eda14cbcSMatt Macy void *cookie = NULL; 275eda14cbcSMatt Macy while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { 276eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 277eda14cbcSMatt Macy kmem_free(dle, sizeof (*dle)); 278eda14cbcSMatt Macy } 279eda14cbcSMatt Macy avl_destroy(&dl->dl_tree); 280eda14cbcSMatt Macy 281eda14cbcSMatt Macy dl->dl_havetree = B_FALSE; 282eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 283eda14cbcSMatt Macy } 284eda14cbcSMatt Macy 285eda14cbcSMatt Macy void 286eda14cbcSMatt Macy dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) 287eda14cbcSMatt Macy { 288eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 289eda14cbcSMatt Macy 290eda14cbcSMatt Macy ASSERT(dsl_deadlist_is_open(dl)); 291eda14cbcSMatt Macy 292eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 293eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 294eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 295eda14cbcSMatt Macy for (dle = avl_first(&dl->dl_tree); dle != NULL; 296eda14cbcSMatt Macy dle = AVL_NEXT(&dl->dl_tree, dle)) { 297eda14cbcSMatt Macy if (func(args, dle) != 0) 298eda14cbcSMatt Macy break; 299eda14cbcSMatt Macy } 300eda14cbcSMatt Macy } 301eda14cbcSMatt Macy 302*17aab35aSMartin Matuska int 303eda14cbcSMatt Macy dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 304eda14cbcSMatt Macy { 305eda14cbcSMatt Macy dmu_object_info_t doi; 306*17aab35aSMartin Matuska int err; 307eda14cbcSMatt Macy 308eda14cbcSMatt Macy ASSERT(!dsl_deadlist_is_open(dl)); 309eda14cbcSMatt Macy 310eda14cbcSMatt Macy mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 311eda14cbcSMatt Macy dl->dl_os = os; 312eda14cbcSMatt Macy dl->dl_object = object; 313*17aab35aSMartin Matuska err = dmu_bonus_hold(os, object, dl, &dl->dl_dbuf); 314*17aab35aSMartin Matuska if (err != 0) 315*17aab35aSMartin Matuska return (err); 316eda14cbcSMatt Macy dmu_object_info_from_db(dl->dl_dbuf, &doi); 317eda14cbcSMatt Macy if (doi.doi_type == DMU_OT_BPOBJ) { 318eda14cbcSMatt Macy dmu_buf_rele(dl->dl_dbuf, dl); 319eda14cbcSMatt Macy dl->dl_dbuf = NULL; 320eda14cbcSMatt Macy dl->dl_oldfmt = B_TRUE; 321*17aab35aSMartin Matuska return (bpobj_open(&dl->dl_bpobj, os, object)); 322eda14cbcSMatt Macy } 323eda14cbcSMatt Macy 324eda14cbcSMatt Macy dl->dl_oldfmt = B_FALSE; 325eda14cbcSMatt Macy dl->dl_phys = dl->dl_dbuf->db_data; 326eda14cbcSMatt Macy dl->dl_havetree = B_FALSE; 327eda14cbcSMatt Macy dl->dl_havecache = B_FALSE; 328*17aab35aSMartin Matuska return (0); 329eda14cbcSMatt Macy } 330eda14cbcSMatt Macy 331eda14cbcSMatt Macy boolean_t 332eda14cbcSMatt Macy dsl_deadlist_is_open(dsl_deadlist_t *dl) 333eda14cbcSMatt Macy { 334eda14cbcSMatt Macy return (dl->dl_os != NULL); 335eda14cbcSMatt Macy } 336eda14cbcSMatt Macy 337eda14cbcSMatt Macy void 338eda14cbcSMatt Macy dsl_deadlist_close(dsl_deadlist_t *dl) 339eda14cbcSMatt Macy { 340eda14cbcSMatt Macy ASSERT(dsl_deadlist_is_open(dl)); 341eda14cbcSMatt Macy mutex_destroy(&dl->dl_lock); 342eda14cbcSMatt Macy 343eda14cbcSMatt Macy if (dl->dl_oldfmt) { 344eda14cbcSMatt Macy dl->dl_oldfmt = B_FALSE; 345eda14cbcSMatt Macy bpobj_close(&dl->dl_bpobj); 346eda14cbcSMatt Macy dl->dl_os = NULL; 347eda14cbcSMatt Macy dl->dl_object = 0; 348eda14cbcSMatt Macy return; 349eda14cbcSMatt Macy } 350eda14cbcSMatt Macy 351eda14cbcSMatt Macy if (dl->dl_havetree) { 352eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 353eda14cbcSMatt Macy void *cookie = NULL; 354eda14cbcSMatt Macy while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 355eda14cbcSMatt Macy != NULL) { 356eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 357eda14cbcSMatt Macy kmem_free(dle, sizeof (*dle)); 358eda14cbcSMatt Macy } 359eda14cbcSMatt Macy avl_destroy(&dl->dl_tree); 360eda14cbcSMatt Macy } 361eda14cbcSMatt Macy if (dl->dl_havecache) { 362eda14cbcSMatt Macy dsl_deadlist_cache_entry_t *dlce; 363eda14cbcSMatt Macy void *cookie = NULL; 364eda14cbcSMatt Macy while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) 365eda14cbcSMatt Macy != NULL) { 366eda14cbcSMatt Macy kmem_free(dlce, sizeof (*dlce)); 367eda14cbcSMatt Macy } 368eda14cbcSMatt Macy avl_destroy(&dl->dl_cache); 369eda14cbcSMatt Macy } 370eda14cbcSMatt Macy dmu_buf_rele(dl->dl_dbuf, dl); 371eda14cbcSMatt Macy dl->dl_dbuf = NULL; 372eda14cbcSMatt Macy dl->dl_phys = NULL; 373eda14cbcSMatt Macy dl->dl_os = NULL; 374eda14cbcSMatt Macy dl->dl_object = 0; 375eda14cbcSMatt Macy } 376eda14cbcSMatt Macy 377eda14cbcSMatt Macy uint64_t 378eda14cbcSMatt Macy dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 379eda14cbcSMatt Macy { 380eda14cbcSMatt Macy if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 381eda14cbcSMatt Macy return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 382eda14cbcSMatt Macy return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 383eda14cbcSMatt Macy sizeof (dsl_deadlist_phys_t), tx)); 384eda14cbcSMatt Macy } 385eda14cbcSMatt Macy 386eda14cbcSMatt Macy void 387eda14cbcSMatt Macy dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 388eda14cbcSMatt Macy { 389eda14cbcSMatt Macy dmu_object_info_t doi; 390eda14cbcSMatt Macy zap_cursor_t zc; 3917a7741afSMartin Matuska zap_attribute_t *za; 392eda14cbcSMatt Macy int error; 393eda14cbcSMatt Macy 394eda14cbcSMatt Macy VERIFY0(dmu_object_info(os, dlobj, &doi)); 395eda14cbcSMatt Macy if (doi.doi_type == DMU_OT_BPOBJ) { 396eda14cbcSMatt Macy bpobj_free(os, dlobj, tx); 397eda14cbcSMatt Macy return; 398eda14cbcSMatt Macy } 399eda14cbcSMatt Macy 4007a7741afSMartin Matuska za = zap_attribute_alloc(); 401eda14cbcSMatt Macy for (zap_cursor_init(&zc, os, dlobj); 4027a7741afSMartin Matuska (error = zap_cursor_retrieve(&zc, za)) == 0; 403eda14cbcSMatt Macy zap_cursor_advance(&zc)) { 4047a7741afSMartin Matuska uint64_t obj = za->za_first_integer; 405eda14cbcSMatt Macy if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 406eda14cbcSMatt Macy bpobj_decr_empty(os, tx); 407eda14cbcSMatt Macy else 408eda14cbcSMatt Macy bpobj_free(os, obj, tx); 409eda14cbcSMatt Macy } 410eda14cbcSMatt Macy VERIFY3U(error, ==, ENOENT); 411eda14cbcSMatt Macy zap_cursor_fini(&zc); 4127a7741afSMartin Matuska zap_attribute_free(za); 413eda14cbcSMatt Macy VERIFY0(dmu_object_free(os, dlobj, tx)); 414eda14cbcSMatt Macy } 415eda14cbcSMatt Macy 416eda14cbcSMatt Macy static void 417eda14cbcSMatt Macy dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 418eda14cbcSMatt Macy const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 419eda14cbcSMatt Macy { 420eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&dl->dl_lock)); 421eda14cbcSMatt Macy if (dle->dle_bpobj.bpo_object == 422eda14cbcSMatt Macy dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 423eda14cbcSMatt Macy uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 424eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 425eda14cbcSMatt Macy bpobj_decr_empty(dl->dl_os, tx); 426eda14cbcSMatt Macy VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 427eda14cbcSMatt Macy VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, 428eda14cbcSMatt Macy dle->dle_mintxg, obj, tx)); 429eda14cbcSMatt Macy } 430eda14cbcSMatt Macy bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); 431eda14cbcSMatt Macy } 432eda14cbcSMatt Macy 433eda14cbcSMatt Macy static void 434eda14cbcSMatt Macy dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 435eda14cbcSMatt Macy uint64_t obj, dmu_tx_t *tx) 436eda14cbcSMatt Macy { 437eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&dl->dl_lock)); 438eda14cbcSMatt Macy if (dle->dle_bpobj.bpo_object != 439eda14cbcSMatt Macy dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 440eda14cbcSMatt Macy bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 441eda14cbcSMatt Macy } else { 442eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 443eda14cbcSMatt Macy bpobj_decr_empty(dl->dl_os, tx); 444eda14cbcSMatt Macy VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 445eda14cbcSMatt Macy VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, 446eda14cbcSMatt Macy dle->dle_mintxg, obj, tx)); 447eda14cbcSMatt Macy } 448eda14cbcSMatt Macy } 449eda14cbcSMatt Macy 450c9539b89SMartin Matuska /* 451c9539b89SMartin Matuska * Prefetch metadata required for dle_enqueue_subobj(). 452c9539b89SMartin Matuska */ 453c9539b89SMartin Matuska static void 454c9539b89SMartin Matuska dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 455c9539b89SMartin Matuska uint64_t obj) 456c9539b89SMartin Matuska { 457c9539b89SMartin Matuska if (dle->dle_bpobj.bpo_object != 458c9539b89SMartin Matuska dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) 459c9539b89SMartin Matuska bpobj_prefetch_subobj(&dle->dle_bpobj, obj); 460c9539b89SMartin Matuska } 461c9539b89SMartin Matuska 462eda14cbcSMatt Macy void 463eda14cbcSMatt Macy dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, 464eda14cbcSMatt Macy dmu_tx_t *tx) 465eda14cbcSMatt Macy { 466eda14cbcSMatt Macy dsl_deadlist_entry_t dle_tofind; 467eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 468eda14cbcSMatt Macy avl_index_t where; 469eda14cbcSMatt Macy 470eda14cbcSMatt Macy if (dl->dl_oldfmt) { 471eda14cbcSMatt Macy bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); 472eda14cbcSMatt Macy return; 473eda14cbcSMatt Macy } 474eda14cbcSMatt Macy 475eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 476eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 477eda14cbcSMatt Macy 478eda14cbcSMatt Macy dmu_buf_will_dirty(dl->dl_dbuf, tx); 479eda14cbcSMatt Macy 480eda14cbcSMatt Macy int sign = bp_freed ? -1 : +1; 481eda14cbcSMatt Macy dl->dl_phys->dl_used += 482eda14cbcSMatt Macy sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 483eda14cbcSMatt Macy dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); 484eda14cbcSMatt Macy dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); 485eda14cbcSMatt Macy 486783d3ff6SMartin Matuska dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); 487eda14cbcSMatt Macy dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 488eda14cbcSMatt Macy if (dle == NULL) 489eda14cbcSMatt Macy dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 490eda14cbcSMatt Macy else 491eda14cbcSMatt Macy dle = AVL_PREV(&dl->dl_tree, dle); 492eda14cbcSMatt Macy 493eda14cbcSMatt Macy if (dle == NULL) { 494eda14cbcSMatt Macy zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", 495783d3ff6SMartin Matuska bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); 496eda14cbcSMatt Macy dle = avl_first(&dl->dl_tree); 497eda14cbcSMatt Macy } 498eda14cbcSMatt Macy 499eda14cbcSMatt Macy ASSERT3P(dle, !=, NULL); 500eda14cbcSMatt Macy dle_enqueue(dl, dle, bp, bp_freed, tx); 501eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 502eda14cbcSMatt Macy } 503eda14cbcSMatt Macy 504eda14cbcSMatt Macy int 505eda14cbcSMatt Macy dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 506eda14cbcSMatt Macy { 507eda14cbcSMatt Macy dsl_deadlist_t *dl = arg; 508eda14cbcSMatt Macy dsl_deadlist_insert(dl, bp, B_FALSE, tx); 509eda14cbcSMatt Macy return (0); 510eda14cbcSMatt Macy } 511eda14cbcSMatt Macy 512eda14cbcSMatt Macy int 513eda14cbcSMatt Macy dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 514eda14cbcSMatt Macy { 515eda14cbcSMatt Macy dsl_deadlist_t *dl = arg; 516eda14cbcSMatt Macy dsl_deadlist_insert(dl, bp, B_TRUE, tx); 517eda14cbcSMatt Macy return (0); 518eda14cbcSMatt Macy } 519eda14cbcSMatt Macy 520eda14cbcSMatt Macy /* 521eda14cbcSMatt Macy * Insert new key in deadlist, which must be > all current entries. 522eda14cbcSMatt Macy * mintxg is not inclusive. 523eda14cbcSMatt Macy */ 524eda14cbcSMatt Macy void 525eda14cbcSMatt Macy dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 526eda14cbcSMatt Macy { 527eda14cbcSMatt Macy uint64_t obj; 528eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 529eda14cbcSMatt Macy 530eda14cbcSMatt Macy if (dl->dl_oldfmt) 531eda14cbcSMatt Macy return; 532eda14cbcSMatt Macy 533eda14cbcSMatt Macy dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 534eda14cbcSMatt Macy dle->dle_mintxg = mintxg; 535eda14cbcSMatt Macy 536eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 537eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 538eda14cbcSMatt Macy 539eda14cbcSMatt Macy obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 540eda14cbcSMatt Macy VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 541eda14cbcSMatt Macy avl_add(&dl->dl_tree, dle); 542eda14cbcSMatt Macy 543eda14cbcSMatt Macy VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, 544eda14cbcSMatt Macy mintxg, obj, tx)); 545eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 546eda14cbcSMatt Macy } 547eda14cbcSMatt Macy 548eda14cbcSMatt Macy /* 549eda14cbcSMatt Macy * Remove this key, merging its entries into the previous key. 550eda14cbcSMatt Macy */ 551eda14cbcSMatt Macy void 552eda14cbcSMatt Macy dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 553eda14cbcSMatt Macy { 554eda14cbcSMatt Macy dsl_deadlist_entry_t dle_tofind; 555eda14cbcSMatt Macy dsl_deadlist_entry_t *dle, *dle_prev; 556eda14cbcSMatt Macy 557eda14cbcSMatt Macy if (dl->dl_oldfmt) 558eda14cbcSMatt Macy return; 559eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 560eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 561eda14cbcSMatt Macy 562eda14cbcSMatt Macy dle_tofind.dle_mintxg = mintxg; 563eda14cbcSMatt Macy dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 564eda14cbcSMatt Macy ASSERT3P(dle, !=, NULL); 565eda14cbcSMatt Macy dle_prev = AVL_PREV(&dl->dl_tree, dle); 566dbd5678dSMartin Matuska ASSERT3P(dle_prev, !=, NULL); 567eda14cbcSMatt Macy 568eda14cbcSMatt Macy dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 569eda14cbcSMatt Macy 570eda14cbcSMatt Macy avl_remove(&dl->dl_tree, dle); 571eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 572eda14cbcSMatt Macy kmem_free(dle, sizeof (*dle)); 573eda14cbcSMatt Macy 574eda14cbcSMatt Macy VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 575eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 576eda14cbcSMatt Macy } 577eda14cbcSMatt Macy 578eda14cbcSMatt Macy /* 579eda14cbcSMatt Macy * Remove a deadlist entry and all of its contents by removing the entry from 580eda14cbcSMatt Macy * the deadlist's avl tree, freeing the entry's bpobj and adjusting the 581eda14cbcSMatt Macy * deadlist's space accounting accordingly. 582eda14cbcSMatt Macy */ 583eda14cbcSMatt Macy void 584eda14cbcSMatt Macy dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 585eda14cbcSMatt Macy { 586eda14cbcSMatt Macy uint64_t used, comp, uncomp; 587eda14cbcSMatt Macy dsl_deadlist_entry_t dle_tofind; 588eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 589eda14cbcSMatt Macy objset_t *os = dl->dl_os; 590eda14cbcSMatt Macy 591eda14cbcSMatt Macy if (dl->dl_oldfmt) 592eda14cbcSMatt Macy return; 593eda14cbcSMatt Macy 594eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 595eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 596eda14cbcSMatt Macy 597eda14cbcSMatt Macy dle_tofind.dle_mintxg = mintxg; 598eda14cbcSMatt Macy dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 599eda14cbcSMatt Macy VERIFY3P(dle, !=, NULL); 600eda14cbcSMatt Macy 601eda14cbcSMatt Macy avl_remove(&dl->dl_tree, dle); 602eda14cbcSMatt Macy VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); 603eda14cbcSMatt Macy VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); 604eda14cbcSMatt Macy dmu_buf_will_dirty(dl->dl_dbuf, tx); 605eda14cbcSMatt Macy dl->dl_phys->dl_used -= used; 606eda14cbcSMatt Macy dl->dl_phys->dl_comp -= comp; 607eda14cbcSMatt Macy dl->dl_phys->dl_uncomp -= uncomp; 608eda14cbcSMatt Macy if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { 609eda14cbcSMatt Macy bpobj_decr_empty(os, tx); 610eda14cbcSMatt Macy } else { 611eda14cbcSMatt Macy bpobj_free(os, dle->dle_bpobj.bpo_object, tx); 612eda14cbcSMatt Macy } 613eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 614eda14cbcSMatt Macy kmem_free(dle, sizeof (*dle)); 615eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 616eda14cbcSMatt Macy } 617eda14cbcSMatt Macy 618eda14cbcSMatt Macy /* 619eda14cbcSMatt Macy * Clear out the contents of a deadlist_entry by freeing its bpobj, 620eda14cbcSMatt Macy * replacing it with an empty bpobj and adjusting the deadlist's 621eda14cbcSMatt Macy * space accounting 622eda14cbcSMatt Macy */ 623eda14cbcSMatt Macy void 624eda14cbcSMatt Macy dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, 625eda14cbcSMatt Macy dmu_tx_t *tx) 626eda14cbcSMatt Macy { 627eda14cbcSMatt Macy uint64_t new_obj, used, comp, uncomp; 628eda14cbcSMatt Macy objset_t *os = dl->dl_os; 629eda14cbcSMatt Macy 630eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 631eda14cbcSMatt Macy VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); 632eda14cbcSMatt Macy VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); 633eda14cbcSMatt Macy dmu_buf_will_dirty(dl->dl_dbuf, tx); 634eda14cbcSMatt Macy dl->dl_phys->dl_used -= used; 635eda14cbcSMatt Macy dl->dl_phys->dl_comp -= comp; 636eda14cbcSMatt Macy dl->dl_phys->dl_uncomp -= uncomp; 637eda14cbcSMatt Macy if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) 638eda14cbcSMatt Macy bpobj_decr_empty(os, tx); 639eda14cbcSMatt Macy else 640eda14cbcSMatt Macy bpobj_free(os, dle->dle_bpobj.bpo_object, tx); 641eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 642eda14cbcSMatt Macy new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); 643eda14cbcSMatt Macy VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); 644eda14cbcSMatt Macy VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, 645eda14cbcSMatt Macy new_obj, tx)); 646eda14cbcSMatt Macy ASSERT(bpobj_is_empty(&dle->dle_bpobj)); 647eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 648eda14cbcSMatt Macy } 649eda14cbcSMatt Macy 650eda14cbcSMatt Macy /* 651eda14cbcSMatt Macy * Return the first entry in deadlist's avl tree 652eda14cbcSMatt Macy */ 653eda14cbcSMatt Macy dsl_deadlist_entry_t * 654eda14cbcSMatt Macy dsl_deadlist_first(dsl_deadlist_t *dl) 655eda14cbcSMatt Macy { 656eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 657eda14cbcSMatt Macy 658eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 659eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 660eda14cbcSMatt Macy dle = avl_first(&dl->dl_tree); 661eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 662eda14cbcSMatt Macy 663eda14cbcSMatt Macy return (dle); 664eda14cbcSMatt Macy } 665eda14cbcSMatt Macy 666eda14cbcSMatt Macy /* 667eda14cbcSMatt Macy * Return the last entry in deadlist's avl tree 668eda14cbcSMatt Macy */ 669eda14cbcSMatt Macy dsl_deadlist_entry_t * 670eda14cbcSMatt Macy dsl_deadlist_last(dsl_deadlist_t *dl) 671eda14cbcSMatt Macy { 672eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 673eda14cbcSMatt Macy 674eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 675eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 676eda14cbcSMatt Macy dle = avl_last(&dl->dl_tree); 677eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 678eda14cbcSMatt Macy 679eda14cbcSMatt Macy return (dle); 680eda14cbcSMatt Macy } 681eda14cbcSMatt Macy 682eda14cbcSMatt Macy /* 683eda14cbcSMatt Macy * Walk ds's snapshots to regenerate generate ZAP & AVL. 684eda14cbcSMatt Macy */ 685eda14cbcSMatt Macy static void 686eda14cbcSMatt Macy dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 687eda14cbcSMatt Macy uint64_t mrs_obj, dmu_tx_t *tx) 688eda14cbcSMatt Macy { 689eda14cbcSMatt Macy dsl_deadlist_t dl = { 0 }; 690eda14cbcSMatt Macy dsl_pool_t *dp = dmu_objset_pool(os); 691eda14cbcSMatt Macy 692*17aab35aSMartin Matuska VERIFY0(dsl_deadlist_open(&dl, os, dlobj)); 693eda14cbcSMatt Macy if (dl.dl_oldfmt) { 694eda14cbcSMatt Macy dsl_deadlist_close(&dl); 695eda14cbcSMatt Macy return; 696eda14cbcSMatt Macy } 697eda14cbcSMatt Macy 698eda14cbcSMatt Macy while (mrs_obj != 0) { 699eda14cbcSMatt Macy dsl_dataset_t *ds; 700eda14cbcSMatt Macy VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 701eda14cbcSMatt Macy dsl_deadlist_add_key(&dl, 702eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 703eda14cbcSMatt Macy mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 704eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG); 705eda14cbcSMatt Macy } 706eda14cbcSMatt Macy dsl_deadlist_close(&dl); 707eda14cbcSMatt Macy } 708eda14cbcSMatt Macy 709eda14cbcSMatt Macy uint64_t 710eda14cbcSMatt Macy dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 711eda14cbcSMatt Macy uint64_t mrs_obj, dmu_tx_t *tx) 712eda14cbcSMatt Macy { 713eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 714eda14cbcSMatt Macy uint64_t newobj; 715eda14cbcSMatt Macy 716eda14cbcSMatt Macy newobj = dsl_deadlist_alloc(dl->dl_os, tx); 717eda14cbcSMatt Macy 718eda14cbcSMatt Macy if (dl->dl_oldfmt) { 719eda14cbcSMatt Macy dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 720eda14cbcSMatt Macy return (newobj); 721eda14cbcSMatt Macy } 722eda14cbcSMatt Macy 723eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 724eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 725eda14cbcSMatt Macy 726eda14cbcSMatt Macy for (dle = avl_first(&dl->dl_tree); dle; 727eda14cbcSMatt Macy dle = AVL_NEXT(&dl->dl_tree, dle)) { 728eda14cbcSMatt Macy uint64_t obj; 729eda14cbcSMatt Macy 730eda14cbcSMatt Macy if (dle->dle_mintxg >= maxtxg) 731eda14cbcSMatt Macy break; 732eda14cbcSMatt Macy 733eda14cbcSMatt Macy obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 734eda14cbcSMatt Macy VERIFY0(zap_add_int_key(dl->dl_os, newobj, 735eda14cbcSMatt Macy dle->dle_mintxg, obj, tx)); 736eda14cbcSMatt Macy } 737eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 738eda14cbcSMatt Macy return (newobj); 739eda14cbcSMatt Macy } 740eda14cbcSMatt Macy 741eda14cbcSMatt Macy void 742eda14cbcSMatt Macy dsl_deadlist_space(dsl_deadlist_t *dl, 743eda14cbcSMatt Macy uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 744eda14cbcSMatt Macy { 745eda14cbcSMatt Macy ASSERT(dsl_deadlist_is_open(dl)); 746eda14cbcSMatt Macy if (dl->dl_oldfmt) { 747eda14cbcSMatt Macy VERIFY0(bpobj_space(&dl->dl_bpobj, 748eda14cbcSMatt Macy usedp, compp, uncompp)); 749eda14cbcSMatt Macy return; 750eda14cbcSMatt Macy } 751eda14cbcSMatt Macy 752eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 753eda14cbcSMatt Macy *usedp = dl->dl_phys->dl_used; 754eda14cbcSMatt Macy *compp = dl->dl_phys->dl_comp; 755eda14cbcSMatt Macy *uncompp = dl->dl_phys->dl_uncomp; 756eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 757eda14cbcSMatt Macy } 758eda14cbcSMatt Macy 759eda14cbcSMatt Macy /* 760eda14cbcSMatt Macy * return space used in the range (mintxg, maxtxg]. 761eda14cbcSMatt Macy * Includes maxtxg, does not include mintxg. 762eda14cbcSMatt Macy * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 763eda14cbcSMatt Macy * UINT64_MAX). 764eda14cbcSMatt Macy */ 765eda14cbcSMatt Macy void 766eda14cbcSMatt Macy dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 767eda14cbcSMatt Macy uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 768eda14cbcSMatt Macy { 769eda14cbcSMatt Macy dsl_deadlist_cache_entry_t *dlce; 770eda14cbcSMatt Macy dsl_deadlist_cache_entry_t dlce_tofind; 771eda14cbcSMatt Macy avl_index_t where; 772eda14cbcSMatt Macy 773eda14cbcSMatt Macy if (dl->dl_oldfmt) { 774eda14cbcSMatt Macy VERIFY0(bpobj_space_range(&dl->dl_bpobj, 775eda14cbcSMatt Macy mintxg, maxtxg, usedp, compp, uncompp)); 776eda14cbcSMatt Macy return; 777eda14cbcSMatt Macy } 778eda14cbcSMatt Macy 779eda14cbcSMatt Macy *usedp = *compp = *uncompp = 0; 780eda14cbcSMatt Macy 781eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 782eda14cbcSMatt Macy dsl_deadlist_load_cache(dl); 783eda14cbcSMatt Macy dlce_tofind.dlce_mintxg = mintxg; 784eda14cbcSMatt Macy dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); 785eda14cbcSMatt Macy 786eda14cbcSMatt Macy /* 787eda14cbcSMatt Macy * If this mintxg doesn't exist, it may be an empty_bpobj which 788eda14cbcSMatt Macy * is omitted from the sparse tree. Start at the next non-empty 789eda14cbcSMatt Macy * entry. 790eda14cbcSMatt Macy */ 791eda14cbcSMatt Macy if (dlce == NULL) 792eda14cbcSMatt Macy dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); 793eda14cbcSMatt Macy 794eda14cbcSMatt Macy for (; dlce && dlce->dlce_mintxg < maxtxg; 795eda14cbcSMatt Macy dlce = AVL_NEXT(&dl->dl_tree, dlce)) { 796eda14cbcSMatt Macy *usedp += dlce->dlce_bytes; 797eda14cbcSMatt Macy *compp += dlce->dlce_comp; 798eda14cbcSMatt Macy *uncompp += dlce->dlce_uncomp; 799eda14cbcSMatt Macy } 800eda14cbcSMatt Macy 801eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 802eda14cbcSMatt Macy } 803eda14cbcSMatt Macy 804eda14cbcSMatt Macy static void 805eda14cbcSMatt Macy dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 806eda14cbcSMatt Macy dmu_tx_t *tx) 807eda14cbcSMatt Macy { 808eda14cbcSMatt Macy dsl_deadlist_entry_t dle_tofind; 809eda14cbcSMatt Macy dsl_deadlist_entry_t *dle; 810eda14cbcSMatt Macy avl_index_t where; 811eda14cbcSMatt Macy uint64_t used, comp, uncomp; 812eda14cbcSMatt Macy bpobj_t bpo; 813eda14cbcSMatt Macy 814eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&dl->dl_lock)); 815eda14cbcSMatt Macy 816eda14cbcSMatt Macy VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); 817eda14cbcSMatt Macy VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); 818eda14cbcSMatt Macy bpobj_close(&bpo); 819eda14cbcSMatt Macy 820eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 821eda14cbcSMatt Macy 822eda14cbcSMatt Macy dmu_buf_will_dirty(dl->dl_dbuf, tx); 823eda14cbcSMatt Macy dl->dl_phys->dl_used += used; 824eda14cbcSMatt Macy dl->dl_phys->dl_comp += comp; 825eda14cbcSMatt Macy dl->dl_phys->dl_uncomp += uncomp; 826eda14cbcSMatt Macy 827eda14cbcSMatt Macy dle_tofind.dle_mintxg = birth; 828eda14cbcSMatt Macy dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 829eda14cbcSMatt Macy if (dle == NULL) 830eda14cbcSMatt Macy dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 831eda14cbcSMatt Macy dle_enqueue_subobj(dl, dle, obj, tx); 832eda14cbcSMatt Macy } 833eda14cbcSMatt Macy 834c9539b89SMartin Matuska /* 835c9539b89SMartin Matuska * Prefetch metadata required for dsl_deadlist_insert_bpobj(). 836c9539b89SMartin Matuska */ 837c9539b89SMartin Matuska static void 838c9539b89SMartin Matuska dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) 839c9539b89SMartin Matuska { 840c9539b89SMartin Matuska dsl_deadlist_entry_t dle_tofind; 841c9539b89SMartin Matuska dsl_deadlist_entry_t *dle; 842c9539b89SMartin Matuska avl_index_t where; 843c9539b89SMartin Matuska 844c9539b89SMartin Matuska ASSERT(MUTEX_HELD(&dl->dl_lock)); 845c9539b89SMartin Matuska 846c9539b89SMartin Matuska dsl_deadlist_load_tree(dl); 847c9539b89SMartin Matuska 848c9539b89SMartin Matuska dle_tofind.dle_mintxg = birth; 849c9539b89SMartin Matuska dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 850c9539b89SMartin Matuska if (dle == NULL) 851c9539b89SMartin Matuska dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 852c9539b89SMartin Matuska dle_prefetch_subobj(dl, dle, obj); 853c9539b89SMartin Matuska } 854c9539b89SMartin Matuska 855eda14cbcSMatt Macy static int 856eda14cbcSMatt Macy dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 857eda14cbcSMatt Macy dmu_tx_t *tx) 858eda14cbcSMatt Macy { 859eda14cbcSMatt Macy dsl_deadlist_t *dl = arg; 860eda14cbcSMatt Macy dsl_deadlist_insert(dl, bp, bp_freed, tx); 861eda14cbcSMatt Macy return (0); 862eda14cbcSMatt Macy } 863eda14cbcSMatt Macy 864eda14cbcSMatt Macy /* 865eda14cbcSMatt Macy * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 866eda14cbcSMatt Macy * an empty deadlist. 867eda14cbcSMatt Macy */ 868eda14cbcSMatt Macy void 869eda14cbcSMatt Macy dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 870eda14cbcSMatt Macy { 871c9539b89SMartin Matuska zap_cursor_t zc, pzc; 8722a58b312SMartin Matuska zap_attribute_t *za, *pza; 873eda14cbcSMatt Macy dmu_buf_t *bonus; 874eda14cbcSMatt Macy dsl_deadlist_phys_t *dlp; 875eda14cbcSMatt Macy dmu_object_info_t doi; 876c9539b89SMartin Matuska int error, perror, i; 877eda14cbcSMatt Macy 878eda14cbcSMatt Macy VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); 879eda14cbcSMatt Macy if (doi.doi_type == DMU_OT_BPOBJ) { 880eda14cbcSMatt Macy bpobj_t bpo; 881eda14cbcSMatt Macy VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); 882eda14cbcSMatt Macy VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); 883eda14cbcSMatt Macy bpobj_close(&bpo); 884eda14cbcSMatt Macy return; 885eda14cbcSMatt Macy } 886eda14cbcSMatt Macy 8877a7741afSMartin Matuska za = zap_attribute_alloc(); 8887a7741afSMartin Matuska pza = zap_attribute_alloc(); 8892a58b312SMartin Matuska 890eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 891c9539b89SMartin Matuska /* 892c9539b89SMartin Matuska * Prefetch up to 128 deadlists first and then more as we progress. 893c9539b89SMartin Matuska * The limit is a balance between ARC use and diminishing returns. 894c9539b89SMartin Matuska */ 895c9539b89SMartin Matuska for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; 8962a58b312SMartin Matuska (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128; 897c9539b89SMartin Matuska zap_cursor_advance(&pzc), i++) { 8982a58b312SMartin Matuska dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, 8992a58b312SMartin Matuska zfs_strtonum(pza->za_name, NULL)); 900c9539b89SMartin Matuska } 901eda14cbcSMatt Macy for (zap_cursor_init(&zc, dl->dl_os, obj); 9022a58b312SMartin Matuska (error = zap_cursor_retrieve(&zc, za)) == 0; 903eda14cbcSMatt Macy zap_cursor_advance(&zc)) { 904315ee00fSMartin Matuska dsl_deadlist_insert_bpobj(dl, za->za_first_integer, 905315ee00fSMartin Matuska zfs_strtonum(za->za_name, NULL), tx); 906315ee00fSMartin Matuska VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx)); 907c9539b89SMartin Matuska if (perror == 0) { 9082a58b312SMartin Matuska dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, 9092a58b312SMartin Matuska zfs_strtonum(pza->za_name, NULL)); 910c9539b89SMartin Matuska zap_cursor_advance(&pzc); 9112a58b312SMartin Matuska perror = zap_cursor_retrieve(&pzc, pza); 912c9539b89SMartin Matuska } 913eda14cbcSMatt Macy } 914eda14cbcSMatt Macy VERIFY3U(error, ==, ENOENT); 915eda14cbcSMatt Macy zap_cursor_fini(&zc); 916c9539b89SMartin Matuska zap_cursor_fini(&pzc); 917eda14cbcSMatt Macy 918eda14cbcSMatt Macy VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 919eda14cbcSMatt Macy dlp = bonus->db_data; 920eda14cbcSMatt Macy dmu_buf_will_dirty(bonus, tx); 921da5137abSMartin Matuska memset(dlp, 0, sizeof (*dlp)); 922eda14cbcSMatt Macy dmu_buf_rele(bonus, FTAG); 923eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 9242a58b312SMartin Matuska 9257a7741afSMartin Matuska zap_attribute_free(za); 9267a7741afSMartin Matuska zap_attribute_free(pza); 927eda14cbcSMatt Macy } 928eda14cbcSMatt Macy 929eda14cbcSMatt Macy /* 930eda14cbcSMatt Macy * Remove entries on dl that are born > mintxg, and put them on the bpobj. 931eda14cbcSMatt Macy */ 932eda14cbcSMatt Macy void 933eda14cbcSMatt Macy dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 934eda14cbcSMatt Macy dmu_tx_t *tx) 935eda14cbcSMatt Macy { 936eda14cbcSMatt Macy dsl_deadlist_entry_t dle_tofind; 937c9539b89SMartin Matuska dsl_deadlist_entry_t *dle, *pdle; 938eda14cbcSMatt Macy avl_index_t where; 939c9539b89SMartin Matuska int i; 940eda14cbcSMatt Macy 941eda14cbcSMatt Macy ASSERT(!dl->dl_oldfmt); 942eda14cbcSMatt Macy 943eda14cbcSMatt Macy mutex_enter(&dl->dl_lock); 944eda14cbcSMatt Macy dmu_buf_will_dirty(dl->dl_dbuf, tx); 945eda14cbcSMatt Macy dsl_deadlist_load_tree(dl); 946eda14cbcSMatt Macy 947eda14cbcSMatt Macy dle_tofind.dle_mintxg = mintxg; 948eda14cbcSMatt Macy dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 949eda14cbcSMatt Macy if (dle == NULL) 950eda14cbcSMatt Macy dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 951c9539b89SMartin Matuska /* 952c9539b89SMartin Matuska * Prefetch up to 128 deadlists first and then more as we progress. 953c9539b89SMartin Matuska * The limit is a balance between ARC use and diminishing returns. 954c9539b89SMartin Matuska */ 9552a58b312SMartin Matuska for (pdle = dle, i = 0; pdle && i < 128; i++) { 956c9539b89SMartin Matuska bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); 957c9539b89SMartin Matuska pdle = AVL_NEXT(&dl->dl_tree, pdle); 958c9539b89SMartin Matuska } 959eda14cbcSMatt Macy while (dle) { 960eda14cbcSMatt Macy uint64_t used, comp, uncomp; 961eda14cbcSMatt Macy dsl_deadlist_entry_t *dle_next; 962eda14cbcSMatt Macy 963eda14cbcSMatt Macy bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 964c9539b89SMartin Matuska if (pdle) { 965c9539b89SMartin Matuska bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); 966c9539b89SMartin Matuska pdle = AVL_NEXT(&dl->dl_tree, pdle); 967c9539b89SMartin Matuska } 968eda14cbcSMatt Macy 969eda14cbcSMatt Macy VERIFY0(bpobj_space(&dle->dle_bpobj, 970eda14cbcSMatt Macy &used, &comp, &uncomp)); 971eda14cbcSMatt Macy ASSERT3U(dl->dl_phys->dl_used, >=, used); 972eda14cbcSMatt Macy ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 973eda14cbcSMatt Macy ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 974eda14cbcSMatt Macy dl->dl_phys->dl_used -= used; 975eda14cbcSMatt Macy dl->dl_phys->dl_comp -= comp; 976eda14cbcSMatt Macy dl->dl_phys->dl_uncomp -= uncomp; 977eda14cbcSMatt Macy 978eda14cbcSMatt Macy VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, 979eda14cbcSMatt Macy dle->dle_mintxg, tx)); 980eda14cbcSMatt Macy 981eda14cbcSMatt Macy dle_next = AVL_NEXT(&dl->dl_tree, dle); 982eda14cbcSMatt Macy avl_remove(&dl->dl_tree, dle); 983eda14cbcSMatt Macy bpobj_close(&dle->dle_bpobj); 984eda14cbcSMatt Macy kmem_free(dle, sizeof (*dle)); 985eda14cbcSMatt Macy dle = dle_next; 986eda14cbcSMatt Macy } 987eda14cbcSMatt Macy mutex_exit(&dl->dl_lock); 988eda14cbcSMatt Macy } 989eda14cbcSMatt Macy 990eda14cbcSMatt Macy typedef struct livelist_entry { 99116038816SMartin Matuska blkptr_t le_bp; 99216038816SMartin Matuska uint32_t le_refcnt; 993eda14cbcSMatt Macy avl_node_t le_node; 994eda14cbcSMatt Macy } livelist_entry_t; 995eda14cbcSMatt Macy 996eda14cbcSMatt Macy static int 997eda14cbcSMatt Macy livelist_compare(const void *larg, const void *rarg) 998eda14cbcSMatt Macy { 99916038816SMartin Matuska const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; 100016038816SMartin Matuska const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; 1001eda14cbcSMatt Macy 1002eda14cbcSMatt Macy /* Sort them according to dva[0] */ 1003eda14cbcSMatt Macy uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); 1004eda14cbcSMatt Macy uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); 1005eda14cbcSMatt Macy 1006eda14cbcSMatt Macy if (l_dva0_vdev != r_dva0_vdev) 1007eda14cbcSMatt Macy return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); 1008eda14cbcSMatt Macy 1009eda14cbcSMatt Macy /* if vdevs are equal, sort by offsets. */ 1010eda14cbcSMatt Macy uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); 1011eda14cbcSMatt Macy uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); 1012eda14cbcSMatt Macy return (TREE_CMP(l_dva0_offset, r_dva0_offset)); 1013eda14cbcSMatt Macy } 1014eda14cbcSMatt Macy 1015eda14cbcSMatt Macy struct livelist_iter_arg { 1016eda14cbcSMatt Macy avl_tree_t *avl; 1017eda14cbcSMatt Macy bplist_t *to_free; 1018eda14cbcSMatt Macy zthr_t *t; 1019eda14cbcSMatt Macy }; 1020eda14cbcSMatt Macy 1021eda14cbcSMatt Macy /* 1022eda14cbcSMatt Macy * Expects an AVL tree which is incrementally filled will FREE blkptrs 1023eda14cbcSMatt Macy * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a 1024eda14cbcSMatt Macy * corresponding FREE are stored in the supplied bplist. 102516038816SMartin Matuska * 1026f552d7adSMartin Matuska * Note that multiple FREE and ALLOC entries for the same blkptr may be 1027f552d7adSMartin Matuska * encountered when dedup or block cloning is involved. For this reason we 1028f552d7adSMartin Matuska * keep a refcount for all the FREE entries of each blkptr and ensure that 102916038816SMartin Matuska * each of those FREE entries has a corresponding ALLOC preceding it. 1030eda14cbcSMatt Macy */ 1031eda14cbcSMatt Macy static int 1032eda14cbcSMatt Macy dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, 1033eda14cbcSMatt Macy dmu_tx_t *tx) 1034eda14cbcSMatt Macy { 1035eda14cbcSMatt Macy struct livelist_iter_arg *lia = arg; 1036eda14cbcSMatt Macy avl_tree_t *avl = lia->avl; 1037eda14cbcSMatt Macy bplist_t *to_free = lia->to_free; 1038eda14cbcSMatt Macy zthr_t *t = lia->t; 1039eda14cbcSMatt Macy ASSERT(tx == NULL); 1040eda14cbcSMatt Macy 1041eda14cbcSMatt Macy if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) 1042eda14cbcSMatt Macy return (SET_ERROR(EINTR)); 104316038816SMartin Matuska 1044eda14cbcSMatt Macy livelist_entry_t node; 104516038816SMartin Matuska node.le_bp = *bp; 1046eda14cbcSMatt Macy livelist_entry_t *found = avl_find(avl, &node, NULL); 1047f552d7adSMartin Matuska if (found) { 1048f552d7adSMartin Matuska ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); 1049f552d7adSMartin Matuska ASSERT3U(BP_GET_CHECKSUM(bp), ==, 1050f552d7adSMartin Matuska BP_GET_CHECKSUM(&found->le_bp)); 1051783d3ff6SMartin Matuska ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp)); 1052f552d7adSMartin Matuska } 105316038816SMartin Matuska if (bp_freed) { 105416038816SMartin Matuska if (found == NULL) { 105516038816SMartin Matuska /* first free entry for this blkptr */ 105616038816SMartin Matuska livelist_entry_t *e = 105716038816SMartin Matuska kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); 105816038816SMartin Matuska e->le_bp = *bp; 105916038816SMartin Matuska e->le_refcnt = 1; 106016038816SMartin Matuska avl_add(avl, e); 106116038816SMartin Matuska } else { 1062f552d7adSMartin Matuska /* 1063f552d7adSMartin Matuska * Deduped or cloned block free. We could assert D bit 1064f552d7adSMartin Matuska * for dedup, but there is no such one for cloning. 1065f552d7adSMartin Matuska */ 106616038816SMartin Matuska ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); 106716038816SMartin Matuska found->le_refcnt++; 106816038816SMartin Matuska } 106916038816SMartin Matuska } else { 107016038816SMartin Matuska if (found == NULL) { 107116038816SMartin Matuska /* block is currently marked as allocated */ 107216038816SMartin Matuska bplist_append(to_free, bp); 107316038816SMartin Matuska } else { 107416038816SMartin Matuska /* alloc matches a free entry */ 107516038816SMartin Matuska ASSERT3U(found->le_refcnt, !=, 0); 107616038816SMartin Matuska found->le_refcnt--; 107716038816SMartin Matuska if (found->le_refcnt == 0) { 107816038816SMartin Matuska /* all tracked free pairs have been matched */ 1079eda14cbcSMatt Macy avl_remove(avl, found); 1080eda14cbcSMatt Macy kmem_free(found, sizeof (livelist_entry_t)); 108116038816SMartin Matuska } 1082eda14cbcSMatt Macy } 1083eda14cbcSMatt Macy } 1084eda14cbcSMatt Macy return (0); 1085eda14cbcSMatt Macy } 1086eda14cbcSMatt Macy 1087eda14cbcSMatt Macy /* 1088eda14cbcSMatt Macy * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs 1089eda14cbcSMatt Macy * which have an ALLOC entry but no matching FREE 1090eda14cbcSMatt Macy */ 1091eda14cbcSMatt Macy int 1092eda14cbcSMatt Macy dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, 1093eda14cbcSMatt Macy uint64_t *size) 1094eda14cbcSMatt Macy { 1095eda14cbcSMatt Macy avl_tree_t avl; 1096eda14cbcSMatt Macy avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), 1097eda14cbcSMatt Macy offsetof(livelist_entry_t, le_node)); 1098eda14cbcSMatt Macy 1099eda14cbcSMatt Macy /* process the sublist */ 1100eda14cbcSMatt Macy struct livelist_iter_arg arg = { 1101eda14cbcSMatt Macy .avl = &avl, 1102eda14cbcSMatt Macy .to_free = to_free, 1103eda14cbcSMatt Macy .t = t 1104eda14cbcSMatt Macy }; 1105eda14cbcSMatt Macy int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); 1106be181ee2SMartin Matuska VERIFY(err != 0 || avl_numnodes(&avl) == 0); 1107eda14cbcSMatt Macy 1108be181ee2SMartin Matuska void *cookie = NULL; 1109be181ee2SMartin Matuska livelist_entry_t *le = NULL; 1110be181ee2SMartin Matuska while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) { 1111be181ee2SMartin Matuska kmem_free(le, sizeof (livelist_entry_t)); 1112be181ee2SMartin Matuska } 1113eda14cbcSMatt Macy avl_destroy(&avl); 1114eda14cbcSMatt Macy return (err); 1115eda14cbcSMatt Macy } 1116eda14cbcSMatt Macy 1117dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW, 1118eda14cbcSMatt Macy "Size to start the next sub-livelist in a livelist"); 1119eda14cbcSMatt Macy 1120eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, 1121eda14cbcSMatt Macy "Threshold at which livelist is disabled"); 1122