xref: /freebsd-src/sys/contrib/openzfs/module/zfs/dsl_deadlist.c (revision 17aab35a77a1b1bf02fc85bb8ffadccb0ca5006d)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23eda14cbcSMatt Macy  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
24eda14cbcSMatt Macy  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25eda14cbcSMatt Macy  */
26eda14cbcSMatt Macy 
27eda14cbcSMatt Macy #include <sys/dmu.h>
28eda14cbcSMatt Macy #include <sys/zap.h>
29eda14cbcSMatt Macy #include <sys/zfs_context.h>
30eda14cbcSMatt Macy #include <sys/dsl_pool.h>
31eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
32eda14cbcSMatt Macy 
33eda14cbcSMatt Macy /*
34eda14cbcSMatt Macy  * Deadlist concurrency:
35eda14cbcSMatt Macy  *
36eda14cbcSMatt Macy  * Deadlists can only be modified from the syncing thread.
37eda14cbcSMatt Macy  *
38eda14cbcSMatt Macy  * Except for dsl_deadlist_insert(), it can only be modified with the
39eda14cbcSMatt Macy  * dp_config_rwlock held with RW_WRITER.
40eda14cbcSMatt Macy  *
41eda14cbcSMatt Macy  * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
42eda14cbcSMatt Macy  * be called concurrently, from open context, with the dl_config_rwlock held
43eda14cbcSMatt Macy  * with RW_READER.
44eda14cbcSMatt Macy  *
45eda14cbcSMatt Macy  * Therefore, we only need to provide locking between dsl_deadlist_insert() and
46eda14cbcSMatt Macy  * the accessors, protecting:
47eda14cbcSMatt Macy  *     dl_phys->dl_used,comp,uncomp
48eda14cbcSMatt Macy  *     and protecting the dl_tree from being loaded.
49eda14cbcSMatt Macy  * The locking is provided by dl_lock.  Note that locking on the bpobj_t
50eda14cbcSMatt Macy  * provides its own locking, and dl_oldfmt is immutable.
51eda14cbcSMatt Macy  */
52eda14cbcSMatt Macy 
53eda14cbcSMatt Macy /*
54eda14cbcSMatt Macy  * Livelist Overview
55eda14cbcSMatt Macy  * ================
56eda14cbcSMatt Macy  *
57eda14cbcSMatt Macy  * Livelists use the same 'deadlist_t' struct as deadlists and are also used
58eda14cbcSMatt Macy  * to track blkptrs over the lifetime of a dataset. Livelists however, belong
59eda14cbcSMatt Macy  * to clones and track the blkptrs that are clone-specific (were born after
60eda14cbcSMatt Macy  * the clone's creation). The exception is embedded block pointers which are
61eda14cbcSMatt Macy  * not included in livelists because they do not need to be freed.
62eda14cbcSMatt Macy  *
63eda14cbcSMatt Macy  * When it comes time to delete the clone, the livelist provides a quick
64eda14cbcSMatt Macy  * reference as to what needs to be freed. For this reason, livelists also track
65eda14cbcSMatt Macy  * when clone-specific blkptrs are freed before deletion to prevent double
66eda14cbcSMatt Macy  * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
67eda14cbcSMatt Macy  * deletion algorithm iterates backwards over the livelist, matching
68eda14cbcSMatt Macy  * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
69eda14cbcSMatt Macy  * are also updated in the case when blkptrs are remapped: the old version
70eda14cbcSMatt Macy  * of the blkptr is cancelled out with a FREE and the new version is tracked
71eda14cbcSMatt Macy  * with an ALLOC.
72eda14cbcSMatt Macy  *
73eda14cbcSMatt Macy  * To bound the amount of memory required for deletion, livelists over a
74eda14cbcSMatt Macy  * certain size are spread over multiple entries. Entries are grouped by
75eda14cbcSMatt Macy  * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
76eda14cbcSMatt Macy  * be in the same entry. This allows us to delete livelists incrementally
77eda14cbcSMatt Macy  * over multiple syncs, one entry at a time.
78eda14cbcSMatt Macy  *
79eda14cbcSMatt Macy  * During the lifetime of the clone, livelists can get extremely large.
80eda14cbcSMatt Macy  * Their size is managed by periodic condensing (preemptively cancelling out
81eda14cbcSMatt Macy  * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
82eda14cbcSMatt Macy  * the shared space between the clone and its origin is so small that it
83eda14cbcSMatt Macy  * doesn't make sense to use livelists anymore.
84eda14cbcSMatt Macy  */
85eda14cbcSMatt Macy 
86eda14cbcSMatt Macy /*
87eda14cbcSMatt Macy  * The threshold sublist size at which we create a new sub-livelist for the
88eda14cbcSMatt Macy  * next txg. However, since blkptrs of the same transaction group must be in
89eda14cbcSMatt Macy  * the same sub-list, the actual sublist size may exceed this. When picking the
90eda14cbcSMatt Macy  * size we had to balance the fact that larger sublists mean fewer sublists
91eda14cbcSMatt Macy  * (decreasing the cost of insertion) against the consideration that sublists
92eda14cbcSMatt Macy  * will be loaded into memory and shouldn't take up an inordinate amount of
93eda14cbcSMatt Macy  * space. We settled on ~500000 entries, corresponding to roughly 128M.
94eda14cbcSMatt Macy  */
95dbd5678dSMartin Matuska uint64_t zfs_livelist_max_entries = 500000;
96eda14cbcSMatt Macy 
97eda14cbcSMatt Macy /*
98eda14cbcSMatt Macy  * We can approximate how much of a performance gain a livelist will give us
99eda14cbcSMatt Macy  * based on the percentage of blocks shared between the clone and its origin.
100eda14cbcSMatt Macy  * 0 percent shared means that the clone has completely diverged and that the
101eda14cbcSMatt Macy  * old method is maximally effective: every read from the block tree will
102eda14cbcSMatt Macy  * result in lots of frees. Livelists give us gains when they track blocks
103eda14cbcSMatt Macy  * scattered across the tree, when one read in the old method might only
104eda14cbcSMatt Macy  * result in a few frees. Once the clone has been overwritten enough,
105eda14cbcSMatt Macy  * writes are no longer sparse and we'll no longer get much of a benefit from
106eda14cbcSMatt Macy  * tracking them with a livelist. We chose a lower limit of 75 percent shared
107eda14cbcSMatt Macy  * (25 percent overwritten). This means that 1/4 of all block pointers will be
108eda14cbcSMatt Macy  * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
109eda14cbcSMatt Macy  * to make deletion 4x faster. Once the amount of shared space drops below this
110eda14cbcSMatt Macy  * threshold, the clone will revert to the old deletion method.
111eda14cbcSMatt Macy  */
112eda14cbcSMatt Macy int zfs_livelist_min_percent_shared = 75;
113eda14cbcSMatt Macy 
114eda14cbcSMatt Macy static int
115eda14cbcSMatt Macy dsl_deadlist_compare(const void *arg1, const void *arg2)
116eda14cbcSMatt Macy {
117eda14cbcSMatt Macy 	const dsl_deadlist_entry_t *dle1 = arg1;
118eda14cbcSMatt Macy 	const dsl_deadlist_entry_t *dle2 = arg2;
119eda14cbcSMatt Macy 
120eda14cbcSMatt Macy 	return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
121eda14cbcSMatt Macy }
122eda14cbcSMatt Macy 
123eda14cbcSMatt Macy static int
124eda14cbcSMatt Macy dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
125eda14cbcSMatt Macy {
126eda14cbcSMatt Macy 	const dsl_deadlist_cache_entry_t *dlce1 = arg1;
127eda14cbcSMatt Macy 	const dsl_deadlist_cache_entry_t *dlce2 = arg2;
128eda14cbcSMatt Macy 
129eda14cbcSMatt Macy 	return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
130eda14cbcSMatt Macy }
131eda14cbcSMatt Macy 
132eda14cbcSMatt Macy static void
133eda14cbcSMatt Macy dsl_deadlist_load_tree(dsl_deadlist_t *dl)
134eda14cbcSMatt Macy {
135eda14cbcSMatt Macy 	zap_cursor_t zc;
1367a7741afSMartin Matuska 	zap_attribute_t *za;
137eda14cbcSMatt Macy 	int error;
138eda14cbcSMatt Macy 
139eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dl->dl_lock));
140eda14cbcSMatt Macy 
141eda14cbcSMatt Macy 	ASSERT(!dl->dl_oldfmt);
142eda14cbcSMatt Macy 	if (dl->dl_havecache) {
143eda14cbcSMatt Macy 		/*
144eda14cbcSMatt Macy 		 * After loading the tree, the caller may modify the tree,
145eda14cbcSMatt Macy 		 * e.g. to add or remove nodes, or to make a node no longer
146eda14cbcSMatt Macy 		 * refer to the empty_bpobj.  These changes would make the
147eda14cbcSMatt Macy 		 * dl_cache incorrect.  Therefore we discard the cache here,
148eda14cbcSMatt Macy 		 * so that it can't become incorrect.
149eda14cbcSMatt Macy 		 */
150eda14cbcSMatt Macy 		dsl_deadlist_cache_entry_t *dlce;
151eda14cbcSMatt Macy 		void *cookie = NULL;
152eda14cbcSMatt Macy 		while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
153eda14cbcSMatt Macy 		    != NULL) {
154eda14cbcSMatt Macy 			kmem_free(dlce, sizeof (*dlce));
155eda14cbcSMatt Macy 		}
156eda14cbcSMatt Macy 		avl_destroy(&dl->dl_cache);
157eda14cbcSMatt Macy 		dl->dl_havecache = B_FALSE;
158eda14cbcSMatt Macy 	}
159eda14cbcSMatt Macy 	if (dl->dl_havetree)
160eda14cbcSMatt Macy 		return;
161eda14cbcSMatt Macy 
1627a7741afSMartin Matuska 	za = zap_attribute_alloc();
163eda14cbcSMatt Macy 	avl_create(&dl->dl_tree, dsl_deadlist_compare,
164eda14cbcSMatt Macy 	    sizeof (dsl_deadlist_entry_t),
165eda14cbcSMatt Macy 	    offsetof(dsl_deadlist_entry_t, dle_node));
166eda14cbcSMatt Macy 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
1677a7741afSMartin Matuska 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
168eda14cbcSMatt Macy 	    zap_cursor_advance(&zc)) {
169eda14cbcSMatt Macy 		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
1707a7741afSMartin Matuska 		dle->dle_mintxg = zfs_strtonum(za->za_name, NULL);
171eda14cbcSMatt Macy 
172eda14cbcSMatt Macy 		/*
173eda14cbcSMatt Macy 		 * Prefetch all the bpobj's so that we do that i/o
174eda14cbcSMatt Macy 		 * in parallel.  Then open them all in a second pass.
175eda14cbcSMatt Macy 		 */
1767a7741afSMartin Matuska 		dle->dle_bpobj.bpo_object = za->za_first_integer;
177315ee00fSMartin Matuska 		dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
178315ee00fSMartin Matuska 		    ZIO_PRIORITY_SYNC_READ);
179eda14cbcSMatt Macy 
180eda14cbcSMatt Macy 		avl_add(&dl->dl_tree, dle);
181eda14cbcSMatt Macy 	}
182eda14cbcSMatt Macy 	VERIFY3U(error, ==, ENOENT);
183eda14cbcSMatt Macy 	zap_cursor_fini(&zc);
1847a7741afSMartin Matuska 	zap_attribute_free(za);
185eda14cbcSMatt Macy 
186eda14cbcSMatt Macy 	for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
187eda14cbcSMatt Macy 	    dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
188eda14cbcSMatt Macy 		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
189eda14cbcSMatt Macy 		    dle->dle_bpobj.bpo_object));
190eda14cbcSMatt Macy 	}
191eda14cbcSMatt Macy 	dl->dl_havetree = B_TRUE;
192eda14cbcSMatt Macy }
193eda14cbcSMatt Macy 
194eda14cbcSMatt Macy /*
195eda14cbcSMatt Macy  * Load only the non-empty bpobj's into the dl_cache.  The cache is an analog
196eda14cbcSMatt Macy  * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
197eda14cbcSMatt Macy  * is used only for gathering space statistics.  The dl_cache has two
198eda14cbcSMatt Macy  * advantages over the dl_tree:
199eda14cbcSMatt Macy  *
200eda14cbcSMatt Macy  * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
201eda14cbcSMatt Macy  * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
202eda14cbcSMatt Macy  * many times and to inquire about its (zero) space stats many times.
203eda14cbcSMatt Macy  *
204eda14cbcSMatt Macy  * 2. The dl_cache uses less memory than the dl_tree.  We only need to load
205eda14cbcSMatt Macy  * the dl_tree of snapshots when deleting a snapshot, after which we free the
206eda14cbcSMatt Macy  * dl_tree with dsl_deadlist_discard_tree
207eda14cbcSMatt Macy  */
208eda14cbcSMatt Macy static void
209eda14cbcSMatt Macy dsl_deadlist_load_cache(dsl_deadlist_t *dl)
210eda14cbcSMatt Macy {
211eda14cbcSMatt Macy 	zap_cursor_t zc;
2127a7741afSMartin Matuska 	zap_attribute_t *za;
213eda14cbcSMatt Macy 	int error;
214eda14cbcSMatt Macy 
215eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dl->dl_lock));
216eda14cbcSMatt Macy 
217eda14cbcSMatt Macy 	ASSERT(!dl->dl_oldfmt);
218eda14cbcSMatt Macy 	if (dl->dl_havecache)
219eda14cbcSMatt Macy 		return;
220eda14cbcSMatt Macy 
221eda14cbcSMatt Macy 	uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
222eda14cbcSMatt Macy 
223eda14cbcSMatt Macy 	avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
224eda14cbcSMatt Macy 	    sizeof (dsl_deadlist_cache_entry_t),
225eda14cbcSMatt Macy 	    offsetof(dsl_deadlist_cache_entry_t, dlce_node));
2267a7741afSMartin Matuska 	za = zap_attribute_alloc();
227eda14cbcSMatt Macy 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
2287a7741afSMartin Matuska 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
229eda14cbcSMatt Macy 	    zap_cursor_advance(&zc)) {
2307a7741afSMartin Matuska 		if (za->za_first_integer == empty_bpobj)
231eda14cbcSMatt Macy 			continue;
232eda14cbcSMatt Macy 		dsl_deadlist_cache_entry_t *dlce =
233eda14cbcSMatt Macy 		    kmem_zalloc(sizeof (*dlce), KM_SLEEP);
2347a7741afSMartin Matuska 		dlce->dlce_mintxg = zfs_strtonum(za->za_name, NULL);
235eda14cbcSMatt Macy 
236eda14cbcSMatt Macy 		/*
237eda14cbcSMatt Macy 		 * Prefetch all the bpobj's so that we do that i/o
238eda14cbcSMatt Macy 		 * in parallel.  Then open them all in a second pass.
239eda14cbcSMatt Macy 		 */
2407a7741afSMartin Matuska 		dlce->dlce_bpobj = za->za_first_integer;
241315ee00fSMartin Matuska 		dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
242315ee00fSMartin Matuska 		    ZIO_PRIORITY_SYNC_READ);
243eda14cbcSMatt Macy 		avl_add(&dl->dl_cache, dlce);
244eda14cbcSMatt Macy 	}
245eda14cbcSMatt Macy 	VERIFY3U(error, ==, ENOENT);
246eda14cbcSMatt Macy 	zap_cursor_fini(&zc);
2477a7741afSMartin Matuska 	zap_attribute_free(za);
248eda14cbcSMatt Macy 
249eda14cbcSMatt Macy 	for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
250eda14cbcSMatt Macy 	    dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
251eda14cbcSMatt Macy 		bpobj_t bpo;
252eda14cbcSMatt Macy 		VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
253eda14cbcSMatt Macy 
254eda14cbcSMatt Macy 		VERIFY0(bpobj_space(&bpo,
255eda14cbcSMatt Macy 		    &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
256eda14cbcSMatt Macy 		bpobj_close(&bpo);
257eda14cbcSMatt Macy 	}
258eda14cbcSMatt Macy 	dl->dl_havecache = B_TRUE;
259eda14cbcSMatt Macy }
260eda14cbcSMatt Macy 
261eda14cbcSMatt Macy /*
262eda14cbcSMatt Macy  * Discard the tree to save memory.
263eda14cbcSMatt Macy  */
264eda14cbcSMatt Macy void
265eda14cbcSMatt Macy dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
266eda14cbcSMatt Macy {
267eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
268eda14cbcSMatt Macy 
269eda14cbcSMatt Macy 	if (!dl->dl_havetree) {
270eda14cbcSMatt Macy 		mutex_exit(&dl->dl_lock);
271eda14cbcSMatt Macy 		return;
272eda14cbcSMatt Macy 	}
273eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
274eda14cbcSMatt Macy 	void *cookie = NULL;
275eda14cbcSMatt Macy 	while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
276eda14cbcSMatt Macy 		bpobj_close(&dle->dle_bpobj);
277eda14cbcSMatt Macy 		kmem_free(dle, sizeof (*dle));
278eda14cbcSMatt Macy 	}
279eda14cbcSMatt Macy 	avl_destroy(&dl->dl_tree);
280eda14cbcSMatt Macy 
281eda14cbcSMatt Macy 	dl->dl_havetree = B_FALSE;
282eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
283eda14cbcSMatt Macy }
284eda14cbcSMatt Macy 
285eda14cbcSMatt Macy void
286eda14cbcSMatt Macy dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
287eda14cbcSMatt Macy {
288eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
289eda14cbcSMatt Macy 
290eda14cbcSMatt Macy 	ASSERT(dsl_deadlist_is_open(dl));
291eda14cbcSMatt Macy 
292eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
293eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
294eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
295eda14cbcSMatt Macy 	for (dle = avl_first(&dl->dl_tree); dle != NULL;
296eda14cbcSMatt Macy 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
297eda14cbcSMatt Macy 		if (func(args, dle) != 0)
298eda14cbcSMatt Macy 			break;
299eda14cbcSMatt Macy 	}
300eda14cbcSMatt Macy }
301eda14cbcSMatt Macy 
302*17aab35aSMartin Matuska int
303eda14cbcSMatt Macy dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
304eda14cbcSMatt Macy {
305eda14cbcSMatt Macy 	dmu_object_info_t doi;
306*17aab35aSMartin Matuska 	int err;
307eda14cbcSMatt Macy 
308eda14cbcSMatt Macy 	ASSERT(!dsl_deadlist_is_open(dl));
309eda14cbcSMatt Macy 
310eda14cbcSMatt Macy 	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
311eda14cbcSMatt Macy 	dl->dl_os = os;
312eda14cbcSMatt Macy 	dl->dl_object = object;
313*17aab35aSMartin Matuska 	err = dmu_bonus_hold(os, object, dl, &dl->dl_dbuf);
314*17aab35aSMartin Matuska 	if (err != 0)
315*17aab35aSMartin Matuska 		return (err);
316eda14cbcSMatt Macy 	dmu_object_info_from_db(dl->dl_dbuf, &doi);
317eda14cbcSMatt Macy 	if (doi.doi_type == DMU_OT_BPOBJ) {
318eda14cbcSMatt Macy 		dmu_buf_rele(dl->dl_dbuf, dl);
319eda14cbcSMatt Macy 		dl->dl_dbuf = NULL;
320eda14cbcSMatt Macy 		dl->dl_oldfmt = B_TRUE;
321*17aab35aSMartin Matuska 		return (bpobj_open(&dl->dl_bpobj, os, object));
322eda14cbcSMatt Macy 	}
323eda14cbcSMatt Macy 
324eda14cbcSMatt Macy 	dl->dl_oldfmt = B_FALSE;
325eda14cbcSMatt Macy 	dl->dl_phys = dl->dl_dbuf->db_data;
326eda14cbcSMatt Macy 	dl->dl_havetree = B_FALSE;
327eda14cbcSMatt Macy 	dl->dl_havecache = B_FALSE;
328*17aab35aSMartin Matuska 	return (0);
329eda14cbcSMatt Macy }
330eda14cbcSMatt Macy 
331eda14cbcSMatt Macy boolean_t
332eda14cbcSMatt Macy dsl_deadlist_is_open(dsl_deadlist_t *dl)
333eda14cbcSMatt Macy {
334eda14cbcSMatt Macy 	return (dl->dl_os != NULL);
335eda14cbcSMatt Macy }
336eda14cbcSMatt Macy 
337eda14cbcSMatt Macy void
338eda14cbcSMatt Macy dsl_deadlist_close(dsl_deadlist_t *dl)
339eda14cbcSMatt Macy {
340eda14cbcSMatt Macy 	ASSERT(dsl_deadlist_is_open(dl));
341eda14cbcSMatt Macy 	mutex_destroy(&dl->dl_lock);
342eda14cbcSMatt Macy 
343eda14cbcSMatt Macy 	if (dl->dl_oldfmt) {
344eda14cbcSMatt Macy 		dl->dl_oldfmt = B_FALSE;
345eda14cbcSMatt Macy 		bpobj_close(&dl->dl_bpobj);
346eda14cbcSMatt Macy 		dl->dl_os = NULL;
347eda14cbcSMatt Macy 		dl->dl_object = 0;
348eda14cbcSMatt Macy 		return;
349eda14cbcSMatt Macy 	}
350eda14cbcSMatt Macy 
351eda14cbcSMatt Macy 	if (dl->dl_havetree) {
352eda14cbcSMatt Macy 		dsl_deadlist_entry_t *dle;
353eda14cbcSMatt Macy 		void *cookie = NULL;
354eda14cbcSMatt Macy 		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
355eda14cbcSMatt Macy 		    != NULL) {
356eda14cbcSMatt Macy 			bpobj_close(&dle->dle_bpobj);
357eda14cbcSMatt Macy 			kmem_free(dle, sizeof (*dle));
358eda14cbcSMatt Macy 		}
359eda14cbcSMatt Macy 		avl_destroy(&dl->dl_tree);
360eda14cbcSMatt Macy 	}
361eda14cbcSMatt Macy 	if (dl->dl_havecache) {
362eda14cbcSMatt Macy 		dsl_deadlist_cache_entry_t *dlce;
363eda14cbcSMatt Macy 		void *cookie = NULL;
364eda14cbcSMatt Macy 		while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
365eda14cbcSMatt Macy 		    != NULL) {
366eda14cbcSMatt Macy 			kmem_free(dlce, sizeof (*dlce));
367eda14cbcSMatt Macy 		}
368eda14cbcSMatt Macy 		avl_destroy(&dl->dl_cache);
369eda14cbcSMatt Macy 	}
370eda14cbcSMatt Macy 	dmu_buf_rele(dl->dl_dbuf, dl);
371eda14cbcSMatt Macy 	dl->dl_dbuf = NULL;
372eda14cbcSMatt Macy 	dl->dl_phys = NULL;
373eda14cbcSMatt Macy 	dl->dl_os = NULL;
374eda14cbcSMatt Macy 	dl->dl_object = 0;
375eda14cbcSMatt Macy }
376eda14cbcSMatt Macy 
377eda14cbcSMatt Macy uint64_t
378eda14cbcSMatt Macy dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
379eda14cbcSMatt Macy {
380eda14cbcSMatt Macy 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
381eda14cbcSMatt Macy 		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
382eda14cbcSMatt Macy 	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
383eda14cbcSMatt Macy 	    sizeof (dsl_deadlist_phys_t), tx));
384eda14cbcSMatt Macy }
385eda14cbcSMatt Macy 
386eda14cbcSMatt Macy void
387eda14cbcSMatt Macy dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
388eda14cbcSMatt Macy {
389eda14cbcSMatt Macy 	dmu_object_info_t doi;
390eda14cbcSMatt Macy 	zap_cursor_t zc;
3917a7741afSMartin Matuska 	zap_attribute_t *za;
392eda14cbcSMatt Macy 	int error;
393eda14cbcSMatt Macy 
394eda14cbcSMatt Macy 	VERIFY0(dmu_object_info(os, dlobj, &doi));
395eda14cbcSMatt Macy 	if (doi.doi_type == DMU_OT_BPOBJ) {
396eda14cbcSMatt Macy 		bpobj_free(os, dlobj, tx);
397eda14cbcSMatt Macy 		return;
398eda14cbcSMatt Macy 	}
399eda14cbcSMatt Macy 
4007a7741afSMartin Matuska 	za = zap_attribute_alloc();
401eda14cbcSMatt Macy 	for (zap_cursor_init(&zc, os, dlobj);
4027a7741afSMartin Matuska 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
403eda14cbcSMatt Macy 	    zap_cursor_advance(&zc)) {
4047a7741afSMartin Matuska 		uint64_t obj = za->za_first_integer;
405eda14cbcSMatt Macy 		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
406eda14cbcSMatt Macy 			bpobj_decr_empty(os, tx);
407eda14cbcSMatt Macy 		else
408eda14cbcSMatt Macy 			bpobj_free(os, obj, tx);
409eda14cbcSMatt Macy 	}
410eda14cbcSMatt Macy 	VERIFY3U(error, ==, ENOENT);
411eda14cbcSMatt Macy 	zap_cursor_fini(&zc);
4127a7741afSMartin Matuska 	zap_attribute_free(za);
413eda14cbcSMatt Macy 	VERIFY0(dmu_object_free(os, dlobj, tx));
414eda14cbcSMatt Macy }
415eda14cbcSMatt Macy 
416eda14cbcSMatt Macy static void
417eda14cbcSMatt Macy dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
418eda14cbcSMatt Macy     const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
419eda14cbcSMatt Macy {
420eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dl->dl_lock));
421eda14cbcSMatt Macy 	if (dle->dle_bpobj.bpo_object ==
422eda14cbcSMatt Macy 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
423eda14cbcSMatt Macy 		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
424eda14cbcSMatt Macy 		bpobj_close(&dle->dle_bpobj);
425eda14cbcSMatt Macy 		bpobj_decr_empty(dl->dl_os, tx);
426eda14cbcSMatt Macy 		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
427eda14cbcSMatt Macy 		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
428eda14cbcSMatt Macy 		    dle->dle_mintxg, obj, tx));
429eda14cbcSMatt Macy 	}
430eda14cbcSMatt Macy 	bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
431eda14cbcSMatt Macy }
432eda14cbcSMatt Macy 
433eda14cbcSMatt Macy static void
434eda14cbcSMatt Macy dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
435eda14cbcSMatt Macy     uint64_t obj, dmu_tx_t *tx)
436eda14cbcSMatt Macy {
437eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dl->dl_lock));
438eda14cbcSMatt Macy 	if (dle->dle_bpobj.bpo_object !=
439eda14cbcSMatt Macy 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
440eda14cbcSMatt Macy 		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
441eda14cbcSMatt Macy 	} else {
442eda14cbcSMatt Macy 		bpobj_close(&dle->dle_bpobj);
443eda14cbcSMatt Macy 		bpobj_decr_empty(dl->dl_os, tx);
444eda14cbcSMatt Macy 		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
445eda14cbcSMatt Macy 		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
446eda14cbcSMatt Macy 		    dle->dle_mintxg, obj, tx));
447eda14cbcSMatt Macy 	}
448eda14cbcSMatt Macy }
449eda14cbcSMatt Macy 
450c9539b89SMartin Matuska /*
451c9539b89SMartin Matuska  * Prefetch metadata required for dle_enqueue_subobj().
452c9539b89SMartin Matuska  */
453c9539b89SMartin Matuska static void
454c9539b89SMartin Matuska dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
455c9539b89SMartin Matuska     uint64_t obj)
456c9539b89SMartin Matuska {
457c9539b89SMartin Matuska 	if (dle->dle_bpobj.bpo_object !=
458c9539b89SMartin Matuska 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
459c9539b89SMartin Matuska 		bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
460c9539b89SMartin Matuska }
461c9539b89SMartin Matuska 
462eda14cbcSMatt Macy void
463eda14cbcSMatt Macy dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
464eda14cbcSMatt Macy     dmu_tx_t *tx)
465eda14cbcSMatt Macy {
466eda14cbcSMatt Macy 	dsl_deadlist_entry_t dle_tofind;
467eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
468eda14cbcSMatt Macy 	avl_index_t where;
469eda14cbcSMatt Macy 
470eda14cbcSMatt Macy 	if (dl->dl_oldfmt) {
471eda14cbcSMatt Macy 		bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
472eda14cbcSMatt Macy 		return;
473eda14cbcSMatt Macy 	}
474eda14cbcSMatt Macy 
475eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
476eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
477eda14cbcSMatt Macy 
478eda14cbcSMatt Macy 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
479eda14cbcSMatt Macy 
480eda14cbcSMatt Macy 	int sign = bp_freed ? -1 : +1;
481eda14cbcSMatt Macy 	dl->dl_phys->dl_used +=
482eda14cbcSMatt Macy 	    sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
483eda14cbcSMatt Macy 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
484eda14cbcSMatt Macy 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
485eda14cbcSMatt Macy 
486783d3ff6SMartin Matuska 	dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
487eda14cbcSMatt Macy 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
488eda14cbcSMatt Macy 	if (dle == NULL)
489eda14cbcSMatt Macy 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
490eda14cbcSMatt Macy 	else
491eda14cbcSMatt Macy 		dle = AVL_PREV(&dl->dl_tree, dle);
492eda14cbcSMatt Macy 
493eda14cbcSMatt Macy 	if (dle == NULL) {
494eda14cbcSMatt Macy 		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
495783d3ff6SMartin Matuska 		    bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
496eda14cbcSMatt Macy 		dle = avl_first(&dl->dl_tree);
497eda14cbcSMatt Macy 	}
498eda14cbcSMatt Macy 
499eda14cbcSMatt Macy 	ASSERT3P(dle, !=, NULL);
500eda14cbcSMatt Macy 	dle_enqueue(dl, dle, bp, bp_freed, tx);
501eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
502eda14cbcSMatt Macy }
503eda14cbcSMatt Macy 
504eda14cbcSMatt Macy int
505eda14cbcSMatt Macy dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
506eda14cbcSMatt Macy {
507eda14cbcSMatt Macy 	dsl_deadlist_t *dl = arg;
508eda14cbcSMatt Macy 	dsl_deadlist_insert(dl, bp, B_FALSE, tx);
509eda14cbcSMatt Macy 	return (0);
510eda14cbcSMatt Macy }
511eda14cbcSMatt Macy 
512eda14cbcSMatt Macy int
513eda14cbcSMatt Macy dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
514eda14cbcSMatt Macy {
515eda14cbcSMatt Macy 	dsl_deadlist_t *dl = arg;
516eda14cbcSMatt Macy 	dsl_deadlist_insert(dl, bp, B_TRUE, tx);
517eda14cbcSMatt Macy 	return (0);
518eda14cbcSMatt Macy }
519eda14cbcSMatt Macy 
520eda14cbcSMatt Macy /*
521eda14cbcSMatt Macy  * Insert new key in deadlist, which must be > all current entries.
522eda14cbcSMatt Macy  * mintxg is not inclusive.
523eda14cbcSMatt Macy  */
524eda14cbcSMatt Macy void
525eda14cbcSMatt Macy dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
526eda14cbcSMatt Macy {
527eda14cbcSMatt Macy 	uint64_t obj;
528eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
529eda14cbcSMatt Macy 
530eda14cbcSMatt Macy 	if (dl->dl_oldfmt)
531eda14cbcSMatt Macy 		return;
532eda14cbcSMatt Macy 
533eda14cbcSMatt Macy 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
534eda14cbcSMatt Macy 	dle->dle_mintxg = mintxg;
535eda14cbcSMatt Macy 
536eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
537eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
538eda14cbcSMatt Macy 
539eda14cbcSMatt Macy 	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
540eda14cbcSMatt Macy 	VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
541eda14cbcSMatt Macy 	avl_add(&dl->dl_tree, dle);
542eda14cbcSMatt Macy 
543eda14cbcSMatt Macy 	VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,
544eda14cbcSMatt Macy 	    mintxg, obj, tx));
545eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
546eda14cbcSMatt Macy }
547eda14cbcSMatt Macy 
548eda14cbcSMatt Macy /*
549eda14cbcSMatt Macy  * Remove this key, merging its entries into the previous key.
550eda14cbcSMatt Macy  */
551eda14cbcSMatt Macy void
552eda14cbcSMatt Macy dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
553eda14cbcSMatt Macy {
554eda14cbcSMatt Macy 	dsl_deadlist_entry_t dle_tofind;
555eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle, *dle_prev;
556eda14cbcSMatt Macy 
557eda14cbcSMatt Macy 	if (dl->dl_oldfmt)
558eda14cbcSMatt Macy 		return;
559eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
560eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
561eda14cbcSMatt Macy 
562eda14cbcSMatt Macy 	dle_tofind.dle_mintxg = mintxg;
563eda14cbcSMatt Macy 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
564eda14cbcSMatt Macy 	ASSERT3P(dle, !=, NULL);
565eda14cbcSMatt Macy 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
566dbd5678dSMartin Matuska 	ASSERT3P(dle_prev, !=, NULL);
567eda14cbcSMatt Macy 
568eda14cbcSMatt Macy 	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
569eda14cbcSMatt Macy 
570eda14cbcSMatt Macy 	avl_remove(&dl->dl_tree, dle);
571eda14cbcSMatt Macy 	bpobj_close(&dle->dle_bpobj);
572eda14cbcSMatt Macy 	kmem_free(dle, sizeof (*dle));
573eda14cbcSMatt Macy 
574eda14cbcSMatt Macy 	VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
575eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
576eda14cbcSMatt Macy }
577eda14cbcSMatt Macy 
578eda14cbcSMatt Macy /*
579eda14cbcSMatt Macy  * Remove a deadlist entry and all of its contents by removing the entry from
580eda14cbcSMatt Macy  * the deadlist's avl tree, freeing the entry's bpobj and adjusting the
581eda14cbcSMatt Macy  * deadlist's space accounting accordingly.
582eda14cbcSMatt Macy  */
583eda14cbcSMatt Macy void
584eda14cbcSMatt Macy dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
585eda14cbcSMatt Macy {
586eda14cbcSMatt Macy 	uint64_t used, comp, uncomp;
587eda14cbcSMatt Macy 	dsl_deadlist_entry_t dle_tofind;
588eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
589eda14cbcSMatt Macy 	objset_t *os = dl->dl_os;
590eda14cbcSMatt Macy 
591eda14cbcSMatt Macy 	if (dl->dl_oldfmt)
592eda14cbcSMatt Macy 		return;
593eda14cbcSMatt Macy 
594eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
595eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
596eda14cbcSMatt Macy 
597eda14cbcSMatt Macy 	dle_tofind.dle_mintxg = mintxg;
598eda14cbcSMatt Macy 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
599eda14cbcSMatt Macy 	VERIFY3P(dle, !=, NULL);
600eda14cbcSMatt Macy 
601eda14cbcSMatt Macy 	avl_remove(&dl->dl_tree, dle);
602eda14cbcSMatt Macy 	VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
603eda14cbcSMatt Macy 	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
604eda14cbcSMatt Macy 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
605eda14cbcSMatt Macy 	dl->dl_phys->dl_used -= used;
606eda14cbcSMatt Macy 	dl->dl_phys->dl_comp -= comp;
607eda14cbcSMatt Macy 	dl->dl_phys->dl_uncomp -= uncomp;
608eda14cbcSMatt Macy 	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
609eda14cbcSMatt Macy 		bpobj_decr_empty(os, tx);
610eda14cbcSMatt Macy 	} else {
611eda14cbcSMatt Macy 		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
612eda14cbcSMatt Macy 	}
613eda14cbcSMatt Macy 	bpobj_close(&dle->dle_bpobj);
614eda14cbcSMatt Macy 	kmem_free(dle, sizeof (*dle));
615eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
616eda14cbcSMatt Macy }
617eda14cbcSMatt Macy 
618eda14cbcSMatt Macy /*
619eda14cbcSMatt Macy  * Clear out the contents of a deadlist_entry by freeing its bpobj,
620eda14cbcSMatt Macy  * replacing it with an empty bpobj and adjusting the deadlist's
621eda14cbcSMatt Macy  * space accounting
622eda14cbcSMatt Macy  */
623eda14cbcSMatt Macy void
624eda14cbcSMatt Macy dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
625eda14cbcSMatt Macy     dmu_tx_t *tx)
626eda14cbcSMatt Macy {
627eda14cbcSMatt Macy 	uint64_t new_obj, used, comp, uncomp;
628eda14cbcSMatt Macy 	objset_t *os = dl->dl_os;
629eda14cbcSMatt Macy 
630eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
631eda14cbcSMatt Macy 	VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
632eda14cbcSMatt Macy 	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
633eda14cbcSMatt Macy 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
634eda14cbcSMatt Macy 	dl->dl_phys->dl_used -= used;
635eda14cbcSMatt Macy 	dl->dl_phys->dl_comp -= comp;
636eda14cbcSMatt Macy 	dl->dl_phys->dl_uncomp -= uncomp;
637eda14cbcSMatt Macy 	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
638eda14cbcSMatt Macy 		bpobj_decr_empty(os, tx);
639eda14cbcSMatt Macy 	else
640eda14cbcSMatt Macy 		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
641eda14cbcSMatt Macy 	bpobj_close(&dle->dle_bpobj);
642eda14cbcSMatt Macy 	new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
643eda14cbcSMatt Macy 	VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
644eda14cbcSMatt Macy 	VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
645eda14cbcSMatt Macy 	    new_obj, tx));
646eda14cbcSMatt Macy 	ASSERT(bpobj_is_empty(&dle->dle_bpobj));
647eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
648eda14cbcSMatt Macy }
649eda14cbcSMatt Macy 
650eda14cbcSMatt Macy /*
651eda14cbcSMatt Macy  * Return the first entry in deadlist's avl tree
652eda14cbcSMatt Macy  */
653eda14cbcSMatt Macy dsl_deadlist_entry_t *
654eda14cbcSMatt Macy dsl_deadlist_first(dsl_deadlist_t *dl)
655eda14cbcSMatt Macy {
656eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
657eda14cbcSMatt Macy 
658eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
659eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
660eda14cbcSMatt Macy 	dle = avl_first(&dl->dl_tree);
661eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
662eda14cbcSMatt Macy 
663eda14cbcSMatt Macy 	return (dle);
664eda14cbcSMatt Macy }
665eda14cbcSMatt Macy 
666eda14cbcSMatt Macy /*
667eda14cbcSMatt Macy  * Return the last entry in deadlist's avl tree
668eda14cbcSMatt Macy  */
669eda14cbcSMatt Macy dsl_deadlist_entry_t *
670eda14cbcSMatt Macy dsl_deadlist_last(dsl_deadlist_t *dl)
671eda14cbcSMatt Macy {
672eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
673eda14cbcSMatt Macy 
674eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
675eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
676eda14cbcSMatt Macy 	dle = avl_last(&dl->dl_tree);
677eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
678eda14cbcSMatt Macy 
679eda14cbcSMatt Macy 	return (dle);
680eda14cbcSMatt Macy }
681eda14cbcSMatt Macy 
682eda14cbcSMatt Macy /*
683eda14cbcSMatt Macy  * Walk ds's snapshots to regenerate generate ZAP & AVL.
684eda14cbcSMatt Macy  */
685eda14cbcSMatt Macy static void
686eda14cbcSMatt Macy dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
687eda14cbcSMatt Macy     uint64_t mrs_obj, dmu_tx_t *tx)
688eda14cbcSMatt Macy {
689eda14cbcSMatt Macy 	dsl_deadlist_t dl = { 0 };
690eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_objset_pool(os);
691eda14cbcSMatt Macy 
692*17aab35aSMartin Matuska 	VERIFY0(dsl_deadlist_open(&dl, os, dlobj));
693eda14cbcSMatt Macy 	if (dl.dl_oldfmt) {
694eda14cbcSMatt Macy 		dsl_deadlist_close(&dl);
695eda14cbcSMatt Macy 		return;
696eda14cbcSMatt Macy 	}
697eda14cbcSMatt Macy 
698eda14cbcSMatt Macy 	while (mrs_obj != 0) {
699eda14cbcSMatt Macy 		dsl_dataset_t *ds;
700eda14cbcSMatt Macy 		VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
701eda14cbcSMatt Macy 		dsl_deadlist_add_key(&dl,
702eda14cbcSMatt Macy 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
703eda14cbcSMatt Macy 		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
704eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
705eda14cbcSMatt Macy 	}
706eda14cbcSMatt Macy 	dsl_deadlist_close(&dl);
707eda14cbcSMatt Macy }
708eda14cbcSMatt Macy 
709eda14cbcSMatt Macy uint64_t
710eda14cbcSMatt Macy dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
711eda14cbcSMatt Macy     uint64_t mrs_obj, dmu_tx_t *tx)
712eda14cbcSMatt Macy {
713eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
714eda14cbcSMatt Macy 	uint64_t newobj;
715eda14cbcSMatt Macy 
716eda14cbcSMatt Macy 	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
717eda14cbcSMatt Macy 
718eda14cbcSMatt Macy 	if (dl->dl_oldfmt) {
719eda14cbcSMatt Macy 		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
720eda14cbcSMatt Macy 		return (newobj);
721eda14cbcSMatt Macy 	}
722eda14cbcSMatt Macy 
723eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
724eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
725eda14cbcSMatt Macy 
726eda14cbcSMatt Macy 	for (dle = avl_first(&dl->dl_tree); dle;
727eda14cbcSMatt Macy 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
728eda14cbcSMatt Macy 		uint64_t obj;
729eda14cbcSMatt Macy 
730eda14cbcSMatt Macy 		if (dle->dle_mintxg >= maxtxg)
731eda14cbcSMatt Macy 			break;
732eda14cbcSMatt Macy 
733eda14cbcSMatt Macy 		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
734eda14cbcSMatt Macy 		VERIFY0(zap_add_int_key(dl->dl_os, newobj,
735eda14cbcSMatt Macy 		    dle->dle_mintxg, obj, tx));
736eda14cbcSMatt Macy 	}
737eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
738eda14cbcSMatt Macy 	return (newobj);
739eda14cbcSMatt Macy }
740eda14cbcSMatt Macy 
741eda14cbcSMatt Macy void
742eda14cbcSMatt Macy dsl_deadlist_space(dsl_deadlist_t *dl,
743eda14cbcSMatt Macy     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
744eda14cbcSMatt Macy {
745eda14cbcSMatt Macy 	ASSERT(dsl_deadlist_is_open(dl));
746eda14cbcSMatt Macy 	if (dl->dl_oldfmt) {
747eda14cbcSMatt Macy 		VERIFY0(bpobj_space(&dl->dl_bpobj,
748eda14cbcSMatt Macy 		    usedp, compp, uncompp));
749eda14cbcSMatt Macy 		return;
750eda14cbcSMatt Macy 	}
751eda14cbcSMatt Macy 
752eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
753eda14cbcSMatt Macy 	*usedp = dl->dl_phys->dl_used;
754eda14cbcSMatt Macy 	*compp = dl->dl_phys->dl_comp;
755eda14cbcSMatt Macy 	*uncompp = dl->dl_phys->dl_uncomp;
756eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
757eda14cbcSMatt Macy }
758eda14cbcSMatt Macy 
759eda14cbcSMatt Macy /*
760eda14cbcSMatt Macy  * return space used in the range (mintxg, maxtxg].
761eda14cbcSMatt Macy  * Includes maxtxg, does not include mintxg.
762eda14cbcSMatt Macy  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
763eda14cbcSMatt Macy  * UINT64_MAX).
764eda14cbcSMatt Macy  */
765eda14cbcSMatt Macy void
766eda14cbcSMatt Macy dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
767eda14cbcSMatt Macy     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
768eda14cbcSMatt Macy {
769eda14cbcSMatt Macy 	dsl_deadlist_cache_entry_t *dlce;
770eda14cbcSMatt Macy 	dsl_deadlist_cache_entry_t dlce_tofind;
771eda14cbcSMatt Macy 	avl_index_t where;
772eda14cbcSMatt Macy 
773eda14cbcSMatt Macy 	if (dl->dl_oldfmt) {
774eda14cbcSMatt Macy 		VERIFY0(bpobj_space_range(&dl->dl_bpobj,
775eda14cbcSMatt Macy 		    mintxg, maxtxg, usedp, compp, uncompp));
776eda14cbcSMatt Macy 		return;
777eda14cbcSMatt Macy 	}
778eda14cbcSMatt Macy 
779eda14cbcSMatt Macy 	*usedp = *compp = *uncompp = 0;
780eda14cbcSMatt Macy 
781eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
782eda14cbcSMatt Macy 	dsl_deadlist_load_cache(dl);
783eda14cbcSMatt Macy 	dlce_tofind.dlce_mintxg = mintxg;
784eda14cbcSMatt Macy 	dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
785eda14cbcSMatt Macy 
786eda14cbcSMatt Macy 	/*
787eda14cbcSMatt Macy 	 * If this mintxg doesn't exist, it may be an empty_bpobj which
788eda14cbcSMatt Macy 	 * is omitted from the sparse tree.  Start at the next non-empty
789eda14cbcSMatt Macy 	 * entry.
790eda14cbcSMatt Macy 	 */
791eda14cbcSMatt Macy 	if (dlce == NULL)
792eda14cbcSMatt Macy 		dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
793eda14cbcSMatt Macy 
794eda14cbcSMatt Macy 	for (; dlce && dlce->dlce_mintxg < maxtxg;
795eda14cbcSMatt Macy 	    dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
796eda14cbcSMatt Macy 		*usedp += dlce->dlce_bytes;
797eda14cbcSMatt Macy 		*compp += dlce->dlce_comp;
798eda14cbcSMatt Macy 		*uncompp += dlce->dlce_uncomp;
799eda14cbcSMatt Macy 	}
800eda14cbcSMatt Macy 
801eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
802eda14cbcSMatt Macy }
803eda14cbcSMatt Macy 
804eda14cbcSMatt Macy static void
805eda14cbcSMatt Macy dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
806eda14cbcSMatt Macy     dmu_tx_t *tx)
807eda14cbcSMatt Macy {
808eda14cbcSMatt Macy 	dsl_deadlist_entry_t dle_tofind;
809eda14cbcSMatt Macy 	dsl_deadlist_entry_t *dle;
810eda14cbcSMatt Macy 	avl_index_t where;
811eda14cbcSMatt Macy 	uint64_t used, comp, uncomp;
812eda14cbcSMatt Macy 	bpobj_t bpo;
813eda14cbcSMatt Macy 
814eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dl->dl_lock));
815eda14cbcSMatt Macy 
816eda14cbcSMatt Macy 	VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
817eda14cbcSMatt Macy 	VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));
818eda14cbcSMatt Macy 	bpobj_close(&bpo);
819eda14cbcSMatt Macy 
820eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
821eda14cbcSMatt Macy 
822eda14cbcSMatt Macy 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
823eda14cbcSMatt Macy 	dl->dl_phys->dl_used += used;
824eda14cbcSMatt Macy 	dl->dl_phys->dl_comp += comp;
825eda14cbcSMatt Macy 	dl->dl_phys->dl_uncomp += uncomp;
826eda14cbcSMatt Macy 
827eda14cbcSMatt Macy 	dle_tofind.dle_mintxg = birth;
828eda14cbcSMatt Macy 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
829eda14cbcSMatt Macy 	if (dle == NULL)
830eda14cbcSMatt Macy 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
831eda14cbcSMatt Macy 	dle_enqueue_subobj(dl, dle, obj, tx);
832eda14cbcSMatt Macy }
833eda14cbcSMatt Macy 
834c9539b89SMartin Matuska /*
835c9539b89SMartin Matuska  * Prefetch metadata required for dsl_deadlist_insert_bpobj().
836c9539b89SMartin Matuska  */
837c9539b89SMartin Matuska static void
838c9539b89SMartin Matuska dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
839c9539b89SMartin Matuska {
840c9539b89SMartin Matuska 	dsl_deadlist_entry_t dle_tofind;
841c9539b89SMartin Matuska 	dsl_deadlist_entry_t *dle;
842c9539b89SMartin Matuska 	avl_index_t where;
843c9539b89SMartin Matuska 
844c9539b89SMartin Matuska 	ASSERT(MUTEX_HELD(&dl->dl_lock));
845c9539b89SMartin Matuska 
846c9539b89SMartin Matuska 	dsl_deadlist_load_tree(dl);
847c9539b89SMartin Matuska 
848c9539b89SMartin Matuska 	dle_tofind.dle_mintxg = birth;
849c9539b89SMartin Matuska 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
850c9539b89SMartin Matuska 	if (dle == NULL)
851c9539b89SMartin Matuska 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
852c9539b89SMartin Matuska 	dle_prefetch_subobj(dl, dle, obj);
853c9539b89SMartin Matuska }
854c9539b89SMartin Matuska 
855eda14cbcSMatt Macy static int
856eda14cbcSMatt Macy dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
857eda14cbcSMatt Macy     dmu_tx_t *tx)
858eda14cbcSMatt Macy {
859eda14cbcSMatt Macy 	dsl_deadlist_t *dl = arg;
860eda14cbcSMatt Macy 	dsl_deadlist_insert(dl, bp, bp_freed, tx);
861eda14cbcSMatt Macy 	return (0);
862eda14cbcSMatt Macy }
863eda14cbcSMatt Macy 
864eda14cbcSMatt Macy /*
865eda14cbcSMatt Macy  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
866eda14cbcSMatt Macy  * an empty deadlist.
867eda14cbcSMatt Macy  */
868eda14cbcSMatt Macy void
869eda14cbcSMatt Macy dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
870eda14cbcSMatt Macy {
871c9539b89SMartin Matuska 	zap_cursor_t zc, pzc;
8722a58b312SMartin Matuska 	zap_attribute_t *za, *pza;
873eda14cbcSMatt Macy 	dmu_buf_t *bonus;
874eda14cbcSMatt Macy 	dsl_deadlist_phys_t *dlp;
875eda14cbcSMatt Macy 	dmu_object_info_t doi;
876c9539b89SMartin Matuska 	int error, perror, i;
877eda14cbcSMatt Macy 
878eda14cbcSMatt Macy 	VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
879eda14cbcSMatt Macy 	if (doi.doi_type == DMU_OT_BPOBJ) {
880eda14cbcSMatt Macy 		bpobj_t bpo;
881eda14cbcSMatt Macy 		VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
882eda14cbcSMatt Macy 		VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));
883eda14cbcSMatt Macy 		bpobj_close(&bpo);
884eda14cbcSMatt Macy 		return;
885eda14cbcSMatt Macy 	}
886eda14cbcSMatt Macy 
8877a7741afSMartin Matuska 	za = zap_attribute_alloc();
8887a7741afSMartin Matuska 	pza = zap_attribute_alloc();
8892a58b312SMartin Matuska 
890eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
891c9539b89SMartin Matuska 	/*
892c9539b89SMartin Matuska 	 * Prefetch up to 128 deadlists first and then more as we progress.
893c9539b89SMartin Matuska 	 * The limit is a balance between ARC use and diminishing returns.
894c9539b89SMartin Matuska 	 */
895c9539b89SMartin Matuska 	for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
8962a58b312SMartin Matuska 	    (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;
897c9539b89SMartin Matuska 	    zap_cursor_advance(&pzc), i++) {
8982a58b312SMartin Matuska 		dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
8992a58b312SMartin Matuska 		    zfs_strtonum(pza->za_name, NULL));
900c9539b89SMartin Matuska 	}
901eda14cbcSMatt Macy 	for (zap_cursor_init(&zc, dl->dl_os, obj);
9022a58b312SMartin Matuska 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
903eda14cbcSMatt Macy 	    zap_cursor_advance(&zc)) {
904315ee00fSMartin Matuska 		dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
905315ee00fSMartin Matuska 		    zfs_strtonum(za->za_name, NULL), tx);
906315ee00fSMartin Matuska 		VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
907c9539b89SMartin Matuska 		if (perror == 0) {
9082a58b312SMartin Matuska 			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
9092a58b312SMartin Matuska 			    zfs_strtonum(pza->za_name, NULL));
910c9539b89SMartin Matuska 			zap_cursor_advance(&pzc);
9112a58b312SMartin Matuska 			perror = zap_cursor_retrieve(&pzc, pza);
912c9539b89SMartin Matuska 		}
913eda14cbcSMatt Macy 	}
914eda14cbcSMatt Macy 	VERIFY3U(error, ==, ENOENT);
915eda14cbcSMatt Macy 	zap_cursor_fini(&zc);
916c9539b89SMartin Matuska 	zap_cursor_fini(&pzc);
917eda14cbcSMatt Macy 
918eda14cbcSMatt Macy 	VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
919eda14cbcSMatt Macy 	dlp = bonus->db_data;
920eda14cbcSMatt Macy 	dmu_buf_will_dirty(bonus, tx);
921da5137abSMartin Matuska 	memset(dlp, 0, sizeof (*dlp));
922eda14cbcSMatt Macy 	dmu_buf_rele(bonus, FTAG);
923eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
9242a58b312SMartin Matuska 
9257a7741afSMartin Matuska 	zap_attribute_free(za);
9267a7741afSMartin Matuska 	zap_attribute_free(pza);
927eda14cbcSMatt Macy }
928eda14cbcSMatt Macy 
929eda14cbcSMatt Macy /*
930eda14cbcSMatt Macy  * Remove entries on dl that are born > mintxg, and put them on the bpobj.
931eda14cbcSMatt Macy  */
932eda14cbcSMatt Macy void
933eda14cbcSMatt Macy dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
934eda14cbcSMatt Macy     dmu_tx_t *tx)
935eda14cbcSMatt Macy {
936eda14cbcSMatt Macy 	dsl_deadlist_entry_t dle_tofind;
937c9539b89SMartin Matuska 	dsl_deadlist_entry_t *dle, *pdle;
938eda14cbcSMatt Macy 	avl_index_t where;
939c9539b89SMartin Matuska 	int i;
940eda14cbcSMatt Macy 
941eda14cbcSMatt Macy 	ASSERT(!dl->dl_oldfmt);
942eda14cbcSMatt Macy 
943eda14cbcSMatt Macy 	mutex_enter(&dl->dl_lock);
944eda14cbcSMatt Macy 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
945eda14cbcSMatt Macy 	dsl_deadlist_load_tree(dl);
946eda14cbcSMatt Macy 
947eda14cbcSMatt Macy 	dle_tofind.dle_mintxg = mintxg;
948eda14cbcSMatt Macy 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
949eda14cbcSMatt Macy 	if (dle == NULL)
950eda14cbcSMatt Macy 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
951c9539b89SMartin Matuska 	/*
952c9539b89SMartin Matuska 	 * Prefetch up to 128 deadlists first and then more as we progress.
953c9539b89SMartin Matuska 	 * The limit is a balance between ARC use and diminishing returns.
954c9539b89SMartin Matuska 	 */
9552a58b312SMartin Matuska 	for (pdle = dle, i = 0; pdle && i < 128; i++) {
956c9539b89SMartin Matuska 		bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
957c9539b89SMartin Matuska 		pdle = AVL_NEXT(&dl->dl_tree, pdle);
958c9539b89SMartin Matuska 	}
959eda14cbcSMatt Macy 	while (dle) {
960eda14cbcSMatt Macy 		uint64_t used, comp, uncomp;
961eda14cbcSMatt Macy 		dsl_deadlist_entry_t *dle_next;
962eda14cbcSMatt Macy 
963eda14cbcSMatt Macy 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
964c9539b89SMartin Matuska 		if (pdle) {
965c9539b89SMartin Matuska 			bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
966c9539b89SMartin Matuska 			pdle = AVL_NEXT(&dl->dl_tree, pdle);
967c9539b89SMartin Matuska 		}
968eda14cbcSMatt Macy 
969eda14cbcSMatt Macy 		VERIFY0(bpobj_space(&dle->dle_bpobj,
970eda14cbcSMatt Macy 		    &used, &comp, &uncomp));
971eda14cbcSMatt Macy 		ASSERT3U(dl->dl_phys->dl_used, >=, used);
972eda14cbcSMatt Macy 		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
973eda14cbcSMatt Macy 		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
974eda14cbcSMatt Macy 		dl->dl_phys->dl_used -= used;
975eda14cbcSMatt Macy 		dl->dl_phys->dl_comp -= comp;
976eda14cbcSMatt Macy 		dl->dl_phys->dl_uncomp -= uncomp;
977eda14cbcSMatt Macy 
978eda14cbcSMatt Macy 		VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,
979eda14cbcSMatt Macy 		    dle->dle_mintxg, tx));
980eda14cbcSMatt Macy 
981eda14cbcSMatt Macy 		dle_next = AVL_NEXT(&dl->dl_tree, dle);
982eda14cbcSMatt Macy 		avl_remove(&dl->dl_tree, dle);
983eda14cbcSMatt Macy 		bpobj_close(&dle->dle_bpobj);
984eda14cbcSMatt Macy 		kmem_free(dle, sizeof (*dle));
985eda14cbcSMatt Macy 		dle = dle_next;
986eda14cbcSMatt Macy 	}
987eda14cbcSMatt Macy 	mutex_exit(&dl->dl_lock);
988eda14cbcSMatt Macy }
989eda14cbcSMatt Macy 
990eda14cbcSMatt Macy typedef struct livelist_entry {
99116038816SMartin Matuska 	blkptr_t le_bp;
99216038816SMartin Matuska 	uint32_t le_refcnt;
993eda14cbcSMatt Macy 	avl_node_t le_node;
994eda14cbcSMatt Macy } livelist_entry_t;
995eda14cbcSMatt Macy 
996eda14cbcSMatt Macy static int
997eda14cbcSMatt Macy livelist_compare(const void *larg, const void *rarg)
998eda14cbcSMatt Macy {
99916038816SMartin Matuska 	const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
100016038816SMartin Matuska 	const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
1001eda14cbcSMatt Macy 
1002eda14cbcSMatt Macy 	/* Sort them according to dva[0] */
1003eda14cbcSMatt Macy 	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
1004eda14cbcSMatt Macy 	uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
1005eda14cbcSMatt Macy 
1006eda14cbcSMatt Macy 	if (l_dva0_vdev != r_dva0_vdev)
1007eda14cbcSMatt Macy 		return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
1008eda14cbcSMatt Macy 
1009eda14cbcSMatt Macy 	/* if vdevs are equal, sort by offsets. */
1010eda14cbcSMatt Macy 	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
1011eda14cbcSMatt Macy 	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
1012eda14cbcSMatt Macy 	return (TREE_CMP(l_dva0_offset, r_dva0_offset));
1013eda14cbcSMatt Macy }
1014eda14cbcSMatt Macy 
1015eda14cbcSMatt Macy struct livelist_iter_arg {
1016eda14cbcSMatt Macy 	avl_tree_t *avl;
1017eda14cbcSMatt Macy 	bplist_t *to_free;
1018eda14cbcSMatt Macy 	zthr_t *t;
1019eda14cbcSMatt Macy };
1020eda14cbcSMatt Macy 
1021eda14cbcSMatt Macy /*
1022eda14cbcSMatt Macy  * Expects an AVL tree which is incrementally filled will FREE blkptrs
1023eda14cbcSMatt Macy  * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
1024eda14cbcSMatt Macy  * corresponding FREE are stored in the supplied bplist.
102516038816SMartin Matuska  *
1026f552d7adSMartin Matuska  * Note that multiple FREE and ALLOC entries for the same blkptr may be
1027f552d7adSMartin Matuska  * encountered when dedup or block cloning is involved.  For this reason we
1028f552d7adSMartin Matuska  * keep a refcount for all the FREE entries of each blkptr and ensure that
102916038816SMartin Matuska  * each of those FREE entries has a corresponding ALLOC preceding it.
1030eda14cbcSMatt Macy  */
1031eda14cbcSMatt Macy static int
1032eda14cbcSMatt Macy dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
1033eda14cbcSMatt Macy     dmu_tx_t *tx)
1034eda14cbcSMatt Macy {
1035eda14cbcSMatt Macy 	struct livelist_iter_arg *lia = arg;
1036eda14cbcSMatt Macy 	avl_tree_t *avl = lia->avl;
1037eda14cbcSMatt Macy 	bplist_t *to_free = lia->to_free;
1038eda14cbcSMatt Macy 	zthr_t *t = lia->t;
1039eda14cbcSMatt Macy 	ASSERT(tx == NULL);
1040eda14cbcSMatt Macy 
1041eda14cbcSMatt Macy 	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
1042eda14cbcSMatt Macy 		return (SET_ERROR(EINTR));
104316038816SMartin Matuska 
1044eda14cbcSMatt Macy 	livelist_entry_t node;
104516038816SMartin Matuska 	node.le_bp = *bp;
1046eda14cbcSMatt Macy 	livelist_entry_t *found = avl_find(avl, &node, NULL);
1047f552d7adSMartin Matuska 	if (found) {
1048f552d7adSMartin Matuska 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
1049f552d7adSMartin Matuska 		ASSERT3U(BP_GET_CHECKSUM(bp), ==,
1050f552d7adSMartin Matuska 		    BP_GET_CHECKSUM(&found->le_bp));
1051783d3ff6SMartin Matuska 		ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
1052f552d7adSMartin Matuska 	}
105316038816SMartin Matuska 	if (bp_freed) {
105416038816SMartin Matuska 		if (found == NULL) {
105516038816SMartin Matuska 			/* first free entry for this blkptr */
105616038816SMartin Matuska 			livelist_entry_t *e =
105716038816SMartin Matuska 			    kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
105816038816SMartin Matuska 			e->le_bp = *bp;
105916038816SMartin Matuska 			e->le_refcnt = 1;
106016038816SMartin Matuska 			avl_add(avl, e);
106116038816SMartin Matuska 		} else {
1062f552d7adSMartin Matuska 			/*
1063f552d7adSMartin Matuska 			 * Deduped or cloned block free.  We could assert D bit
1064f552d7adSMartin Matuska 			 * for dedup, but there is no such one for cloning.
1065f552d7adSMartin Matuska 			 */
106616038816SMartin Matuska 			ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
106716038816SMartin Matuska 			found->le_refcnt++;
106816038816SMartin Matuska 		}
106916038816SMartin Matuska 	} else {
107016038816SMartin Matuska 		if (found == NULL) {
107116038816SMartin Matuska 			/* block is currently marked as allocated */
107216038816SMartin Matuska 			bplist_append(to_free, bp);
107316038816SMartin Matuska 		} else {
107416038816SMartin Matuska 			/* alloc matches a free entry */
107516038816SMartin Matuska 			ASSERT3U(found->le_refcnt, !=, 0);
107616038816SMartin Matuska 			found->le_refcnt--;
107716038816SMartin Matuska 			if (found->le_refcnt == 0) {
107816038816SMartin Matuska 				/* all tracked free pairs have been matched */
1079eda14cbcSMatt Macy 				avl_remove(avl, found);
1080eda14cbcSMatt Macy 				kmem_free(found, sizeof (livelist_entry_t));
108116038816SMartin Matuska 			}
1082eda14cbcSMatt Macy 		}
1083eda14cbcSMatt Macy 	}
1084eda14cbcSMatt Macy 	return (0);
1085eda14cbcSMatt Macy }
1086eda14cbcSMatt Macy 
1087eda14cbcSMatt Macy /*
1088eda14cbcSMatt Macy  * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
1089eda14cbcSMatt Macy  * which have an ALLOC entry but no matching FREE
1090eda14cbcSMatt Macy  */
1091eda14cbcSMatt Macy int
1092eda14cbcSMatt Macy dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
1093eda14cbcSMatt Macy     uint64_t *size)
1094eda14cbcSMatt Macy {
1095eda14cbcSMatt Macy 	avl_tree_t avl;
1096eda14cbcSMatt Macy 	avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
1097eda14cbcSMatt Macy 	    offsetof(livelist_entry_t, le_node));
1098eda14cbcSMatt Macy 
1099eda14cbcSMatt Macy 	/* process the sublist */
1100eda14cbcSMatt Macy 	struct livelist_iter_arg arg = {
1101eda14cbcSMatt Macy 	    .avl = &avl,
1102eda14cbcSMatt Macy 	    .to_free = to_free,
1103eda14cbcSMatt Macy 	    .t = t
1104eda14cbcSMatt Macy 	};
1105eda14cbcSMatt Macy 	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
1106be181ee2SMartin Matuska 	VERIFY(err != 0 || avl_numnodes(&avl) == 0);
1107eda14cbcSMatt Macy 
1108be181ee2SMartin Matuska 	void *cookie = NULL;
1109be181ee2SMartin Matuska 	livelist_entry_t *le = NULL;
1110be181ee2SMartin Matuska 	while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {
1111be181ee2SMartin Matuska 		kmem_free(le, sizeof (livelist_entry_t));
1112be181ee2SMartin Matuska 	}
1113eda14cbcSMatt Macy 	avl_destroy(&avl);
1114eda14cbcSMatt Macy 	return (err);
1115eda14cbcSMatt Macy }
1116eda14cbcSMatt Macy 
1117dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,
1118eda14cbcSMatt Macy 	"Size to start the next sub-livelist in a livelist");
1119eda14cbcSMatt Macy 
1120eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
1121eda14cbcSMatt Macy 	"Threshold at which livelist is disabled");
1122