1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy * CDDL HEADER START
3eda14cbcSMatt Macy *
4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy *
8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy * See the License for the specific language governing permissions
11eda14cbcSMatt Macy * and limitations under the License.
12eda14cbcSMatt Macy *
13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy *
19eda14cbcSMatt Macy * CDDL HEADER END
20eda14cbcSMatt Macy */
21eda14cbcSMatt Macy
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24eda14cbcSMatt Macy */
25eda14cbcSMatt Macy
26eda14cbcSMatt Macy #include <sys/arc.h>
27eda14cbcSMatt Macy #include <sys/bptree.h>
28eda14cbcSMatt Macy #include <sys/dmu.h>
29eda14cbcSMatt Macy #include <sys/dmu_objset.h>
30eda14cbcSMatt Macy #include <sys/dmu_tx.h>
31eda14cbcSMatt Macy #include <sys/dmu_traverse.h>
32eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
33eda14cbcSMatt Macy #include <sys/dsl_dir.h>
34eda14cbcSMatt Macy #include <sys/dsl_pool.h>
35eda14cbcSMatt Macy #include <sys/dnode.h>
36eda14cbcSMatt Macy #include <sys/spa.h>
37eda14cbcSMatt Macy
38eda14cbcSMatt Macy /*
39eda14cbcSMatt Macy * A bptree is a queue of root block pointers from destroyed datasets. When a
40eda14cbcSMatt Macy * dataset is destroyed its root block pointer is put on the end of the pool's
41eda14cbcSMatt Macy * bptree queue so the dataset's blocks can be freed asynchronously by
42eda14cbcSMatt Macy * dsl_scan_sync. This allows the delete operation to finish without traversing
43eda14cbcSMatt Macy * all the dataset's blocks.
44eda14cbcSMatt Macy *
45eda14cbcSMatt Macy * Note that while bt_begin and bt_end are only ever incremented in this code,
46eda14cbcSMatt Macy * they are effectively reset to 0 every time the entire bptree is freed because
47eda14cbcSMatt Macy * the bptree's object is destroyed and re-created.
48eda14cbcSMatt Macy */
49eda14cbcSMatt Macy
50eda14cbcSMatt Macy struct bptree_args {
51eda14cbcSMatt Macy bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
52eda14cbcSMatt Macy boolean_t ba_free; /* true if freeing during traversal */
53eda14cbcSMatt Macy
54eda14cbcSMatt Macy bptree_itor_t *ba_func; /* function to call for each blockpointer */
55eda14cbcSMatt Macy void *ba_arg; /* caller supplied argument to ba_func */
56eda14cbcSMatt Macy dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */
57eda14cbcSMatt Macy } bptree_args_t;
58eda14cbcSMatt Macy
59eda14cbcSMatt Macy uint64_t
bptree_alloc(objset_t * os,dmu_tx_t * tx)60eda14cbcSMatt Macy bptree_alloc(objset_t *os, dmu_tx_t *tx)
61eda14cbcSMatt Macy {
62eda14cbcSMatt Macy uint64_t obj;
63eda14cbcSMatt Macy dmu_buf_t *db;
64eda14cbcSMatt Macy bptree_phys_t *bt;
65eda14cbcSMatt Macy
66eda14cbcSMatt Macy obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
67eda14cbcSMatt Macy SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
68eda14cbcSMatt Macy sizeof (bptree_phys_t), tx);
69eda14cbcSMatt Macy
70eda14cbcSMatt Macy /*
71eda14cbcSMatt Macy * Bonus buffer contents are already initialized to 0, but for
72eda14cbcSMatt Macy * readability we make it explicit.
73eda14cbcSMatt Macy */
74eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
75eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx);
76eda14cbcSMatt Macy bt = db->db_data;
77eda14cbcSMatt Macy bt->bt_begin = 0;
78eda14cbcSMatt Macy bt->bt_end = 0;
79eda14cbcSMatt Macy bt->bt_bytes = 0;
80eda14cbcSMatt Macy bt->bt_comp = 0;
81eda14cbcSMatt Macy bt->bt_uncomp = 0;
82eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
83eda14cbcSMatt Macy
84eda14cbcSMatt Macy return (obj);
85eda14cbcSMatt Macy }
86eda14cbcSMatt Macy
87eda14cbcSMatt Macy int
bptree_free(objset_t * os,uint64_t obj,dmu_tx_t * tx)88eda14cbcSMatt Macy bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
89eda14cbcSMatt Macy {
90eda14cbcSMatt Macy dmu_buf_t *db;
91eda14cbcSMatt Macy bptree_phys_t *bt;
92eda14cbcSMatt Macy
93eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
94eda14cbcSMatt Macy bt = db->db_data;
95eda14cbcSMatt Macy ASSERT3U(bt->bt_begin, ==, bt->bt_end);
96eda14cbcSMatt Macy ASSERT0(bt->bt_bytes);
97eda14cbcSMatt Macy ASSERT0(bt->bt_comp);
98eda14cbcSMatt Macy ASSERT0(bt->bt_uncomp);
99eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
100eda14cbcSMatt Macy
101eda14cbcSMatt Macy return (dmu_object_free(os, obj, tx));
102eda14cbcSMatt Macy }
103eda14cbcSMatt Macy
104eda14cbcSMatt Macy boolean_t
bptree_is_empty(objset_t * os,uint64_t obj)105eda14cbcSMatt Macy bptree_is_empty(objset_t *os, uint64_t obj)
106eda14cbcSMatt Macy {
107eda14cbcSMatt Macy dmu_buf_t *db;
108eda14cbcSMatt Macy bptree_phys_t *bt;
109eda14cbcSMatt Macy boolean_t rv;
110eda14cbcSMatt Macy
111eda14cbcSMatt Macy VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
112eda14cbcSMatt Macy bt = db->db_data;
113eda14cbcSMatt Macy rv = (bt->bt_begin == bt->bt_end);
114eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
115eda14cbcSMatt Macy return (rv);
116eda14cbcSMatt Macy }
117eda14cbcSMatt Macy
118eda14cbcSMatt Macy void
bptree_add(objset_t * os,uint64_t obj,blkptr_t * bp,uint64_t birth_txg,uint64_t bytes,uint64_t comp,uint64_t uncomp,dmu_tx_t * tx)119eda14cbcSMatt Macy bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
120eda14cbcSMatt Macy uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
121eda14cbcSMatt Macy {
122eda14cbcSMatt Macy dmu_buf_t *db;
123eda14cbcSMatt Macy bptree_phys_t *bt;
124eda14cbcSMatt Macy bptree_entry_phys_t *bte;
125eda14cbcSMatt Macy
126eda14cbcSMatt Macy /*
127eda14cbcSMatt Macy * bptree objects are in the pool mos, therefore they can only be
128eda14cbcSMatt Macy * modified in syncing context. Furthermore, this is only modified
129eda14cbcSMatt Macy * by the sync thread, so no locking is necessary.
130eda14cbcSMatt Macy */
131eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx));
132eda14cbcSMatt Macy
133eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
134eda14cbcSMatt Macy bt = db->db_data;
135eda14cbcSMatt Macy
136eda14cbcSMatt Macy bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
137eda14cbcSMatt Macy bte->be_birth_txg = birth_txg;
138eda14cbcSMatt Macy bte->be_bp = *bp;
139eda14cbcSMatt Macy dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
140eda14cbcSMatt Macy kmem_free(bte, sizeof (*bte));
141eda14cbcSMatt Macy
142eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx);
143eda14cbcSMatt Macy bt->bt_end++;
144eda14cbcSMatt Macy bt->bt_bytes += bytes;
145eda14cbcSMatt Macy bt->bt_comp += comp;
146eda14cbcSMatt Macy bt->bt_uncomp += uncomp;
147eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
148eda14cbcSMatt Macy }
149eda14cbcSMatt Macy
150eda14cbcSMatt Macy static int
bptree_visit_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)151eda14cbcSMatt Macy bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
152eda14cbcSMatt Macy const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
153eda14cbcSMatt Macy {
154e92ffd9bSMartin Matuska (void) zilog, (void) dnp;
155eda14cbcSMatt Macy int err;
156eda14cbcSMatt Macy struct bptree_args *ba = arg;
157eda14cbcSMatt Macy
158eda14cbcSMatt Macy if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
159eda14cbcSMatt Macy BP_IS_REDACTED(bp))
160eda14cbcSMatt Macy return (0);
161eda14cbcSMatt Macy
162eda14cbcSMatt Macy err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
163eda14cbcSMatt Macy if (err == 0 && ba->ba_free) {
164eda14cbcSMatt Macy ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
165eda14cbcSMatt Macy ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
166eda14cbcSMatt Macy ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
167eda14cbcSMatt Macy }
168eda14cbcSMatt Macy return (err);
169eda14cbcSMatt Macy }
170eda14cbcSMatt Macy
171eda14cbcSMatt Macy /*
172eda14cbcSMatt Macy * If "free" is set:
173eda14cbcSMatt Macy * - It is assumed that "func" will be freeing the block pointers.
174eda14cbcSMatt Macy * - If "func" returns nonzero, the bookmark will be remembered and
175eda14cbcSMatt Macy * iteration will be restarted from this point on next invocation.
176eda14cbcSMatt Macy * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
177eda14cbcSMatt Macy * bptree_iterate will remember the bookmark, continue traversing
178eda14cbcSMatt Macy * any additional entries, and return 0.
179eda14cbcSMatt Macy *
180eda14cbcSMatt Macy * If "free" is not set, traversal will stop and return an error if
181eda14cbcSMatt Macy * an i/o error is encountered.
182eda14cbcSMatt Macy *
183eda14cbcSMatt Macy * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
184eda14cbcSMatt Macy * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
185eda14cbcSMatt Macy * traverse_dataset_destroyed()).
186eda14cbcSMatt Macy */
187eda14cbcSMatt Macy int
bptree_iterate(objset_t * os,uint64_t obj,boolean_t free,bptree_itor_t func,void * arg,dmu_tx_t * tx)188eda14cbcSMatt Macy bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
189eda14cbcSMatt Macy void *arg, dmu_tx_t *tx)
190eda14cbcSMatt Macy {
191eda14cbcSMatt Macy boolean_t ioerr = B_FALSE;
192eda14cbcSMatt Macy int err;
193eda14cbcSMatt Macy uint64_t i;
194eda14cbcSMatt Macy dmu_buf_t *db;
195eda14cbcSMatt Macy struct bptree_args ba;
196eda14cbcSMatt Macy
197eda14cbcSMatt Macy ASSERT(!free || dmu_tx_is_syncing(tx));
198eda14cbcSMatt Macy
199eda14cbcSMatt Macy err = dmu_bonus_hold(os, obj, FTAG, &db);
200eda14cbcSMatt Macy if (err != 0)
201eda14cbcSMatt Macy return (err);
202eda14cbcSMatt Macy
203eda14cbcSMatt Macy if (free)
204eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx);
205eda14cbcSMatt Macy
206eda14cbcSMatt Macy ba.ba_phys = db->db_data;
207eda14cbcSMatt Macy ba.ba_free = free;
208eda14cbcSMatt Macy ba.ba_func = func;
209eda14cbcSMatt Macy ba.ba_arg = arg;
210eda14cbcSMatt Macy ba.ba_tx = tx;
211eda14cbcSMatt Macy
212eda14cbcSMatt Macy err = 0;
213eda14cbcSMatt Macy for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
214eda14cbcSMatt Macy bptree_entry_phys_t bte;
215eda14cbcSMatt Macy int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
216eda14cbcSMatt Macy TRAVERSE_NO_DECRYPT;
217eda14cbcSMatt Macy
218eda14cbcSMatt Macy err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
219eda14cbcSMatt Macy &bte, DMU_READ_NO_PREFETCH);
220eda14cbcSMatt Macy if (err != 0)
221eda14cbcSMatt Macy break;
222eda14cbcSMatt Macy
223eda14cbcSMatt Macy if (zfs_free_leak_on_eio)
224eda14cbcSMatt Macy flags |= TRAVERSE_HARD;
225eda14cbcSMatt Macy zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
226eda14cbcSMatt Macy "bookmark %lld/%lld/%lld/%lld",
227eda14cbcSMatt Macy (longlong_t)i,
228eda14cbcSMatt Macy (longlong_t)bte.be_birth_txg,
229eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_objset,
230eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_object,
231eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_level,
232eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_blkid);
233eda14cbcSMatt Macy err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
234eda14cbcSMatt Macy bte.be_birth_txg, &bte.be_zb, flags,
235eda14cbcSMatt Macy bptree_visit_cb, &ba);
236eda14cbcSMatt Macy if (free) {
237eda14cbcSMatt Macy /*
238eda14cbcSMatt Macy * The callback has freed the visited block pointers.
239eda14cbcSMatt Macy * Record our traversal progress on disk, either by
240eda14cbcSMatt Macy * updating this record's bookmark, or by logically
241eda14cbcSMatt Macy * removing this record by advancing bt_begin.
242eda14cbcSMatt Macy */
243eda14cbcSMatt Macy if (err != 0) {
244eda14cbcSMatt Macy /* save bookmark for future resume */
245eda14cbcSMatt Macy ASSERT3U(bte.be_zb.zb_objset, ==,
246eda14cbcSMatt Macy ZB_DESTROYED_OBJSET);
247eda14cbcSMatt Macy ASSERT0(bte.be_zb.zb_level);
248eda14cbcSMatt Macy dmu_write(os, obj, i * sizeof (bte),
249eda14cbcSMatt Macy sizeof (bte), &bte, tx);
250eda14cbcSMatt Macy if (err == EIO || err == ECKSUM ||
251eda14cbcSMatt Macy err == ENXIO) {
252eda14cbcSMatt Macy /*
253eda14cbcSMatt Macy * Skip the rest of this tree and
254eda14cbcSMatt Macy * continue on to the next entry.
255eda14cbcSMatt Macy */
256eda14cbcSMatt Macy err = 0;
257eda14cbcSMatt Macy ioerr = B_TRUE;
258eda14cbcSMatt Macy } else {
259eda14cbcSMatt Macy break;
260eda14cbcSMatt Macy }
261eda14cbcSMatt Macy } else if (ioerr) {
262eda14cbcSMatt Macy /*
263eda14cbcSMatt Macy * This entry is finished, but there were
264eda14cbcSMatt Macy * i/o errors on previous entries, so we
265eda14cbcSMatt Macy * can't adjust bt_begin. Set this entry's
266eda14cbcSMatt Macy * be_birth_txg such that it will be
267eda14cbcSMatt Macy * treated as a no-op in future traversals.
268eda14cbcSMatt Macy */
269eda14cbcSMatt Macy bte.be_birth_txg = UINT64_MAX;
270eda14cbcSMatt Macy dmu_write(os, obj, i * sizeof (bte),
271eda14cbcSMatt Macy sizeof (bte), &bte, tx);
272eda14cbcSMatt Macy }
273eda14cbcSMatt Macy
274eda14cbcSMatt Macy if (!ioerr) {
275eda14cbcSMatt Macy ba.ba_phys->bt_begin++;
276eda14cbcSMatt Macy (void) dmu_free_range(os, obj,
277eda14cbcSMatt Macy i * sizeof (bte), sizeof (bte), tx);
278eda14cbcSMatt Macy }
279eda14cbcSMatt Macy } else if (err != 0) {
280eda14cbcSMatt Macy break;
281eda14cbcSMatt Macy }
282eda14cbcSMatt Macy }
283eda14cbcSMatt Macy
284eda14cbcSMatt Macy ASSERT(!free || err != 0 || ioerr ||
285eda14cbcSMatt Macy ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
286eda14cbcSMatt Macy
287eda14cbcSMatt Macy /* if all blocks are free there should be no used space */
288eda14cbcSMatt Macy if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
289eda14cbcSMatt Macy if (zfs_free_leak_on_eio) {
290eda14cbcSMatt Macy ba.ba_phys->bt_bytes = 0;
291eda14cbcSMatt Macy ba.ba_phys->bt_comp = 0;
292eda14cbcSMatt Macy ba.ba_phys->bt_uncomp = 0;
293eda14cbcSMatt Macy }
294eda14cbcSMatt Macy
295eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_bytes);
296eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_comp);
297eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_uncomp);
298eda14cbcSMatt Macy }
299eda14cbcSMatt Macy
300eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
301eda14cbcSMatt Macy
302eda14cbcSMatt Macy return (err);
303eda14cbcSMatt Macy }
304