1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy 22*eda14cbcSMatt Macy /* 23*eda14cbcSMatt Macy * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24*eda14cbcSMatt Macy */ 25*eda14cbcSMatt Macy 26*eda14cbcSMatt Macy #include <sys/arc.h> 27*eda14cbcSMatt Macy #include <sys/bptree.h> 28*eda14cbcSMatt Macy #include <sys/dmu.h> 29*eda14cbcSMatt Macy #include <sys/dmu_objset.h> 30*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 31*eda14cbcSMatt Macy #include <sys/dmu_traverse.h> 32*eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 33*eda14cbcSMatt Macy #include <sys/dsl_dir.h> 34*eda14cbcSMatt Macy #include <sys/dsl_pool.h> 35*eda14cbcSMatt Macy #include <sys/dnode.h> 36*eda14cbcSMatt Macy #include <sys/spa.h> 37*eda14cbcSMatt Macy 38*eda14cbcSMatt Macy /* 39*eda14cbcSMatt Macy * A bptree is a queue of root block pointers from destroyed datasets. When a 40*eda14cbcSMatt Macy * dataset is destroyed its root block pointer is put on the end of the pool's 41*eda14cbcSMatt Macy * bptree queue so the dataset's blocks can be freed asynchronously by 42*eda14cbcSMatt Macy * dsl_scan_sync. This allows the delete operation to finish without traversing 43*eda14cbcSMatt Macy * all the dataset's blocks. 44*eda14cbcSMatt Macy * 45*eda14cbcSMatt Macy * Note that while bt_begin and bt_end are only ever incremented in this code, 46*eda14cbcSMatt Macy * they are effectively reset to 0 every time the entire bptree is freed because 47*eda14cbcSMatt Macy * the bptree's object is destroyed and re-created. 48*eda14cbcSMatt Macy */ 49*eda14cbcSMatt Macy 50*eda14cbcSMatt Macy struct bptree_args { 51*eda14cbcSMatt Macy bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 52*eda14cbcSMatt Macy boolean_t ba_free; /* true if freeing during traversal */ 53*eda14cbcSMatt Macy 54*eda14cbcSMatt Macy bptree_itor_t *ba_func; /* function to call for each blockpointer */ 55*eda14cbcSMatt Macy void *ba_arg; /* caller supplied argument to ba_func */ 56*eda14cbcSMatt Macy dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 57*eda14cbcSMatt Macy } bptree_args_t; 58*eda14cbcSMatt Macy 59*eda14cbcSMatt Macy uint64_t 60*eda14cbcSMatt Macy bptree_alloc(objset_t *os, dmu_tx_t *tx) 61*eda14cbcSMatt Macy { 62*eda14cbcSMatt Macy uint64_t obj; 63*eda14cbcSMatt Macy dmu_buf_t *db; 64*eda14cbcSMatt Macy bptree_phys_t *bt; 65*eda14cbcSMatt Macy 66*eda14cbcSMatt Macy obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 67*eda14cbcSMatt Macy SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 68*eda14cbcSMatt Macy sizeof (bptree_phys_t), tx); 69*eda14cbcSMatt Macy 70*eda14cbcSMatt Macy /* 71*eda14cbcSMatt Macy * Bonus buffer contents are already initialized to 0, but for 72*eda14cbcSMatt Macy * readability we make it explicit. 73*eda14cbcSMatt Macy */ 74*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 75*eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx); 76*eda14cbcSMatt Macy bt = db->db_data; 77*eda14cbcSMatt Macy bt->bt_begin = 0; 78*eda14cbcSMatt Macy bt->bt_end = 0; 79*eda14cbcSMatt Macy bt->bt_bytes = 0; 80*eda14cbcSMatt Macy bt->bt_comp = 0; 81*eda14cbcSMatt Macy bt->bt_uncomp = 0; 82*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 83*eda14cbcSMatt Macy 84*eda14cbcSMatt Macy return (obj); 85*eda14cbcSMatt Macy } 86*eda14cbcSMatt Macy 87*eda14cbcSMatt Macy int 88*eda14cbcSMatt Macy bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 89*eda14cbcSMatt Macy { 90*eda14cbcSMatt Macy dmu_buf_t *db; 91*eda14cbcSMatt Macy bptree_phys_t *bt; 92*eda14cbcSMatt Macy 93*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 94*eda14cbcSMatt Macy bt = db->db_data; 95*eda14cbcSMatt Macy ASSERT3U(bt->bt_begin, ==, bt->bt_end); 96*eda14cbcSMatt Macy ASSERT0(bt->bt_bytes); 97*eda14cbcSMatt Macy ASSERT0(bt->bt_comp); 98*eda14cbcSMatt Macy ASSERT0(bt->bt_uncomp); 99*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 100*eda14cbcSMatt Macy 101*eda14cbcSMatt Macy return (dmu_object_free(os, obj, tx)); 102*eda14cbcSMatt Macy } 103*eda14cbcSMatt Macy 104*eda14cbcSMatt Macy boolean_t 105*eda14cbcSMatt Macy bptree_is_empty(objset_t *os, uint64_t obj) 106*eda14cbcSMatt Macy { 107*eda14cbcSMatt Macy dmu_buf_t *db; 108*eda14cbcSMatt Macy bptree_phys_t *bt; 109*eda14cbcSMatt Macy boolean_t rv; 110*eda14cbcSMatt Macy 111*eda14cbcSMatt Macy VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); 112*eda14cbcSMatt Macy bt = db->db_data; 113*eda14cbcSMatt Macy rv = (bt->bt_begin == bt->bt_end); 114*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 115*eda14cbcSMatt Macy return (rv); 116*eda14cbcSMatt Macy } 117*eda14cbcSMatt Macy 118*eda14cbcSMatt Macy void 119*eda14cbcSMatt Macy bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 120*eda14cbcSMatt Macy uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 121*eda14cbcSMatt Macy { 122*eda14cbcSMatt Macy dmu_buf_t *db; 123*eda14cbcSMatt Macy bptree_phys_t *bt; 124*eda14cbcSMatt Macy bptree_entry_phys_t *bte; 125*eda14cbcSMatt Macy 126*eda14cbcSMatt Macy /* 127*eda14cbcSMatt Macy * bptree objects are in the pool mos, therefore they can only be 128*eda14cbcSMatt Macy * modified in syncing context. Furthermore, this is only modified 129*eda14cbcSMatt Macy * by the sync thread, so no locking is necessary. 130*eda14cbcSMatt Macy */ 131*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 132*eda14cbcSMatt Macy 133*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 134*eda14cbcSMatt Macy bt = db->db_data; 135*eda14cbcSMatt Macy 136*eda14cbcSMatt Macy bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); 137*eda14cbcSMatt Macy bte->be_birth_txg = birth_txg; 138*eda14cbcSMatt Macy bte->be_bp = *bp; 139*eda14cbcSMatt Macy dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); 140*eda14cbcSMatt Macy kmem_free(bte, sizeof (*bte)); 141*eda14cbcSMatt Macy 142*eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx); 143*eda14cbcSMatt Macy bt->bt_end++; 144*eda14cbcSMatt Macy bt->bt_bytes += bytes; 145*eda14cbcSMatt Macy bt->bt_comp += comp; 146*eda14cbcSMatt Macy bt->bt_uncomp += uncomp; 147*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 148*eda14cbcSMatt Macy } 149*eda14cbcSMatt Macy 150*eda14cbcSMatt Macy /* ARGSUSED */ 151*eda14cbcSMatt Macy static int 152*eda14cbcSMatt Macy bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 153*eda14cbcSMatt Macy const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 154*eda14cbcSMatt Macy { 155*eda14cbcSMatt Macy int err; 156*eda14cbcSMatt Macy struct bptree_args *ba = arg; 157*eda14cbcSMatt Macy 158*eda14cbcSMatt Macy if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 159*eda14cbcSMatt Macy BP_IS_REDACTED(bp)) 160*eda14cbcSMatt Macy return (0); 161*eda14cbcSMatt Macy 162*eda14cbcSMatt Macy err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 163*eda14cbcSMatt Macy if (err == 0 && ba->ba_free) { 164*eda14cbcSMatt Macy ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 165*eda14cbcSMatt Macy ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 166*eda14cbcSMatt Macy ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 167*eda14cbcSMatt Macy } 168*eda14cbcSMatt Macy return (err); 169*eda14cbcSMatt Macy } 170*eda14cbcSMatt Macy 171*eda14cbcSMatt Macy /* 172*eda14cbcSMatt Macy * If "free" is set: 173*eda14cbcSMatt Macy * - It is assumed that "func" will be freeing the block pointers. 174*eda14cbcSMatt Macy * - If "func" returns nonzero, the bookmark will be remembered and 175*eda14cbcSMatt Macy * iteration will be restarted from this point on next invocation. 176*eda14cbcSMatt Macy * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), 177*eda14cbcSMatt Macy * bptree_iterate will remember the bookmark, continue traversing 178*eda14cbcSMatt Macy * any additional entries, and return 0. 179*eda14cbcSMatt Macy * 180*eda14cbcSMatt Macy * If "free" is not set, traversal will stop and return an error if 181*eda14cbcSMatt Macy * an i/o error is encountered. 182*eda14cbcSMatt Macy * 183*eda14cbcSMatt Macy * In either case, if zfs_free_leak_on_eio is set, i/o errors will be 184*eda14cbcSMatt Macy * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to 185*eda14cbcSMatt Macy * traverse_dataset_destroyed()). 186*eda14cbcSMatt Macy */ 187*eda14cbcSMatt Macy int 188*eda14cbcSMatt Macy bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 189*eda14cbcSMatt Macy void *arg, dmu_tx_t *tx) 190*eda14cbcSMatt Macy { 191*eda14cbcSMatt Macy boolean_t ioerr = B_FALSE; 192*eda14cbcSMatt Macy int err; 193*eda14cbcSMatt Macy uint64_t i; 194*eda14cbcSMatt Macy dmu_buf_t *db; 195*eda14cbcSMatt Macy struct bptree_args ba; 196*eda14cbcSMatt Macy 197*eda14cbcSMatt Macy ASSERT(!free || dmu_tx_is_syncing(tx)); 198*eda14cbcSMatt Macy 199*eda14cbcSMatt Macy err = dmu_bonus_hold(os, obj, FTAG, &db); 200*eda14cbcSMatt Macy if (err != 0) 201*eda14cbcSMatt Macy return (err); 202*eda14cbcSMatt Macy 203*eda14cbcSMatt Macy if (free) 204*eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx); 205*eda14cbcSMatt Macy 206*eda14cbcSMatt Macy ba.ba_phys = db->db_data; 207*eda14cbcSMatt Macy ba.ba_free = free; 208*eda14cbcSMatt Macy ba.ba_func = func; 209*eda14cbcSMatt Macy ba.ba_arg = arg; 210*eda14cbcSMatt Macy ba.ba_tx = tx; 211*eda14cbcSMatt Macy 212*eda14cbcSMatt Macy err = 0; 213*eda14cbcSMatt Macy for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 214*eda14cbcSMatt Macy bptree_entry_phys_t bte; 215*eda14cbcSMatt Macy int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST | 216*eda14cbcSMatt Macy TRAVERSE_NO_DECRYPT; 217*eda14cbcSMatt Macy 218*eda14cbcSMatt Macy err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 219*eda14cbcSMatt Macy &bte, DMU_READ_NO_PREFETCH); 220*eda14cbcSMatt Macy if (err != 0) 221*eda14cbcSMatt Macy break; 222*eda14cbcSMatt Macy 223*eda14cbcSMatt Macy if (zfs_free_leak_on_eio) 224*eda14cbcSMatt Macy flags |= TRAVERSE_HARD; 225*eda14cbcSMatt Macy zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " 226*eda14cbcSMatt Macy "bookmark %lld/%lld/%lld/%lld", 227*eda14cbcSMatt Macy (longlong_t)i, 228*eda14cbcSMatt Macy (longlong_t)bte.be_birth_txg, 229*eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_objset, 230*eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_object, 231*eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_level, 232*eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_blkid); 233*eda14cbcSMatt Macy err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 234*eda14cbcSMatt Macy bte.be_birth_txg, &bte.be_zb, flags, 235*eda14cbcSMatt Macy bptree_visit_cb, &ba); 236*eda14cbcSMatt Macy if (free) { 237*eda14cbcSMatt Macy /* 238*eda14cbcSMatt Macy * The callback has freed the visited block pointers. 239*eda14cbcSMatt Macy * Record our traversal progress on disk, either by 240*eda14cbcSMatt Macy * updating this record's bookmark, or by logically 241*eda14cbcSMatt Macy * removing this record by advancing bt_begin. 242*eda14cbcSMatt Macy */ 243*eda14cbcSMatt Macy if (err != 0) { 244*eda14cbcSMatt Macy /* save bookmark for future resume */ 245*eda14cbcSMatt Macy ASSERT3U(bte.be_zb.zb_objset, ==, 246*eda14cbcSMatt Macy ZB_DESTROYED_OBJSET); 247*eda14cbcSMatt Macy ASSERT0(bte.be_zb.zb_level); 248*eda14cbcSMatt Macy dmu_write(os, obj, i * sizeof (bte), 249*eda14cbcSMatt Macy sizeof (bte), &bte, tx); 250*eda14cbcSMatt Macy if (err == EIO || err == ECKSUM || 251*eda14cbcSMatt Macy err == ENXIO) { 252*eda14cbcSMatt Macy /* 253*eda14cbcSMatt Macy * Skip the rest of this tree and 254*eda14cbcSMatt Macy * continue on to the next entry. 255*eda14cbcSMatt Macy */ 256*eda14cbcSMatt Macy err = 0; 257*eda14cbcSMatt Macy ioerr = B_TRUE; 258*eda14cbcSMatt Macy } else { 259*eda14cbcSMatt Macy break; 260*eda14cbcSMatt Macy } 261*eda14cbcSMatt Macy } else if (ioerr) { 262*eda14cbcSMatt Macy /* 263*eda14cbcSMatt Macy * This entry is finished, but there were 264*eda14cbcSMatt Macy * i/o errors on previous entries, so we 265*eda14cbcSMatt Macy * can't adjust bt_begin. Set this entry's 266*eda14cbcSMatt Macy * be_birth_txg such that it will be 267*eda14cbcSMatt Macy * treated as a no-op in future traversals. 268*eda14cbcSMatt Macy */ 269*eda14cbcSMatt Macy bte.be_birth_txg = UINT64_MAX; 270*eda14cbcSMatt Macy dmu_write(os, obj, i * sizeof (bte), 271*eda14cbcSMatt Macy sizeof (bte), &bte, tx); 272*eda14cbcSMatt Macy } 273*eda14cbcSMatt Macy 274*eda14cbcSMatt Macy if (!ioerr) { 275*eda14cbcSMatt Macy ba.ba_phys->bt_begin++; 276*eda14cbcSMatt Macy (void) dmu_free_range(os, obj, 277*eda14cbcSMatt Macy i * sizeof (bte), sizeof (bte), tx); 278*eda14cbcSMatt Macy } 279*eda14cbcSMatt Macy } else if (err != 0) { 280*eda14cbcSMatt Macy break; 281*eda14cbcSMatt Macy } 282*eda14cbcSMatt Macy } 283*eda14cbcSMatt Macy 284*eda14cbcSMatt Macy ASSERT(!free || err != 0 || ioerr || 285*eda14cbcSMatt Macy ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 286*eda14cbcSMatt Macy 287*eda14cbcSMatt Macy /* if all blocks are free there should be no used space */ 288*eda14cbcSMatt Macy if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 289*eda14cbcSMatt Macy if (zfs_free_leak_on_eio) { 290*eda14cbcSMatt Macy ba.ba_phys->bt_bytes = 0; 291*eda14cbcSMatt Macy ba.ba_phys->bt_comp = 0; 292*eda14cbcSMatt Macy ba.ba_phys->bt_uncomp = 0; 293*eda14cbcSMatt Macy } 294*eda14cbcSMatt Macy 295*eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_bytes); 296*eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_comp); 297*eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_uncomp); 298*eda14cbcSMatt Macy } 299*eda14cbcSMatt Macy 300*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 301*eda14cbcSMatt Macy 302*eda14cbcSMatt Macy return (err); 303*eda14cbcSMatt Macy } 304