1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * This file and its contents are supplied under the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License ("CDDL"), version 1.0. 6eda14cbcSMatt Macy * You may only use this file in accordance with the terms of version 7eda14cbcSMatt Macy * 1.0 of the CDDL. 8eda14cbcSMatt Macy * 9eda14cbcSMatt Macy * A full copy of the text of the CDDL should have accompanied this 10eda14cbcSMatt Macy * source. A copy of the CDDL is also available via the Internet at 11eda14cbcSMatt Macy * http://www.illumos.org/license/CDDL. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * CDDL HEADER END 14eda14cbcSMatt Macy */ 15eda14cbcSMatt Macy 16eda14cbcSMatt Macy /* 17eda14cbcSMatt Macy * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 18eda14cbcSMatt Macy * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 192c48331dSMatt Macy * Copyright (c) 2014, 2020 by Delphix. All rights reserved. 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy 22eda14cbcSMatt Macy #include <sys/zfs_context.h> 23eda14cbcSMatt Macy #include <sys/spa.h> 24eda14cbcSMatt Macy #include <sys/spa_impl.h> 25eda14cbcSMatt Macy #include <sys/vdev_impl.h> 26eda14cbcSMatt Macy #include <sys/fs/zfs.h> 27eda14cbcSMatt Macy #include <sys/zio.h> 28eda14cbcSMatt Macy #include <sys/zio_checksum.h> 29eda14cbcSMatt Macy #include <sys/metaslab.h> 30eda14cbcSMatt Macy #include <sys/dmu.h> 31eda14cbcSMatt Macy #include <sys/vdev_indirect_mapping.h> 32eda14cbcSMatt Macy #include <sys/dmu_tx.h> 33eda14cbcSMatt Macy #include <sys/dsl_synctask.h> 34eda14cbcSMatt Macy #include <sys/zap.h> 35eda14cbcSMatt Macy #include <sys/abd.h> 36eda14cbcSMatt Macy #include <sys/zthr.h> 37*87bf66d4SMartin Matuska #include <sys/fm/fs/zfs.h> 38eda14cbcSMatt Macy 39eda14cbcSMatt Macy /* 40eda14cbcSMatt Macy * An indirect vdev corresponds to a vdev that has been removed. Since 41eda14cbcSMatt Macy * we cannot rewrite block pointers of snapshots, etc., we keep a 42eda14cbcSMatt Macy * mapping from old location on the removed device to the new location 43eda14cbcSMatt Macy * on another device in the pool and use this mapping whenever we need 44eda14cbcSMatt Macy * to access the DVA. Unfortunately, this mapping did not respect 45eda14cbcSMatt Macy * logical block boundaries when it was first created, and so a DVA on 46eda14cbcSMatt Macy * this indirect vdev may be "split" into multiple sections that each 47eda14cbcSMatt Macy * map to a different location. As a consequence, not all DVAs can be 48eda14cbcSMatt Macy * translated to an equivalent new DVA. Instead we must provide a 49eda14cbcSMatt Macy * "vdev_remap" operation that executes a callback on each contiguous 50eda14cbcSMatt Macy * segment of the new location. This function is used in multiple ways: 51eda14cbcSMatt Macy * 52da5137abSMartin Matuska * - I/Os to this vdev use the callback to determine where the 53da5137abSMartin Matuska * data is now located, and issue child I/Os for each segment's new 54eda14cbcSMatt Macy * location. 55eda14cbcSMatt Macy * 56eda14cbcSMatt Macy * - frees and claims to this vdev use the callback to free or claim 57eda14cbcSMatt Macy * each mapped segment. (Note that we don't actually need to claim 58eda14cbcSMatt Macy * log blocks on indirect vdevs, because we don't allocate to 59eda14cbcSMatt Macy * removing vdevs. However, zdb uses zio_claim() for its leak 60eda14cbcSMatt Macy * detection.) 61eda14cbcSMatt Macy */ 62eda14cbcSMatt Macy 63eda14cbcSMatt Macy /* 64eda14cbcSMatt Macy * "Big theory statement" for how we mark blocks obsolete. 65eda14cbcSMatt Macy * 66eda14cbcSMatt Macy * When a block on an indirect vdev is freed or remapped, a section of 67eda14cbcSMatt Macy * that vdev's mapping may no longer be referenced (aka "obsolete"). We 68eda14cbcSMatt Macy * keep track of how much of each mapping entry is obsolete. When 69eda14cbcSMatt Macy * an entry becomes completely obsolete, we can remove it, thus reducing 70eda14cbcSMatt Macy * the memory used by the mapping. The complete picture of obsolescence 71eda14cbcSMatt Macy * is given by the following data structures, described below: 72eda14cbcSMatt Macy * - the entry-specific obsolete count 73eda14cbcSMatt Macy * - the vdev-specific obsolete spacemap 74eda14cbcSMatt Macy * - the pool-specific obsolete bpobj 75eda14cbcSMatt Macy * 76eda14cbcSMatt Macy * == On disk data structures used == 77eda14cbcSMatt Macy * 78eda14cbcSMatt Macy * We track the obsolete space for the pool using several objects. Each 79eda14cbcSMatt Macy * of these objects is created on demand and freed when no longer 80eda14cbcSMatt Macy * needed, and is assumed to be empty if it does not exist. 81eda14cbcSMatt Macy * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 82eda14cbcSMatt Macy * 83eda14cbcSMatt Macy * - Each vic_mapping_object (associated with an indirect vdev) can 84eda14cbcSMatt Macy * have a vimp_counts_object. This is an array of uint32_t's 85eda14cbcSMatt Macy * with the same number of entries as the vic_mapping_object. When 86eda14cbcSMatt Macy * the mapping is condensed, entries from the vic_obsolete_sm_object 87eda14cbcSMatt Macy * (see below) are folded into the counts. Therefore, each 88eda14cbcSMatt Macy * obsolete_counts entry tells us the number of bytes in the 89eda14cbcSMatt Macy * corresponding mapping entry that were not referenced when the 90eda14cbcSMatt Macy * mapping was last condensed. 91eda14cbcSMatt Macy * 92eda14cbcSMatt Macy * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 93eda14cbcSMatt Macy * This is a space map containing an alloc entry for every DVA that 94eda14cbcSMatt Macy * has been obsoleted since the last time this indirect vdev was 95eda14cbcSMatt Macy * condensed. We use this object in order to improve performance 96eda14cbcSMatt Macy * when marking a DVA as obsolete. Instead of modifying an arbitrary 97eda14cbcSMatt Macy * offset of the vimp_counts_object, we only need to append an entry 98eda14cbcSMatt Macy * to the end of this object. When a DVA becomes obsolete, it is 99eda14cbcSMatt Macy * added to the obsolete space map. This happens when the DVA is 100eda14cbcSMatt Macy * freed, remapped and not referenced by a snapshot, or the last 101eda14cbcSMatt Macy * snapshot referencing it is destroyed. 102eda14cbcSMatt Macy * 103eda14cbcSMatt Macy * - Each dataset can have a ds_remap_deadlist object. This is a 104eda14cbcSMatt Macy * deadlist object containing all blocks that were remapped in this 105eda14cbcSMatt Macy * dataset but referenced in a previous snapshot. Blocks can *only* 106eda14cbcSMatt Macy * appear on this list if they were remapped (dsl_dataset_block_remapped); 107eda14cbcSMatt Macy * blocks that were killed in a head dataset are put on the normal 108eda14cbcSMatt Macy * ds_deadlist and marked obsolete when they are freed. 109eda14cbcSMatt Macy * 110eda14cbcSMatt Macy * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 111eda14cbcSMatt Macy * in the pool that need to be marked obsolete. When a snapshot is 112eda14cbcSMatt Macy * destroyed, we move some of the ds_remap_deadlist to the obsolete 113eda14cbcSMatt Macy * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 114eda14cbcSMatt Macy * asynchronously process the obsolete bpobj, moving its entries to 115eda14cbcSMatt Macy * the specific vdevs' obsolete space maps. 116eda14cbcSMatt Macy * 117eda14cbcSMatt Macy * == Summary of how we mark blocks as obsolete == 118eda14cbcSMatt Macy * 119eda14cbcSMatt Macy * - When freeing a block: if any DVA is on an indirect vdev, append to 120eda14cbcSMatt Macy * vic_obsolete_sm_object. 121eda14cbcSMatt Macy * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 122eda14cbcSMatt Macy * references; otherwise append to vic_obsolete_sm_object). 123eda14cbcSMatt Macy * - When freeing a snapshot: move parts of ds_remap_deadlist to 124eda14cbcSMatt Macy * dp_obsolete_bpobj (same algorithm as ds_deadlist). 125eda14cbcSMatt Macy * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 126eda14cbcSMatt Macy * individual vdev's vic_obsolete_sm_object. 127eda14cbcSMatt Macy */ 128eda14cbcSMatt Macy 129eda14cbcSMatt Macy /* 130eda14cbcSMatt Macy * "Big theory statement" for how we condense indirect vdevs. 131eda14cbcSMatt Macy * 132eda14cbcSMatt Macy * Condensing an indirect vdev's mapping is the process of determining 133eda14cbcSMatt Macy * the precise counts of obsolete space for each mapping entry (by 134eda14cbcSMatt Macy * integrating the obsolete spacemap into the obsolete counts) and 135eda14cbcSMatt Macy * writing out a new mapping that contains only referenced entries. 136eda14cbcSMatt Macy * 137eda14cbcSMatt Macy * We condense a vdev when we expect the mapping to shrink (see 138eda14cbcSMatt Macy * vdev_indirect_should_condense()), but only perform one condense at a 139eda14cbcSMatt Macy * time to limit the memory usage. In addition, we use a separate 140eda14cbcSMatt Macy * open-context thread (spa_condense_indirect_thread) to incrementally 141eda14cbcSMatt Macy * create the new mapping object in a way that minimizes the impact on 142eda14cbcSMatt Macy * the rest of the system. 143eda14cbcSMatt Macy * 144eda14cbcSMatt Macy * == Generating a new mapping == 145eda14cbcSMatt Macy * 146eda14cbcSMatt Macy * To generate a new mapping, we follow these steps: 147eda14cbcSMatt Macy * 148eda14cbcSMatt Macy * 1. Save the old obsolete space map and create a new mapping object 149eda14cbcSMatt Macy * (see spa_condense_indirect_start_sync()). This initializes the 150eda14cbcSMatt Macy * spa_condensing_indirect_phys with the "previous obsolete space map", 151eda14cbcSMatt Macy * which is now read only. Newly obsolete DVAs will be added to a 152eda14cbcSMatt Macy * new (initially empty) obsolete space map, and will not be 153eda14cbcSMatt Macy * considered as part of this condense operation. 154eda14cbcSMatt Macy * 155eda14cbcSMatt Macy * 2. Construct in memory the precise counts of obsolete space for each 156eda14cbcSMatt Macy * mapping entry, by incorporating the obsolete space map into the 157eda14cbcSMatt Macy * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 158eda14cbcSMatt Macy * 159eda14cbcSMatt Macy * 3. Iterate through each mapping entry, writing to the new mapping any 160eda14cbcSMatt Macy * entries that are not completely obsolete (i.e. which don't have 161eda14cbcSMatt Macy * obsolete count == mapping length). (See 162eda14cbcSMatt Macy * spa_condense_indirect_generate_new_mapping().) 163eda14cbcSMatt Macy * 164eda14cbcSMatt Macy * 4. Destroy the old mapping object and switch over to the new one 165eda14cbcSMatt Macy * (spa_condense_indirect_complete_sync). 166eda14cbcSMatt Macy * 167eda14cbcSMatt Macy * == Restarting from failure == 168eda14cbcSMatt Macy * 169eda14cbcSMatt Macy * To restart the condense when we import/open the pool, we must start 170eda14cbcSMatt Macy * at the 2nd step above: reconstruct the precise counts in memory, 171eda14cbcSMatt Macy * based on the space map + counts. Then in the 3rd step, we start 172eda14cbcSMatt Macy * iterating where we left off: at vimp_max_offset of the new mapping 173eda14cbcSMatt Macy * object. 174eda14cbcSMatt Macy */ 175eda14cbcSMatt Macy 176e92ffd9bSMartin Matuska static int zfs_condense_indirect_vdevs_enable = B_TRUE; 177eda14cbcSMatt Macy 178eda14cbcSMatt Macy /* 179eda14cbcSMatt Macy * Condense if at least this percent of the bytes in the mapping is 180eda14cbcSMatt Macy * obsolete. With the default of 25%, the amount of space mapped 181eda14cbcSMatt Macy * will be reduced to 1% of its original size after at most 16 182eda14cbcSMatt Macy * condenses. Higher values will condense less often (causing less 183eda14cbcSMatt Macy * i/o); lower values will reduce the mapping size more quickly. 184eda14cbcSMatt Macy */ 185be181ee2SMartin Matuska static uint_t zfs_condense_indirect_obsolete_pct = 25; 186eda14cbcSMatt Macy 187eda14cbcSMatt Macy /* 188eda14cbcSMatt Macy * Condense if the obsolete space map takes up more than this amount of 189eda14cbcSMatt Macy * space on disk (logically). This limits the amount of disk space 190eda14cbcSMatt Macy * consumed by the obsolete space map; the default of 1GB is small enough 191eda14cbcSMatt Macy * that we typically don't mind "wasting" it. 192eda14cbcSMatt Macy */ 193dbd5678dSMartin Matuska static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 194eda14cbcSMatt Macy 195eda14cbcSMatt Macy /* 196eda14cbcSMatt Macy * Don't bother condensing if the mapping uses less than this amount of 197eda14cbcSMatt Macy * memory. The default of 128KB is considered a "trivial" amount of 198eda14cbcSMatt Macy * memory and not worth reducing. 199eda14cbcSMatt Macy */ 200dbd5678dSMartin Matuska static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 201eda14cbcSMatt Macy 202eda14cbcSMatt Macy /* 203eda14cbcSMatt Macy * This is used by the test suite so that it can ensure that certain 204eda14cbcSMatt Macy * actions happen while in the middle of a condense (which might otherwise 205eda14cbcSMatt Macy * complete too quickly). If used to reduce the performance impact of 206eda14cbcSMatt Macy * condensing in production, a maximum value of 1 should be sufficient. 207eda14cbcSMatt Macy */ 208be181ee2SMartin Matuska static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0; 209eda14cbcSMatt Macy 210eda14cbcSMatt Macy /* 211eda14cbcSMatt Macy * If an indirect split block contains more than this many possible unique 212eda14cbcSMatt Macy * combinations when being reconstructed, consider it too computationally 213eda14cbcSMatt Macy * expensive to check them all. Instead, try at most 100 randomly-selected 214eda14cbcSMatt Macy * combinations each time the block is accessed. This allows all segment 215eda14cbcSMatt Macy * copies to participate fairly in the reconstruction when all combinations 216eda14cbcSMatt Macy * cannot be checked and prevents repeated use of one bad copy. 217eda14cbcSMatt Macy */ 218be181ee2SMartin Matuska uint_t zfs_reconstruct_indirect_combinations_max = 4096; 219eda14cbcSMatt Macy 220eda14cbcSMatt Macy /* 221eda14cbcSMatt Macy * Enable to simulate damaged segments and validate reconstruction. This 222eda14cbcSMatt Macy * is intentionally not exposed as a module parameter. 223eda14cbcSMatt Macy */ 224eda14cbcSMatt Macy unsigned long zfs_reconstruct_indirect_damage_fraction = 0; 225eda14cbcSMatt Macy 226eda14cbcSMatt Macy /* 227eda14cbcSMatt Macy * The indirect_child_t represents the vdev that we will read from, when we 228eda14cbcSMatt Macy * need to read all copies of the data (e.g. for scrub or reconstruction). 229eda14cbcSMatt Macy * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 230eda14cbcSMatt Macy * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 231eda14cbcSMatt Macy * ic_vdev is a child of the mirror. 232eda14cbcSMatt Macy */ 233eda14cbcSMatt Macy typedef struct indirect_child { 234eda14cbcSMatt Macy abd_t *ic_data; 235eda14cbcSMatt Macy vdev_t *ic_vdev; 236eda14cbcSMatt Macy 237eda14cbcSMatt Macy /* 238eda14cbcSMatt Macy * ic_duplicate is NULL when the ic_data contents are unique, when it 239eda14cbcSMatt Macy * is determined to be a duplicate it references the primary child. 240eda14cbcSMatt Macy */ 241eda14cbcSMatt Macy struct indirect_child *ic_duplicate; 242eda14cbcSMatt Macy list_node_t ic_node; /* node on is_unique_child */ 2437877fdebSMatt Macy int ic_error; /* set when a child does not contain the data */ 244eda14cbcSMatt Macy } indirect_child_t; 245eda14cbcSMatt Macy 246eda14cbcSMatt Macy /* 247eda14cbcSMatt Macy * The indirect_split_t represents one mapped segment of an i/o to the 248eda14cbcSMatt Macy * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 249eda14cbcSMatt Macy * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 250eda14cbcSMatt Macy * For split blocks, there will be several of these. 251eda14cbcSMatt Macy */ 252eda14cbcSMatt Macy typedef struct indirect_split { 253eda14cbcSMatt Macy list_node_t is_node; /* link on iv_splits */ 254eda14cbcSMatt Macy 255eda14cbcSMatt Macy /* 256eda14cbcSMatt Macy * is_split_offset is the offset into the i/o. 257eda14cbcSMatt Macy * This is the sum of the previous splits' is_size's. 258eda14cbcSMatt Macy */ 259eda14cbcSMatt Macy uint64_t is_split_offset; 260eda14cbcSMatt Macy 261eda14cbcSMatt Macy vdev_t *is_vdev; /* top-level vdev */ 262eda14cbcSMatt Macy uint64_t is_target_offset; /* offset on is_vdev */ 263eda14cbcSMatt Macy uint64_t is_size; 264eda14cbcSMatt Macy int is_children; /* number of entries in is_child[] */ 265eda14cbcSMatt Macy int is_unique_children; /* number of entries in is_unique_child */ 266eda14cbcSMatt Macy list_t is_unique_child; 267eda14cbcSMatt Macy 268eda14cbcSMatt Macy /* 269eda14cbcSMatt Macy * is_good_child is the child that we are currently using to 270eda14cbcSMatt Macy * attempt reconstruction. 271eda14cbcSMatt Macy */ 272eda14cbcSMatt Macy indirect_child_t *is_good_child; 273eda14cbcSMatt Macy 27415f0b8c3SMartin Matuska indirect_child_t is_child[]; 275eda14cbcSMatt Macy } indirect_split_t; 276eda14cbcSMatt Macy 277eda14cbcSMatt Macy /* 278eda14cbcSMatt Macy * The indirect_vsd_t is associated with each i/o to the indirect vdev. 279eda14cbcSMatt Macy * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 280eda14cbcSMatt Macy */ 281eda14cbcSMatt Macy typedef struct indirect_vsd { 282eda14cbcSMatt Macy boolean_t iv_split_block; 283eda14cbcSMatt Macy boolean_t iv_reconstruct; 284eda14cbcSMatt Macy uint64_t iv_unique_combinations; 285eda14cbcSMatt Macy uint64_t iv_attempts; 286eda14cbcSMatt Macy uint64_t iv_attempts_max; 287eda14cbcSMatt Macy 288eda14cbcSMatt Macy list_t iv_splits; /* list of indirect_split_t's */ 289eda14cbcSMatt Macy } indirect_vsd_t; 290eda14cbcSMatt Macy 291eda14cbcSMatt Macy static void 292eda14cbcSMatt Macy vdev_indirect_map_free(zio_t *zio) 293eda14cbcSMatt Macy { 294eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 295eda14cbcSMatt Macy 296eda14cbcSMatt Macy indirect_split_t *is; 2974e8d558cSMartin Matuska while ((is = list_remove_head(&iv->iv_splits)) != NULL) { 298eda14cbcSMatt Macy for (int c = 0; c < is->is_children; c++) { 299eda14cbcSMatt Macy indirect_child_t *ic = &is->is_child[c]; 300eda14cbcSMatt Macy if (ic->ic_data != NULL) 301eda14cbcSMatt Macy abd_free(ic->ic_data); 302eda14cbcSMatt Macy } 303eda14cbcSMatt Macy 304eda14cbcSMatt Macy indirect_child_t *ic; 3054e8d558cSMartin Matuska while ((ic = list_remove_head(&is->is_unique_child)) != NULL) 3064e8d558cSMartin Matuska ; 307eda14cbcSMatt Macy 308eda14cbcSMatt Macy list_destroy(&is->is_unique_child); 309eda14cbcSMatt Macy 310eda14cbcSMatt Macy kmem_free(is, 311eda14cbcSMatt Macy offsetof(indirect_split_t, is_child[is->is_children])); 312eda14cbcSMatt Macy } 313eda14cbcSMatt Macy kmem_free(iv, sizeof (*iv)); 314eda14cbcSMatt Macy } 315eda14cbcSMatt Macy 316eda14cbcSMatt Macy static const zio_vsd_ops_t vdev_indirect_vsd_ops = { 317eda14cbcSMatt Macy .vsd_free = vdev_indirect_map_free, 318eda14cbcSMatt Macy }; 319eda14cbcSMatt Macy 320eda14cbcSMatt Macy /* 321eda14cbcSMatt Macy * Mark the given offset and size as being obsolete. 322eda14cbcSMatt Macy */ 323eda14cbcSMatt Macy void 324eda14cbcSMatt Macy vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 325eda14cbcSMatt Macy { 326eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 327eda14cbcSMatt Macy 328eda14cbcSMatt Macy ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 329eda14cbcSMatt Macy ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 330eda14cbcSMatt Macy ASSERT(size > 0); 331eda14cbcSMatt Macy VERIFY(vdev_indirect_mapping_entry_for_offset( 332eda14cbcSMatt Macy vd->vdev_indirect_mapping, offset) != NULL); 333eda14cbcSMatt Macy 334eda14cbcSMatt Macy if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 335eda14cbcSMatt Macy mutex_enter(&vd->vdev_obsolete_lock); 336eda14cbcSMatt Macy range_tree_add(vd->vdev_obsolete_segments, offset, size); 337eda14cbcSMatt Macy mutex_exit(&vd->vdev_obsolete_lock); 338eda14cbcSMatt Macy vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); 339eda14cbcSMatt Macy } 340eda14cbcSMatt Macy } 341eda14cbcSMatt Macy 342eda14cbcSMatt Macy /* 343eda14cbcSMatt Macy * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 344eda14cbcSMatt Macy * wrapper is provided because the DMU does not know about vdev_t's and 345eda14cbcSMatt Macy * cannot directly call vdev_indirect_mark_obsolete. 346eda14cbcSMatt Macy */ 347eda14cbcSMatt Macy void 348eda14cbcSMatt Macy spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 349eda14cbcSMatt Macy uint64_t size, dmu_tx_t *tx) 350eda14cbcSMatt Macy { 351eda14cbcSMatt Macy vdev_t *vd = vdev_lookup_top(spa, vdev_id); 352eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 353eda14cbcSMatt Macy 354eda14cbcSMatt Macy /* The DMU can only remap indirect vdevs. */ 355eda14cbcSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 356eda14cbcSMatt Macy vdev_indirect_mark_obsolete(vd, offset, size); 357eda14cbcSMatt Macy } 358eda14cbcSMatt Macy 359eda14cbcSMatt Macy static spa_condensing_indirect_t * 360eda14cbcSMatt Macy spa_condensing_indirect_create(spa_t *spa) 361eda14cbcSMatt Macy { 362eda14cbcSMatt Macy spa_condensing_indirect_phys_t *scip = 363eda14cbcSMatt Macy &spa->spa_condensing_indirect_phys; 364eda14cbcSMatt Macy spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 365eda14cbcSMatt Macy objset_t *mos = spa->spa_meta_objset; 366eda14cbcSMatt Macy 367eda14cbcSMatt Macy for (int i = 0; i < TXG_SIZE; i++) { 368eda14cbcSMatt Macy list_create(&sci->sci_new_mapping_entries[i], 369eda14cbcSMatt Macy sizeof (vdev_indirect_mapping_entry_t), 370eda14cbcSMatt Macy offsetof(vdev_indirect_mapping_entry_t, vime_node)); 371eda14cbcSMatt Macy } 372eda14cbcSMatt Macy 373eda14cbcSMatt Macy sci->sci_new_mapping = 374eda14cbcSMatt Macy vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 375eda14cbcSMatt Macy 376eda14cbcSMatt Macy return (sci); 377eda14cbcSMatt Macy } 378eda14cbcSMatt Macy 379eda14cbcSMatt Macy static void 380eda14cbcSMatt Macy spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 381eda14cbcSMatt Macy { 382eda14cbcSMatt Macy for (int i = 0; i < TXG_SIZE; i++) 383eda14cbcSMatt Macy list_destroy(&sci->sci_new_mapping_entries[i]); 384eda14cbcSMatt Macy 385eda14cbcSMatt Macy if (sci->sci_new_mapping != NULL) 386eda14cbcSMatt Macy vdev_indirect_mapping_close(sci->sci_new_mapping); 387eda14cbcSMatt Macy 388eda14cbcSMatt Macy kmem_free(sci, sizeof (*sci)); 389eda14cbcSMatt Macy } 390eda14cbcSMatt Macy 391eda14cbcSMatt Macy boolean_t 392eda14cbcSMatt Macy vdev_indirect_should_condense(vdev_t *vd) 393eda14cbcSMatt Macy { 394eda14cbcSMatt Macy vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 395eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 396eda14cbcSMatt Macy 397eda14cbcSMatt Macy ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 398eda14cbcSMatt Macy 399eda14cbcSMatt Macy if (!zfs_condense_indirect_vdevs_enable) 400eda14cbcSMatt Macy return (B_FALSE); 401eda14cbcSMatt Macy 402eda14cbcSMatt Macy /* 403eda14cbcSMatt Macy * We can only condense one indirect vdev at a time. 404eda14cbcSMatt Macy */ 405eda14cbcSMatt Macy if (spa->spa_condensing_indirect != NULL) 406eda14cbcSMatt Macy return (B_FALSE); 407eda14cbcSMatt Macy 408eda14cbcSMatt Macy if (spa_shutting_down(spa)) 409eda14cbcSMatt Macy return (B_FALSE); 410eda14cbcSMatt Macy 411eda14cbcSMatt Macy /* 412eda14cbcSMatt Macy * The mapping object size must not change while we are 413eda14cbcSMatt Macy * condensing, so we can only condense indirect vdevs 414eda14cbcSMatt Macy * (not vdevs that are still in the middle of being removed). 415eda14cbcSMatt Macy */ 416eda14cbcSMatt Macy if (vd->vdev_ops != &vdev_indirect_ops) 417eda14cbcSMatt Macy return (B_FALSE); 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy /* 420eda14cbcSMatt Macy * If nothing new has been marked obsolete, there is no 421eda14cbcSMatt Macy * point in condensing. 422eda14cbcSMatt Macy */ 423eda14cbcSMatt Macy uint64_t obsolete_sm_obj __maybe_unused; 424eda14cbcSMatt Macy ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); 425eda14cbcSMatt Macy if (vd->vdev_obsolete_sm == NULL) { 426eda14cbcSMatt Macy ASSERT0(obsolete_sm_obj); 427eda14cbcSMatt Macy return (B_FALSE); 428eda14cbcSMatt Macy } 429eda14cbcSMatt Macy 430eda14cbcSMatt Macy ASSERT(vd->vdev_obsolete_sm != NULL); 431eda14cbcSMatt Macy 432eda14cbcSMatt Macy ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); 433eda14cbcSMatt Macy 434eda14cbcSMatt Macy uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 435eda14cbcSMatt Macy uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 436eda14cbcSMatt Macy uint64_t mapping_size = vdev_indirect_mapping_size(vim); 437eda14cbcSMatt Macy uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 438eda14cbcSMatt Macy 439eda14cbcSMatt Macy ASSERT3U(bytes_obsolete, <=, bytes_mapped); 440eda14cbcSMatt Macy 441eda14cbcSMatt Macy /* 442eda14cbcSMatt Macy * If a high percentage of the bytes that are mapped have become 443eda14cbcSMatt Macy * obsolete, condense (unless the mapping is already small enough). 444eda14cbcSMatt Macy * This has a good chance of reducing the amount of memory used 445eda14cbcSMatt Macy * by the mapping. 446eda14cbcSMatt Macy */ 447eda14cbcSMatt Macy if (bytes_obsolete * 100 / bytes_mapped >= 44816038816SMartin Matuska zfs_condense_indirect_obsolete_pct && 449eda14cbcSMatt Macy mapping_size > zfs_condense_min_mapping_bytes) { 450eda14cbcSMatt Macy zfs_dbgmsg("should condense vdev %llu because obsolete " 451eda14cbcSMatt Macy "spacemap covers %d%% of %lluMB mapping", 452eda14cbcSMatt Macy (u_longlong_t)vd->vdev_id, 453eda14cbcSMatt Macy (int)(bytes_obsolete * 100 / bytes_mapped), 454eda14cbcSMatt Macy (u_longlong_t)bytes_mapped / 1024 / 1024); 455eda14cbcSMatt Macy return (B_TRUE); 456eda14cbcSMatt Macy } 457eda14cbcSMatt Macy 458eda14cbcSMatt Macy /* 459eda14cbcSMatt Macy * If the obsolete space map takes up too much space on disk, 460eda14cbcSMatt Macy * condense in order to free up this disk space. 461eda14cbcSMatt Macy */ 462eda14cbcSMatt Macy if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 463eda14cbcSMatt Macy zfs_dbgmsg("should condense vdev %llu because obsolete sm " 464eda14cbcSMatt Macy "length %lluMB >= max size %lluMB", 465eda14cbcSMatt Macy (u_longlong_t)vd->vdev_id, 466eda14cbcSMatt Macy (u_longlong_t)obsolete_sm_size / 1024 / 1024, 467eda14cbcSMatt Macy (u_longlong_t)zfs_condense_max_obsolete_bytes / 468eda14cbcSMatt Macy 1024 / 1024); 469eda14cbcSMatt Macy return (B_TRUE); 470eda14cbcSMatt Macy } 471eda14cbcSMatt Macy 472eda14cbcSMatt Macy return (B_FALSE); 473eda14cbcSMatt Macy } 474eda14cbcSMatt Macy 475eda14cbcSMatt Macy /* 476eda14cbcSMatt Macy * This sync task completes (finishes) a condense, deleting the old 477eda14cbcSMatt Macy * mapping and replacing it with the new one. 478eda14cbcSMatt Macy */ 479eda14cbcSMatt Macy static void 480eda14cbcSMatt Macy spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 481eda14cbcSMatt Macy { 482eda14cbcSMatt Macy spa_condensing_indirect_t *sci = arg; 483eda14cbcSMatt Macy spa_t *spa = dmu_tx_pool(tx)->dp_spa; 484eda14cbcSMatt Macy spa_condensing_indirect_phys_t *scip = 485eda14cbcSMatt Macy &spa->spa_condensing_indirect_phys; 486eda14cbcSMatt Macy vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 487eda14cbcSMatt Macy vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 488eda14cbcSMatt Macy objset_t *mos = spa->spa_meta_objset; 489eda14cbcSMatt Macy vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 490eda14cbcSMatt Macy uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 491eda14cbcSMatt Macy uint64_t new_count = 492eda14cbcSMatt Macy vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 493eda14cbcSMatt Macy 494eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 495eda14cbcSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 496eda14cbcSMatt Macy ASSERT3P(sci, ==, spa->spa_condensing_indirect); 497eda14cbcSMatt Macy for (int i = 0; i < TXG_SIZE; i++) { 498eda14cbcSMatt Macy ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 499eda14cbcSMatt Macy } 500eda14cbcSMatt Macy ASSERT(vic->vic_mapping_object != 0); 501eda14cbcSMatt Macy ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 502eda14cbcSMatt Macy ASSERT(scip->scip_next_mapping_object != 0); 503eda14cbcSMatt Macy ASSERT(scip->scip_prev_obsolete_sm_object != 0); 504eda14cbcSMatt Macy 505eda14cbcSMatt Macy /* 506eda14cbcSMatt Macy * Reset vdev_indirect_mapping to refer to the new object. 507eda14cbcSMatt Macy */ 508eda14cbcSMatt Macy rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 509eda14cbcSMatt Macy vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 510eda14cbcSMatt Macy vd->vdev_indirect_mapping = sci->sci_new_mapping; 511eda14cbcSMatt Macy rw_exit(&vd->vdev_indirect_rwlock); 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy sci->sci_new_mapping = NULL; 514eda14cbcSMatt Macy vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 515eda14cbcSMatt Macy vic->vic_mapping_object = scip->scip_next_mapping_object; 516eda14cbcSMatt Macy scip->scip_next_mapping_object = 0; 517eda14cbcSMatt Macy 518eda14cbcSMatt Macy space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 519eda14cbcSMatt Macy spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 520eda14cbcSMatt Macy scip->scip_prev_obsolete_sm_object = 0; 521eda14cbcSMatt Macy 522eda14cbcSMatt Macy scip->scip_vdev = 0; 523eda14cbcSMatt Macy 524eda14cbcSMatt Macy VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 525eda14cbcSMatt Macy DMU_POOL_CONDENSING_INDIRECT, tx)); 526eda14cbcSMatt Macy spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 527eda14cbcSMatt Macy spa->spa_condensing_indirect = NULL; 528eda14cbcSMatt Macy 529eda14cbcSMatt Macy zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 530eda14cbcSMatt Macy "new mapping object %llu has %llu entries " 531eda14cbcSMatt Macy "(was %llu entries)", 53233b8c039SMartin Matuska (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), 53333b8c039SMartin Matuska (u_longlong_t)vic->vic_mapping_object, 53433b8c039SMartin Matuska (u_longlong_t)new_count, (u_longlong_t)old_count); 535eda14cbcSMatt Macy 536eda14cbcSMatt Macy vdev_config_dirty(spa->spa_root_vdev); 537eda14cbcSMatt Macy } 538eda14cbcSMatt Macy 539eda14cbcSMatt Macy /* 540eda14cbcSMatt Macy * This sync task appends entries to the new mapping object. 541eda14cbcSMatt Macy */ 542eda14cbcSMatt Macy static void 543eda14cbcSMatt Macy spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 544eda14cbcSMatt Macy { 545eda14cbcSMatt Macy spa_condensing_indirect_t *sci = arg; 546eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx); 547eda14cbcSMatt Macy spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; 548eda14cbcSMatt Macy 549eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 550eda14cbcSMatt Macy ASSERT3P(sci, ==, spa->spa_condensing_indirect); 551eda14cbcSMatt Macy 552eda14cbcSMatt Macy vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 553eda14cbcSMatt Macy &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 554eda14cbcSMatt Macy ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 555eda14cbcSMatt Macy } 556eda14cbcSMatt Macy 557eda14cbcSMatt Macy /* 558eda14cbcSMatt Macy * Open-context function to add one entry to the new mapping. The new 559eda14cbcSMatt Macy * entry will be remembered and written from syncing context. 560eda14cbcSMatt Macy */ 561eda14cbcSMatt Macy static void 562eda14cbcSMatt Macy spa_condense_indirect_commit_entry(spa_t *spa, 563eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 564eda14cbcSMatt Macy { 565eda14cbcSMatt Macy spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 566eda14cbcSMatt Macy 567eda14cbcSMatt Macy ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 568eda14cbcSMatt Macy 569eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 570eda14cbcSMatt Macy dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 571eda14cbcSMatt Macy VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 572eda14cbcSMatt Macy int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 573eda14cbcSMatt Macy 574eda14cbcSMatt Macy /* 575eda14cbcSMatt Macy * If we are the first entry committed this txg, kick off the sync 576eda14cbcSMatt Macy * task to write to the MOS on our behalf. 577eda14cbcSMatt Macy */ 578eda14cbcSMatt Macy if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 579eda14cbcSMatt Macy dsl_sync_task_nowait(dmu_tx_pool(tx), 5802c48331dSMatt Macy spa_condense_indirect_commit_sync, sci, tx); 581eda14cbcSMatt Macy } 582eda14cbcSMatt Macy 583eda14cbcSMatt Macy vdev_indirect_mapping_entry_t *vime = 584eda14cbcSMatt Macy kmem_alloc(sizeof (*vime), KM_SLEEP); 585eda14cbcSMatt Macy vime->vime_mapping = *vimep; 586eda14cbcSMatt Macy vime->vime_obsolete_count = count; 587eda14cbcSMatt Macy list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 588eda14cbcSMatt Macy 589eda14cbcSMatt Macy dmu_tx_commit(tx); 590eda14cbcSMatt Macy } 591eda14cbcSMatt Macy 592eda14cbcSMatt Macy static void 593eda14cbcSMatt Macy spa_condense_indirect_generate_new_mapping(vdev_t *vd, 594eda14cbcSMatt Macy uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) 595eda14cbcSMatt Macy { 596eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 597eda14cbcSMatt Macy uint64_t mapi = start_index; 598eda14cbcSMatt Macy vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 599eda14cbcSMatt Macy uint64_t old_num_entries = 600eda14cbcSMatt Macy vdev_indirect_mapping_num_entries(old_mapping); 601eda14cbcSMatt Macy 602eda14cbcSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 603eda14cbcSMatt Macy ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 604eda14cbcSMatt Macy 605eda14cbcSMatt Macy zfs_dbgmsg("starting condense of vdev %llu from index %llu", 606eda14cbcSMatt Macy (u_longlong_t)vd->vdev_id, 607eda14cbcSMatt Macy (u_longlong_t)mapi); 608eda14cbcSMatt Macy 609eda14cbcSMatt Macy while (mapi < old_num_entries) { 610eda14cbcSMatt Macy 611eda14cbcSMatt Macy if (zthr_iscancelled(zthr)) { 612eda14cbcSMatt Macy zfs_dbgmsg("pausing condense of vdev %llu " 613eda14cbcSMatt Macy "at index %llu", (u_longlong_t)vd->vdev_id, 614eda14cbcSMatt Macy (u_longlong_t)mapi); 615eda14cbcSMatt Macy break; 616eda14cbcSMatt Macy } 617eda14cbcSMatt Macy 618eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *entry = 619eda14cbcSMatt Macy &old_mapping->vim_entries[mapi]; 620eda14cbcSMatt Macy uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 621eda14cbcSMatt Macy ASSERT3U(obsolete_counts[mapi], <=, entry_size); 622eda14cbcSMatt Macy if (obsolete_counts[mapi] < entry_size) { 623eda14cbcSMatt Macy spa_condense_indirect_commit_entry(spa, entry, 624eda14cbcSMatt Macy obsolete_counts[mapi]); 625eda14cbcSMatt Macy 626eda14cbcSMatt Macy /* 627eda14cbcSMatt Macy * This delay may be requested for testing, debugging, 628eda14cbcSMatt Macy * or performance reasons. 629eda14cbcSMatt Macy */ 630eda14cbcSMatt Macy hrtime_t now = gethrtime(); 631eda14cbcSMatt Macy hrtime_t sleep_until = now + MSEC2NSEC( 632eda14cbcSMatt Macy zfs_condense_indirect_commit_entry_delay_ms); 633eda14cbcSMatt Macy zfs_sleep_until(sleep_until); 634eda14cbcSMatt Macy } 635eda14cbcSMatt Macy 636eda14cbcSMatt Macy mapi++; 637eda14cbcSMatt Macy } 638eda14cbcSMatt Macy } 639eda14cbcSMatt Macy 640eda14cbcSMatt Macy static boolean_t 641eda14cbcSMatt Macy spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) 642eda14cbcSMatt Macy { 643e92ffd9bSMartin Matuska (void) zthr; 644eda14cbcSMatt Macy spa_t *spa = arg; 645eda14cbcSMatt Macy 646eda14cbcSMatt Macy return (spa->spa_condensing_indirect != NULL); 647eda14cbcSMatt Macy } 648eda14cbcSMatt Macy 649eda14cbcSMatt Macy static void 650eda14cbcSMatt Macy spa_condense_indirect_thread(void *arg, zthr_t *zthr) 651eda14cbcSMatt Macy { 652eda14cbcSMatt Macy spa_t *spa = arg; 653eda14cbcSMatt Macy vdev_t *vd; 654eda14cbcSMatt Macy 655eda14cbcSMatt Macy ASSERT3P(spa->spa_condensing_indirect, !=, NULL); 656eda14cbcSMatt Macy spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 657eda14cbcSMatt Macy vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); 658eda14cbcSMatt Macy ASSERT3P(vd, !=, NULL); 659eda14cbcSMatt Macy spa_config_exit(spa, SCL_VDEV, FTAG); 660eda14cbcSMatt Macy 661eda14cbcSMatt Macy spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 662eda14cbcSMatt Macy spa_condensing_indirect_phys_t *scip = 663eda14cbcSMatt Macy &spa->spa_condensing_indirect_phys; 664eda14cbcSMatt Macy uint32_t *counts; 665eda14cbcSMatt Macy uint64_t start_index; 666eda14cbcSMatt Macy vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 667eda14cbcSMatt Macy space_map_t *prev_obsolete_sm = NULL; 668eda14cbcSMatt Macy 669eda14cbcSMatt Macy ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 670eda14cbcSMatt Macy ASSERT(scip->scip_next_mapping_object != 0); 671eda14cbcSMatt Macy ASSERT(scip->scip_prev_obsolete_sm_object != 0); 672eda14cbcSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 673eda14cbcSMatt Macy 674eda14cbcSMatt Macy for (int i = 0; i < TXG_SIZE; i++) { 675eda14cbcSMatt Macy /* 676eda14cbcSMatt Macy * The list must start out empty in order for the 677eda14cbcSMatt Macy * _commit_sync() sync task to be properly registered 678eda14cbcSMatt Macy * on the first call to _commit_entry(); so it's wise 679eda14cbcSMatt Macy * to double check and ensure we actually are starting 680eda14cbcSMatt Macy * with empty lists. 681eda14cbcSMatt Macy */ 682eda14cbcSMatt Macy ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 683eda14cbcSMatt Macy } 684eda14cbcSMatt Macy 685eda14cbcSMatt Macy VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 686eda14cbcSMatt Macy scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 687eda14cbcSMatt Macy counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 688eda14cbcSMatt Macy if (prev_obsolete_sm != NULL) { 689eda14cbcSMatt Macy vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 690eda14cbcSMatt Macy counts, prev_obsolete_sm); 691eda14cbcSMatt Macy } 692eda14cbcSMatt Macy space_map_close(prev_obsolete_sm); 693eda14cbcSMatt Macy 694eda14cbcSMatt Macy /* 695eda14cbcSMatt Macy * Generate new mapping. Determine what index to continue from 696eda14cbcSMatt Macy * based on the max offset that we've already written in the 697eda14cbcSMatt Macy * new mapping. 698eda14cbcSMatt Macy */ 699eda14cbcSMatt Macy uint64_t max_offset = 700eda14cbcSMatt Macy vdev_indirect_mapping_max_offset(sci->sci_new_mapping); 701eda14cbcSMatt Macy if (max_offset == 0) { 702eda14cbcSMatt Macy /* We haven't written anything to the new mapping yet. */ 703eda14cbcSMatt Macy start_index = 0; 704eda14cbcSMatt Macy } else { 705eda14cbcSMatt Macy /* 706eda14cbcSMatt Macy * Pick up from where we left off. _entry_for_offset() 707eda14cbcSMatt Macy * returns a pointer into the vim_entries array. If 708eda14cbcSMatt Macy * max_offset is greater than any of the mappings 709eda14cbcSMatt Macy * contained in the table NULL will be returned and 710eda14cbcSMatt Macy * that indicates we've exhausted our iteration of the 711eda14cbcSMatt Macy * old_mapping. 712eda14cbcSMatt Macy */ 713eda14cbcSMatt Macy 714eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *entry = 715eda14cbcSMatt Macy vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, 716eda14cbcSMatt Macy max_offset); 717eda14cbcSMatt Macy 718eda14cbcSMatt Macy if (entry == NULL) { 719eda14cbcSMatt Macy /* 720eda14cbcSMatt Macy * We've already written the whole new mapping. 721eda14cbcSMatt Macy * This special value will cause us to skip the 722eda14cbcSMatt Macy * generate_new_mapping step and just do the sync 723eda14cbcSMatt Macy * task to complete the condense. 724eda14cbcSMatt Macy */ 725eda14cbcSMatt Macy start_index = UINT64_MAX; 726eda14cbcSMatt Macy } else { 727eda14cbcSMatt Macy start_index = entry - old_mapping->vim_entries; 728eda14cbcSMatt Macy ASSERT3U(start_index, <, 729eda14cbcSMatt Macy vdev_indirect_mapping_num_entries(old_mapping)); 730eda14cbcSMatt Macy } 731eda14cbcSMatt Macy } 732eda14cbcSMatt Macy 733eda14cbcSMatt Macy spa_condense_indirect_generate_new_mapping(vd, counts, 734eda14cbcSMatt Macy start_index, zthr); 735eda14cbcSMatt Macy 736eda14cbcSMatt Macy vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); 737eda14cbcSMatt Macy 738eda14cbcSMatt Macy /* 739eda14cbcSMatt Macy * If the zthr has received a cancellation signal while running 740eda14cbcSMatt Macy * in generate_new_mapping() or at any point after that, then bail 741eda14cbcSMatt Macy * early. We don't want to complete the condense if the spa is 742eda14cbcSMatt Macy * shutting down. 743eda14cbcSMatt Macy */ 744eda14cbcSMatt Macy if (zthr_iscancelled(zthr)) 745eda14cbcSMatt Macy return; 746eda14cbcSMatt Macy 747eda14cbcSMatt Macy VERIFY0(dsl_sync_task(spa_name(spa), NULL, 748eda14cbcSMatt Macy spa_condense_indirect_complete_sync, sci, 0, 749eda14cbcSMatt Macy ZFS_SPACE_CHECK_EXTRA_RESERVED)); 750eda14cbcSMatt Macy } 751eda14cbcSMatt Macy 752eda14cbcSMatt Macy /* 753eda14cbcSMatt Macy * Sync task to begin the condensing process. 754eda14cbcSMatt Macy */ 755eda14cbcSMatt Macy void 756eda14cbcSMatt Macy spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) 757eda14cbcSMatt Macy { 758eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 759eda14cbcSMatt Macy spa_condensing_indirect_phys_t *scip = 760eda14cbcSMatt Macy &spa->spa_condensing_indirect_phys; 761eda14cbcSMatt Macy 762eda14cbcSMatt Macy ASSERT0(scip->scip_next_mapping_object); 763eda14cbcSMatt Macy ASSERT0(scip->scip_prev_obsolete_sm_object); 764eda14cbcSMatt Macy ASSERT0(scip->scip_vdev); 765eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 766eda14cbcSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 767eda14cbcSMatt Macy ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 768eda14cbcSMatt Macy ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); 769eda14cbcSMatt Macy 770eda14cbcSMatt Macy uint64_t obsolete_sm_obj; 771eda14cbcSMatt Macy VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); 772eda14cbcSMatt Macy ASSERT3U(obsolete_sm_obj, !=, 0); 773eda14cbcSMatt Macy 774eda14cbcSMatt Macy scip->scip_vdev = vd->vdev_id; 775eda14cbcSMatt Macy scip->scip_next_mapping_object = 776eda14cbcSMatt Macy vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); 777eda14cbcSMatt Macy 778eda14cbcSMatt Macy scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; 779eda14cbcSMatt Macy 780eda14cbcSMatt Macy /* 781eda14cbcSMatt Macy * We don't need to allocate a new space map object, since 782eda14cbcSMatt Macy * vdev_indirect_sync_obsolete will allocate one when needed. 783eda14cbcSMatt Macy */ 784eda14cbcSMatt Macy space_map_close(vd->vdev_obsolete_sm); 785eda14cbcSMatt Macy vd->vdev_obsolete_sm = NULL; 786eda14cbcSMatt Macy VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 787eda14cbcSMatt Macy VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 788eda14cbcSMatt Macy 789eda14cbcSMatt Macy VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 790eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, 791eda14cbcSMatt Macy DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 792eda14cbcSMatt Macy sizeof (*scip) / sizeof (uint64_t), scip, tx)); 793eda14cbcSMatt Macy 794eda14cbcSMatt Macy ASSERT3P(spa->spa_condensing_indirect, ==, NULL); 795eda14cbcSMatt Macy spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); 796eda14cbcSMatt Macy 797eda14cbcSMatt Macy zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " 798eda14cbcSMatt Macy "posm=%llu nm=%llu", 79933b8c039SMartin Matuska (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), 800eda14cbcSMatt Macy (u_longlong_t)scip->scip_prev_obsolete_sm_object, 801eda14cbcSMatt Macy (u_longlong_t)scip->scip_next_mapping_object); 802eda14cbcSMatt Macy 803eda14cbcSMatt Macy zthr_wakeup(spa->spa_condense_zthr); 804eda14cbcSMatt Macy } 805eda14cbcSMatt Macy 806eda14cbcSMatt Macy /* 807eda14cbcSMatt Macy * Sync to the given vdev's obsolete space map any segments that are no longer 808eda14cbcSMatt Macy * referenced as of the given txg. 809eda14cbcSMatt Macy * 810eda14cbcSMatt Macy * If the obsolete space map doesn't exist yet, create and open it. 811eda14cbcSMatt Macy */ 812eda14cbcSMatt Macy void 813eda14cbcSMatt Macy vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) 814eda14cbcSMatt Macy { 815eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 816eda14cbcSMatt Macy vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; 817eda14cbcSMatt Macy 818eda14cbcSMatt Macy ASSERT3U(vic->vic_mapping_object, !=, 0); 819eda14cbcSMatt Macy ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); 820eda14cbcSMatt Macy ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 821eda14cbcSMatt Macy ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 822eda14cbcSMatt Macy 823eda14cbcSMatt Macy uint64_t obsolete_sm_object; 824eda14cbcSMatt Macy VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 825eda14cbcSMatt Macy if (obsolete_sm_object == 0) { 826eda14cbcSMatt Macy obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, 827eda14cbcSMatt Macy zfs_vdev_standard_sm_blksz, tx); 828eda14cbcSMatt Macy 829eda14cbcSMatt Macy ASSERT(vd->vdev_top_zap != 0); 830eda14cbcSMatt Macy VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 831eda14cbcSMatt Macy VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, 832eda14cbcSMatt Macy sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); 833eda14cbcSMatt Macy ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 834eda14cbcSMatt Macy ASSERT3U(obsolete_sm_object, !=, 0); 835eda14cbcSMatt Macy 836eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 837eda14cbcSMatt Macy VERIFY0(space_map_open(&vd->vdev_obsolete_sm, 838eda14cbcSMatt Macy spa->spa_meta_objset, obsolete_sm_object, 839eda14cbcSMatt Macy 0, vd->vdev_asize, 0)); 840eda14cbcSMatt Macy } 841eda14cbcSMatt Macy 842eda14cbcSMatt Macy ASSERT(vd->vdev_obsolete_sm != NULL); 843eda14cbcSMatt Macy ASSERT3U(obsolete_sm_object, ==, 844eda14cbcSMatt Macy space_map_object(vd->vdev_obsolete_sm)); 845eda14cbcSMatt Macy 846eda14cbcSMatt Macy space_map_write(vd->vdev_obsolete_sm, 847eda14cbcSMatt Macy vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); 848eda14cbcSMatt Macy range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 849eda14cbcSMatt Macy } 850eda14cbcSMatt Macy 851eda14cbcSMatt Macy int 852eda14cbcSMatt Macy spa_condense_init(spa_t *spa) 853eda14cbcSMatt Macy { 854eda14cbcSMatt Macy int error = zap_lookup(spa->spa_meta_objset, 855eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, 856eda14cbcSMatt Macy DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 857eda14cbcSMatt Macy sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), 858eda14cbcSMatt Macy &spa->spa_condensing_indirect_phys); 859eda14cbcSMatt Macy if (error == 0) { 860eda14cbcSMatt Macy if (spa_writeable(spa)) { 861eda14cbcSMatt Macy spa->spa_condensing_indirect = 862eda14cbcSMatt Macy spa_condensing_indirect_create(spa); 863eda14cbcSMatt Macy } 864eda14cbcSMatt Macy return (0); 865eda14cbcSMatt Macy } else if (error == ENOENT) { 866eda14cbcSMatt Macy return (0); 867eda14cbcSMatt Macy } else { 868eda14cbcSMatt Macy return (error); 869eda14cbcSMatt Macy } 870eda14cbcSMatt Macy } 871eda14cbcSMatt Macy 872eda14cbcSMatt Macy void 873eda14cbcSMatt Macy spa_condense_fini(spa_t *spa) 874eda14cbcSMatt Macy { 875eda14cbcSMatt Macy if (spa->spa_condensing_indirect != NULL) { 876eda14cbcSMatt Macy spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 877eda14cbcSMatt Macy spa->spa_condensing_indirect = NULL; 878eda14cbcSMatt Macy } 879eda14cbcSMatt Macy } 880eda14cbcSMatt Macy 881eda14cbcSMatt Macy void 882eda14cbcSMatt Macy spa_start_indirect_condensing_thread(spa_t *spa) 883eda14cbcSMatt Macy { 884eda14cbcSMatt Macy ASSERT3P(spa->spa_condense_zthr, ==, NULL); 885eda14cbcSMatt Macy spa->spa_condense_zthr = zthr_create("z_indirect_condense", 886eda14cbcSMatt Macy spa_condense_indirect_thread_check, 8872faf504dSMartin Matuska spa_condense_indirect_thread, spa, minclsyspri); 888eda14cbcSMatt Macy } 889eda14cbcSMatt Macy 890eda14cbcSMatt Macy /* 891eda14cbcSMatt Macy * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj 892eda14cbcSMatt Macy * will contain either the obsolete spacemap object or zero if none exists. 893eda14cbcSMatt Macy * All other errors are returned to the caller. 894eda14cbcSMatt Macy */ 895eda14cbcSMatt Macy int 896eda14cbcSMatt Macy vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) 897eda14cbcSMatt Macy { 898eda14cbcSMatt Macy ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 899eda14cbcSMatt Macy 900eda14cbcSMatt Macy if (vd->vdev_top_zap == 0) { 901eda14cbcSMatt Macy *sm_obj = 0; 902eda14cbcSMatt Macy return (0); 903eda14cbcSMatt Macy } 904eda14cbcSMatt Macy 905eda14cbcSMatt Macy int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 906eda14cbcSMatt Macy VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); 907eda14cbcSMatt Macy if (error == ENOENT) { 908eda14cbcSMatt Macy *sm_obj = 0; 909eda14cbcSMatt Macy error = 0; 910eda14cbcSMatt Macy } 911eda14cbcSMatt Macy 912eda14cbcSMatt Macy return (error); 913eda14cbcSMatt Macy } 914eda14cbcSMatt Macy 915eda14cbcSMatt Macy /* 916eda14cbcSMatt Macy * Gets the obsolete count are precise spacemap object from the vdev's ZAP. 917eda14cbcSMatt Macy * On success are_precise will be set to reflect if the counts are precise. 918eda14cbcSMatt Macy * All other errors are returned to the caller. 919eda14cbcSMatt Macy */ 920eda14cbcSMatt Macy int 921eda14cbcSMatt Macy vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) 922eda14cbcSMatt Macy { 923eda14cbcSMatt Macy ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 924eda14cbcSMatt Macy 925eda14cbcSMatt Macy if (vd->vdev_top_zap == 0) { 926eda14cbcSMatt Macy *are_precise = B_FALSE; 927eda14cbcSMatt Macy return (0); 928eda14cbcSMatt Macy } 929eda14cbcSMatt Macy 930eda14cbcSMatt Macy uint64_t val = 0; 931eda14cbcSMatt Macy int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 932eda14cbcSMatt Macy VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); 933eda14cbcSMatt Macy if (error == 0) { 934eda14cbcSMatt Macy *are_precise = (val != 0); 935eda14cbcSMatt Macy } else if (error == ENOENT) { 936eda14cbcSMatt Macy *are_precise = B_FALSE; 937eda14cbcSMatt Macy error = 0; 938eda14cbcSMatt Macy } 939eda14cbcSMatt Macy 940eda14cbcSMatt Macy return (error); 941eda14cbcSMatt Macy } 942eda14cbcSMatt Macy 943eda14cbcSMatt Macy static void 944eda14cbcSMatt Macy vdev_indirect_close(vdev_t *vd) 945eda14cbcSMatt Macy { 946e92ffd9bSMartin Matuska (void) vd; 947eda14cbcSMatt Macy } 948eda14cbcSMatt Macy 949eda14cbcSMatt Macy static int 950eda14cbcSMatt Macy vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 951eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 952eda14cbcSMatt Macy { 953eda14cbcSMatt Macy *psize = *max_psize = vd->vdev_asize + 954eda14cbcSMatt Macy VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 955eda14cbcSMatt Macy *logical_ashift = vd->vdev_ashift; 956eda14cbcSMatt Macy *physical_ashift = vd->vdev_physical_ashift; 957eda14cbcSMatt Macy return (0); 958eda14cbcSMatt Macy } 959eda14cbcSMatt Macy 960eda14cbcSMatt Macy typedef struct remap_segment { 961eda14cbcSMatt Macy vdev_t *rs_vd; 962eda14cbcSMatt Macy uint64_t rs_offset; 963eda14cbcSMatt Macy uint64_t rs_asize; 964eda14cbcSMatt Macy uint64_t rs_split_offset; 965eda14cbcSMatt Macy list_node_t rs_node; 966eda14cbcSMatt Macy } remap_segment_t; 967eda14cbcSMatt Macy 968eda14cbcSMatt Macy static remap_segment_t * 969eda14cbcSMatt Macy rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 970eda14cbcSMatt Macy { 971eda14cbcSMatt Macy remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); 972eda14cbcSMatt Macy rs->rs_vd = vd; 973eda14cbcSMatt Macy rs->rs_offset = offset; 974eda14cbcSMatt Macy rs->rs_asize = asize; 975eda14cbcSMatt Macy rs->rs_split_offset = split_offset; 976eda14cbcSMatt Macy return (rs); 977eda14cbcSMatt Macy } 978eda14cbcSMatt Macy 979eda14cbcSMatt Macy /* 980eda14cbcSMatt Macy * Given an indirect vdev and an extent on that vdev, it duplicates the 981eda14cbcSMatt Macy * physical entries of the indirect mapping that correspond to the extent 982eda14cbcSMatt Macy * to a new array and returns a pointer to it. In addition, copied_entries 983eda14cbcSMatt Macy * is populated with the number of mapping entries that were duplicated. 984eda14cbcSMatt Macy * 985eda14cbcSMatt Macy * Note that the function assumes that the caller holds vdev_indirect_rwlock. 986eda14cbcSMatt Macy * This ensures that the mapping won't change due to condensing as we 987eda14cbcSMatt Macy * copy over its contents. 988eda14cbcSMatt Macy * 989eda14cbcSMatt Macy * Finally, since we are doing an allocation, it is up to the caller to 990eda14cbcSMatt Macy * free the array allocated in this function. 991eda14cbcSMatt Macy */ 992eda14cbcSMatt Macy static vdev_indirect_mapping_entry_phys_t * 993eda14cbcSMatt Macy vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 994eda14cbcSMatt Macy uint64_t asize, uint64_t *copied_entries) 995eda14cbcSMatt Macy { 996eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 997eda14cbcSMatt Macy vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 998eda14cbcSMatt Macy uint64_t entries = 0; 999eda14cbcSMatt Macy 1000eda14cbcSMatt Macy ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); 1001eda14cbcSMatt Macy 1002eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *first_mapping = 1003eda14cbcSMatt Macy vdev_indirect_mapping_entry_for_offset(vim, offset); 1004eda14cbcSMatt Macy ASSERT3P(first_mapping, !=, NULL); 1005eda14cbcSMatt Macy 1006eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *m = first_mapping; 1007eda14cbcSMatt Macy while (asize > 0) { 1008eda14cbcSMatt Macy uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 1009eda14cbcSMatt Macy 1010eda14cbcSMatt Macy ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); 1011eda14cbcSMatt Macy ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); 1012eda14cbcSMatt Macy 1013eda14cbcSMatt Macy uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 1014eda14cbcSMatt Macy uint64_t inner_size = MIN(asize, size - inner_offset); 1015eda14cbcSMatt Macy 1016eda14cbcSMatt Macy offset += inner_size; 1017eda14cbcSMatt Macy asize -= inner_size; 1018eda14cbcSMatt Macy entries++; 1019eda14cbcSMatt Macy m++; 1020eda14cbcSMatt Macy } 1021eda14cbcSMatt Macy 1022eda14cbcSMatt Macy size_t copy_length = entries * sizeof (*first_mapping); 1023eda14cbcSMatt Macy duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); 1024da5137abSMartin Matuska memcpy(duplicate_mappings, first_mapping, copy_length); 1025eda14cbcSMatt Macy *copied_entries = entries; 1026eda14cbcSMatt Macy 1027eda14cbcSMatt Macy return (duplicate_mappings); 1028eda14cbcSMatt Macy } 1029eda14cbcSMatt Macy 1030eda14cbcSMatt Macy /* 1031eda14cbcSMatt Macy * Goes through the relevant indirect mappings until it hits a concrete vdev 1032eda14cbcSMatt Macy * and issues the callback. On the way to the concrete vdev, if any other 1033eda14cbcSMatt Macy * indirect vdevs are encountered, then the callback will also be called on 1034eda14cbcSMatt Macy * each of those indirect vdevs. For example, if the segment is mapped to 1035eda14cbcSMatt Macy * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is 1036eda14cbcSMatt Macy * mapped to segment B on concrete vdev 2, then the callback will be called on 1037eda14cbcSMatt Macy * both vdev 1 and vdev 2. 1038eda14cbcSMatt Macy * 1039eda14cbcSMatt Macy * While the callback passed to vdev_indirect_remap() is called on every vdev 1040eda14cbcSMatt Macy * the function encounters, certain callbacks only care about concrete vdevs. 1041eda14cbcSMatt Macy * These types of callbacks should return immediately and explicitly when they 1042eda14cbcSMatt Macy * are called on an indirect vdev. 1043eda14cbcSMatt Macy * 1044eda14cbcSMatt Macy * Because there is a possibility that a DVA section in the indirect device 1045eda14cbcSMatt Macy * has been split into multiple sections in our mapping, we keep track 1046eda14cbcSMatt Macy * of the relevant contiguous segments of the new location (remap_segment_t) 1047eda14cbcSMatt Macy * in a stack. This way we can call the callback for each of the new sections 1048eda14cbcSMatt Macy * created by a single section of the indirect device. Note though, that in 1049eda14cbcSMatt Macy * this scenario the callbacks in each split block won't occur in-order in 1050eda14cbcSMatt Macy * terms of offset, so callers should not make any assumptions about that. 1051eda14cbcSMatt Macy * 1052eda14cbcSMatt Macy * For callbacks that don't handle split blocks and immediately return when 1053eda14cbcSMatt Macy * they encounter them (as is the case for remap_blkptr_cb), the caller can 1054eda14cbcSMatt Macy * assume that its callback will be applied from the first indirect vdev 1055eda14cbcSMatt Macy * encountered to the last one and then the concrete vdev, in that order. 1056eda14cbcSMatt Macy */ 1057eda14cbcSMatt Macy static void 1058eda14cbcSMatt Macy vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, 1059eda14cbcSMatt Macy void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) 1060eda14cbcSMatt Macy { 1061eda14cbcSMatt Macy list_t stack; 1062eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 1063eda14cbcSMatt Macy 1064eda14cbcSMatt Macy list_create(&stack, sizeof (remap_segment_t), 1065eda14cbcSMatt Macy offsetof(remap_segment_t, rs_node)); 1066eda14cbcSMatt Macy 1067eda14cbcSMatt Macy for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); 1068eda14cbcSMatt Macy rs != NULL; rs = list_remove_head(&stack)) { 1069eda14cbcSMatt Macy vdev_t *v = rs->rs_vd; 1070eda14cbcSMatt Macy uint64_t num_entries = 0; 1071eda14cbcSMatt Macy 1072eda14cbcSMatt Macy ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1073eda14cbcSMatt Macy ASSERT(rs->rs_asize > 0); 1074eda14cbcSMatt Macy 1075eda14cbcSMatt Macy /* 1076eda14cbcSMatt Macy * Note: As this function can be called from open context 1077eda14cbcSMatt Macy * (e.g. zio_read()), we need the following rwlock to 1078eda14cbcSMatt Macy * prevent the mapping from being changed by condensing. 1079eda14cbcSMatt Macy * 1080eda14cbcSMatt Macy * So we grab the lock and we make a copy of the entries 1081eda14cbcSMatt Macy * that are relevant to the extent that we are working on. 1082eda14cbcSMatt Macy * Once that is done, we drop the lock and iterate over 1083eda14cbcSMatt Macy * our copy of the mapping. Once we are done with the with 1084eda14cbcSMatt Macy * the remap segment and we free it, we also free our copy 1085eda14cbcSMatt Macy * of the indirect mapping entries that are relevant to it. 1086eda14cbcSMatt Macy * 1087eda14cbcSMatt Macy * This way we don't need to wait until the function is 1088eda14cbcSMatt Macy * finished with a segment, to condense it. In addition, we 1089eda14cbcSMatt Macy * don't need a recursive rwlock for the case that a call to 1090eda14cbcSMatt Macy * vdev_indirect_remap() needs to call itself (through the 1091eda14cbcSMatt Macy * codepath of its callback) for the same vdev in the middle 1092eda14cbcSMatt Macy * of its execution. 1093eda14cbcSMatt Macy */ 1094eda14cbcSMatt Macy rw_enter(&v->vdev_indirect_rwlock, RW_READER); 1095eda14cbcSMatt Macy ASSERT3P(v->vdev_indirect_mapping, !=, NULL); 1096eda14cbcSMatt Macy 1097eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *mapping = 1098eda14cbcSMatt Macy vdev_indirect_mapping_duplicate_adjacent_entries(v, 1099eda14cbcSMatt Macy rs->rs_offset, rs->rs_asize, &num_entries); 1100eda14cbcSMatt Macy ASSERT3P(mapping, !=, NULL); 1101eda14cbcSMatt Macy ASSERT3U(num_entries, >, 0); 1102eda14cbcSMatt Macy rw_exit(&v->vdev_indirect_rwlock); 1103eda14cbcSMatt Macy 1104eda14cbcSMatt Macy for (uint64_t i = 0; i < num_entries; i++) { 1105eda14cbcSMatt Macy /* 1106eda14cbcSMatt Macy * Note: the vdev_indirect_mapping can not change 1107eda14cbcSMatt Macy * while we are running. It only changes while the 1108eda14cbcSMatt Macy * removal is in progress, and then only from syncing 1109eda14cbcSMatt Macy * context. While a removal is in progress, this 1110eda14cbcSMatt Macy * function is only called for frees, which also only 1111eda14cbcSMatt Macy * happen from syncing context. 1112eda14cbcSMatt Macy */ 1113eda14cbcSMatt Macy vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 1114eda14cbcSMatt Macy 1115eda14cbcSMatt Macy ASSERT3P(m, !=, NULL); 1116eda14cbcSMatt Macy ASSERT3U(rs->rs_asize, >, 0); 1117eda14cbcSMatt Macy 1118eda14cbcSMatt Macy uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 1119eda14cbcSMatt Macy uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 1120eda14cbcSMatt Macy uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 1121eda14cbcSMatt Macy 1122eda14cbcSMatt Macy ASSERT3U(rs->rs_offset, >=, 1123eda14cbcSMatt Macy DVA_MAPPING_GET_SRC_OFFSET(m)); 1124eda14cbcSMatt Macy ASSERT3U(rs->rs_offset, <, 1125eda14cbcSMatt Macy DVA_MAPPING_GET_SRC_OFFSET(m) + size); 1126eda14cbcSMatt Macy ASSERT3U(dst_vdev, !=, v->vdev_id); 1127eda14cbcSMatt Macy 1128eda14cbcSMatt Macy uint64_t inner_offset = rs->rs_offset - 1129eda14cbcSMatt Macy DVA_MAPPING_GET_SRC_OFFSET(m); 1130eda14cbcSMatt Macy uint64_t inner_size = 1131eda14cbcSMatt Macy MIN(rs->rs_asize, size - inner_offset); 1132eda14cbcSMatt Macy 1133eda14cbcSMatt Macy vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 1134eda14cbcSMatt Macy ASSERT3P(dst_v, !=, NULL); 1135eda14cbcSMatt Macy 1136eda14cbcSMatt Macy if (dst_v->vdev_ops == &vdev_indirect_ops) { 1137eda14cbcSMatt Macy list_insert_head(&stack, 1138eda14cbcSMatt Macy rs_alloc(dst_v, dst_offset + inner_offset, 1139eda14cbcSMatt Macy inner_size, rs->rs_split_offset)); 1140eda14cbcSMatt Macy 1141eda14cbcSMatt Macy } 1142eda14cbcSMatt Macy 1143eda14cbcSMatt Macy if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && 1144eda14cbcSMatt Macy IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { 1145eda14cbcSMatt Macy /* 1146eda14cbcSMatt Macy * Note: This clause exists only solely for 1147eda14cbcSMatt Macy * testing purposes. We use it to ensure that 1148eda14cbcSMatt Macy * split blocks work and that the callbacks 1149eda14cbcSMatt Macy * using them yield the same result if issued 1150eda14cbcSMatt Macy * in reverse order. 1151eda14cbcSMatt Macy */ 1152eda14cbcSMatt Macy uint64_t inner_half = inner_size / 2; 1153eda14cbcSMatt Macy 1154eda14cbcSMatt Macy func(rs->rs_split_offset + inner_half, dst_v, 1155eda14cbcSMatt Macy dst_offset + inner_offset + inner_half, 1156eda14cbcSMatt Macy inner_half, arg); 1157eda14cbcSMatt Macy 1158eda14cbcSMatt Macy func(rs->rs_split_offset, dst_v, 1159eda14cbcSMatt Macy dst_offset + inner_offset, 1160eda14cbcSMatt Macy inner_half, arg); 1161eda14cbcSMatt Macy } else { 1162eda14cbcSMatt Macy func(rs->rs_split_offset, dst_v, 1163eda14cbcSMatt Macy dst_offset + inner_offset, 1164eda14cbcSMatt Macy inner_size, arg); 1165eda14cbcSMatt Macy } 1166eda14cbcSMatt Macy 1167eda14cbcSMatt Macy rs->rs_offset += inner_size; 1168eda14cbcSMatt Macy rs->rs_asize -= inner_size; 1169eda14cbcSMatt Macy rs->rs_split_offset += inner_size; 1170eda14cbcSMatt Macy } 1171eda14cbcSMatt Macy VERIFY0(rs->rs_asize); 1172eda14cbcSMatt Macy 1173eda14cbcSMatt Macy kmem_free(mapping, num_entries * sizeof (*mapping)); 1174eda14cbcSMatt Macy kmem_free(rs, sizeof (remap_segment_t)); 1175eda14cbcSMatt Macy } 1176eda14cbcSMatt Macy list_destroy(&stack); 1177eda14cbcSMatt Macy } 1178eda14cbcSMatt Macy 1179eda14cbcSMatt Macy static void 1180eda14cbcSMatt Macy vdev_indirect_child_io_done(zio_t *zio) 1181eda14cbcSMatt Macy { 1182eda14cbcSMatt Macy zio_t *pio = zio->io_private; 1183eda14cbcSMatt Macy 1184eda14cbcSMatt Macy mutex_enter(&pio->io_lock); 1185eda14cbcSMatt Macy pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 1186eda14cbcSMatt Macy mutex_exit(&pio->io_lock); 1187eda14cbcSMatt Macy 1188184c1b94SMartin Matuska abd_free(zio->io_abd); 1189eda14cbcSMatt Macy } 1190eda14cbcSMatt Macy 1191eda14cbcSMatt Macy /* 1192eda14cbcSMatt Macy * This is a callback for vdev_indirect_remap() which allocates an 1193eda14cbcSMatt Macy * indirect_split_t for each split segment and adds it to iv_splits. 1194eda14cbcSMatt Macy */ 1195eda14cbcSMatt Macy static void 1196eda14cbcSMatt Macy vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 1197eda14cbcSMatt Macy uint64_t size, void *arg) 1198eda14cbcSMatt Macy { 1199eda14cbcSMatt Macy zio_t *zio = arg; 1200eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 1201eda14cbcSMatt Macy 1202eda14cbcSMatt Macy ASSERT3P(vd, !=, NULL); 1203eda14cbcSMatt Macy 1204eda14cbcSMatt Macy if (vd->vdev_ops == &vdev_indirect_ops) 1205eda14cbcSMatt Macy return; 1206eda14cbcSMatt Macy 1207eda14cbcSMatt Macy int n = 1; 1208eda14cbcSMatt Macy if (vd->vdev_ops == &vdev_mirror_ops) 1209eda14cbcSMatt Macy n = vd->vdev_children; 1210eda14cbcSMatt Macy 1211eda14cbcSMatt Macy indirect_split_t *is = 1212eda14cbcSMatt Macy kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); 1213eda14cbcSMatt Macy 1214eda14cbcSMatt Macy is->is_children = n; 1215eda14cbcSMatt Macy is->is_size = size; 1216eda14cbcSMatt Macy is->is_split_offset = split_offset; 1217eda14cbcSMatt Macy is->is_target_offset = offset; 1218eda14cbcSMatt Macy is->is_vdev = vd; 1219eda14cbcSMatt Macy list_create(&is->is_unique_child, sizeof (indirect_child_t), 1220eda14cbcSMatt Macy offsetof(indirect_child_t, ic_node)); 1221eda14cbcSMatt Macy 1222eda14cbcSMatt Macy /* 1223eda14cbcSMatt Macy * Note that we only consider multiple copies of the data for 1224eda14cbcSMatt Macy * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 1225eda14cbcSMatt Macy * though they use the same ops as mirror, because there's only one 1226eda14cbcSMatt Macy * "good" copy under the replacing/spare. 1227eda14cbcSMatt Macy */ 1228eda14cbcSMatt Macy if (vd->vdev_ops == &vdev_mirror_ops) { 1229eda14cbcSMatt Macy for (int i = 0; i < n; i++) { 1230eda14cbcSMatt Macy is->is_child[i].ic_vdev = vd->vdev_child[i]; 1231eda14cbcSMatt Macy list_link_init(&is->is_child[i].ic_node); 1232eda14cbcSMatt Macy } 1233eda14cbcSMatt Macy } else { 1234eda14cbcSMatt Macy is->is_child[0].ic_vdev = vd; 1235eda14cbcSMatt Macy } 1236eda14cbcSMatt Macy 1237eda14cbcSMatt Macy list_insert_tail(&iv->iv_splits, is); 1238eda14cbcSMatt Macy } 1239eda14cbcSMatt Macy 1240eda14cbcSMatt Macy static void 1241eda14cbcSMatt Macy vdev_indirect_read_split_done(zio_t *zio) 1242eda14cbcSMatt Macy { 1243eda14cbcSMatt Macy indirect_child_t *ic = zio->io_private; 1244eda14cbcSMatt Macy 1245eda14cbcSMatt Macy if (zio->io_error != 0) { 1246eda14cbcSMatt Macy /* 1247eda14cbcSMatt Macy * Clear ic_data to indicate that we do not have data for this 1248eda14cbcSMatt Macy * child. 1249eda14cbcSMatt Macy */ 1250eda14cbcSMatt Macy abd_free(ic->ic_data); 1251eda14cbcSMatt Macy ic->ic_data = NULL; 1252eda14cbcSMatt Macy } 1253eda14cbcSMatt Macy } 1254eda14cbcSMatt Macy 1255eda14cbcSMatt Macy /* 1256eda14cbcSMatt Macy * Issue reads for all copies (mirror children) of all splits. 1257eda14cbcSMatt Macy */ 1258eda14cbcSMatt Macy static void 1259eda14cbcSMatt Macy vdev_indirect_read_all(zio_t *zio) 1260eda14cbcSMatt Macy { 1261eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 1262eda14cbcSMatt Macy 1263eda14cbcSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 1264eda14cbcSMatt Macy 1265eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1266eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1267eda14cbcSMatt Macy for (int i = 0; i < is->is_children; i++) { 1268eda14cbcSMatt Macy indirect_child_t *ic = &is->is_child[i]; 1269eda14cbcSMatt Macy 1270eda14cbcSMatt Macy if (!vdev_readable(ic->ic_vdev)) 1271eda14cbcSMatt Macy continue; 1272eda14cbcSMatt Macy 1273eda14cbcSMatt Macy /* 12747877fdebSMatt Macy * If a child is missing the data, set ic_error. Used 12757877fdebSMatt Macy * in vdev_indirect_repair(). We perform the read 12767877fdebSMatt Macy * nevertheless which provides the opportunity to 12777877fdebSMatt Macy * reconstruct the split block if at all possible. 1278eda14cbcSMatt Macy */ 12797877fdebSMatt Macy if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, 12807877fdebSMatt Macy zio->io_txg, 1)) 12817877fdebSMatt Macy ic->ic_error = SET_ERROR(ESTALE); 1282eda14cbcSMatt Macy 1283eda14cbcSMatt Macy ic->ic_data = abd_alloc_sametype(zio->io_abd, 1284eda14cbcSMatt Macy is->is_size); 1285eda14cbcSMatt Macy ic->ic_duplicate = NULL; 1286eda14cbcSMatt Macy 1287eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 1288eda14cbcSMatt Macy ic->ic_vdev, is->is_target_offset, ic->ic_data, 1289eda14cbcSMatt Macy is->is_size, zio->io_type, zio->io_priority, 0, 1290eda14cbcSMatt Macy vdev_indirect_read_split_done, ic)); 1291eda14cbcSMatt Macy } 1292eda14cbcSMatt Macy } 1293eda14cbcSMatt Macy iv->iv_reconstruct = B_TRUE; 1294eda14cbcSMatt Macy } 1295eda14cbcSMatt Macy 1296eda14cbcSMatt Macy static void 1297eda14cbcSMatt Macy vdev_indirect_io_start(zio_t *zio) 1298eda14cbcSMatt Macy { 1299eda14cbcSMatt Macy spa_t *spa __maybe_unused = zio->io_spa; 1300eda14cbcSMatt Macy indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); 1301eda14cbcSMatt Macy list_create(&iv->iv_splits, 1302eda14cbcSMatt Macy sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 1303eda14cbcSMatt Macy 1304eda14cbcSMatt Macy zio->io_vsd = iv; 1305eda14cbcSMatt Macy zio->io_vsd_ops = &vdev_indirect_vsd_ops; 1306eda14cbcSMatt Macy 1307eda14cbcSMatt Macy ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1308eda14cbcSMatt Macy if (zio->io_type != ZIO_TYPE_READ) { 1309eda14cbcSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 1310eda14cbcSMatt Macy /* 1311eda14cbcSMatt Macy * Note: this code can handle other kinds of writes, 1312eda14cbcSMatt Macy * but we don't expect them. 1313eda14cbcSMatt Macy */ 1314eda14cbcSMatt Macy ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | 1315eda14cbcSMatt Macy ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); 1316eda14cbcSMatt Macy } 1317eda14cbcSMatt Macy 1318eda14cbcSMatt Macy vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, 1319eda14cbcSMatt Macy vdev_indirect_gather_splits, zio); 1320eda14cbcSMatt Macy 1321eda14cbcSMatt Macy indirect_split_t *first = list_head(&iv->iv_splits); 1322dbd5678dSMartin Matuska ASSERT3P(first, !=, NULL); 1323eda14cbcSMatt Macy if (first->is_size == zio->io_size) { 1324eda14cbcSMatt Macy /* 1325eda14cbcSMatt Macy * This is not a split block; we are pointing to the entire 1326eda14cbcSMatt Macy * data, which will checksum the same as the original data. 1327eda14cbcSMatt Macy * Pass the BP down so that the child i/o can verify the 1328eda14cbcSMatt Macy * checksum, and try a different location if available 1329eda14cbcSMatt Macy * (e.g. on a mirror). 1330eda14cbcSMatt Macy * 1331eda14cbcSMatt Macy * While this special case could be handled the same as the 1332eda14cbcSMatt Macy * general (split block) case, doing it this way ensures 1333eda14cbcSMatt Macy * that the vast majority of blocks on indirect vdevs 1334eda14cbcSMatt Macy * (which are not split) are handled identically to blocks 1335eda14cbcSMatt Macy * on non-indirect vdevs. This allows us to be less strict 1336eda14cbcSMatt Macy * about performance in the general (but rare) case. 1337eda14cbcSMatt Macy */ 1338eda14cbcSMatt Macy ASSERT0(first->is_split_offset); 1339eda14cbcSMatt Macy ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); 1340eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 1341eda14cbcSMatt Macy first->is_vdev, first->is_target_offset, 1342eda14cbcSMatt Macy abd_get_offset(zio->io_abd, 0), 1343eda14cbcSMatt Macy zio->io_size, zio->io_type, zio->io_priority, 0, 1344eda14cbcSMatt Macy vdev_indirect_child_io_done, zio)); 1345eda14cbcSMatt Macy } else { 1346eda14cbcSMatt Macy iv->iv_split_block = B_TRUE; 1347eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_READ && 1348eda14cbcSMatt Macy zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { 1349eda14cbcSMatt Macy /* 1350eda14cbcSMatt Macy * Read all copies. Note that for simplicity, 1351eda14cbcSMatt Macy * we don't bother consulting the DTL in the 1352eda14cbcSMatt Macy * resilver case. 1353eda14cbcSMatt Macy */ 1354eda14cbcSMatt Macy vdev_indirect_read_all(zio); 1355eda14cbcSMatt Macy } else { 1356eda14cbcSMatt Macy /* 1357eda14cbcSMatt Macy * If this is a read zio, we read one copy of each 1358eda14cbcSMatt Macy * split segment, from the top-level vdev. Since 1359eda14cbcSMatt Macy * we don't know the checksum of each split 1360eda14cbcSMatt Macy * individually, the child zio can't ensure that 1361eda14cbcSMatt Macy * we get the right data. E.g. if it's a mirror, 1362eda14cbcSMatt Macy * it will just read from a random (healthy) leaf 1363eda14cbcSMatt Macy * vdev. We have to verify the checksum in 1364eda14cbcSMatt Macy * vdev_indirect_io_done(). 1365eda14cbcSMatt Macy * 1366eda14cbcSMatt Macy * For write zios, the vdev code will ensure we write 1367eda14cbcSMatt Macy * to all children. 1368eda14cbcSMatt Macy */ 1369eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1370eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1371eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 1372eda14cbcSMatt Macy is->is_vdev, is->is_target_offset, 1373d411c1d6SMartin Matuska abd_get_offset_size(zio->io_abd, 1374d411c1d6SMartin Matuska is->is_split_offset, is->is_size), 1375d411c1d6SMartin Matuska is->is_size, zio->io_type, 1376d411c1d6SMartin Matuska zio->io_priority, 0, 1377eda14cbcSMatt Macy vdev_indirect_child_io_done, zio)); 1378eda14cbcSMatt Macy } 1379eda14cbcSMatt Macy 1380eda14cbcSMatt Macy } 1381eda14cbcSMatt Macy } 1382eda14cbcSMatt Macy 1383eda14cbcSMatt Macy zio_execute(zio); 1384eda14cbcSMatt Macy } 1385eda14cbcSMatt Macy 1386eda14cbcSMatt Macy /* 1387eda14cbcSMatt Macy * Report a checksum error for a child. 1388eda14cbcSMatt Macy */ 1389eda14cbcSMatt Macy static void 1390eda14cbcSMatt Macy vdev_indirect_checksum_error(zio_t *zio, 1391eda14cbcSMatt Macy indirect_split_t *is, indirect_child_t *ic) 1392eda14cbcSMatt Macy { 1393eda14cbcSMatt Macy vdev_t *vd = ic->ic_vdev; 1394eda14cbcSMatt Macy 1395eda14cbcSMatt Macy if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 1396eda14cbcSMatt Macy return; 1397eda14cbcSMatt Macy 1398eda14cbcSMatt Macy mutex_enter(&vd->vdev_stat_lock); 1399eda14cbcSMatt Macy vd->vdev_stat.vs_checksum_errors++; 1400eda14cbcSMatt Macy mutex_exit(&vd->vdev_stat_lock); 1401eda14cbcSMatt Macy 1402315ee00fSMartin Matuska zio_bad_cksum_t zbc = { 0 }; 1403eda14cbcSMatt Macy abd_t *bad_abd = ic->ic_data; 1404eda14cbcSMatt Macy abd_t *good_abd = is->is_good_child->ic_data; 1405eac7052fSMatt Macy (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, 1406eda14cbcSMatt Macy is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); 1407eda14cbcSMatt Macy } 1408eda14cbcSMatt Macy 1409eda14cbcSMatt Macy /* 1410eda14cbcSMatt Macy * Issue repair i/os for any incorrect copies. We do this by comparing 1411eda14cbcSMatt Macy * each split segment's correct data (is_good_child's ic_data) with each 1412eda14cbcSMatt Macy * other copy of the data. If they differ, then we overwrite the bad data 14137877fdebSMatt Macy * with the good copy. The DTL is checked in vdev_indirect_read_all() and 14147877fdebSMatt Macy * if a vdev is missing a copy of the data we set ic_error and the read is 14157877fdebSMatt Macy * performed. This provides the opportunity to reconstruct the split block 14167877fdebSMatt Macy * if at all possible. ic_error is checked here and if set it suppresses 14177877fdebSMatt Macy * incrementing the checksum counter. Aside from this DTLs are not checked, 1418eda14cbcSMatt Macy * which simplifies this code and also issues the optimal number of writes 1419eda14cbcSMatt Macy * (based on which copies actually read bad data, as opposed to which we 1420eda14cbcSMatt Macy * think might be wrong). For the same reason, we always use 1421eda14cbcSMatt Macy * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). 1422eda14cbcSMatt Macy */ 1423eda14cbcSMatt Macy static void 1424eda14cbcSMatt Macy vdev_indirect_repair(zio_t *zio) 1425eda14cbcSMatt Macy { 1426eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 1427eda14cbcSMatt Macy 1428eda14cbcSMatt Macy if (!spa_writeable(zio->io_spa)) 1429eda14cbcSMatt Macy return; 1430eda14cbcSMatt Macy 1431eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1432eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1433eda14cbcSMatt Macy for (int c = 0; c < is->is_children; c++) { 1434eda14cbcSMatt Macy indirect_child_t *ic = &is->is_child[c]; 1435eda14cbcSMatt Macy if (ic == is->is_good_child) 1436eda14cbcSMatt Macy continue; 1437eda14cbcSMatt Macy if (ic->ic_data == NULL) 1438eda14cbcSMatt Macy continue; 1439eda14cbcSMatt Macy if (ic->ic_duplicate == is->is_good_child) 1440eda14cbcSMatt Macy continue; 1441eda14cbcSMatt Macy 1442eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 1443eda14cbcSMatt Macy ic->ic_vdev, is->is_target_offset, 1444eda14cbcSMatt Macy is->is_good_child->ic_data, is->is_size, 1445eda14cbcSMatt Macy ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 1446eda14cbcSMatt Macy ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 1447eda14cbcSMatt Macy NULL, NULL)); 1448eda14cbcSMatt Macy 14497877fdebSMatt Macy /* 14507877fdebSMatt Macy * If ic_error is set the current child does not have 14517877fdebSMatt Macy * a copy of the data, so suppress incrementing the 14527877fdebSMatt Macy * checksum counter. 14537877fdebSMatt Macy */ 14547877fdebSMatt Macy if (ic->ic_error == ESTALE) 14557877fdebSMatt Macy continue; 14567877fdebSMatt Macy 1457eda14cbcSMatt Macy vdev_indirect_checksum_error(zio, is, ic); 1458eda14cbcSMatt Macy } 1459eda14cbcSMatt Macy } 1460eda14cbcSMatt Macy } 1461eda14cbcSMatt Macy 1462eda14cbcSMatt Macy /* 1463eda14cbcSMatt Macy * Report checksum errors on all children that we read from. 1464eda14cbcSMatt Macy */ 1465eda14cbcSMatt Macy static void 1466eda14cbcSMatt Macy vdev_indirect_all_checksum_errors(zio_t *zio) 1467eda14cbcSMatt Macy { 1468eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 1469eda14cbcSMatt Macy 1470eda14cbcSMatt Macy if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 1471eda14cbcSMatt Macy return; 1472eda14cbcSMatt Macy 1473eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1474eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1475eda14cbcSMatt Macy for (int c = 0; c < is->is_children; c++) { 1476eda14cbcSMatt Macy indirect_child_t *ic = &is->is_child[c]; 1477eda14cbcSMatt Macy 1478eda14cbcSMatt Macy if (ic->ic_data == NULL) 1479eda14cbcSMatt Macy continue; 1480eda14cbcSMatt Macy 1481eda14cbcSMatt Macy vdev_t *vd = ic->ic_vdev; 1482eda14cbcSMatt Macy 1483eda14cbcSMatt Macy mutex_enter(&vd->vdev_stat_lock); 1484eda14cbcSMatt Macy vd->vdev_stat.vs_checksum_errors++; 1485eda14cbcSMatt Macy mutex_exit(&vd->vdev_stat_lock); 1486bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1487bb2d13b6SMartin Matuska NULL, zio, is->is_target_offset, is->is_size, 1488bb2d13b6SMartin Matuska NULL, NULL, NULL); 14892c48331dSMatt Macy } 1490eda14cbcSMatt Macy } 1491eda14cbcSMatt Macy } 1492eda14cbcSMatt Macy 1493eda14cbcSMatt Macy /* 1494eda14cbcSMatt Macy * Copy data from all the splits to a main zio then validate the checksum. 1495eda14cbcSMatt Macy * If then checksum is successfully validated return success. 1496eda14cbcSMatt Macy */ 1497eda14cbcSMatt Macy static int 1498eda14cbcSMatt Macy vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) 1499eda14cbcSMatt Macy { 1500eda14cbcSMatt Macy zio_bad_cksum_t zbc; 1501eda14cbcSMatt Macy 1502eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1503eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1504eda14cbcSMatt Macy 1505eda14cbcSMatt Macy ASSERT3P(is->is_good_child->ic_data, !=, NULL); 1506eda14cbcSMatt Macy ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); 1507eda14cbcSMatt Macy 1508eda14cbcSMatt Macy abd_copy_off(zio->io_abd, is->is_good_child->ic_data, 1509eda14cbcSMatt Macy is->is_split_offset, 0, is->is_size); 1510eda14cbcSMatt Macy } 1511eda14cbcSMatt Macy 1512eda14cbcSMatt Macy return (zio_checksum_error(zio, &zbc)); 1513eda14cbcSMatt Macy } 1514eda14cbcSMatt Macy 1515eda14cbcSMatt Macy /* 1516eda14cbcSMatt Macy * There are relatively few possible combinations making it feasible to 1517eda14cbcSMatt Macy * deterministically check them all. We do this by setting the good_child 1518eda14cbcSMatt Macy * to the next unique split version. If we reach the end of the list then 1519eda14cbcSMatt Macy * "carry over" to the next unique split version (like counting in base 1520eda14cbcSMatt Macy * is_unique_children, but each digit can have a different base). 1521eda14cbcSMatt Macy */ 1522eda14cbcSMatt Macy static int 1523eda14cbcSMatt Macy vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) 1524eda14cbcSMatt Macy { 1525eda14cbcSMatt Macy boolean_t more = B_TRUE; 1526eda14cbcSMatt Macy 1527eda14cbcSMatt Macy iv->iv_attempts = 0; 1528eda14cbcSMatt Macy 1529eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1530eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) 1531eda14cbcSMatt Macy is->is_good_child = list_head(&is->is_unique_child); 1532eda14cbcSMatt Macy 1533eda14cbcSMatt Macy while (more == B_TRUE) { 1534eda14cbcSMatt Macy iv->iv_attempts++; 1535eda14cbcSMatt Macy more = B_FALSE; 1536eda14cbcSMatt Macy 1537eda14cbcSMatt Macy if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) 1538eda14cbcSMatt Macy return (0); 1539eda14cbcSMatt Macy 1540eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1541eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1542eda14cbcSMatt Macy is->is_good_child = list_next(&is->is_unique_child, 1543eda14cbcSMatt Macy is->is_good_child); 1544eda14cbcSMatt Macy if (is->is_good_child != NULL) { 1545eda14cbcSMatt Macy more = B_TRUE; 1546eda14cbcSMatt Macy break; 1547eda14cbcSMatt Macy } 1548eda14cbcSMatt Macy 1549eda14cbcSMatt Macy is->is_good_child = list_head(&is->is_unique_child); 1550eda14cbcSMatt Macy } 1551eda14cbcSMatt Macy } 1552eda14cbcSMatt Macy 1553eda14cbcSMatt Macy ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); 1554eda14cbcSMatt Macy 1555eda14cbcSMatt Macy return (SET_ERROR(ECKSUM)); 1556eda14cbcSMatt Macy } 1557eda14cbcSMatt Macy 1558eda14cbcSMatt Macy /* 1559eda14cbcSMatt Macy * There are too many combinations to try all of them in a reasonable amount 1560eda14cbcSMatt Macy * of time. So try a fixed number of random combinations from the unique 1561eda14cbcSMatt Macy * split versions, after which we'll consider the block unrecoverable. 1562eda14cbcSMatt Macy */ 1563eda14cbcSMatt Macy static int 1564eda14cbcSMatt Macy vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) 1565eda14cbcSMatt Macy { 1566eda14cbcSMatt Macy iv->iv_attempts = 0; 1567eda14cbcSMatt Macy 1568eda14cbcSMatt Macy while (iv->iv_attempts < iv->iv_attempts_max) { 1569eda14cbcSMatt Macy iv->iv_attempts++; 1570eda14cbcSMatt Macy 1571eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1572eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1573eda14cbcSMatt Macy indirect_child_t *ic = list_head(&is->is_unique_child); 1574eda14cbcSMatt Macy int children = is->is_unique_children; 1575eda14cbcSMatt Macy 157633b8c039SMartin Matuska for (int i = random_in_range(children); i > 0; i--) 1577eda14cbcSMatt Macy ic = list_next(&is->is_unique_child, ic); 1578eda14cbcSMatt Macy 1579eda14cbcSMatt Macy ASSERT3P(ic, !=, NULL); 1580eda14cbcSMatt Macy is->is_good_child = ic; 1581eda14cbcSMatt Macy } 1582eda14cbcSMatt Macy 1583eda14cbcSMatt Macy if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) 1584eda14cbcSMatt Macy return (0); 1585eda14cbcSMatt Macy } 1586eda14cbcSMatt Macy 1587eda14cbcSMatt Macy return (SET_ERROR(ECKSUM)); 1588eda14cbcSMatt Macy } 1589eda14cbcSMatt Macy 1590eda14cbcSMatt Macy /* 1591eda14cbcSMatt Macy * This is a validation function for reconstruction. It randomly selects 1592eda14cbcSMatt Macy * a good combination, if one can be found, and then it intentionally 1593eda14cbcSMatt Macy * damages all other segment copes by zeroing them. This forces the 1594eda14cbcSMatt Macy * reconstruction algorithm to locate the one remaining known good copy. 1595eda14cbcSMatt Macy */ 1596eda14cbcSMatt Macy static int 1597eda14cbcSMatt Macy vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) 1598eda14cbcSMatt Macy { 1599eda14cbcSMatt Macy int error; 1600eda14cbcSMatt Macy 1601eda14cbcSMatt Macy /* Presume all the copies are unique for initial selection. */ 1602eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1603eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1604eda14cbcSMatt Macy is->is_unique_children = 0; 1605eda14cbcSMatt Macy 1606eda14cbcSMatt Macy for (int i = 0; i < is->is_children; i++) { 1607eda14cbcSMatt Macy indirect_child_t *ic = &is->is_child[i]; 1608eda14cbcSMatt Macy if (ic->ic_data != NULL) { 1609eda14cbcSMatt Macy is->is_unique_children++; 1610eda14cbcSMatt Macy list_insert_tail(&is->is_unique_child, ic); 1611eda14cbcSMatt Macy } 1612eda14cbcSMatt Macy } 1613eda14cbcSMatt Macy 1614eda14cbcSMatt Macy if (list_is_empty(&is->is_unique_child)) { 1615eda14cbcSMatt Macy error = SET_ERROR(EIO); 1616eda14cbcSMatt Macy goto out; 1617eda14cbcSMatt Macy } 1618eda14cbcSMatt Macy } 1619eda14cbcSMatt Macy 1620eda14cbcSMatt Macy /* 1621eda14cbcSMatt Macy * Set each is_good_child to a randomly-selected child which 1622eda14cbcSMatt Macy * is known to contain validated data. 1623eda14cbcSMatt Macy */ 1624eda14cbcSMatt Macy error = vdev_indirect_splits_enumerate_randomly(iv, zio); 1625eda14cbcSMatt Macy if (error) 1626eda14cbcSMatt Macy goto out; 1627eda14cbcSMatt Macy 1628eda14cbcSMatt Macy /* 1629eda14cbcSMatt Macy * Damage all but the known good copy by zeroing it. This will 1630eda14cbcSMatt Macy * result in two or less unique copies per indirect_child_t. 1631eda14cbcSMatt Macy * Both may need to be checked in order to reconstruct the block. 1632eda14cbcSMatt Macy * Set iv->iv_attempts_max such that all unique combinations will 1633eda14cbcSMatt Macy * enumerated, but limit the damage to at most 12 indirect splits. 1634eda14cbcSMatt Macy */ 1635eda14cbcSMatt Macy iv->iv_attempts_max = 1; 1636eda14cbcSMatt Macy 1637eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1638eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1639eda14cbcSMatt Macy for (int c = 0; c < is->is_children; c++) { 1640eda14cbcSMatt Macy indirect_child_t *ic = &is->is_child[c]; 1641eda14cbcSMatt Macy 1642eda14cbcSMatt Macy if (ic == is->is_good_child) 1643eda14cbcSMatt Macy continue; 1644eda14cbcSMatt Macy if (ic->ic_data == NULL) 1645eda14cbcSMatt Macy continue; 1646eda14cbcSMatt Macy 1647eda14cbcSMatt Macy abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); 1648eda14cbcSMatt Macy } 1649eda14cbcSMatt Macy 1650eda14cbcSMatt Macy iv->iv_attempts_max *= 2; 1651eda14cbcSMatt Macy if (iv->iv_attempts_max >= (1ULL << 12)) { 1652eda14cbcSMatt Macy iv->iv_attempts_max = UINT64_MAX; 1653eda14cbcSMatt Macy break; 1654eda14cbcSMatt Macy } 1655eda14cbcSMatt Macy } 1656eda14cbcSMatt Macy 1657eda14cbcSMatt Macy out: 1658eda14cbcSMatt Macy /* Empty the unique children lists so they can be reconstructed. */ 1659eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1660eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1661eda14cbcSMatt Macy indirect_child_t *ic; 16624e8d558cSMartin Matuska while ((ic = list_remove_head(&is->is_unique_child)) != NULL) 16634e8d558cSMartin Matuska ; 1664eda14cbcSMatt Macy 1665eda14cbcSMatt Macy is->is_unique_children = 0; 1666eda14cbcSMatt Macy } 1667eda14cbcSMatt Macy 1668eda14cbcSMatt Macy return (error); 1669eda14cbcSMatt Macy } 1670eda14cbcSMatt Macy 1671eda14cbcSMatt Macy /* 1672eda14cbcSMatt Macy * This function is called when we have read all copies of the data and need 1673eda14cbcSMatt Macy * to try to find a combination of copies that gives us the right checksum. 1674eda14cbcSMatt Macy * 1675eda14cbcSMatt Macy * If we pointed to any mirror vdevs, this effectively does the job of the 1676eda14cbcSMatt Macy * mirror. The mirror vdev code can't do its own job because we don't know 1677eda14cbcSMatt Macy * the checksum of each split segment individually. 1678eda14cbcSMatt Macy * 1679eda14cbcSMatt Macy * We have to try every unique combination of copies of split segments, until 1680eda14cbcSMatt Macy * we find one that checksums correctly. Duplicate segment copies are first 1681eda14cbcSMatt Macy * identified and latter skipped during reconstruction. This optimization 1682eda14cbcSMatt Macy * reduces the search space and ensures that of the remaining combinations 1683eda14cbcSMatt Macy * at most one is correct. 1684eda14cbcSMatt Macy * 1685eda14cbcSMatt Macy * When the total number of combinations is small they can all be checked. 1686eda14cbcSMatt Macy * For example, if we have 3 segments in the split, and each points to a 1687eda14cbcSMatt Macy * 2-way mirror with unique copies, we will have the following pieces of data: 1688eda14cbcSMatt Macy * 1689eda14cbcSMatt Macy * | mirror child 1690eda14cbcSMatt Macy * split | [0] [1] 1691eda14cbcSMatt Macy * ======|===================== 1692eda14cbcSMatt Macy * A | data_A_0 data_A_1 1693eda14cbcSMatt Macy * B | data_B_0 data_B_1 1694eda14cbcSMatt Macy * C | data_C_0 data_C_1 1695eda14cbcSMatt Macy * 1696eda14cbcSMatt Macy * We will try the following (mirror children)^(number of splits) (2^3=8) 1697eda14cbcSMatt Macy * combinations, which is similar to bitwise-little-endian counting in 1698eda14cbcSMatt Macy * binary. In general each "digit" corresponds to a split segment, and the 1699eda14cbcSMatt Macy * base of each digit is is_children, which can be different for each 1700eda14cbcSMatt Macy * digit. 1701eda14cbcSMatt Macy * 1702eda14cbcSMatt Macy * "low bit" "high bit" 1703eda14cbcSMatt Macy * v v 1704eda14cbcSMatt Macy * data_A_0 data_B_0 data_C_0 1705eda14cbcSMatt Macy * data_A_1 data_B_0 data_C_0 1706eda14cbcSMatt Macy * data_A_0 data_B_1 data_C_0 1707eda14cbcSMatt Macy * data_A_1 data_B_1 data_C_0 1708eda14cbcSMatt Macy * data_A_0 data_B_0 data_C_1 1709eda14cbcSMatt Macy * data_A_1 data_B_0 data_C_1 1710eda14cbcSMatt Macy * data_A_0 data_B_1 data_C_1 1711eda14cbcSMatt Macy * data_A_1 data_B_1 data_C_1 1712eda14cbcSMatt Macy * 1713eda14cbcSMatt Macy * Note that the split segments may be on the same or different top-level 1714eda14cbcSMatt Macy * vdevs. In either case, we may need to try lots of combinations (see 1715eda14cbcSMatt Macy * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror 1716eda14cbcSMatt Macy * has small silent errors on all of its children, we can still reconstruct 1717eda14cbcSMatt Macy * the correct data, as long as those errors are at sufficiently-separated 1718eda14cbcSMatt Macy * offsets (specifically, separated by the largest block size - default of 1719eda14cbcSMatt Macy * 128KB, but up to 16MB). 1720eda14cbcSMatt Macy */ 1721eda14cbcSMatt Macy static void 1722eda14cbcSMatt Macy vdev_indirect_reconstruct_io_done(zio_t *zio) 1723eda14cbcSMatt Macy { 1724eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 1725eda14cbcSMatt Macy boolean_t known_good = B_FALSE; 1726eda14cbcSMatt Macy int error; 1727eda14cbcSMatt Macy 1728eda14cbcSMatt Macy iv->iv_unique_combinations = 1; 1729eda14cbcSMatt Macy iv->iv_attempts_max = UINT64_MAX; 1730eda14cbcSMatt Macy 1731eda14cbcSMatt Macy if (zfs_reconstruct_indirect_combinations_max > 0) 1732eda14cbcSMatt Macy iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; 1733eda14cbcSMatt Macy 1734eda14cbcSMatt Macy /* 1735eda14cbcSMatt Macy * If nonzero, every 1/x blocks will be damaged, in order to validate 1736eda14cbcSMatt Macy * reconstruction when there are split segments with damaged copies. 1737eda14cbcSMatt Macy * Known_good will be TRUE when reconstruction is known to be possible. 1738eda14cbcSMatt Macy */ 1739eda14cbcSMatt Macy if (zfs_reconstruct_indirect_damage_fraction != 0 && 174033b8c039SMartin Matuska random_in_range(zfs_reconstruct_indirect_damage_fraction) == 0) 1741eda14cbcSMatt Macy known_good = (vdev_indirect_splits_damage(iv, zio) == 0); 1742eda14cbcSMatt Macy 1743eda14cbcSMatt Macy /* 1744eda14cbcSMatt Macy * Determine the unique children for a split segment and add them 1745eda14cbcSMatt Macy * to the is_unique_child list. By restricting reconstruction 1746eda14cbcSMatt Macy * to these children, only unique combinations will be considered. 1747eda14cbcSMatt Macy * This can vastly reduce the search space when there are a large 1748eda14cbcSMatt Macy * number of indirect splits. 1749eda14cbcSMatt Macy */ 1750eda14cbcSMatt Macy for (indirect_split_t *is = list_head(&iv->iv_splits); 1751eda14cbcSMatt Macy is != NULL; is = list_next(&iv->iv_splits, is)) { 1752eda14cbcSMatt Macy is->is_unique_children = 0; 1753eda14cbcSMatt Macy 1754eda14cbcSMatt Macy for (int i = 0; i < is->is_children; i++) { 1755eda14cbcSMatt Macy indirect_child_t *ic_i = &is->is_child[i]; 1756eda14cbcSMatt Macy 1757eda14cbcSMatt Macy if (ic_i->ic_data == NULL || 1758eda14cbcSMatt Macy ic_i->ic_duplicate != NULL) 1759eda14cbcSMatt Macy continue; 1760eda14cbcSMatt Macy 1761eda14cbcSMatt Macy for (int j = i + 1; j < is->is_children; j++) { 1762eda14cbcSMatt Macy indirect_child_t *ic_j = &is->is_child[j]; 1763eda14cbcSMatt Macy 1764eda14cbcSMatt Macy if (ic_j->ic_data == NULL || 1765eda14cbcSMatt Macy ic_j->ic_duplicate != NULL) 1766eda14cbcSMatt Macy continue; 1767eda14cbcSMatt Macy 1768eda14cbcSMatt Macy if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) 1769eda14cbcSMatt Macy ic_j->ic_duplicate = ic_i; 1770eda14cbcSMatt Macy } 1771eda14cbcSMatt Macy 1772eda14cbcSMatt Macy is->is_unique_children++; 1773eda14cbcSMatt Macy list_insert_tail(&is->is_unique_child, ic_i); 1774eda14cbcSMatt Macy } 1775eda14cbcSMatt Macy 1776eda14cbcSMatt Macy /* Reconstruction is impossible, no valid children */ 1777eda14cbcSMatt Macy EQUIV(list_is_empty(&is->is_unique_child), 1778eda14cbcSMatt Macy is->is_unique_children == 0); 1779eda14cbcSMatt Macy if (list_is_empty(&is->is_unique_child)) { 1780eda14cbcSMatt Macy zio->io_error = EIO; 1781eda14cbcSMatt Macy vdev_indirect_all_checksum_errors(zio); 1782eda14cbcSMatt Macy zio_checksum_verified(zio); 1783eda14cbcSMatt Macy return; 1784eda14cbcSMatt Macy } 1785eda14cbcSMatt Macy 1786eda14cbcSMatt Macy iv->iv_unique_combinations *= is->is_unique_children; 1787eda14cbcSMatt Macy } 1788eda14cbcSMatt Macy 1789eda14cbcSMatt Macy if (iv->iv_unique_combinations <= iv->iv_attempts_max) 1790eda14cbcSMatt Macy error = vdev_indirect_splits_enumerate_all(iv, zio); 1791eda14cbcSMatt Macy else 1792eda14cbcSMatt Macy error = vdev_indirect_splits_enumerate_randomly(iv, zio); 1793eda14cbcSMatt Macy 1794eda14cbcSMatt Macy if (error != 0) { 1795eda14cbcSMatt Macy /* All attempted combinations failed. */ 1796eda14cbcSMatt Macy ASSERT3B(known_good, ==, B_FALSE); 1797eda14cbcSMatt Macy zio->io_error = error; 1798eda14cbcSMatt Macy vdev_indirect_all_checksum_errors(zio); 1799eda14cbcSMatt Macy } else { 1800eda14cbcSMatt Macy /* 1801eda14cbcSMatt Macy * The checksum has been successfully validated. Issue 1802eda14cbcSMatt Macy * repair I/Os to any copies of splits which don't match 1803eda14cbcSMatt Macy * the validated version. 1804eda14cbcSMatt Macy */ 1805eda14cbcSMatt Macy ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); 1806eda14cbcSMatt Macy vdev_indirect_repair(zio); 1807eda14cbcSMatt Macy zio_checksum_verified(zio); 1808eda14cbcSMatt Macy } 1809eda14cbcSMatt Macy } 1810eda14cbcSMatt Macy 1811eda14cbcSMatt Macy static void 1812eda14cbcSMatt Macy vdev_indirect_io_done(zio_t *zio) 1813eda14cbcSMatt Macy { 1814eda14cbcSMatt Macy indirect_vsd_t *iv = zio->io_vsd; 1815eda14cbcSMatt Macy 1816eda14cbcSMatt Macy if (iv->iv_reconstruct) { 1817eda14cbcSMatt Macy /* 1818eda14cbcSMatt Macy * We have read all copies of the data (e.g. from mirrors), 1819eda14cbcSMatt Macy * either because this was a scrub/resilver, or because the 1820eda14cbcSMatt Macy * one-copy read didn't checksum correctly. 1821eda14cbcSMatt Macy */ 1822eda14cbcSMatt Macy vdev_indirect_reconstruct_io_done(zio); 1823eda14cbcSMatt Macy return; 1824eda14cbcSMatt Macy } 1825eda14cbcSMatt Macy 1826eda14cbcSMatt Macy if (!iv->iv_split_block) { 1827eda14cbcSMatt Macy /* 1828eda14cbcSMatt Macy * This was not a split block, so we passed the BP down, 1829eda14cbcSMatt Macy * and the checksum was handled by the (one) child zio. 1830eda14cbcSMatt Macy */ 1831eda14cbcSMatt Macy return; 1832eda14cbcSMatt Macy } 1833eda14cbcSMatt Macy 1834eda14cbcSMatt Macy zio_bad_cksum_t zbc; 1835eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc); 1836*87bf66d4SMartin Matuska /* 1837*87bf66d4SMartin Matuska * Any Direct I/O read that has a checksum error must be treated as 1838*87bf66d4SMartin Matuska * suspicious as the contents of the buffer could be getting 1839*87bf66d4SMartin Matuska * manipulated while the I/O is taking place. The checksum verify error 1840*87bf66d4SMartin Matuska * will be reported to the top-level VDEV. 1841*87bf66d4SMartin Matuska */ 1842*87bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 1843*87bf66d4SMartin Matuska zio->io_error = ret; 1844*87bf66d4SMartin Matuska zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 1845*87bf66d4SMartin Matuska zio_dio_chksum_verify_error_report(zio); 1846*87bf66d4SMartin Matuska ret = 0; 1847*87bf66d4SMartin Matuska } 1848*87bf66d4SMartin Matuska 1849eda14cbcSMatt Macy if (ret == 0) { 1850eda14cbcSMatt Macy zio_checksum_verified(zio); 1851eda14cbcSMatt Macy return; 1852eda14cbcSMatt Macy } 1853eda14cbcSMatt Macy 1854eda14cbcSMatt Macy /* 1855eda14cbcSMatt Macy * The checksum didn't match. Read all copies of all splits, and 1856eda14cbcSMatt Macy * then we will try to reconstruct. The next time 1857eda14cbcSMatt Macy * vdev_indirect_io_done() is called, iv_reconstruct will be set. 1858eda14cbcSMatt Macy */ 1859eda14cbcSMatt Macy vdev_indirect_read_all(zio); 1860eda14cbcSMatt Macy 1861eda14cbcSMatt Macy zio_vdev_io_redone(zio); 1862eda14cbcSMatt Macy } 1863eda14cbcSMatt Macy 1864eda14cbcSMatt Macy vdev_ops_t vdev_indirect_ops = { 18657877fdebSMatt Macy .vdev_op_init = NULL, 18667877fdebSMatt Macy .vdev_op_fini = NULL, 1867eda14cbcSMatt Macy .vdev_op_open = vdev_indirect_open, 1868eda14cbcSMatt Macy .vdev_op_close = vdev_indirect_close, 1869eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 18707877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 18717877fdebSMatt Macy .vdev_op_min_alloc = NULL, 1872eda14cbcSMatt Macy .vdev_op_io_start = vdev_indirect_io_start, 1873eda14cbcSMatt Macy .vdev_op_io_done = vdev_indirect_io_done, 1874eda14cbcSMatt Macy .vdev_op_state_change = NULL, 1875eda14cbcSMatt Macy .vdev_op_need_resilver = NULL, 1876eda14cbcSMatt Macy .vdev_op_hold = NULL, 1877eda14cbcSMatt Macy .vdev_op_rele = NULL, 1878eda14cbcSMatt Macy .vdev_op_remap = vdev_indirect_remap, 1879eda14cbcSMatt Macy .vdev_op_xlate = NULL, 18807877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 18817877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 18827877fdebSMatt Macy .vdev_op_config_generate = NULL, 18837877fdebSMatt Macy .vdev_op_nparity = NULL, 18847877fdebSMatt Macy .vdev_op_ndisks = NULL, 1885eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1886eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* leaf vdev */ 1887eda14cbcSMatt Macy }; 1888eda14cbcSMatt Macy 1889eda14cbcSMatt Macy EXPORT_SYMBOL(spa_condense_fini); 1890eda14cbcSMatt Macy EXPORT_SYMBOL(spa_start_indirect_condensing_thread); 1891eda14cbcSMatt Macy EXPORT_SYMBOL(spa_condense_indirect_start_sync); 1892eda14cbcSMatt Macy EXPORT_SYMBOL(spa_condense_init); 1893eda14cbcSMatt Macy EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); 1894eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_indirect_mark_obsolete); 1895eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_indirect_should_condense); 1896eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_indirect_sync_obsolete); 1897eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); 1898eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_obsolete_sm_object); 1899eda14cbcSMatt Macy 1900c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, 1901c03c5b1cSMartin Matuska ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); 1902eda14cbcSMatt Macy 1903be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT, 1904c03c5b1cSMartin Matuska ZMOD_RW, 1905c03c5b1cSMartin Matuska "Minimum obsolete percent of bytes in the mapping " 1906c03c5b1cSMartin Matuska "to attempt condensing"); 190716038816SMartin Matuska 1908dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW, 1909eda14cbcSMatt Macy "Don't bother condensing if the mapping uses less than this amount of " 1910eda14cbcSMatt Macy "memory"); 1911eda14cbcSMatt Macy 1912dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64, 1913c03c5b1cSMartin Matuska ZMOD_RW, 1914eda14cbcSMatt Macy "Minimum size obsolete spacemap to attempt condensing"); 1915eda14cbcSMatt Macy 1916c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, 1917be181ee2SMartin Matuska UINT, ZMOD_RW, 1918eda14cbcSMatt Macy "Used by tests to ensure certain actions happen in the middle of a " 1919eda14cbcSMatt Macy "condense. A maximum value of 1 should be sufficient."); 1920eda14cbcSMatt Macy 1921c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, 1922be181ee2SMartin Matuska UINT, ZMOD_RW, 1923eda14cbcSMatt Macy "Maximum number of combinations when reconstructing split segments"); 1924