Lines Matching +full:data +full:- +full:mapping

42  * mapping from old location on the removed device to the new location
43 * on another device in the pool and use this mapping whenever we need
44 * to access the DVA. Unfortunately, this mapping did not respect
52 * - I/Os to this vdev use the callback to determine where the
53 * data is now located, and issue child I/Os for each segment's new
56 * - frees and claims to this vdev use the callback to free or claim
67 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
68 * keep track of how much of each mapping entry is obsolete. When
70 * the memory used by the mapping. The complete picture of obsolescence
71 * is given by the following data structures, described below:
72 * - the entry-specific obsolete count
73 * - the vdev-specific obsolete spacemap
74 * - the pool-specific obsolete bpobj
76 * == On disk data structures used ==
83 * - Each vic_mapping_object (associated with an indirect vdev) can
86 * the mapping is condensed, entries from the vic_obsolete_sm_object
89 * corresponding mapping entry that were not referenced when the
90 * mapping was last condensed.
92 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
103 * - Each dataset can have a ds_remap_deadlist object. This is a
110 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
119 * - When freeing a block: if any DVA is on an indirect vdev, append to
121 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
123 * - When freeing a snapshot: move parts of ds_remap_deadlist to
125 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
132 * Condensing an indirect vdev's mapping is the process of determining
133 * the precise counts of obsolete space for each mapping entry (by
135 * writing out a new mapping that contains only referenced entries.
137 * We condense a vdev when we expect the mapping to shrink (see
140 * open-context thread (spa_condense_indirect_thread) to incrementally
141 * create the new mapping object in a way that minimizes the impact on
144 * == Generating a new mapping ==
146 * To generate a new mapping, we follow these steps:
148 * 1. Save the old obsolete space map and create a new mapping object
156 * mapping entry, by incorporating the obsolete space map into the
159 * 3. Iterate through each mapping entry, writing to the new mapping any
161 * obsolete count == mapping length). (See
164 * 4. Destroy the old mapping object and switch over to the new one
172 * iterating where we left off: at vimp_max_offset of the new mapping
179 * Condense if at least this percent of the bytes in the mapping is
183 * i/o); lower values will reduce the mapping size more quickly.
196 * Don't bother condensing if the mapping uses less than this amount of
213 * expensive to check them all. Instead, try at most 100 randomly-selected
228 * need to read all copies of the data (e.g. for scrub or reconstruction).
229 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
230 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
243 int ic_error; /* set when a child does not contain the data */
248 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
261 vdev_t *is_vdev; /* top-level vdev */
279 * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
294 indirect_vsd_t *iv = zio->io_vsd;
297 while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
298 for (int c = 0; c < is->is_children; c++) {
299 indirect_child_t *ic = &is->is_child[c];
300 if (ic->ic_data != NULL)
301 abd_free(ic->ic_data);
305 while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
308 list_destroy(&is->is_unique_child);
311 offsetof(indirect_split_t, is_child[is->is_children]));
326 spa_t *spa = vd->vdev_spa;
328 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
329 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
332 vd->vdev_indirect_mapping, offset) != NULL);
335 mutex_enter(&vd->vdev_obsolete_lock);
336 range_tree_add(vd->vdev_obsolete_segments, offset, size);
337 mutex_exit(&vd->vdev_obsolete_lock);
355 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
363 &spa->spa_condensing_indirect_phys;
365 objset_t *mos = spa->spa_meta_objset;
368 list_create(&sci->sci_new_mapping_entries[i],
373 sci->sci_new_mapping =
374 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
383 list_destroy(&sci->sci_new_mapping_entries[i]);
385 if (sci->sci_new_mapping != NULL)
386 vdev_indirect_mapping_close(sci->sci_new_mapping);
394 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
395 spa_t *spa = vd->vdev_spa;
397 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
405 if (spa->spa_condensing_indirect != NULL)
412 * The mapping object size must not change while we are
416 if (vd->vdev_ops != &vdev_indirect_ops)
425 if (vd->vdev_obsolete_sm == NULL) {
430 ASSERT(vd->vdev_obsolete_sm != NULL);
432 ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm));
435 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
437 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
443 * obsolete, condense (unless the mapping is already small enough).
445 * by the mapping.
451 "spacemap covers %d%% of %lluMB mapping",
452 (u_longlong_t)vd->vdev_id,
465 (u_longlong_t)vd->vdev_id,
477 * mapping and replacing it with the new one.
483 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
485 &spa->spa_condensing_indirect_phys;
486 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
487 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
488 objset_t *mos = spa->spa_meta_objset;
489 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
492 vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
495 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
496 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
498 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
500 ASSERT(vic->vic_mapping_object != 0);
501 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
502 ASSERT(scip->scip_next_mapping_object != 0);
503 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
508 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
509 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
510 vd->vdev_indirect_mapping = sci->sci_new_mapping;
511 rw_exit(&vd->vdev_indirect_rwlock);
513 sci->sci_new_mapping = NULL;
514 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
515 vic->vic_mapping_object = scip->scip_next_mapping_object;
516 scip->scip_next_mapping_object = 0;
518 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
520 scip->scip_prev_obsolete_sm_object = 0;
522 scip->scip_vdev = 0;
526 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
527 spa->spa_condensing_indirect = NULL;
530 "new mapping object %llu has %llu entries "
532 (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx),
533 (u_longlong_t)vic->vic_mapping_object,
536 vdev_config_dirty(spa->spa_root_vdev);
540 * This sync task appends entries to the new mapping object.
547 spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa;
550 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
552 vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
553 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
554 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
558 * Open-context function to add one entry to the new mapping. The new
565 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
567 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
569 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
578 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
585 vime->vime_mapping = *vimep;
586 vime->vime_obsolete_count = count;
587 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
596 spa_t *spa = vd->vdev_spa;
598 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
602 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
603 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
606 (u_longlong_t)vd->vdev_id,
613 "at index %llu", (u_longlong_t)vd->vdev_id,
619 &old_mapping->vim_entries[mapi];
620 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
646 return (spa->spa_condensing_indirect != NULL);
655 ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
657 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
661 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
663 &spa->spa_condensing_indirect_phys;
666 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
669 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
670 ASSERT(scip->scip_next_mapping_object != 0);
671 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
672 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
682 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
685 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
686 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
695 * Generate new mapping. Determine what index to continue from
697 * new mapping.
700 vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
702 /* We haven't written anything to the new mapping yet. */
720 * We've already written the whole new mapping.
727 start_index = entry - old_mapping->vim_entries;
758 spa_t *spa = vd->vdev_spa;
760 &spa->spa_condensing_indirect_phys;
762 ASSERT0(scip->scip_next_mapping_object);
763 ASSERT0(scip->scip_prev_obsolete_sm_object);
764 ASSERT0(scip->scip_vdev);
766 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
768 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
774 scip->scip_vdev = vd->vdev_id;
775 scip->scip_next_mapping_object =
776 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
778 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
784 space_map_close(vd->vdev_obsolete_sm);
785 vd->vdev_obsolete_sm = NULL;
786 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
789 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
794 ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
795 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
799 (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx),
800 (u_longlong_t)scip->scip_prev_obsolete_sm_object,
801 (u_longlong_t)scip->scip_next_mapping_object);
803 zthr_wakeup(spa->spa_condense_zthr);
815 spa_t *spa = vd->vdev_spa;
816 vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
818 ASSERT3U(vic->vic_mapping_object, !=, 0);
819 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
820 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
826 obsolete_sm_object = space_map_alloc(spa->spa_meta_objset,
829 ASSERT(vd->vdev_top_zap != 0);
830 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
837 VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
838 spa->spa_meta_objset, obsolete_sm_object,
839 0, vd->vdev_asize, 0));
842 ASSERT(vd->vdev_obsolete_sm != NULL);
844 space_map_object(vd->vdev_obsolete_sm));
846 space_map_write(vd->vdev_obsolete_sm,
847 vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
848 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
854 int error = zap_lookup(spa->spa_meta_objset,
857 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
858 &spa->spa_condensing_indirect_phys);
861 spa->spa_condensing_indirect =
875 if (spa->spa_condensing_indirect != NULL) {
876 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
877 spa->spa_condensing_indirect = NULL;
884 ASSERT3P(spa->spa_condense_zthr, ==, NULL);
885 spa->spa_condense_zthr = zthr_create("z_indirect_condense",
898 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
900 if (vd->vdev_top_zap == 0) {
905 int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
923 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
925 if (vd->vdev_top_zap == 0) {
931 int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
953 *psize = *max_psize = vd->vdev_asize +
955 *logical_ashift = vd->vdev_ashift;
956 *physical_ashift = vd->vdev_physical_ashift;
972 rs->rs_vd = vd;
973 rs->rs_offset = offset;
974 rs->rs_asize = asize;
975 rs->rs_split_offset = split_offset;
981 * physical entries of the indirect mapping that correspond to the extent
983 * is populated with the number of mapping entries that were duplicated.
986 * This ensures that the mapping won't change due to condensing as we
997 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1000 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
1008 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
1013 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
1014 uint64_t inner_size = MIN(asize, size - inner_offset);
1017 asize -= inner_size;
1045 * has been split into multiple sections in our mapping, we keep track
1049 * this scenario the callbacks in each split block won't occur in-order in
1062 spa_t *spa = vd->vdev_spa;
1069 vdev_t *v = rs->rs_vd;
1073 ASSERT(rs->rs_asize > 0);
1078 * prevent the mapping from being changed by condensing.
1083 * our copy of the mapping. Once we are done with the with
1085 * of the indirect mapping entries that are relevant to it.
1094 rw_enter(&v->vdev_indirect_rwlock, RW_READER);
1095 ASSERT3P(v->vdev_indirect_mapping, !=, NULL);
1097 vdev_indirect_mapping_entry_phys_t *mapping =
1099 rs->rs_offset, rs->rs_asize, &num_entries);
1100 ASSERT3P(mapping, !=, NULL);
1102 rw_exit(&v->vdev_indirect_rwlock);
1113 vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
1116 ASSERT3U(rs->rs_asize, >, 0);
1118 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
1119 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
1120 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
1122 ASSERT3U(rs->rs_offset, >=,
1124 ASSERT3U(rs->rs_offset, <,
1126 ASSERT3U(dst_vdev, !=, v->vdev_id);
1128 uint64_t inner_offset = rs->rs_offset -
1131 MIN(rs->rs_asize, size - inner_offset);
1136 if (dst_v->vdev_ops == &vdev_indirect_ops) {
1139 inner_size, rs->rs_split_offset));
1154 func(rs->rs_split_offset + inner_half, dst_v,
1158 func(rs->rs_split_offset, dst_v,
1162 func(rs->rs_split_offset, dst_v,
1167 rs->rs_offset += inner_size;
1168 rs->rs_asize -= inner_size;
1169 rs->rs_split_offset += inner_size;
1171 VERIFY0(rs->rs_asize);
1173 kmem_free(mapping, num_entries * sizeof (*mapping));
1182 zio_t *pio = zio->io_private;
1184 mutex_enter(&pio->io_lock);
1185 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
1186 mutex_exit(&pio->io_lock);
1188 abd_free(zio->io_abd);
1200 indirect_vsd_t *iv = zio->io_vsd;
1204 if (vd->vdev_ops == &vdev_indirect_ops)
1208 if (vd->vdev_ops == &vdev_mirror_ops)
1209 n = vd->vdev_children;
1214 is->is_children = n;
1215 is->is_size = size;
1216 is->is_split_offset = split_offset;
1217 is->is_target_offset = offset;
1218 is->is_vdev = vd;
1219 list_create(&is->is_unique_child, sizeof (indirect_child_t),
1223 * Note that we only consider multiple copies of the data for
1228 if (vd->vdev_ops == &vdev_mirror_ops) {
1230 is->is_child[i].ic_vdev = vd->vdev_child[i];
1231 list_link_init(&is->is_child[i].ic_node);
1234 is->is_child[0].ic_vdev = vd;
1237 list_insert_tail(&iv->iv_splits, is);
1243 indirect_child_t *ic = zio->io_private;
1245 if (zio->io_error != 0) {
1247 * Clear ic_data to indicate that we do not have data for this
1250 abd_free(ic->ic_data);
1251 ic->ic_data = NULL;
1261 indirect_vsd_t *iv = zio->io_vsd;
1263 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1265 for (indirect_split_t *is = list_head(&iv->iv_splits);
1266 is != NULL; is = list_next(&iv->iv_splits, is)) {
1267 for (int i = 0; i < is->is_children; i++) {
1268 indirect_child_t *ic = &is->is_child[i];
1270 if (!vdev_readable(ic->ic_vdev))
1274 * If a child is missing the data, set ic_error. Used
1279 if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING,
1280 zio->io_txg, 1))
1281 ic->ic_error = SET_ERROR(ESTALE);
1283 ic->ic_data = abd_alloc_sametype(zio->io_abd,
1284 is->is_size);
1285 ic->ic_duplicate = NULL;
1288 ic->ic_vdev, is->is_target_offset, ic->ic_data,
1289 is->is_size, zio->io_type, zio->io_priority, 0,
1293 iv->iv_reconstruct = B_TRUE;
1299 spa_t *spa __maybe_unused = zio->io_spa;
1301 list_create(&iv->iv_splits,
1304 zio->io_vsd = iv;
1305 zio->io_vsd_ops = &vdev_indirect_vsd_ops;
1308 if (zio->io_type != ZIO_TYPE_READ) {
1309 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1314 ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
1318 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1321 indirect_split_t *first = list_head(&iv->iv_splits);
1323 if (first->is_size == zio->io_size) {
1326 * data, which will checksum the same as the original data.
1335 * on non-indirect vdevs. This allows us to be less strict
1338 ASSERT0(first->is_split_offset);
1339 ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
1340 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
1341 first->is_vdev, first->is_target_offset,
1342 abd_get_offset(zio->io_abd, 0),
1343 zio->io_size, zio->io_type, zio->io_priority, 0,
1346 iv->iv_split_block = B_TRUE;
1347 if (zio->io_type == ZIO_TYPE_READ &&
1348 zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1358 * split segment, from the top-level vdev. Since
1361 * we get the right data. E.g. if it's a mirror,
1369 for (indirect_split_t *is = list_head(&iv->iv_splits);
1370 is != NULL; is = list_next(&iv->iv_splits, is)) {
1372 is->is_vdev, is->is_target_offset,
1373 abd_get_offset_size(zio->io_abd,
1374 is->is_split_offset, is->is_size),
1375 is->is_size, zio->io_type,
1376 zio->io_priority, 0,
1393 vdev_t *vd = ic->ic_vdev;
1395 if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
1398 mutex_enter(&vd->vdev_stat_lock);
1399 vd->vdev_stat.vs_checksum_errors++;
1400 mutex_exit(&vd->vdev_stat_lock);
1403 abd_t *bad_abd = ic->ic_data;
1404 abd_t *good_abd = is->is_good_child->ic_data;
1405 (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
1406 is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
1411 * each split segment's correct data (is_good_child's ic_data) with each
1412 * other copy of the data. If they differ, then we overwrite the bad data
1414 * if a vdev is missing a copy of the data we set ic_error and the read is
1419 * (based on which copies actually read bad data, as opposed to which we
1426 indirect_vsd_t *iv = zio->io_vsd;
1428 if (!spa_writeable(zio->io_spa))
1431 for (indirect_split_t *is = list_head(&iv->iv_splits);
1432 is != NULL; is = list_next(&iv->iv_splits, is)) {
1433 for (int c = 0; c < is->is_children; c++) {
1434 indirect_child_t *ic = &is->is_child[c];
1435 if (ic == is->is_good_child)
1437 if (ic->ic_data == NULL)
1439 if (ic->ic_duplicate == is->is_good_child)
1443 ic->ic_vdev, is->is_target_offset,
1444 is->is_good_child->ic_data, is->is_size,
1451 * a copy of the data, so suppress incrementing the
1454 if (ic->ic_error == ESTALE)
1468 indirect_vsd_t *iv = zio->io_vsd;
1470 if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
1473 for (indirect_split_t *is = list_head(&iv->iv_splits);
1474 is != NULL; is = list_next(&iv->iv_splits, is)) {
1475 for (int c = 0; c < is->is_children; c++) {
1476 indirect_child_t *ic = &is->is_child[c];
1478 if (ic->ic_data == NULL)
1481 vdev_t *vd = ic->ic_vdev;
1483 mutex_enter(&vd->vdev_stat_lock);
1484 vd->vdev_stat.vs_checksum_errors++;
1485 mutex_exit(&vd->vdev_stat_lock);
1486 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
1487 NULL, zio, is->is_target_offset, is->is_size,
1494 * Copy data from all the splits to a main zio then validate the checksum.
1502 for (indirect_split_t *is = list_head(&iv->iv_splits);
1503 is != NULL; is = list_next(&iv->iv_splits, is)) {
1505 ASSERT3P(is->is_good_child->ic_data, !=, NULL);
1506 ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
1508 abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
1509 is->is_split_offset, 0, is->is_size);
1527 iv->iv_attempts = 0;
1529 for (indirect_split_t *is = list_head(&iv->iv_splits);
1530 is != NULL; is = list_next(&iv->iv_splits, is))
1531 is->is_good_child = list_head(&is->is_unique_child);
1534 iv->iv_attempts++;
1540 for (indirect_split_t *is = list_head(&iv->iv_splits);
1541 is != NULL; is = list_next(&iv->iv_splits, is)) {
1542 is->is_good_child = list_next(&is->is_unique_child,
1543 is->is_good_child);
1544 if (is->is_good_child != NULL) {
1549 is->is_good_child = list_head(&is->is_unique_child);
1553 ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
1566 iv->iv_attempts = 0;
1568 while (iv->iv_attempts < iv->iv_attempts_max) {
1569 iv->iv_attempts++;
1571 for (indirect_split_t *is = list_head(&iv->iv_splits);
1572 is != NULL; is = list_next(&iv->iv_splits, is)) {
1573 indirect_child_t *ic = list_head(&is->is_unique_child);
1574 int children = is->is_unique_children;
1576 for (int i = random_in_range(children); i > 0; i--)
1577 ic = list_next(&is->is_unique_child, ic);
1580 is->is_good_child = ic;
1602 for (indirect_split_t *is = list_head(&iv->iv_splits);
1603 is != NULL; is = list_next(&iv->iv_splits, is)) {
1604 is->is_unique_children = 0;
1606 for (int i = 0; i < is->is_children; i++) {
1607 indirect_child_t *ic = &is->is_child[i];
1608 if (ic->ic_data != NULL) {
1609 is->is_unique_children++;
1610 list_insert_tail(&is->is_unique_child, ic);
1614 if (list_is_empty(&is->is_unique_child)) {
1621 * Set each is_good_child to a randomly-selected child which
1622 * is known to contain validated data.
1632 * Set iv->iv_attempts_max such that all unique combinations will
1635 iv->iv_attempts_max = 1;
1637 for (indirect_split_t *is = list_head(&iv->iv_splits);
1638 is != NULL; is = list_next(&iv->iv_splits, is)) {
1639 for (int c = 0; c < is->is_children; c++) {
1640 indirect_child_t *ic = &is->is_child[c];
1642 if (ic == is->is_good_child)
1644 if (ic->ic_data == NULL)
1647 abd_zero(ic->ic_data, abd_get_size(ic->ic_data));
1650 iv->iv_attempts_max *= 2;
1651 if (iv->iv_attempts_max >= (1ULL << 12)) {
1652 iv->iv_attempts_max = UINT64_MAX;
1659 for (indirect_split_t *is = list_head(&iv->iv_splits);
1660 is != NULL; is = list_next(&iv->iv_splits, is)) {
1662 while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
1665 is->is_unique_children = 0;
1672 * This function is called when we have read all copies of the data and need
1687 * 2-way mirror with unique copies, we will have the following pieces of data:
1697 * combinations, which is similar to bitwise-little-endian counting in
1713 * Note that the split segments may be on the same or different top-level
1717 * the correct data, as long as those errors are at sufficiently-separated
1718 * offsets (specifically, separated by the largest block size - default of
1724 indirect_vsd_t *iv = zio->io_vsd;
1728 iv->iv_unique_combinations = 1;
1729 iv->iv_attempts_max = UINT64_MAX;
1732 iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
1750 for (indirect_split_t *is = list_head(&iv->iv_splits);
1751 is != NULL; is = list_next(&iv->iv_splits, is)) {
1752 is->is_unique_children = 0;
1754 for (int i = 0; i < is->is_children; i++) {
1755 indirect_child_t *ic_i = &is->is_child[i];
1757 if (ic_i->ic_data == NULL ||
1758 ic_i->ic_duplicate != NULL)
1761 for (int j = i + 1; j < is->is_children; j++) {
1762 indirect_child_t *ic_j = &is->is_child[j];
1764 if (ic_j->ic_data == NULL ||
1765 ic_j->ic_duplicate != NULL)
1768 if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0)
1769 ic_j->ic_duplicate = ic_i;
1772 is->is_unique_children++;
1773 list_insert_tail(&is->is_unique_child, ic_i);
1777 EQUIV(list_is_empty(&is->is_unique_child),
1778 is->is_unique_children == 0);
1779 if (list_is_empty(&is->is_unique_child)) {
1780 zio->io_error = EIO;
1786 iv->iv_unique_combinations *= is->is_unique_children;
1789 if (iv->iv_unique_combinations <= iv->iv_attempts_max)
1797 zio->io_error = error;
1814 indirect_vsd_t *iv = zio->io_vsd;
1816 if (iv->iv_reconstruct) {
1818 * We have read all copies of the data (e.g. from mirrors),
1820 * one-copy read didn't checksum correctly.
1826 if (!iv->iv_split_block) {
1840 * will be reported to the top-level VDEV.
1842 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
1843 zio->io_error = ret;
1844 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
1905 "Minimum obsolete percent of bytes in the mapping "
1909 "Don't bother condensing if the mapping uses less than this amount of "