Lines Matching +full:segment +full:- +full:no +full:- +full:remap

9  * or https://opensource.org/licenses/CDDL-1.0.
50 * moving on to the next top-level vdev.
82 * The in-core space map representation is more compact than its on-disk form.
83 * The zfs_condense_pct determines how much more compact the in-core
84 * space map representation must be before we compact it on-disk.
115 * no metaslab group will be excluded based on this criterion.
146 * active metaslab that exceeds this threshold will no longer keep its active
171 * in a space map to continue allocations in a first-fit fashion.
173 * switch to using best-fit allocations.
181 * high-performance storage.
194 * segments forwards until giving up on finding a segment that the allocation
202 * controls what segment is used. If it is set, we will use the largest free
203 * segment. If it is not set, we will use a segment of exactly the requested
213 * unloaded sooner. These settings are intended to be generous -- to keep
250 * Enable/disable segment-based metaslab selection.
255 * When using segment-based metaslab selection, we will continue
269 * in a given list when running in non-debug mode. We limit the number
270 * of entries in non-debug mode to prevent us from using up too much memory.
285 * To avoid 64-bit overflow, don't set above UINT32_MAX.
297 * Force the per-metaslab range trees to use 64-bit integers to store
303 * By default we only store segments over a certain size in the size-sorted
314 * gang allocation. If that fails then we will have a multi-layer gang
320 * that fails then we will have a multi-layer gang block.
332 * bucketization. E.g. we are looking for a 60K segment, and the best
333 * metaslabs all have free segments in the 32-63K bucket, but the best
384 metaslab_ksp->ks_data = &metaslab_stats;
412 mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
414 mc->mc_spa = spa;
415 mc->mc_ops = ops;
416 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
417 multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
419 for (int i = 0; i < spa->spa_alloc_count; i++) {
420 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
421 mca->mca_rotor = NULL;
422 zfs_refcount_create_tracked(&mca->mca_alloc_slots);
431 spa_t *spa = mc->mc_spa;
433 ASSERT(mc->mc_alloc == 0);
434 ASSERT(mc->mc_deferred == 0);
435 ASSERT(mc->mc_space == 0);
436 ASSERT(mc->mc_dspace == 0);
438 for (int i = 0; i < spa->spa_alloc_count; i++) {
439 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
440 ASSERT(mca->mca_rotor == NULL);
441 zfs_refcount_destroy(&mca->mca_alloc_slots);
443 mutex_destroy(&mc->mc_lock);
444 multilist_destroy(&mc->mc_metaslab_txg_list);
446 mc_allocator[spa->spa_alloc_count]));
458 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
459 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
461 if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
465 vd = mg->mg_vd;
466 ASSERT(vd->vdev_mg != NULL);
467 ASSERT3P(vd->vdev_top, ==, vd);
468 ASSERT3P(mg->mg_class, ==, mc);
469 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
470 } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
479 atomic_add_64(&mc->mc_alloc, alloc_delta);
480 atomic_add_64(&mc->mc_deferred, defer_delta);
481 atomic_add_64(&mc->mc_space, space_delta);
482 atomic_add_64(&mc->mc_dspace, dspace_delta);
488 return (mc->mc_alloc);
494 return (mc->mc_deferred);
500 return (mc->mc_space);
506 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
512 spa_t *spa = mc->mc_spa;
513 vdev_t *rvd = spa->spa_root_vdev;
523 mutex_enter(&mc->mc_lock);
524 for (int c = 0; c < rvd->vdev_children; c++) {
525 vdev_t *tvd = rvd->vdev_child[c];
529 * Skip any holes, uninitialized top-levels, or
532 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
533 mg->mg_class != mc) {
537 IMPLY(mg == mg->mg_vd->vdev_log_mg,
538 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
541 mc_hist[i] += mg->mg_histogram[i];
545 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
548 mutex_exit(&mc->mc_lock);
562 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
565 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
567 for (int c = 0; c < rvd->vdev_children; c++) {
568 vdev_t *tvd = rvd->vdev_child[c];
569 metaslab_group_t *mg = tvd->vdev_mg;
572 * Skip any holes, uninitialized top-levels,
575 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
576 mg->mg_class != mc) {
584 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
585 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
593 fragmentation += mg->mg_fragmentation *
599 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
612 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
615 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
616 for (int c = 0; c < rvd->vdev_children; c++) {
617 vdev_t *tvd = rvd->vdev_child[c];
618 metaslab_group_t *mg = tvd->vdev_mg;
620 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
621 mg->mg_class != mc) {
630 space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
631 1ULL << tvd->vdev_ms_shift, uint64_t);
633 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
640 multilist_t *ml = &mc->mc_metaslab_txg_list;
647 mutex_enter(&msp->ms_lock);
655 if (!multilist_link_active(&msp->ms_class_txg_node)) {
656 mutex_exit(&msp->ms_lock);
657 i--;
664 msp->ms_selected_txg + metaslab_unload_delay &&
665 now > msp->ms_selected_time +
667 (msp->ms_allocator == -1 ||
676 mutex_exit(&msp->ms_lock);
679 mutex_exit(&msp->ms_lock);
693 if (m1->ms_allocator != -1 && m1->ms_primary)
695 else if (m1->ms_allocator != -1 && !m1->ms_primary)
697 if (m2->ms_allocator != -1 && m2->ms_primary)
699 else if (m2->ms_allocator != -1 && !m2->ms_primary)
711 return (-1);
715 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
719 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
721 return (TREE_CMP(m1->ms_start, m2->ms_start));
734 * transitions from allocatable to non-allocatable or vice versa then the
740 vdev_t *vd = mg->mg_vd;
741 metaslab_class_t *mc = mg->mg_class;
742 vdev_stat_t *vs = &vd->vdev_stat;
746 ASSERT(vd == vd->vdev_top);
747 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
750 mutex_enter(&mg->mg_lock);
751 was_allocatable = mg->mg_allocatable;
752 was_initialized = mg->mg_initialized;
754 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
755 (vs->vs_space + 1);
757 mutex_enter(&mc->mc_lock);
763 * for allocations. We also don't consider non-activated
767 mg->mg_initialized = metaslab_group_initialized(mg);
768 if (!was_initialized && mg->mg_initialized) {
769 mc->mc_groups++;
770 } else if (was_initialized && !mg->mg_initialized) {
771 ASSERT3U(mc->mc_groups, >, 0);
772 mc->mc_groups--;
774 if (mg->mg_initialized)
775 mg->mg_no_free_space = B_FALSE;
783 mg->mg_allocatable = (mg->mg_activation_count > 0 &&
784 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
785 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
786 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
796 * When a group transitions from allocatable to non-allocatable or
803 if (was_allocatable && !mg->mg_allocatable)
804 mc->mc_alloc_groups--;
805 else if (!was_allocatable && mg->mg_allocatable)
806 mc->mc_alloc_groups++;
807 mutex_exit(&mc->mc_lock);
809 mutex_exit(&mg->mg_lock);
818 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
822 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
823 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
828 return (TREE_CMP(a->ms_id, b->ms_id));
838 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
839 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
840 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
841 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
843 mg->mg_vd = vd;
844 mg->mg_class = mc;
845 mg->mg_activation_count = 0;
846 mg->mg_initialized = B_FALSE;
847 mg->mg_no_free_space = B_TRUE;
848 mg->mg_allocators = allocators;
851 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
852 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
861 ASSERT(mg->mg_prev == NULL);
862 ASSERT(mg->mg_next == NULL);
868 ASSERT(mg->mg_activation_count <= 0);
870 avl_destroy(&mg->mg_metaslab_tree);
871 mutex_destroy(&mg->mg_lock);
872 mutex_destroy(&mg->mg_ms_disabled_lock);
873 cv_destroy(&mg->mg_ms_disabled_cv);
875 for (int i = 0; i < mg->mg_allocators; i++) {
876 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
877 zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
880 mg_allocator[mg->mg_allocators]));
886 metaslab_class_t *mc = mg->mg_class;
887 spa_t *spa = mc->mc_spa;
892 ASSERT(mg->mg_prev == NULL);
893 ASSERT(mg->mg_next == NULL);
894 ASSERT(mg->mg_activation_count <= 0);
896 if (++mg->mg_activation_count <= 0)
899 mg->mg_aliquot = metaslab_aliquot * MAX(1,
900 vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
903 if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
904 mg->mg_prev = mg;
905 mg->mg_next = mg;
907 mgnext = mgprev->mg_next;
908 mg->mg_prev = mgprev;
909 mg->mg_next = mgnext;
910 mgprev->mg_next = mg;
911 mgnext->mg_prev = mg;
913 for (int i = 0; i < spa->spa_alloc_count; i++) {
914 mc->mc_allocator[i].mca_rotor = mg;
915 mg = mg->mg_next;
928 metaslab_class_t *mc = mg->mg_class;
929 spa_t *spa = mc->mc_spa;
936 if (--mg->mg_activation_count != 0) {
937 for (int i = 0; i < spa->spa_alloc_count; i++)
938 ASSERT(mc->mc_allocator[i].mca_rotor != mg);
939 ASSERT(mg->mg_prev == NULL);
940 ASSERT(mg->mg_next == NULL);
941 ASSERT(mg->mg_activation_count < 0);
959 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
960 taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
961 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
963 for (int i = 0; i < mg->mg_allocators; i++) {
964 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
965 metaslab_t *msp = mga->mga_primary;
967 mutex_enter(&msp->ms_lock);
970 mutex_exit(&msp->ms_lock);
972 msp = mga->mga_secondary;
974 mutex_enter(&msp->ms_lock);
977 mutex_exit(&msp->ms_lock);
981 mgprev = mg->mg_prev;
982 mgnext = mg->mg_next;
987 mgprev->mg_next = mgnext;
988 mgnext->mg_prev = mgprev;
990 for (int i = 0; i < spa->spa_alloc_count; i++) {
991 if (mc->mc_allocator[i].mca_rotor == mg)
992 mc->mc_allocator[i].mca_rotor = mgnext;
995 mg->mg_prev = NULL;
996 mg->mg_next = NULL;
1002 vdev_t *vd = mg->mg_vd;
1003 vdev_stat_t *vs = &vd->vdev_stat;
1005 return (vs->vs_space != 0 && mg->mg_activation_count > 0);
1015 mutex_enter(&mg->mg_lock);
1016 uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
1017 mutex_exit(&mg->mg_lock);
1018 return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
1025 avl_tree_t *t = &mg->mg_metaslab_tree;
1026 uint64_t ashift = mg->mg_vd->vdev_ashift;
1037 mutex_enter(&mg->mg_lock);
1040 VERIFY3P(msp->ms_group, ==, mg);
1042 if (msp->ms_sm == NULL)
1047 msp->ms_sm->sm_phys->smp_histogram[i];
1052 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
1054 mutex_exit(&mg->mg_lock);
1062 metaslab_class_t *mc = mg->mg_class;
1063 uint64_t ashift = mg->mg_vd->vdev_ashift;
1065 ASSERT(MUTEX_HELD(&msp->ms_lock));
1066 if (msp->ms_sm == NULL)
1069 mutex_enter(&mg->mg_lock);
1070 mutex_enter(&mc->mc_lock);
1072 IMPLY(mg == mg->mg_vd->vdev_log_mg,
1073 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1074 mg->mg_histogram[i + ashift] +=
1075 msp->ms_sm->sm_phys->smp_histogram[i];
1076 mc->mc_histogram[i + ashift] +=
1077 msp->ms_sm->sm_phys->smp_histogram[i];
1079 mutex_exit(&mc->mc_lock);
1080 mutex_exit(&mg->mg_lock);
1086 metaslab_class_t *mc = mg->mg_class;
1087 uint64_t ashift = mg->mg_vd->vdev_ashift;
1089 ASSERT(MUTEX_HELD(&msp->ms_lock));
1090 if (msp->ms_sm == NULL)
1093 mutex_enter(&mg->mg_lock);
1094 mutex_enter(&mc->mc_lock);
1096 ASSERT3U(mg->mg_histogram[i + ashift], >=,
1097 msp->ms_sm->sm_phys->smp_histogram[i]);
1098 ASSERT3U(mc->mc_histogram[i + ashift], >=,
1099 msp->ms_sm->sm_phys->smp_histogram[i]);
1100 IMPLY(mg == mg->mg_vd->vdev_log_mg,
1101 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1103 mg->mg_histogram[i + ashift] -=
1104 msp->ms_sm->sm_phys->smp_histogram[i];
1105 mc->mc_histogram[i + ashift] -=
1106 msp->ms_sm->sm_phys->smp_histogram[i];
1108 mutex_exit(&mc->mc_lock);
1109 mutex_exit(&mg->mg_lock);
1115 ASSERT(msp->ms_group == NULL);
1116 mutex_enter(&mg->mg_lock);
1117 msp->ms_group = mg;
1118 msp->ms_weight = 0;
1119 avl_add(&mg->mg_metaslab_tree, msp);
1120 mutex_exit(&mg->mg_lock);
1122 mutex_enter(&msp->ms_lock);
1124 mutex_exit(&msp->ms_lock);
1130 mutex_enter(&msp->ms_lock);
1132 mutex_exit(&msp->ms_lock);
1134 mutex_enter(&mg->mg_lock);
1135 ASSERT(msp->ms_group == mg);
1136 avl_remove(&mg->mg_metaslab_tree, msp);
1138 metaslab_class_t *mc = msp->ms_group->mg_class;
1140 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
1141 if (multilist_link_active(&msp->ms_class_txg_node))
1145 msp->ms_group = NULL;
1146 mutex_exit(&mg->mg_lock);
1152 ASSERT(MUTEX_HELD(&msp->ms_lock));
1153 ASSERT(MUTEX_HELD(&mg->mg_lock));
1154 ASSERT(msp->ms_group == mg);
1156 avl_remove(&mg->mg_metaslab_tree, msp);
1157 msp->ms_weight = weight;
1158 avl_add(&mg->mg_metaslab_tree, msp);
1170 ASSERT(MUTEX_HELD(&msp->ms_lock));
1172 mutex_enter(&mg->mg_lock);
1174 mutex_exit(&mg->mg_lock);
1187 vdev_t *vd = mg->mg_vd;
1191 for (int m = 0; m < vd->vdev_ms_count; m++) {
1192 metaslab_t *msp = vd->vdev_ms[m];
1194 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1196 if (msp->ms_group != mg)
1200 fragmentation += msp->ms_fragmentation;
1203 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1225 spa_t *spa = mg->mg_vd->vdev_spa;
1226 metaslab_class_t *mc = mg->mg_class;
1237 mc->mc_groups <= 1)
1247 * If all metaslab groups are no longer considered allocatable
1252 if (mg->mg_allocatable) {
1253 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
1255 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
1257 if (!mc->mc_alloc_throttle_enabled)
1262 * there is no point in looking further.
1264 if (mg->mg_no_free_space)
1283 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
1290 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1292 ASSERT3U(mc->mc_alloc_groups, >, 1);
1301 for (metaslab_group_t *mgp = mg->mg_next;
1302 mgp != rotor; mgp = mgp->mg_next) {
1304 &mgp->mg_allocator[allocator];
1305 qmax = mgap->mga_cur_max_alloc_queue_depth;
1308 zfs_refcount_count(&mgap->mga_alloc_queue_depth);
1315 if (qdepth < qmax && !mgp->mg_no_free_space)
1326 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1339 * Comparison function for the private size-ordered tree using 32-bit
1349 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1350 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1354 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1358 * Comparison function for the private size-ordered tree using 64-bit
1368 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1369 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1373 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1390 range_tree_t *rt = mssap->rt;
1391 metaslab_rt_arg_t *mrap = mssap->mra;
1401 metaslab_rt_arg_t *mrap = rt->rt_arg;
1403 ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
1404 mrap->mra_floor_shift = 0;
1420 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1426 zfs_btree_t *size_tree = mrap->mra_bt;
1431 switch (rt->rt_type) {
1443 panic("Invalid range seg type %d", rt->rt_type);
1446 mrap->mra_floor_shift = metaslab_by_size_min_shift;
1454 zfs_btree_t *size_tree = mrap->mra_bt;
1464 zfs_btree_t *size_tree = mrap->mra_bt;
1466 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
1467 (1ULL << mrap->mra_floor_shift))
1477 zfs_btree_t *size_tree = mrap->mra_bt;
1479 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
1480 mrap->mra_floor_shift))
1490 zfs_btree_t *size_tree = mrap->mra_bt;
1512 * Return the maximum contiguous segment within the metaslab.
1517 zfs_btree_t *t = &msp->ms_allocatable_by_size;
1523 metaslab_size_tree_full_load(msp->ms_allocatable);
1529 return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
1530 msp->ms_allocatable));
1534 * Return the maximum contiguous segment within the unflushed frees of this
1540 ASSERT(MUTEX_HELD(&msp->ms_lock));
1542 if (msp->ms_unflushed_frees == NULL)
1545 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
1546 metaslab_size_tree_full_load(msp->ms_unflushed_frees);
1547 range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
1558 * bound for the largest currently-usable free segment in the
1563 * briefly and should eventually self-correct as frees are no longer
1569 * only check the largest segment for overlaps. Smaller segments may
1572 * the largest segment; there may be other usable chunks in the
1573 * largest segment, but we ignore them.
1575 uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
1576 uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
1580 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1585 rsize = start - rstart;
1591 boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1594 rsize = start - rstart;
1619 * suitable block to allocate. This will search the specified B-tree looking
1627 *cursor = rt->rt_start;
1628 zfs_btree_t *bt = &rt->rt_root;
1637 while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
1649 return (-1ULL);
1660 { "new-dynamic", metaslab_ndf_alloc },
1666 int a = ARRAY_SIZE(metaslab_allocators) - 1;
1667 if (strcmp("new-dynamic", val) == 0)
1668 return (-1); /* remove when ndf is working */
1669 for (; a >= 0; a--) {
1673 return (-1);
1681 spa->spa_active_allocator = a;
1688 return (spa->spa_active_allocator);
1749 uint64_t align = size & -size;
1750 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1751 range_tree_t *rt = msp->ms_allocatable;
1752 uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1755 ASSERT(MUTEX_HELD(&msp->ms_lock));
1758 * If we're running low on space, find a segment based on size,
1763 offset = -1;
1769 if (offset == -1) {
1771 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
1772 metaslab_size_tree_full_load(msp->ms_allocatable);
1775 /* use largest free segment */
1776 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
1779 /* use segment of this size, or next largest */
1780 rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1781 rt, msp->ms_start, size, &where);
1795 * Cursor fit block allocator -
1805 range_tree_t *rt = msp->ms_allocatable;
1806 zfs_btree_t *t = &msp->ms_allocatable_by_size;
1807 uint64_t *cursor = &msp->ms_lbas[0];
1808 uint64_t *cursor_end = &msp->ms_lbas[1];
1811 ASSERT(MUTEX_HELD(&msp->ms_lock));
1819 metaslab_size_tree_full_load(msp->ms_allocatable);
1821 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
1823 return (-1ULL);
1837 * New dynamic fit allocator -
1839 * contiguous blocks. If no region is found then just use the largest segment
1853 zfs_btree_t *t = &msp->ms_allocatable->rt_root;
1854 range_tree_t *rt = msp->ms_allocatable;
1859 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1862 ASSERT(MUTEX_HELD(&msp->ms_lock));
1865 return (-1ULL);
1871 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
1872 t = &msp->ms_allocatable_by_size;
1884 if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
1888 return (-1ULL);
1898 * Wait for any in-progress metaslab loads to complete.
1903 ASSERT(MUTEX_HELD(&msp->ms_lock));
1905 while (msp->ms_loading) {
1906 ASSERT(!msp->ms_loaded);
1907 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1912 * Wait for any in-progress flushing to complete.
1917 ASSERT(MUTEX_HELD(&msp->ms_lock));
1919 while (msp->ms_flushing)
1920 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1932 return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
1938 return (msp->ms_allocated_space);
1942 * Verify that the space accounting on disk matches the in-core range_trees.
1947 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1951 ASSERT(MUTEX_HELD(&msp->ms_lock));
1952 ASSERT(!msp->ms_condensing);
1960 * allocated space map. Calling this in non-syncing context
1964 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1965 !msp->ms_loaded)
1973 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1975 ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1976 range_tree_space(msp->ms_unflushed_frees));
1979 space_map_allocated(msp->ms_sm) +
1980 range_tree_space(msp->ms_unflushed_allocs) -
1981 range_tree_space(msp->ms_unflushed_frees));
1983 sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1991 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1993 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1994 msp->ms_allocating_total);
1996 ASSERT3U(msp->ms_deferspace, ==,
1997 range_tree_space(msp->ms_defer[0]) +
1998 range_tree_space(msp->ms_defer[1]));
2000 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
2001 msp->ms_deferspace + range_tree_space(msp->ms_freed);
2013 ASSERT(msp->ms_loaded);
2015 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
2017 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
2034 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
2036 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
2056 space_map_t *sm = msp->ms_sm;
2065 if (msp->ms_loaded) {
2068 metaslab_aux_histogram_add(msp->ms_synchist,
2069 sm->sm_shift, msp->ms_freed);
2072 metaslab_aux_histogram_add(msp->ms_deferhist[t],
2073 sm->sm_shift, msp->ms_defer[t]);
2077 metaslab_aux_histogram_add(msp->ms_synchist,
2078 sm->sm_shift, msp->ms_freeing);
2089 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2090 space_map_t *sm = msp->ms_sm;
2107 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
2108 sizeof (msp->ms_synchist));
2110 memset(msp->ms_deferhist[hist_index], 0,
2111 sizeof (msp->ms_deferhist[hist_index]));
2113 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
2124 ASSERT(MUTEX_HELD(&msp->ms_lock));
2138 if (msp->ms_group == NULL)
2143 * fragmentation and ms_max_size as is - there is nothing for
2146 vdev_t *vd = msp->ms_group->mg_vd;
2147 if (vd->vdev_removing)
2156 if (txg_list_member(&vd->vdev_ms_list, msp, t))
2161 * This verification checks that our in-memory state is consistent
2162 * with what's on disk. If the pool is read-only then there aren't
2163 * any changes and we just have the initially-loaded state.
2165 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
2168 /* some extra verification for in-core tree if you can */
2169 if (msp->ms_loaded) {
2170 range_tree_stat_verify(msp->ms_allocatable);
2171 VERIFY(space_map_histogram_verify(msp->ms_sm,
2172 msp->ms_allocatable));
2175 uint64_t weight = msp->ms_weight;
2176 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2177 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
2178 uint64_t frag = msp->ms_fragmentation;
2179 uint64_t max_segsize = msp->ms_max_size;
2181 msp->ms_weight = 0;
2182 msp->ms_fragmentation = 0;
2186 * not introduce any side-effects/mutations on the system's state.
2197 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
2199 VERIFY3U(max_segsize, ==, msp->ms_max_size);
2202 * If the weight type changed then there is no point in doing
2205 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
2206 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
2207 msp->ms_fragmentation = frag;
2208 msp->ms_weight = weight;
2212 VERIFY3U(msp->ms_fragmentation, ==, frag);
2213 VERIFY3U(msp->ms_weight, ==, weight);
2233 tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
2236 &mc->mc_metaslab_txg_list);
2238 multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
2244 &mc->mc_metaslab_txg_list, idx));
2246 metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
2248 if (!multilist_link_active(&msp->ms_class_txg_node)) {
2265 if (msp->ms_loading) {
2272 * We can't unload metaslabs with no spacemap because
2279 * currently active because they are high-weight
2283 mutex_enter(&msp->ms_lock);
2284 if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
2285 msp->ms_allocating_total == 0) {
2288 mutex_exit(&msp->ms_lock);
2303 ASSERT(MUTEX_HELD(&msp->ms_lock));
2304 ASSERT(msp->ms_loading);
2305 ASSERT(!msp->ms_condensing);
2324 * metaslab_sync_done() would try to re-add later.
2331 uint64_t length = msp->ms_synced_length;
2332 mutex_exit(&msp->ms_lock);
2336 if (msp->ms_allocatable->rt_arg == NULL) {
2339 mrap = msp->ms_allocatable->rt_arg;
2340 msp->ms_allocatable->rt_ops = NULL;
2341 msp->ms_allocatable->rt_arg = NULL;
2343 mrap->mra_bt = &msp->ms_allocatable_by_size;
2344 mrap->mra_floor_shift = metaslab_by_size_min_shift;
2346 if (msp->ms_sm != NULL) {
2347 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
2350 /* Now, populate the size-sorted tree. */
2351 metaslab_rt_create(msp->ms_allocatable, mrap);
2352 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2353 msp->ms_allocatable->rt_arg = mrap;
2356 arg.rt = msp->ms_allocatable;
2358 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
2362 * Add the size-sorted tree first, since we don't need to load
2365 metaslab_rt_create(msp->ms_allocatable, mrap);
2366 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2367 msp->ms_allocatable->rt_arg = mrap;
2373 range_tree_add(msp->ms_allocatable,
2374 msp->ms_start, msp->ms_size);
2376 if (msp->ms_new) {
2384 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2385 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2396 mutex_enter(&msp->ms_sync_lock);
2397 mutex_enter(&msp->ms_lock);
2399 ASSERT(!msp->ms_condensing);
2400 ASSERT(!msp->ms_flushing);
2403 mutex_exit(&msp->ms_sync_lock);
2407 ASSERT3P(msp->ms_group, !=, NULL);
2408 msp->ms_loaded = B_TRUE;
2415 range_tree_walk(msp->ms_unflushed_allocs,
2416 range_tree_remove, msp->ms_allocatable);
2417 range_tree_walk(msp->ms_unflushed_frees,
2418 range_tree_add, msp->ms_allocatable);
2420 ASSERT3P(msp->ms_group, !=, NULL);
2421 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2442 * When there's no log space map, the ms_allocatable
2446 range_tree_walk(msp->ms_freed,
2447 range_tree_remove, msp->ms_allocatable);
2465 range_tree_walk(msp->ms_defer[t],
2466 range_tree_remove, msp->ms_allocatable);
2474 * has not yet been converted to use segment-based weight, we
2481 uint64_t weight = msp->ms_weight;
2482 uint64_t max_size = msp->ms_max_size;
2485 ASSERT3U(weight, <=, msp->ms_weight);
2486 msp->ms_max_size = metaslab_largest_allocatable(msp);
2487 ASSERT3U(max_size, <=, msp->ms_max_size);
2489 msp->ms_load_time = load_end;
2498 (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2499 (u_longlong_t)msp->ms_id,
2500 (u_longlong_t)space_map_length(msp->ms_sm),
2501 (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
2502 (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
2503 (u_longlong_t)range_tree_space(msp->ms_freed),
2504 (u_longlong_t)range_tree_space(msp->ms_defer[0]),
2505 (u_longlong_t)range_tree_space(msp->ms_defer[1]),
2506 (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
2507 (longlong_t)((load_end - load_start) / 1000000),
2508 (u_longlong_t)msp->ms_max_size,
2509 (u_longlong_t)msp->ms_max_size - max_size,
2510 (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
2513 mutex_exit(&msp->ms_sync_lock);
2520 ASSERT(MUTEX_HELD(&msp->ms_lock));
2527 if (msp->ms_loaded)
2529 VERIFY(!msp->ms_loading);
2530 ASSERT(!msp->ms_condensing);
2538 msp->ms_loading = B_TRUE;
2541 * Wait for any in-progress flushing to finish as we drop the ms_lock
2545 if (msp->ms_flushing)
2551 * no one else loaded the metaslab somehow.
2553 ASSERT(!msp->ms_loaded);
2560 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2561 msp->ms_group->mg_class) {
2562 metaslab_potentially_evict(msp->ms_group->mg_class);
2567 ASSERT(MUTEX_HELD(&msp->ms_lock));
2568 msp->ms_loading = B_FALSE;
2569 cv_broadcast(&msp->ms_load_cv);
2577 ASSERT(MUTEX_HELD(&msp->ms_lock));
2584 if (!msp->ms_loaded)
2587 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2588 msp->ms_loaded = B_FALSE;
2589 msp->ms_unload_time = gethrtime();
2591 msp->ms_activation_weight = 0;
2592 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2594 if (msp->ms_group != NULL) {
2595 metaslab_class_t *mc = msp->ms_group->mg_class;
2597 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2598 if (multilist_link_active(&msp->ms_class_txg_node))
2602 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2608 (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2609 (u_longlong_t)msp->ms_id,
2610 (u_longlong_t)msp->ms_weight,
2611 (u_longlong_t)msp->ms_selected_txg,
2612 (u_longlong_t)(msp->ms_unload_time -
2613 msp->ms_selected_time) / 1000 / 1000,
2614 (u_longlong_t)msp->ms_alloc_txg,
2615 (u_longlong_t)(msp->ms_unload_time -
2616 msp->ms_load_time) / 1000 / 1000,
2617 (u_longlong_t)msp->ms_max_size);
2624 * loaded ones have it calculated from their in-core range tree
2626 * available in-core, whether it is loaded or not.
2632 if (msp->ms_group != NULL)
2637 * We want to optimize the memory use of the per-metaslab range
2639 * units of sectors, zero-indexing from the start of the metaslab. If
2640 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2647 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
2649 *shift = vdev->vdev_ashift;
2650 *start = msp->ms_start;
2662 ASSERT(MUTEX_HELD(&msp->ms_lock));
2663 metaslab_class_t *mc = msp->ms_group->mg_class;
2665 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2666 if (multilist_link_active(&msp->ms_class_txg_node))
2668 msp->ms_selected_txg = txg;
2669 msp->ms_selected_time = gethrtime();
2680 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2681 ASSERT(vd->vdev_ms_count != 0);
2691 vdev_t *vd = mg->mg_vd;
2692 spa_t *spa = vd->vdev_spa;
2693 objset_t *mos = spa->spa_meta_objset;
2698 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2699 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2700 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2701 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2702 multilist_link_init(&ms->ms_class_txg_node);
2704 ms->ms_id = id;
2705 ms->ms_start = id << vd->vdev_ms_shift;
2706 ms->ms_size = 1ULL << vd->vdev_ms_shift;
2707 ms->ms_allocator = -1;
2708 ms->ms_new = B_TRUE;
2710 vdev_ops_t *ops = vd->vdev_ops;
2711 if (ops->vdev_op_metaslab_init != NULL)
2712 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
2717 * readonly pools there is no need to open the space map object.
2726 if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
2727 !spa->spa_read_spacemaps)) {
2728 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2729 ms->ms_size, vd->vdev_ashift);
2736 ASSERT(ms->ms_sm != NULL);
2737 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2744 ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
2746 ms->ms_allocating[t] = range_tree_create(NULL, type,
2749 ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
2750 ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
2752 ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
2755 ms->ms_checkpointing =
2757 ms->ms_unflushed_allocs =
2761 mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
2762 mrap->mra_floor_shift = metaslab_by_size_min_shift;
2763 ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
2766 ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
2782 metaslab_space_update(vd, mg->mg_class,
2799 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2802 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2808 mutex_enter(&spa->spa_flushed_ms_lock);
2809 avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2810 mutex_exit(&spa->spa_flushed_ms_lock);
2820 return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2821 range_tree_numsegs(ms->ms_unflushed_frees)) *
2822 ms->ms_unflushed_allocs->rt_root.bt_elem_size);
2828 metaslab_group_t *mg = msp->ms_group;
2829 vdev_t *vd = mg->mg_vd;
2830 spa_t *spa = vd->vdev_spa;
2836 mutex_enter(&msp->ms_lock);
2837 VERIFY(msp->ms_group == NULL);
2844 if (!msp->ms_new) {
2845 metaslab_space_update(vd, mg->mg_class,
2846 -metaslab_allocated_space(msp), 0, -msp->ms_size);
2849 space_map_close(msp->ms_sm);
2850 msp->ms_sm = NULL;
2854 range_tree_destroy(msp->ms_allocatable);
2855 range_tree_destroy(msp->ms_freeing);
2856 range_tree_destroy(msp->ms_freed);
2858 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2860 spa->spa_unflushed_stats.sus_memused -=
2862 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2863 range_tree_destroy(msp->ms_unflushed_allocs);
2864 range_tree_destroy(msp->ms_checkpointing);
2865 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2866 range_tree_destroy(msp->ms_unflushed_frees);
2869 range_tree_destroy(msp->ms_allocating[t]);
2872 range_tree_destroy(msp->ms_defer[t]);
2874 ASSERT0(msp->ms_deferspace);
2877 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2879 range_tree_vacate(msp->ms_trim, NULL, NULL);
2880 range_tree_destroy(msp->ms_trim);
2882 mutex_exit(&msp->ms_lock);
2883 cv_destroy(&msp->ms_load_cv);
2884 cv_destroy(&msp->ms_flush_cv);
2885 mutex_destroy(&msp->ms_lock);
2886 mutex_destroy(&msp->ms_sync_lock);
2887 ASSERT3U(msp->ms_allocator, ==, -1);
2895 * This table defines a segment size based fragmentation metric that will
2941 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2948 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2956 if (msp->ms_sm == NULL) {
2957 msp->ms_fragmentation = 0;
2965 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2967 vdev_t *vd = msp->ms_group->mg_vd;
2979 msp->ms_condense_wanted = B_TRUE;
2983 (u_longlong_t)msp->ms_id,
2984 (u_longlong_t)vd->vdev_id);
2986 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2992 uint8_t shift = msp->ms_sm->sm_shift;
2994 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2995 FRAGMENTATION_TABLE_SIZE - 1);
2997 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
3000 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
3011 msp->ms_fragmentation = fragmentation;
3015 * Compute a weight -- a selection preference value -- for the given metaslab.
3022 metaslab_group_t *mg = msp->ms_group;
3023 vdev_t *vd = mg->mg_vd;
3026 ASSERT(MUTEX_HELD(&msp->ms_lock));
3031 space = msp->ms_size - metaslab_allocated_space(msp);
3034 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
3042 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
3065 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
3066 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
3076 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
3077 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
3078 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
3086 * Return the weight of the specified metaslab, according to the segment-based
3097 ASSERT(msp->ms_loaded);
3099 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
3100 i--) {
3101 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
3102 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3105 segments += msp->ms_allocatable->rt_histogram[i];
3128 * Calculate the weight based on the on-disk histogram. Should be applied
3129 * only to unloaded metaslabs (i.e no incoming allocations) in-order to
3130 * give results consistent with the on-disk state
3135 space_map_t *sm = msp->ms_sm;
3136 ASSERT(!msp->ms_loaded);
3139 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3151 deferspace_histogram[i] += msp->ms_synchist[i];
3154 deferspace_histogram[i] += msp->ms_deferhist[t][i];
3159 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
3160 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
3163 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
3166 WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
3175 * Compute a segment-based weight for the specified metaslab. The weight
3182 metaslab_group_t *mg = msp->ms_group;
3184 uint8_t shift = mg->mg_vd->vdev_ashift;
3186 ASSERT(MUTEX_HELD(&msp->ms_lock));
3192 int idx = highbit64(msp->ms_size) - 1;
3193 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3199 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
3207 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3212 if (metaslab_allocated_space(msp) == msp->ms_size)
3219 if (msp->ms_loaded) {
3230 if (msp->ms_activation_weight != 0 && weight != 0)
3231 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
3238 * can be satisfied by looking at the size of the maximum free segment
3240 * weight. For segment-based weighting we can determine the maximum
3241 * allocation based on the index encoded in its value. For space-based
3242 * weights we rely on the entire weight (excluding the weight-type bit).
3253 if (unlikely(msp->ms_new))
3263 if (msp->ms_loaded ||
3264 (msp->ms_max_size != 0 && !try_hard && gethrtime() <
3265 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
3266 return (msp->ms_max_size >= asize);
3269 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3271 * The metaslab segment weight indicates segments in the
3277 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
3280 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
3289 vdev_t *vd = msp->ms_group->mg_vd;
3290 spa_t *spa = vd->vdev_spa;
3293 ASSERT(MUTEX_HELD(&msp->ms_lock));
3301 * unloaded, we check if there's a larger free segment in the
3303 * segment size. Coalescing of adjacent entries may reveal larger
3307 if (msp->ms_loaded) {
3308 msp->ms_max_size = metaslab_largest_allocatable(msp);
3310 msp->ms_max_size = MAX(msp->ms_max_size,
3315 * Segment-based weighting requires space map histogram support.
3319 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3331 ASSERT(MUTEX_HELD(&msp->ms_lock));
3334 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
3335 metaslab_group_sort(msp->ms_group, msp,
3343 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
3344 ASSERT(MUTEX_HELD(&msp->ms_lock));
3351 ASSERT0(msp->ms_activation_weight);
3352 msp->ms_activation_weight = msp->ms_weight;
3353 metaslab_group_sort(mg, msp, msp->ms_weight |
3359 &mga->mga_primary : &mga->mga_secondary);
3361 mutex_enter(&mg->mg_lock);
3363 mutex_exit(&mg->mg_lock);
3368 ASSERT3S(msp->ms_allocator, ==, -1);
3369 msp->ms_allocator = allocator;
3370 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
3372 ASSERT0(msp->ms_activation_weight);
3373 msp->ms_activation_weight = msp->ms_weight;
3375 msp->ms_weight | activation_weight);
3376 mutex_exit(&mg->mg_lock);
3384 ASSERT(MUTEX_HELD(&msp->ms_lock));
3397 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3398 ASSERT(msp->ms_loaded);
3404 metaslab_group_sort(msp->ms_group, msp, 0);
3423 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3424 if (msp->ms_allocator != allocator)
3427 if ((msp->ms_weight & activation_weight) == 0)
3431 msp->ms_primary);
3442 if (msp->ms_weight == 0) {
3443 ASSERT0(range_tree_space(msp->ms_allocatable));
3447 if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3452 ASSERT(msp->ms_loaded);
3453 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3462 ASSERT(MUTEX_HELD(&msp->ms_lock));
3463 ASSERT(msp->ms_loaded);
3465 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3470 mutex_enter(&mg->mg_lock);
3471 ASSERT3P(msp->ms_group, ==, mg);
3472 ASSERT3S(0, <=, msp->ms_allocator);
3473 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3475 metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
3476 if (msp->ms_primary) {
3477 ASSERT3P(mga->mga_primary, ==, msp);
3478 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3479 mga->mga_primary = NULL;
3481 ASSERT3P(mga->mga_secondary, ==, msp);
3482 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3483 mga->mga_secondary = NULL;
3485 msp->ms_allocator = -1;
3487 mutex_exit(&mg->mg_lock);
3500 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
3502 range_tree_space(msp->ms_allocatable) == 0);
3505 ASSERT(msp->ms_activation_weight != 0);
3506 msp->ms_activation_weight = 0;
3507 metaslab_passivate_allocator(msp->ms_group, msp, weight);
3508 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3512 * Segment-based metaslabs are activated once and remain active until
3513 * we either fail an allocation attempt (similar to space-based metaslabs)
3525 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3527 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3532 * information that is accessible to us is the in-core range tree
3536 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3539 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3547 metaslab_class_t *mc = msp->ms_group->mg_class;
3548 spa_t *spa = mc->mc_spa;
3551 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3553 mutex_enter(&msp->ms_lock);
3556 mutex_exit(&msp->ms_lock);
3563 spa_t *spa = mg->mg_vd->vdev_spa;
3565 avl_tree_t *t = &mg->mg_metaslab_tree;
3571 mutex_enter(&mg->mg_lock);
3577 ASSERT3P(msp->ms_group, ==, mg);
3585 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3589 VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
3590 msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
3593 mutex_exit(&mg->mg_lock);
3597 * Determine if the space map's on-disk footprint is past our tolerance for
3604 * 2. Condense if the on on-disk space map representation is at least
3606 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3608 * 3. Do not condense if the on-disk size of the space map does not actually
3611 * Unfortunately, we cannot compute the on-disk size of the space map in this
3614 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3620 space_map_t *sm = msp->ms_sm;
3621 vdev_t *vd = msp->ms_group->mg_vd;
3622 uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
3624 ASSERT(MUTEX_HELD(&msp->ms_lock));
3625 ASSERT(msp->ms_loaded);
3627 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3633 if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
3634 msp->ms_condense_wanted)
3637 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3640 msp->ms_allocatable, SM_NO_VDEVID);
3647 * Condense the on-disk space map representation to its minimized form.
3651 * the pool-wide log spacemaps; thus this is effectively a superset of
3658 space_map_t *sm = msp->ms_sm;
3660 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3662 ASSERT(MUTEX_HELD(&msp->ms_lock));
3663 ASSERT(msp->ms_loaded);
3664 ASSERT(msp->ms_sm != NULL);
3709 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3713 (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
3714 (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3715 spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
3716 (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
3717 msp->ms_condense_wanted ? "TRUE" : "FALSE");
3719 msp->ms_condense_wanted = B_FALSE;
3723 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
3729 range_tree_walk(msp->ms_defer[t],
3734 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3738 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3740 spa->spa_unflushed_stats.sus_memused -=
3742 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3743 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3753 msp->ms_condensing = B_TRUE;
3755 mutex_exit(&msp->ms_lock);
3756 uint64_t object = space_map_object(msp->ms_sm);
3765 if (space_map_object(msp->ms_sm) != object) {
3766 object = space_map_object(msp->ms_sm);
3767 dmu_write(spa->spa_meta_objset,
3768 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3769 msp->ms_id, sizeof (uint64_t), &object, tx);
3785 range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
3787 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3794 mutex_enter(&msp->ms_lock);
3796 msp->ms_condensing = B_FALSE;
3803 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3805 ASSERT(msp->ms_sm != NULL);
3806 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3807 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3809 mutex_enter(&spa->spa_flushed_ms_lock);
3812 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3813 mutex_exit(&spa->spa_flushed_ms_lock);
3822 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3824 ASSERT(msp->ms_sm != NULL);
3826 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3827 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3828 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3830 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3835 mutex_enter(&spa->spa_flushed_ms_lock);
3836 avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3839 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3840 mutex_exit(&spa->spa_flushed_ms_lock);
3857 * all the contents of the pool-wide spacemap log). Updates the metaslab's
3858 * metadata and any pool-wide related log space map data (e.g. summary,
3864 metaslab_group_t *mg = msp->ms_group;
3865 spa_t *spa = mg->mg_vd->vdev_spa;
3867 ASSERT(MUTEX_HELD(&msp->ms_lock));
3876 msp->ms_synced_length = space_map_length(msp->ms_sm);
3880 * feature being active. In that case this is a no-op.
3892 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3894 ASSERT(MUTEX_HELD(&msp->ms_lock));
3898 ASSERT(msp->ms_sm != NULL);
3900 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3915 if (msp->ms_loading)
3932 if (msp->ms_loaded && metaslab_should_condense(msp)) {
3933 metaslab_group_t *mg = msp->ms_group;
3941 metaslab_class_histogram_verify(mg->mg_class);
3946 space_map_histogram_clear(msp->ms_sm);
3947 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3948 ASSERT(range_tree_is_empty(msp->ms_freed));
3950 space_map_histogram_add(msp->ms_sm,
3951 msp->ms_defer[t], tx);
3957 metaslab_class_histogram_verify(mg->mg_class);
3972 msp->ms_flushing = B_TRUE;
3973 uint64_t sm_len_before = space_map_length(msp->ms_sm);
3975 mutex_exit(&msp->ms_lock);
3976 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3978 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3980 mutex_enter(&msp->ms_lock);
3982 uint64_t sm_len_after = space_map_length(msp->ms_sm);
3988 (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3989 (u_longlong_t)msp->ms_id,
3990 (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
3991 (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
3992 (u_longlong_t)(sm_len_after - sm_len_before));
3995 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3997 spa->spa_unflushed_stats.sus_memused -=
3999 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
4000 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
4010 msp->ms_flushing = B_FALSE;
4011 cv_broadcast(&msp->ms_flush_cv);
4021 metaslab_group_t *mg = msp->ms_group;
4022 vdev_t *vd = mg->mg_vd;
4023 spa_t *spa = vd->vdev_spa;
4025 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
4028 ASSERT(!vd->vdev_ishole);
4031 * This metaslab has just been added so there's no work to do now.
4033 if (msp->ms_new) {
4035 ASSERT0(range_tree_space(msp->ms_freeing));
4036 ASSERT0(range_tree_space(msp->ms_freed));
4037 ASSERT0(range_tree_space(msp->ms_checkpointing));
4038 ASSERT0(range_tree_space(msp->ms_trim));
4043 * Normally, we don't want to process a metaslab if there are no
4054 range_tree_is_empty(msp->ms_freeing) &&
4055 range_tree_is_empty(msp->ms_checkpointing) &&
4056 !(msp->ms_loaded && msp->ms_condense_wanted &&
4065 * with metaslab_sync() is the metaslab's ms_allocatable. No
4083 if (msp->ms_sm == NULL) {
4090 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
4091 msp->ms_id, sizeof (uint64_t), &new_object, tx);
4093 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
4094 msp->ms_start, msp->ms_size, vd->vdev_ashift));
4095 ASSERT(msp->ms_sm != NULL);
4097 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
4098 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
4102 if (!range_tree_is_empty(msp->ms_checkpointing) &&
4103 vd->vdev_checkpoint_sm == NULL) {
4110 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
4111 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
4112 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4119 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
4120 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4124 mutex_enter(&msp->ms_sync_lock);
4125 mutex_enter(&msp->ms_lock);
4133 metaslab_class_histogram_verify(mg->mg_class);
4136 if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
4143 * open-context (ZIL) for future TXGs do not block.
4145 mutex_exit(&msp->ms_lock);
4155 vd->vdev_id, tx);
4156 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
4157 vd->vdev_id, tx);
4158 mutex_enter(&msp->ms_lock);
4160 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
4162 spa->spa_unflushed_stats.sus_memused -=
4165 msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
4166 range_tree_remove_xor_add(msp->ms_freeing,
4167 msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
4168 spa->spa_unflushed_stats.sus_memused +=
4173 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
4175 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
4177 mutex_enter(&msp->ms_lock);
4180 msp->ms_allocated_space += range_tree_space(alloctree);
4181 ASSERT3U(msp->ms_allocated_space, >=,
4182 range_tree_space(msp->ms_freeing));
4183 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
4185 if (!range_tree_is_empty(msp->ms_checkpointing)) {
4187 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4195 mutex_exit(&msp->ms_lock);
4196 space_map_write(vd->vdev_checkpoint_sm,
4197 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
4198 mutex_enter(&msp->ms_lock);
4200 spa->spa_checkpoint_info.sci_dspace +=
4201 range_tree_space(msp->ms_checkpointing);
4202 vd->vdev_stat.vs_checkpoint_space +=
4203 range_tree_space(msp->ms_checkpointing);
4204 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
4205 -space_map_allocated(vd->vdev_checkpoint_sm));
4207 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
4210 if (msp->ms_loaded) {
4214 * to bring the space map's histogram up-to-date so we clear
4217 space_map_histogram_clear(msp->ms_sm);
4218 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
4223 * any deferred space. This allows the on-disk histogram
4227 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
4231 * added back into the in-core free tree yet. This will
4237 space_map_histogram_add(msp->ms_sm,
4238 msp->ms_defer[t], tx);
4244 * map histogram. We want to make sure that the on-disk histogram
4249 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
4254 metaslab_class_histogram_verify(mg->mg_class);
4267 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
4268 ASSERT0(msp->ms_allocated_this_txg);
4270 range_tree_vacate(msp->ms_freeing,
4271 range_tree_add, msp->ms_freed);
4273 msp->ms_allocated_this_txg += range_tree_space(alloctree);
4276 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4277 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
4279 ASSERT0(range_tree_space(msp->ms_freeing));
4280 ASSERT0(range_tree_space(msp->ms_checkpointing));
4282 mutex_exit(&msp->ms_lock);
4289 VERIFY0(dmu_read(mos, vd->vdev_ms_array,
4290 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
4291 VERIFY3U(object, ==, space_map_object(msp->ms_sm));
4293 mutex_exit(&msp->ms_sync_lock);
4300 if (!msp->ms_loaded || msp->ms_disabled != 0)
4305 msp->ms_allocating[(txg + t) & TXG_MASK]));
4307 if (msp->ms_allocator != -1)
4308 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
4321 metaslab_group_t *mg = msp->ms_group;
4322 vdev_t *vd = mg->mg_vd;
4323 spa_t *spa = vd->vdev_spa;
4328 ASSERT(!vd->vdev_ishole);
4330 mutex_enter(&msp->ms_lock);
4332 if (msp->ms_new) {
4334 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
4336 /* there should be no allocations nor frees at this point */
4337 VERIFY0(msp->ms_allocated_this_txg);
4338 VERIFY0(range_tree_space(msp->ms_freed));
4341 ASSERT0(range_tree_space(msp->ms_freeing));
4342 ASSERT0(range_tree_space(msp->ms_checkpointing));
4344 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
4346 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
4348 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
4349 vd->vdev_rz_expanding) {
4354 alloc_delta = msp->ms_allocated_this_txg -
4355 range_tree_space(msp->ms_freed);
4358 defer_delta = range_tree_space(msp->ms_freed) -
4361 defer_delta -= range_tree_space(*defer_tree);
4363 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
4372 * have a consistent view at the in-core side of the metaslab.
4380 * When auto-trimming is enabled, free ranges which are added to
4388 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
4390 range_tree_walk(msp->ms_freed, range_tree_add,
4391 msp->ms_trim);
4394 range_tree_vacate(msp->ms_trim, NULL, NULL);
4400 * the defer_tree -- this is safe to do because we've
4404 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
4406 range_tree_swap(&msp->ms_freed, defer_tree);
4408 range_tree_vacate(msp->ms_freed,
4409 msp->ms_loaded ? range_tree_add : NULL,
4410 msp->ms_allocatable);
4413 msp->ms_synced_length = space_map_length(msp->ms_sm);
4415 msp->ms_deferspace += defer_delta;
4416 ASSERT3S(msp->ms_deferspace, >=, 0);
4417 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
4418 if (msp->ms_deferspace != 0) {
4427 if (msp->ms_new) {
4428 msp->ms_new = B_FALSE;
4429 mutex_enter(&mg->mg_lock);
4430 mg->mg_ms_ready++;
4431 mutex_exit(&mg->mg_lock);
4435 * Re-sort metaslab within its group now that we've adjusted
4440 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4441 ASSERT0(range_tree_space(msp->ms_freeing));
4442 ASSERT0(range_tree_space(msp->ms_freed));
4443 ASSERT0(range_tree_space(msp->ms_checkpointing));
4444 msp->ms_allocating_total -= msp->ms_allocated_this_txg;
4445 msp->ms_allocated_this_txg = 0;
4446 mutex_exit(&msp->ms_lock);
4452 spa_t *spa = mg->mg_class->mc_spa;
4456 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
4461 * is no longer active since we dirty metaslabs as we remove a
4465 if (mg->mg_activation_count > 0) {
4484 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4487 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4489 return (msp->ms_id != dva_ms_id);
4518 if (zal->zal_size == metaslab_trace_max_entries) {
4524 zal->zal_size--;
4525 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4526 list_remove(&zal->zal_list, mat_next);
4531 list_link_init(&mat->mat_list_node);
4532 mat->mat_mg = mg;
4533 mat->mat_msp = msp;
4534 mat->mat_size = psize;
4535 mat->mat_dva_id = dva_id;
4536 mat->mat_offset = offset;
4537 mat->mat_weight = 0;
4538 mat->mat_allocator = allocator;
4541 mat->mat_weight = msp->ms_weight;
4547 list_insert_tail(&zal->zal_list, mat);
4548 zal->zal_size++;
4550 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4556 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4558 zal->zal_size = 0;
4566 while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4568 list_destroy(&zal->zal_list);
4569 zal->zal_size = 0;
4586 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4587 if (!mg->mg_class->mc_alloc_throttle_enabled)
4590 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4591 (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
4597 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4599 &mg->mg_class->mc_allocator[allocator];
4600 uint64_t max = mg->mg_max_alloc_queue_depth;
4601 uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
4603 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
4605 atomic_inc_64(&mca->mca_alloc_max_slots);
4608 cur = mga->mga_cur_max_alloc_queue_depth;
4620 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4621 if (!mg->mg_class->mc_alloc_throttle_enabled)
4624 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4625 (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
4635 const dva_t *dva = bp->blk_dva;
4640 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4641 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4642 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
4651 range_tree_t *rt = msp->ms_allocatable;
4652 metaslab_class_t *mc = msp->ms_group->mg_class;
4654 ASSERT(MUTEX_HELD(&msp->ms_lock));
4655 VERIFY(!msp->ms_condensing);
4656 VERIFY0(msp->ms_disabled);
4657 VERIFY0(msp->ms_new);
4659 start = mc->mc_ops->msop_alloc(msp, size);
4660 if (start != -1ULL) {
4661 metaslab_group_t *mg = msp->ms_group;
4662 vdev_t *vd = mg->mg_vd;
4664 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4665 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4666 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4668 range_tree_clear(msp->ms_trim, start, size);
4670 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4671 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4673 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4674 msp->ms_allocating_total += size;
4677 msp->ms_alloc_txg = txg;
4685 msp->ms_max_size = metaslab_largest_allocatable(msp);
4695 * have selected, we may not try the newly-activated metaslab, and instead
4698 * except for the newly-activated metaslab which we fail to examine).
4707 avl_tree_t *t = &mg->mg_metaslab_tree;
4732 if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
4735 *was_active = msp->ms_allocator != -1;
4756 search->ms_weight = msp->ms_weight;
4757 search->ms_start = msp->ms_start + 1;
4758 search->ms_allocator = msp->ms_allocator;
4759 search->ms_primary = msp->ms_primary;
4767 ASSERT(MUTEX_HELD(&msp->ms_lock));
4772 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4775 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4776 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4777 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4778 VERIFY3S(msp->ms_allocator, !=, -1);
4779 VERIFY(msp->ms_primary);
4783 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4784 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4785 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4786 VERIFY3S(msp->ms_allocator, !=, -1);
4787 VERIFY(!msp->ms_primary);
4791 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4792 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4793 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4794 VERIFY3S(msp->ms_allocator, ==, -1);
4805 uint64_t offset = -1ULL;
4810 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4813 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4823 if (mg->mg_ms_ready < mg->mg_allocators * 3)
4825 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4827 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4830 search->ms_weight = UINT64_MAX;
4831 search->ms_start = 0;
4833 * At the end of the metaslab tree are the already-active metaslabs,
4839 search->ms_allocator = -1;
4840 search->ms_primary = B_TRUE;
4844 mutex_enter(&mg->mg_lock);
4847 mga->mga_primary != NULL) {
4848 msp = mga->mga_primary;
4856 ASSERT(msp->ms_primary);
4857 ASSERT3S(msp->ms_allocator, ==, allocator);
4858 ASSERT(msp->ms_loaded);
4861 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4863 mga->mga_secondary != NULL) {
4864 msp = mga->mga_secondary;
4870 ASSERT(!msp->ms_primary);
4871 ASSERT3S(msp->ms_allocator, ==, allocator);
4872 ASSERT(msp->ms_loaded);
4875 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4882 mutex_exit(&mg->mg_lock);
4885 return (-1ULL);
4887 mutex_enter(&msp->ms_lock);
4893 * tracepoints in non-gpl kernel modules.
4909 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4910 ASSERT3S(msp->ms_allocator, ==, -1);
4911 mutex_exit(&msp->ms_lock);
4921 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4922 (msp->ms_allocator != -1) &&
4923 (msp->ms_allocator != allocator || ((activation_weight ==
4924 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4925 ASSERT(msp->ms_loaded);
4926 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4927 msp->ms_allocator != -1);
4928 mutex_exit(&msp->ms_lock);
4939 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4941 ASSERT(msp->ms_loaded);
4942 ASSERT3S(msp->ms_allocator, ==, -1);
4943 metaslab_passivate(msp, msp->ms_weight &
4945 mutex_exit(&msp->ms_lock);
4974 mutex_exit(&msp->ms_lock);
4977 ASSERT(msp->ms_loaded);
5000 if (msp->ms_condensing) {
5004 metaslab_passivate(msp, msp->ms_weight &
5007 mutex_exit(&msp->ms_lock);
5009 } else if (msp->ms_disabled > 0) {
5013 metaslab_passivate(msp, msp->ms_weight &
5016 mutex_exit(&msp->ms_lock);
5023 if (offset != -1ULL) {
5030 ASSERT(msp->ms_loaded);
5034 * tracepoints in non-gpl kernel modules.
5047 * For space-based metaslabs, we use the maximum block size.
5055 * For segment-based metaslabs, determine the new weight
5057 * explicitly use the loaded segment weight (i.e. the range
5063 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
5085 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
5098 mutex_exit(&msp->ms_lock);
5100 mutex_exit(&msp->ms_lock);
5115 mutex_enter(&mg->mg_lock);
5116 if (offset == -1ULL) {
5117 mg->mg_failed_allocations++;
5132 mg->mg_no_free_space = B_TRUE;
5135 mg->mg_allocations++;
5136 mutex_exit(&mg->mg_lock);
5148 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5158 * and a large number of split blocks coupled with ztest-induced
5172 * Note that there's no locking on mca_rotor or mca_aliquot because
5173 * nothing actually breaks if we miss a few updates -- we just won't
5183 * able to reason about. Otherwise, any two top-level vdev failures
5185 * only two adjacent top-level vdev failures will result in data loss.
5187 * If we are doing gang blocks (hintdva is non-NULL), try to keep
5196 * It's possible the vdev we're using as the hint no
5201 if (vd != NULL && vd->vdev_mg != NULL) {
5205 mg = mg->mg_next;
5207 mg = mca->mca_rotor;
5210 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
5211 mg = vd->vdev_mg->mg_next;
5213 ASSERT(mca->mca_rotor != NULL);
5214 mg = mca->mca_rotor;
5221 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
5222 mg = mca->mca_rotor;
5229 ASSERT(mg->mg_activation_count == 1);
5230 vd = mg->mg_vd;
5262 * Avoid writing single-copy data to an unhealthy,
5263 * non-redundant vdev, unless we've already tried all
5266 if (vd->vdev_state < VDEV_STATE_HEALTHY &&
5267 d == 0 && !try_hard && vd->vdev_children == 0) {
5273 ASSERT(mg->mg_class == mc);
5276 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
5287 if (offset != -1ULL) {
5291 * over- or under-used relative to the pool,
5297 if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
5298 vdev_stat_t *vs = &vd->vdev_stat;
5299 int64_t vs_free = vs->vs_space - vs->vs_alloc;
5300 int64_t mc_free = mc->mc_space - mc->mc_alloc;
5308 * This basically introduces a zero-centered
5326 ratio = (vs_free * mc->mc_alloc_groups * 100) /
5328 mg->mg_bias = ((ratio - 100) *
5329 (int64_t)mg->mg_aliquot) / 100;
5331 mg->mg_bias = 0;
5335 atomic_add_64_nv(&mca->mca_aliquot, asize) >=
5336 mg->mg_aliquot + mg->mg_bias) {
5337 mca->mca_rotor = mg->mg_next;
5338 mca->mca_aliquot = 0;
5341 DVA_SET_VDEV(&dva[d], vd->vdev_id);
5350 mca->mca_rotor = mg->mg_next;
5351 mca->mca_aliquot = 0;
5352 } while ((mg = mg->mg_next) != rotor);
5359 psize <= 1 << spa->spa_min_ashift)) {
5376 spa_t *spa = vd->vdev_spa;
5380 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5382 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5384 VERIFY(!msp->ms_condensing);
5385 VERIFY3U(offset, >=, msp->ms_start);
5386 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
5387 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5388 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
5392 mutex_enter(&msp->ms_lock);
5393 if (range_tree_is_empty(msp->ms_freeing) &&
5394 range_tree_is_empty(msp->ms_checkpointing)) {
5400 range_tree_add(msp->ms_checkpointing, offset, asize);
5402 range_tree_add(msp->ms_freeing, offset, asize);
5404 mutex_exit(&msp->ms_lock);
5416 if (vd->vdev_ops->vdev_op_remap != NULL)
5426 spa_t *spa = vd->vdev_spa;
5433 if (spa->spa_vdev_removal != NULL &&
5434 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
5443 } else if (vd->vdev_ops->vdev_op_remap != NULL) {
5445 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5465 blkptr_t *bp = rbca->rbca_bp;
5467 /* We can not remap split blocks. */
5468 if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5472 if (rbca->rbca_cb != NULL) {
5478 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5480 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5481 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5484 rbca->rbca_remap_vd = vd;
5485 rbca->rbca_remap_offset = offset;
5498 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5499 DVA_GET_VDEV(&bp->blk_dva[0]));
5500 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5502 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5505 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5506 DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5549 * Embedded BP's have no DVA to remap.
5555 * Note: we only remap dva[0]. If we remapped other dvas, we
5556 * would no longer know what their phys birth txg is.
5558 dva_t *dva = &bp->blk_dva[0];
5564 if (vd->vdev_ops->vdev_op_remap == NULL)
5580 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5583 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5608 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5615 ASSERT(!vd->vdev_removing);
5617 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5618 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5623 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5625 mutex_enter(&msp->ms_lock);
5626 range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5628 msp->ms_allocating_total -= size;
5630 VERIFY(!msp->ms_condensing);
5631 VERIFY3U(offset, >=, msp->ms_start);
5632 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5633 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5634 msp->ms_size);
5635 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5636 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5637 range_tree_add(msp->ms_allocatable, offset, size);
5638 mutex_exit(&msp->ms_lock);
5673 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5674 uint64_t max = mca->mca_alloc_max_slots;
5676 ASSERT(mc->mc_alloc_throttle_enabled);
5678 zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
5683 * But even if we assume some other non-existing scenario, the
5690 zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
5691 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5701 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5703 ASSERT(mc->mc_alloc_throttle_enabled);
5704 zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
5712 spa_t *spa = vd->vdev_spa;
5715 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5718 ASSERT3P(vd->vdev_ms, !=, NULL);
5719 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5721 mutex_enter(&msp->ms_lock);
5723 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
5726 ASSERT(msp->ms_loaded);
5727 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
5733 !range_tree_contains(msp->ms_allocatable, offset, size))
5737 mutex_exit(&msp->ms_lock);
5741 VERIFY(!msp->ms_condensing);
5742 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5743 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5744 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5745 msp->ms_size);
5746 range_tree_remove(msp->ms_allocatable, offset, size);
5747 range_tree_clear(msp->ms_trim, offset, size);
5750 metaslab_class_t *mc = msp->ms_group->mg_class;
5752 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
5753 if (!multilist_link_active(&msp->ms_class_txg_node)) {
5754 msp->ms_selected_txg = txg;
5759 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5761 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5763 msp->ms_allocating_total += size;
5766 mutex_exit(&msp->ms_lock);
5783 if (mcca_arg->mcca_error == 0) {
5784 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5785 size, mcca_arg->mcca_txg);
5792 if (vd->vdev_ops->vdev_op_remap != NULL) {
5800 ASSERT(!spa_writeable(vd->vdev_spa));
5804 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5848 dva_t *dva = bp->blk_dva;
5849 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5857 if (mc->mc_allocator[allocator].mca_rotor == NULL) {
5858 /* no vdevs in this class */
5872 for (d--; d >= 0; d--) {
5903 const dva_t *dva = bp->blk_dva;
5925 if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
5926 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5929 * there is no way it was created in the current txg.
5953 const dva_t *dva = bp->blk_dva;
5989 if (vd->vdev_ops == &vdev_indirect_ops)
5999 spa_t *spa __maybe_unused = vd->vdev_spa;
6004 if (vd->vdev_ops->vdev_op_remap != NULL) {
6005 vd->vdev_ops->vdev_op_remap(vd, offset, size,
6011 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
6014 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6016 mutex_enter(&msp->ms_lock);
6017 if (msp->ms_loaded) {
6018 range_tree_verify_not_present(msp->ms_allocatable,
6029 * segment but then we free part of it within the same txg
6033 range_tree_verify_not_present(msp->ms_freeing, offset, size);
6034 range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
6035 range_tree_verify_not_present(msp->ms_freed, offset, size);
6037 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
6038 range_tree_verify_not_present(msp->ms_trim, offset, size);
6039 mutex_exit(&msp->ms_lock);
6050 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
6052 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
6053 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
6055 if (DVA_GET_GANG(&bp->blk_dva[i]))
6068 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6069 while (mg->mg_disabled_updating) {
6070 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6077 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6078 ASSERT(mg->mg_disabled_updating);
6080 while (mg->mg_ms_disabled >= max_disabled_ms) {
6081 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6083 mg->mg_ms_disabled++;
6084 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
6096 ASSERT(!MUTEX_HELD(&msp->ms_lock));
6097 metaslab_group_t *mg = msp->ms_group;
6099 mutex_enter(&mg->mg_ms_disabled_lock);
6107 * to wait till the metaslab's mg_disabled_updating flag is no
6111 mg->mg_disabled_updating = B_TRUE;
6112 if (msp->ms_disabled == 0) {
6115 mutex_enter(&msp->ms_lock);
6116 msp->ms_disabled++;
6117 mutex_exit(&msp->ms_lock);
6119 mg->mg_disabled_updating = B_FALSE;
6120 cv_broadcast(&mg->mg_ms_disabled_cv);
6121 mutex_exit(&mg->mg_ms_disabled_lock);
6127 metaslab_group_t *mg = msp->ms_group;
6128 spa_t *spa = mg->mg_vd->vdev_spa;
6138 mutex_enter(&mg->mg_ms_disabled_lock);
6139 mutex_enter(&msp->ms_lock);
6140 if (--msp->ms_disabled == 0) {
6141 mg->mg_ms_disabled--;
6142 cv_broadcast(&mg->mg_ms_disabled_cv);
6146 mutex_exit(&msp->ms_lock);
6147 mutex_exit(&mg->mg_ms_disabled_lock);
6153 ms->ms_unflushed_dirty = dirty;
6159 vdev_t *vd = ms->ms_group->mg_vd;
6160 spa_t *spa = vd->vdev_spa;
6169 uint64_t entry_offset = ms->ms_id * entry_size;
6172 int err = zap_lookup(mos, vd->vdev_top_zap,
6178 VERIFY0(zap_add(mos, vd->vdev_top_zap,
6192 ms->ms_unflushed_txg = txg;
6199 return (ms->ms_unflushed_dirty);
6205 return (ms->ms_unflushed_txg);
6252 ZMOD_RW, "Enable segment-based metaslab selection");
6255 "Segment-based metaslab selection maximum buckets before switching");
6267 "When looking in size tree, use largest segment instead of exact fit");