xref: /freebsd-src/sys/contrib/openzfs/module/zfs/brt.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
12a58b312SMartin Matuska /*
22a58b312SMartin Matuska  * CDDL HEADER START
32a58b312SMartin Matuska  *
42a58b312SMartin Matuska  * The contents of this file are subject to the terms of the
52a58b312SMartin Matuska  * Common Development and Distribution License (the "License").
62a58b312SMartin Matuska  * You may not use this file except in compliance with the License.
72a58b312SMartin Matuska  *
82a58b312SMartin Matuska  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92a58b312SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
102a58b312SMartin Matuska  * See the License for the specific language governing permissions
112a58b312SMartin Matuska  * and limitations under the License.
122a58b312SMartin Matuska  *
132a58b312SMartin Matuska  * When distributing Covered Code, include this CDDL HEADER in each
142a58b312SMartin Matuska  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152a58b312SMartin Matuska  * If applicable, add the following below this CDDL HEADER, with the
162a58b312SMartin Matuska  * fields enclosed by brackets "[]" replaced with your own identifying
172a58b312SMartin Matuska  * information: Portions Copyright [yyyy] [name of copyright owner]
182a58b312SMartin Matuska  *
192a58b312SMartin Matuska  * CDDL HEADER END
202a58b312SMartin Matuska  */
212a58b312SMartin Matuska 
222a58b312SMartin Matuska /*
232a58b312SMartin Matuska  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
242a58b312SMartin Matuska  */
252a58b312SMartin Matuska 
262a58b312SMartin Matuska #include <sys/zfs_context.h>
272a58b312SMartin Matuska #include <sys/spa.h>
282a58b312SMartin Matuska #include <sys/spa_impl.h>
292a58b312SMartin Matuska #include <sys/zio.h>
302a58b312SMartin Matuska #include <sys/brt.h>
312276e539SMartin Matuska #include <sys/brt_impl.h>
322a58b312SMartin Matuska #include <sys/ddt.h>
332a58b312SMartin Matuska #include <sys/bitmap.h>
342a58b312SMartin Matuska #include <sys/zap.h>
352a58b312SMartin Matuska #include <sys/dmu_tx.h>
362a58b312SMartin Matuska #include <sys/arc.h>
372a58b312SMartin Matuska #include <sys/dsl_pool.h>
382a58b312SMartin Matuska #include <sys/dsl_scan.h>
392a58b312SMartin Matuska #include <sys/vdev_impl.h>
402a58b312SMartin Matuska #include <sys/kstat.h>
412a58b312SMartin Matuska #include <sys/wmsum.h>
422a58b312SMartin Matuska 
432a58b312SMartin Matuska /*
442a58b312SMartin Matuska  * Block Cloning design.
452a58b312SMartin Matuska  *
462a58b312SMartin Matuska  * Block Cloning allows to manually clone a file (or a subset of its blocks)
472a58b312SMartin Matuska  * into another (or the same) file by just creating additional references to
482a58b312SMartin Matuska  * the data blocks without copying the data itself. Those references are kept
492a58b312SMartin Matuska  * in the Block Reference Tables (BRTs).
502a58b312SMartin Matuska  *
512a58b312SMartin Matuska  * In many ways this is similar to the existing deduplication, but there are
522a58b312SMartin Matuska  * some important differences:
532a58b312SMartin Matuska  *
542a58b312SMartin Matuska  * - Deduplication is automatic and Block Cloning is not - one has to use a
552a58b312SMartin Matuska  *   dedicated system call(s) to clone the given file/blocks.
562a58b312SMartin Matuska  * - Deduplication keeps all data blocks in its table, even those referenced
572a58b312SMartin Matuska  *   just once. Block Cloning creates an entry in its tables only when there
582a58b312SMartin Matuska  *   are at least two references to the given data block. If the block was
592a58b312SMartin Matuska  *   never explicitly cloned or the second to last reference was dropped,
602a58b312SMartin Matuska  *   there will be neither space nor performance overhead.
612a58b312SMartin Matuska  * - Deduplication needs data to work - one needs to pass real data to the
622a58b312SMartin Matuska  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
632a58b312SMartin Matuska  *   data, just block pointers to the data, so it is extremely fast, as we pay
642a58b312SMartin Matuska  *   neither the cost of reading the data, nor the cost of writing the data -
652a58b312SMartin Matuska  *   we operate exclusively on metadata.
662a58b312SMartin Matuska  * - If the D (dedup) bit is not set in the block pointer, it means that
672a58b312SMartin Matuska  *   the block is not in the dedup table (DDT) and we won't consult the DDT
682a58b312SMartin Matuska  *   when we need to free the block. Block Cloning must be consulted on every
692a58b312SMartin Matuska  *   free, because we cannot modify the source BP (eg. by setting something
702a58b312SMartin Matuska  *   similar to the D bit), thus we have no hint if the block is in the
712a58b312SMartin Matuska  *   Block Reference Table (BRT), so we need to look into the BRT. There is
722a58b312SMartin Matuska  *   an optimization in place that allows us to eliminate the majority of BRT
732a58b312SMartin Matuska  *   lookups which is described below in the "Minimizing free penalty" section.
742a58b312SMartin Matuska  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
752a58b312SMartin Matuska  *   64bit offset and 64bit reference counter.
762a58b312SMartin Matuska  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
772a58b312SMartin Matuska  *   other on disk are most likely in totally different parts of the DDT.
782a58b312SMartin Matuska  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
792a58b312SMartin Matuska  *   from one file should have BRT entries close to each other.
802a58b312SMartin Matuska  * - Scrub will only do a single pass over a block that is referenced multiple
812a58b312SMartin Matuska  *   times in the DDT. Unfortunately it is not currently (if at all) possible
822a58b312SMartin Matuska  *   with Block Cloning and block referenced multiple times will be scrubbed
832a58b312SMartin Matuska  *   multiple times. The new, sorted scrub should be able to eliminate
842a58b312SMartin Matuska  *   duplicated reads given enough memory.
852a58b312SMartin Matuska  * - Deduplication requires cryptographically strong hash as a checksum or
862a58b312SMartin Matuska  *   additional data verification. Block Cloning works with any checksum
872a58b312SMartin Matuska  *   algorithm or even with checksumming disabled.
882a58b312SMartin Matuska  *
892a58b312SMartin Matuska  * As mentioned above, the BRT entries are much smaller than the DDT entries.
902a58b312SMartin Matuska  * To uniquely identify a block we just need its vdev id and offset. We also
912a58b312SMartin Matuska  * need to maintain a reference counter. The vdev id will often repeat, as there
922a58b312SMartin Matuska  * is a small number of top-level VDEVs and a large number of blocks stored in
932a58b312SMartin Matuska  * each VDEV. We take advantage of that to reduce the BRT entry size further by
942a58b312SMartin Matuska  * maintaining one BRT for each top-level VDEV, so we can then have only offset
952a58b312SMartin Matuska  * and counter as the BRT entry.
962a58b312SMartin Matuska  *
972a58b312SMartin Matuska  * Minimizing free penalty.
982a58b312SMartin Matuska  *
992a58b312SMartin Matuska  * Block Cloning allows creating additional references to any existing block.
1002a58b312SMartin Matuska  * When we free a block there is no hint in the block pointer whether the block
1012a58b312SMartin Matuska  * was cloned or not, so on each free we have to check if there is a
1022a58b312SMartin Matuska  * corresponding entry in the BRT or not. If there is, we need to decrease
1032a58b312SMartin Matuska  * the reference counter. Doing BRT lookup on every free can potentially be
1042a58b312SMartin Matuska  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
1052a58b312SMartin Matuska  * This is the main problem with deduplication, so we've learned our lesson and
1062a58b312SMartin Matuska  * try not to repeat the same mistake here. How do we do that? We divide each
1072a58b312SMartin Matuska  * top-level VDEV into 16MB regions. For each region we maintain a counter that
1082a58b312SMartin Matuska  * is a sum of all the BRT entries that have offsets within the region. This
1092a58b312SMartin Matuska  * creates the entries count array of 16bit numbers for each top-level VDEV.
1102a58b312SMartin Matuska  * The entries count array is always kept in memory and updated on disk in the
1112a58b312SMartin Matuska  * same transaction group as the BRT updates to keep everything in-sync. We can
1122a58b312SMartin Matuska  * keep the array in memory, because it is very small. With 16MB regions and
1132a58b312SMartin Matuska  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
1142a58b312SMartin Matuska  * the region size even further in the future). Now, when we want to free
1152a58b312SMartin Matuska  * a block, we first consult the array. If the counter for the whole region is
1162a58b312SMartin Matuska  * zero, there is no need to look for the BRT entry, as there isn't one for
1172a58b312SMartin Matuska  * sure. If the counter for the region is greater than zero, only then we will
1182a58b312SMartin Matuska  * do a BRT lookup and if an entry is found we will decrease the reference
1192a58b312SMartin Matuska  * counter in the BRT entry and in the entry counters array.
1202a58b312SMartin Matuska  *
1212a58b312SMartin Matuska  * The entry counters array is small, but can potentially be larger for very
1222a58b312SMartin Matuska  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
1232a58b312SMartin Matuska  * array on every change. We then divide the array into 32kB block and keep
1242a58b312SMartin Matuska  * a bitmap of dirty blocks within a transaction group. When we sync the
1252a58b312SMartin Matuska  * transaction group we can only update the parts of the entry counters array
1262a58b312SMartin Matuska  * that were modified. Note: Keeping track of the dirty parts of the entry
1272a58b312SMartin Matuska  * counters array is implemented, but updating only parts of the array on disk
1282a58b312SMartin Matuska  * is not yet implemented - for now we will update entire array if there was
1292a58b312SMartin Matuska  * any change.
1302a58b312SMartin Matuska  *
1312a58b312SMartin Matuska  * The implementation tries to be economic: if BRT is not used, or no longer
1322a58b312SMartin Matuska  * used, there will be no entries in the MOS and no additional memory used (eg.
1332a58b312SMartin Matuska  * the entry counters array is only allocated if needed).
1342a58b312SMartin Matuska  *
1352a58b312SMartin Matuska  * Interaction between Deduplication and Block Cloning.
1362a58b312SMartin Matuska  *
1372a58b312SMartin Matuska  * If both functionalities are in use, we could end up with a block that is
1382a58b312SMartin Matuska  * referenced multiple times in both DDT and BRT. When we free one of the
1392a58b312SMartin Matuska  * references we couldn't tell where it belongs, so we would have to decide
1402a58b312SMartin Matuska  * what table takes the precedence: do we first clear DDT references or BRT
1412a58b312SMartin Matuska  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
1422a58b312SMartin Matuska  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
1432a58b312SMartin Matuska  * lookup DDT entry instead and increase the counter there. No BRT entry
1442a58b312SMartin Matuska  * will be created for a block which has the D (dedup) bit set.
1452a58b312SMartin Matuska  * BRT may be more efficient for manual deduplication, but if the block is
1462a58b312SMartin Matuska  * already in the DDT, then creating additional BRT entry would be less
1472a58b312SMartin Matuska  * efficient. This clever idea was proposed by Allan Jude.
1482a58b312SMartin Matuska  *
1492a58b312SMartin Matuska  * Block Cloning across datasets.
1502a58b312SMartin Matuska  *
1512a58b312SMartin Matuska  * Block Cloning is not limited to cloning blocks within the same dataset.
1522a58b312SMartin Matuska  * It is possible (and very useful) to clone blocks between different datasets.
1532a58b312SMartin Matuska  * One use case is recovering files from snapshots. By cloning the files into
1542a58b312SMartin Matuska  * dataset we need no additional storage. Without Block Cloning we would need
1552a58b312SMartin Matuska  * additional space for those files.
1562a58b312SMartin Matuska  * Another interesting use case is moving the files between datasets
1572a58b312SMartin Matuska  * (copying the file content to the new dataset and removing the source file).
1582a58b312SMartin Matuska  * In that case Block Cloning will only be used briefly, because the BRT entries
1592a58b312SMartin Matuska  * will be removed when the source is removed.
1603494f7c0SMartin Matuska  * Block Cloning across encrypted datasets is supported as long as both
1613494f7c0SMartin Matuska  * datasets share the same master key (e.g. snapshots and clones)
1622a58b312SMartin Matuska  *
1632a58b312SMartin Matuska  * Block Cloning flow through ZFS layers.
1642a58b312SMartin Matuska  *
1652a58b312SMartin Matuska  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
1662a58b312SMartin Matuska  * blocks. As of this writing no interface is implemented that allows for block
1672a58b312SMartin Matuska  * cloning within a ZVOL.
1682a58b312SMartin Matuska  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
1692a58b312SMartin Matuska  * for blocking cloning.
1702a58b312SMartin Matuska  *
1712a58b312SMartin Matuska  *	ssize_t
1722a58b312SMartin Matuska  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
1732a58b312SMartin Matuska  *	                size_t len, unsigned int flags);
1742a58b312SMartin Matuska  *
1752a58b312SMartin Matuska  * Even though offsets and length represent bytes, they have to be
176315ee00fSMartin Matuska  * block-aligned or we will return an error so the upper layer can
1772a58b312SMartin Matuska  * fallback to the generic mechanism that will just copy the data.
1782a58b312SMartin Matuska  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
1792a58b312SMartin Matuska  * This function was implemented based on zfs_write(), but instead of writing
1802a58b312SMartin Matuska  * the given data we first read block pointers using the new dmu_read_l0_bps()
1812a58b312SMartin Matuska  * function from the source file. Once we have BPs from the source file we call
1822a58b312SMartin Matuska  * the dmu_brt_clone() function on the destination file. This function
1832a58b312SMartin Matuska  * allocates BPs for us. We iterate over all source BPs. If the given BP is
1842a58b312SMartin Matuska  * a hole or an embedded block, we just copy BP as-is. If it points to a real
1852a58b312SMartin Matuska  * data we place this BP on a BRT pending list using the brt_pending_add()
1862a58b312SMartin Matuska  * function.
1872a58b312SMartin Matuska  *
1882a58b312SMartin Matuska  * We use this pending list to keep track of all BPs that got new references
1892a58b312SMartin Matuska  * within this transaction group.
1902a58b312SMartin Matuska  *
1912a58b312SMartin Matuska  * Some special cases to consider and how we address them:
1922a58b312SMartin Matuska  * - The block we want to clone may have been created within the same
1932a58b312SMartin Matuska  *   transaction group that we are trying to clone. Such block has no BP
194315ee00fSMartin Matuska  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
1952a58b312SMartin Matuska  * - The block we want to clone may have been modified within the same
196315ee00fSMartin Matuska  *   transaction group. We return EAGAIN.
1972a58b312SMartin Matuska  * - A block may be cloned multiple times during one transaction group (that's
1982a58b312SMartin Matuska  *   why pending list is actually a tree and not an append-only list - this
1992a58b312SMartin Matuska  *   way we can figure out faster if this block is cloned for the first time
2002a58b312SMartin Matuska  *   in this txg or consecutive time).
2012a58b312SMartin Matuska  * - A block may be cloned and freed within the same transaction group
2022a58b312SMartin Matuska  *   (see dbuf_undirty()).
2032a58b312SMartin Matuska  * - A block may be cloned and within the same transaction group the clone
2042a58b312SMartin Matuska  *   can be cloned again (see dmu_read_l0_bps()).
2052a58b312SMartin Matuska  * - A file might have been deleted, but the caller still has a file descriptor
2062a58b312SMartin Matuska  *   open to this file and clones it.
2072a58b312SMartin Matuska  *
2082a58b312SMartin Matuska  * When we free a block we have an additional step in the ZIO pipeline where we
2092a58b312SMartin Matuska  * call the zio_brt_free() function. We then call the brt_entry_decref()
2102a58b312SMartin Matuska  * that loads the corresponding BRT entry (if one exists) and decreases
2112a58b312SMartin Matuska  * reference counter. If this is not the last reference we will stop ZIO
2122a58b312SMartin Matuska  * pipeline here. If this is the last reference or the block is not in the
2132a58b312SMartin Matuska  * BRT, we continue the pipeline and free the block as usual.
2142a58b312SMartin Matuska  *
2152a58b312SMartin Matuska  * At the beginning of spa_sync() where there can be no more block cloning,
2162a58b312SMartin Matuska  * but before issuing frees we call brt_pending_apply(). This function applies
2172a58b312SMartin Matuska  * all the new clones to the BRT table - we load BRT entries and update
2182a58b312SMartin Matuska  * reference counters. To sync new BRT entries to disk, we use brt_sync()
2192a58b312SMartin Matuska  * function. This function will sync all dirty per-top-level-vdev BRTs,
2202a58b312SMartin Matuska  * the entry counters arrays, etc.
2212a58b312SMartin Matuska  *
2222a58b312SMartin Matuska  * Block Cloning and ZIL.
2232a58b312SMartin Matuska  *
2242a58b312SMartin Matuska  * Every clone operation is divided into chunks (similar to write) and each
2252a58b312SMartin Matuska  * chunk is cloned in a separate transaction. The chunk size is determined by
2262a58b312SMartin Matuska  * how many BPs we can fit into a single ZIL entry.
2272a58b312SMartin Matuska  * Replaying clone operation is different from the regular clone operation,
2282a58b312SMartin Matuska  * as when we log clone operations we cannot use the source object - it may
2292a58b312SMartin Matuska  * reside on a different dataset, so we log BPs we want to clone.
2302a58b312SMartin Matuska  * The ZIL is replayed when we mount the given dataset, not when the pool is
2312a58b312SMartin Matuska  * imported. Taking this into account it is possible that the pool is imported
2322a58b312SMartin Matuska  * without mounting datasets and the source dataset is destroyed before the
2332a58b312SMartin Matuska  * destination dataset is mounted and its ZIL replayed.
2342a58b312SMartin Matuska  * To address this situation we leverage zil_claim() mechanism where ZFS will
2352a58b312SMartin Matuska  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236525fe93dSMartin Matuska  * entries, we will bump reference counters for their BPs in the BRT.  Then
237525fe93dSMartin Matuska  * on mount and ZIL replay we bump the reference counters once more, while the
238525fe93dSMartin Matuska  * first references are dropped during ZIL destroy by zil_free_clone_range().
239525fe93dSMartin Matuska  * It is possible that after zil_claim() we never mount the destination, so
240525fe93dSMartin Matuska  * we never replay its ZIL and just destroy it.  In this case the only taken
241525fe93dSMartin Matuska  * references will be dropped by zil_free_clone_range(), since the cloning is
242525fe93dSMartin Matuska  * not going to ever take place.
2432a58b312SMartin Matuska  */
2442a58b312SMartin Matuska 
2452a58b312SMartin Matuska static kmem_cache_t *brt_entry_cache;
2462a58b312SMartin Matuska 
2472a58b312SMartin Matuska /*
2482a58b312SMartin Matuska  * Enable/disable prefetching of BRT entries that we are going to modify.
2492a58b312SMartin Matuska  */
250783d3ff6SMartin Matuska static int brt_zap_prefetch = 1;
2512a58b312SMartin Matuska 
2522a58b312SMartin Matuska #ifdef ZFS_DEBUG
2532a58b312SMartin Matuska #define	BRT_DEBUG(...)	do {						\
2542a58b312SMartin Matuska 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
2552a58b312SMartin Matuska 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
2562a58b312SMartin Matuska 	}								\
2572a58b312SMartin Matuska } while (0)
2582a58b312SMartin Matuska #else
2592a58b312SMartin Matuska #define	BRT_DEBUG(...)	do { } while (0)
2602a58b312SMartin Matuska #endif
2612a58b312SMartin Matuska 
262783d3ff6SMartin Matuska static int brt_zap_default_bs = 12;
263783d3ff6SMartin Matuska static int brt_zap_default_ibs = 12;
2642a58b312SMartin Matuska 
2652a58b312SMartin Matuska static kstat_t	*brt_ksp;
2662a58b312SMartin Matuska 
2672a58b312SMartin Matuska typedef struct brt_stats {
2682a58b312SMartin Matuska 	kstat_named_t brt_addref_entry_not_on_disk;
2692a58b312SMartin Matuska 	kstat_named_t brt_addref_entry_on_disk;
2702a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_in_memory;
2712a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_loaded_from_disk;
2722a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_not_in_memory;
2732a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_read_lost_race;
2742a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_still_referenced;
2752a58b312SMartin Matuska 	kstat_named_t brt_decref_free_data_later;
2762a58b312SMartin Matuska 	kstat_named_t brt_decref_free_data_now;
2772a58b312SMartin Matuska 	kstat_named_t brt_decref_no_entry;
2782a58b312SMartin Matuska } brt_stats_t;
2792a58b312SMartin Matuska 
2802a58b312SMartin Matuska static brt_stats_t brt_stats = {
2812a58b312SMartin Matuska 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
2822a58b312SMartin Matuska 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
2832a58b312SMartin Matuska 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
2842a58b312SMartin Matuska 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
2852a58b312SMartin Matuska 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
2862a58b312SMartin Matuska 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
2872a58b312SMartin Matuska 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
2882a58b312SMartin Matuska 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
2892a58b312SMartin Matuska 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
2902a58b312SMartin Matuska 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
2912a58b312SMartin Matuska };
2922a58b312SMartin Matuska 
2932a58b312SMartin Matuska struct {
2942a58b312SMartin Matuska 	wmsum_t brt_addref_entry_not_on_disk;
2952a58b312SMartin Matuska 	wmsum_t brt_addref_entry_on_disk;
2962a58b312SMartin Matuska 	wmsum_t brt_decref_entry_in_memory;
2972a58b312SMartin Matuska 	wmsum_t brt_decref_entry_loaded_from_disk;
2982a58b312SMartin Matuska 	wmsum_t brt_decref_entry_not_in_memory;
2992a58b312SMartin Matuska 	wmsum_t brt_decref_entry_read_lost_race;
3002a58b312SMartin Matuska 	wmsum_t brt_decref_entry_still_referenced;
3012a58b312SMartin Matuska 	wmsum_t brt_decref_free_data_later;
3022a58b312SMartin Matuska 	wmsum_t brt_decref_free_data_now;
3032a58b312SMartin Matuska 	wmsum_t brt_decref_no_entry;
3042a58b312SMartin Matuska } brt_sums;
3052a58b312SMartin Matuska 
3062a58b312SMartin Matuska #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
3072a58b312SMartin Matuska 
3082a58b312SMartin Matuska static int brt_entry_compare(const void *x1, const void *x2);
309718519f4SMartin Matuska static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
3102a58b312SMartin Matuska 
3112a58b312SMartin Matuska static void
312718519f4SMartin Matuska brt_rlock(spa_t *spa)
3132a58b312SMartin Matuska {
314718519f4SMartin Matuska 	rw_enter(&spa->spa_brt_lock, RW_READER);
3152a58b312SMartin Matuska }
3162a58b312SMartin Matuska 
3172a58b312SMartin Matuska static void
318718519f4SMartin Matuska brt_wlock(spa_t *spa)
3192a58b312SMartin Matuska {
320718519f4SMartin Matuska 	rw_enter(&spa->spa_brt_lock, RW_WRITER);
3212a58b312SMartin Matuska }
3222a58b312SMartin Matuska 
3232a58b312SMartin Matuska static void
324718519f4SMartin Matuska brt_unlock(spa_t *spa)
3252a58b312SMartin Matuska {
326718519f4SMartin Matuska 	rw_exit(&spa->spa_brt_lock);
3272a58b312SMartin Matuska }
3282a58b312SMartin Matuska 
3292a58b312SMartin Matuska static uint16_t
3302a58b312SMartin Matuska brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
3312a58b312SMartin Matuska {
3322a58b312SMartin Matuska 
3332a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
3342a58b312SMartin Matuska 
3353494f7c0SMartin Matuska 	if (unlikely(brtvd->bv_need_byteswap)) {
3362a58b312SMartin Matuska 		return (BSWAP_16(brtvd->bv_entcount[idx]));
3372a58b312SMartin Matuska 	} else {
3382a58b312SMartin Matuska 		return (brtvd->bv_entcount[idx]);
3392a58b312SMartin Matuska 	}
3402a58b312SMartin Matuska }
3412a58b312SMartin Matuska 
3422a58b312SMartin Matuska static void
3432a58b312SMartin Matuska brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
3442a58b312SMartin Matuska {
3452a58b312SMartin Matuska 
3462a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
3472a58b312SMartin Matuska 
3483494f7c0SMartin Matuska 	if (unlikely(brtvd->bv_need_byteswap)) {
3492a58b312SMartin Matuska 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
3502a58b312SMartin Matuska 	} else {
3512a58b312SMartin Matuska 		brtvd->bv_entcount[idx] = entcnt;
3522a58b312SMartin Matuska 	}
3532a58b312SMartin Matuska }
3542a58b312SMartin Matuska 
3552a58b312SMartin Matuska static void
3562a58b312SMartin Matuska brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
3572a58b312SMartin Matuska {
3582a58b312SMartin Matuska 	uint16_t entcnt;
3592a58b312SMartin Matuska 
3602a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
3612a58b312SMartin Matuska 
3622a58b312SMartin Matuska 	entcnt = brt_vdev_entcount_get(brtvd, idx);
3632a58b312SMartin Matuska 	ASSERT(entcnt < UINT16_MAX);
3642a58b312SMartin Matuska 
3652a58b312SMartin Matuska 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
3662a58b312SMartin Matuska }
3672a58b312SMartin Matuska 
3682a58b312SMartin Matuska static void
3692a58b312SMartin Matuska brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
3702a58b312SMartin Matuska {
3712a58b312SMartin Matuska 	uint16_t entcnt;
3722a58b312SMartin Matuska 
3732a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
3742a58b312SMartin Matuska 
3752a58b312SMartin Matuska 	entcnt = brt_vdev_entcount_get(brtvd, idx);
3762a58b312SMartin Matuska 	ASSERT(entcnt > 0);
3772a58b312SMartin Matuska 
3782a58b312SMartin Matuska 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
3792a58b312SMartin Matuska }
3802a58b312SMartin Matuska 
3812a58b312SMartin Matuska #ifdef ZFS_DEBUG
3822a58b312SMartin Matuska static void
3833494f7c0SMartin Matuska brt_vdev_dump(brt_vdev_t *brtvd)
3842a58b312SMartin Matuska {
3852a58b312SMartin Matuska 	uint64_t idx;
3862a58b312SMartin Matuska 
387718519f4SMartin Matuska 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
3883494f7c0SMartin Matuska 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
389718519f4SMartin Matuska 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
3903494f7c0SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid,
3912a58b312SMartin Matuska 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
3922a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_size,
3932a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_totalcount,
394718519f4SMartin Matuska 	    (u_longlong_t)nblocks,
395718519f4SMartin Matuska 	    (size_t)BT_SIZEOFMAP(nblocks));
3962a58b312SMartin Matuska 	if (brtvd->bv_totalcount > 0) {
3972a58b312SMartin Matuska 		zfs_dbgmsg("    entcounts:");
3982a58b312SMartin Matuska 		for (idx = 0; idx < brtvd->bv_size; idx++) {
3993494f7c0SMartin Matuska 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
4003494f7c0SMartin Matuska 			if (entcnt > 0) {
4012a58b312SMartin Matuska 				zfs_dbgmsg("      [%04llu] %hu",
4023494f7c0SMartin Matuska 				    (u_longlong_t)idx, entcnt);
4032a58b312SMartin Matuska 			}
4042a58b312SMartin Matuska 		}
4052a58b312SMartin Matuska 	}
4062a58b312SMartin Matuska 	if (brtvd->bv_entcount_dirty) {
4072a58b312SMartin Matuska 		char *bitmap;
4082a58b312SMartin Matuska 
409718519f4SMartin Matuska 		bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
410718519f4SMartin Matuska 		for (idx = 0; idx < nblocks; idx++) {
4112a58b312SMartin Matuska 			bitmap[idx] =
4122a58b312SMartin Matuska 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
4132a58b312SMartin Matuska 		}
4142a58b312SMartin Matuska 		bitmap[idx] = '\0';
4153494f7c0SMartin Matuska 		zfs_dbgmsg("    dirty: %s", bitmap);
416718519f4SMartin Matuska 		kmem_free(bitmap, nblocks + 1);
4172a58b312SMartin Matuska 	}
4182a58b312SMartin Matuska }
4192a58b312SMartin Matuska #endif
4202a58b312SMartin Matuska 
4212a58b312SMartin Matuska static brt_vdev_t *
422718519f4SMartin Matuska brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
4232a58b312SMartin Matuska {
424718519f4SMartin Matuska 	brt_vdev_t *brtvd = NULL;
4252a58b312SMartin Matuska 
426718519f4SMartin Matuska 	brt_rlock(spa);
427718519f4SMartin Matuska 	if (vdevid < spa->spa_brt_nvdevs) {
428718519f4SMartin Matuska 		brtvd = spa->spa_brt_vdevs[vdevid];
429718519f4SMartin Matuska 	} else if (alloc) {
430718519f4SMartin Matuska 		/* New VDEV was added. */
431718519f4SMartin Matuska 		brt_unlock(spa);
432718519f4SMartin Matuska 		brt_wlock(spa);
433718519f4SMartin Matuska 		if (vdevid >= spa->spa_brt_nvdevs)
434718519f4SMartin Matuska 			brt_vdevs_expand(spa, vdevid + 1);
435718519f4SMartin Matuska 		brtvd = spa->spa_brt_vdevs[vdevid];
4362a58b312SMartin Matuska 	}
437718519f4SMartin Matuska 	brt_unlock(spa);
4382a58b312SMartin Matuska 	return (brtvd);
4392a58b312SMartin Matuska }
4402a58b312SMartin Matuska 
4412a58b312SMartin Matuska static void
442718519f4SMartin Matuska brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
4432a58b312SMartin Matuska {
4442a58b312SMartin Matuska 	char name[64];
4452a58b312SMartin Matuska 
446718519f4SMartin Matuska 	ASSERT(brtvd->bv_initiated);
4472a58b312SMartin Matuska 	ASSERT0(brtvd->bv_mos_brtvdev);
4482a58b312SMartin Matuska 	ASSERT0(brtvd->bv_mos_entries);
4492a58b312SMartin Matuska 
450718519f4SMartin Matuska 	uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
4512a58b312SMartin Matuska 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
452783d3ff6SMartin Matuska 	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
453718519f4SMartin Matuska 	VERIFY(mos_entries != 0);
454718519f4SMartin Matuska 	VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
455718519f4SMartin Matuska 	    &brtvd->bv_mos_entries_dnode));
456718519f4SMartin Matuska 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
457718519f4SMartin Matuska 	brtvd->bv_mos_entries = mos_entries;
458718519f4SMartin Matuska 	rw_exit(&brtvd->bv_mos_entries_lock);
4592a58b312SMartin Matuska 	BRT_DEBUG("MOS entries created, object=%llu",
4602a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_entries);
4612a58b312SMartin Matuska 
4622a58b312SMartin Matuska 	/*
4632a58b312SMartin Matuska 	 * We allocate DMU buffer to store the bv_entcount[] array.
4642a58b312SMartin Matuska 	 * We will keep array size (bv_size) and cummulative count for all
4652a58b312SMartin Matuska 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
4662a58b312SMartin Matuska 	 */
467718519f4SMartin Matuska 	brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
4682a58b312SMartin Matuska 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
4692a58b312SMartin Matuska 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
4702a58b312SMartin Matuska 	VERIFY(brtvd->bv_mos_brtvdev != 0);
4712a58b312SMartin Matuska 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
4722a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
4732a58b312SMartin Matuska 
4742a58b312SMartin Matuska 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
4752a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid);
476718519f4SMartin Matuska 	VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
4772a58b312SMartin Matuska 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
4782a58b312SMartin Matuska 	BRT_DEBUG("Pool directory object created, object=%s", name);
4792a58b312SMartin Matuska 
480718519f4SMartin Matuska 	spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
4812a58b312SMartin Matuska }
4822a58b312SMartin Matuska 
4832a58b312SMartin Matuska static void
484718519f4SMartin Matuska brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
4852a58b312SMartin Matuska {
4862a58b312SMartin Matuska 	vdev_t *vd;
4872a58b312SMartin Matuska 	uint16_t *entcount;
4882a58b312SMartin Matuska 	ulong_t *bitmap;
489718519f4SMartin Matuska 	uint64_t nblocks, onblocks, size;
4902a58b312SMartin Matuska 
491718519f4SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
4922a58b312SMartin Matuska 
493718519f4SMartin Matuska 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
494718519f4SMartin Matuska 	vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
495718519f4SMartin Matuska 	size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
496718519f4SMartin Matuska 	spa_config_exit(spa, SCL_VDEV, FTAG);
4972a58b312SMartin Matuska 
498315ee00fSMartin Matuska 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
4992a58b312SMartin Matuska 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
5002a58b312SMartin Matuska 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
5012a58b312SMartin Matuska 
5022a58b312SMartin Matuska 	if (!brtvd->bv_initiated) {
5032a58b312SMartin Matuska 		ASSERT0(brtvd->bv_size);
504718519f4SMartin Matuska 		ASSERT0P(brtvd->bv_entcount);
505718519f4SMartin Matuska 		ASSERT0P(brtvd->bv_bitmap);
5062a58b312SMartin Matuska 	} else {
5072a58b312SMartin Matuska 		ASSERT(brtvd->bv_size > 0);
5082a58b312SMartin Matuska 		ASSERT(brtvd->bv_entcount != NULL);
5092a58b312SMartin Matuska 		ASSERT(brtvd->bv_bitmap != NULL);
5102a58b312SMartin Matuska 		/*
5112a58b312SMartin Matuska 		 * TODO: Allow vdev shrinking. We only need to implement
5122a58b312SMartin Matuska 		 * shrinking the on-disk BRT VDEV object.
513718519f4SMartin Matuska 		 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
514718519f4SMartin Matuska 		 *     offset, size, tx);
5152a58b312SMartin Matuska 		 */
5162a58b312SMartin Matuska 		ASSERT3U(brtvd->bv_size, <=, size);
5172a58b312SMartin Matuska 
5182a58b312SMartin Matuska 		memcpy(entcount, brtvd->bv_entcount,
5192a58b312SMartin Matuska 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
520315ee00fSMartin Matuska 		vmem_free(brtvd->bv_entcount,
5212a58b312SMartin Matuska 		    sizeof (entcount[0]) * brtvd->bv_size);
522718519f4SMartin Matuska 		onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
523718519f4SMartin Matuska 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
524718519f4SMartin Matuska 		    BT_SIZEOFMAP(onblocks)));
525718519f4SMartin Matuska 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
5262a58b312SMartin Matuska 	}
5272a58b312SMartin Matuska 
5282a58b312SMartin Matuska 	brtvd->bv_size = size;
5292a58b312SMartin Matuska 	brtvd->bv_entcount = entcount;
5302a58b312SMartin Matuska 	brtvd->bv_bitmap = bitmap;
5312a58b312SMartin Matuska 	if (!brtvd->bv_initiated) {
5322a58b312SMartin Matuska 		brtvd->bv_need_byteswap = FALSE;
5332a58b312SMartin Matuska 		brtvd->bv_initiated = TRUE;
5342a58b312SMartin Matuska 		BRT_DEBUG("BRT VDEV %llu initiated.",
5352a58b312SMartin Matuska 		    (u_longlong_t)brtvd->bv_vdevid);
5362a58b312SMartin Matuska 	}
5372a58b312SMartin Matuska }
5382a58b312SMartin Matuska 
539718519f4SMartin Matuska static int
540718519f4SMartin Matuska brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
5412a58b312SMartin Matuska {
5422a58b312SMartin Matuska 	dmu_buf_t *db;
5432a58b312SMartin Matuska 	brt_vdev_phys_t *bvphys;
5442a58b312SMartin Matuska 	int error;
5452a58b312SMartin Matuska 
546718519f4SMartin Matuska 	ASSERT(!brtvd->bv_initiated);
5472a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_brtvdev != 0);
5482a58b312SMartin Matuska 
549718519f4SMartin Matuska 	error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
550718519f4SMartin Matuska 	    FTAG, &db);
5512a58b312SMartin Matuska 	if (error != 0)
552718519f4SMartin Matuska 		return (error);
5532a58b312SMartin Matuska 
5542a58b312SMartin Matuska 	bvphys = db->db_data;
555718519f4SMartin Matuska 	if (spa->spa_brt_rangesize == 0) {
556718519f4SMartin Matuska 		spa->spa_brt_rangesize = bvphys->bvp_rangesize;
5572a58b312SMartin Matuska 	} else {
558718519f4SMartin Matuska 		ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
5592a58b312SMartin Matuska 	}
5602a58b312SMartin Matuska 
561718519f4SMartin Matuska 	brt_vdev_realloc(spa, brtvd);
5622a58b312SMartin Matuska 
5632a58b312SMartin Matuska 	/* TODO: We don't support VDEV shrinking. */
5642a58b312SMartin Matuska 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
5652a58b312SMartin Matuska 
5662a58b312SMartin Matuska 	/*
5672a58b312SMartin Matuska 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
5682a58b312SMartin Matuska 	 */
569718519f4SMartin Matuska 	error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
5702a58b312SMartin Matuska 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
5712a58b312SMartin Matuska 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
572718519f4SMartin Matuska 	if (error != 0)
573718519f4SMartin Matuska 		return (error);
5742a58b312SMartin Matuska 
575718519f4SMartin Matuska 	ASSERT(bvphys->bvp_mos_entries != 0);
576718519f4SMartin Matuska 	VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
577718519f4SMartin Matuska 	    &brtvd->bv_mos_entries_dnode));
578718519f4SMartin Matuska 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
5792a58b312SMartin Matuska 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
580718519f4SMartin Matuska 	rw_exit(&brtvd->bv_mos_entries_lock);
5812a58b312SMartin Matuska 	brtvd->bv_need_byteswap =
5822a58b312SMartin Matuska 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
5832a58b312SMartin Matuska 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
5842a58b312SMartin Matuska 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
5852a58b312SMartin Matuska 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
5862a58b312SMartin Matuska 
5872a58b312SMartin Matuska 	dmu_buf_rele(db, FTAG);
5882a58b312SMartin Matuska 
589718519f4SMartin Matuska 	BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
590718519f4SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid,
591718519f4SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_brtvdev,
5922a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_entries);
593718519f4SMartin Matuska 	return (0);
5942a58b312SMartin Matuska }
5952a58b312SMartin Matuska 
5962a58b312SMartin Matuska static void
597718519f4SMartin Matuska brt_vdev_dealloc(brt_vdev_t *brtvd)
5982a58b312SMartin Matuska {
599718519f4SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
6002a58b312SMartin Matuska 	ASSERT(brtvd->bv_initiated);
601718519f4SMartin Matuska 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
6022a58b312SMartin Matuska 
603315ee00fSMartin Matuska 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
6042a58b312SMartin Matuska 	brtvd->bv_entcount = NULL;
605718519f4SMartin Matuska 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
606718519f4SMartin Matuska 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
6072a58b312SMartin Matuska 	brtvd->bv_bitmap = NULL;
6082a58b312SMartin Matuska 
6092a58b312SMartin Matuska 	brtvd->bv_size = 0;
6102a58b312SMartin Matuska 
6112a58b312SMartin Matuska 	brtvd->bv_initiated = FALSE;
6122a58b312SMartin Matuska 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
6132a58b312SMartin Matuska }
6142a58b312SMartin Matuska 
6152a58b312SMartin Matuska static void
616718519f4SMartin Matuska brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
6172a58b312SMartin Matuska {
6182a58b312SMartin Matuska 	char name[64];
6192a58b312SMartin Matuska 	uint64_t count;
6202a58b312SMartin Matuska 
621718519f4SMartin Matuska 	ASSERT(brtvd->bv_initiated);
6222a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_brtvdev != 0);
6232a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_entries != 0);
624718519f4SMartin Matuska 	ASSERT0(brtvd->bv_totalcount);
625718519f4SMartin Matuska 	ASSERT0(brtvd->bv_usedspace);
626718519f4SMartin Matuska 	ASSERT0(brtvd->bv_savedspace);
6272a58b312SMartin Matuska 
628718519f4SMartin Matuska 	uint64_t mos_entries = brtvd->bv_mos_entries;
629718519f4SMartin Matuska 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
6302a58b312SMartin Matuska 	brtvd->bv_mos_entries = 0;
631718519f4SMartin Matuska 	rw_exit(&brtvd->bv_mos_entries_lock);
632718519f4SMartin Matuska 	dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
633718519f4SMartin Matuska 	brtvd->bv_mos_entries_dnode = NULL;
634718519f4SMartin Matuska 	ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
635718519f4SMartin Matuska 	ASSERT0(count);
636718519f4SMartin Matuska 	VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
637718519f4SMartin Matuska 	BRT_DEBUG("MOS entries destroyed, object=%llu",
638718519f4SMartin Matuska 	    (u_longlong_t)mos_entries);
6392a58b312SMartin Matuska 
640718519f4SMartin Matuska 	VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
641718519f4SMartin Matuska 	    tx));
6422a58b312SMartin Matuska 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
6432a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
6442a58b312SMartin Matuska 	brtvd->bv_mos_brtvdev = 0;
645718519f4SMartin Matuska 	brtvd->bv_entcount_dirty = FALSE;
6462a58b312SMartin Matuska 
6472a58b312SMartin Matuska 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
6482a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid);
649718519f4SMartin Matuska 	VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
650718519f4SMartin Matuska 	    name, tx));
6512a58b312SMartin Matuska 	BRT_DEBUG("Pool directory object removed, object=%s", name);
6522a58b312SMartin Matuska 
653718519f4SMartin Matuska 	brtvd->bv_meta_dirty = FALSE;
6542a58b312SMartin Matuska 
655718519f4SMartin Matuska 	rw_enter(&brtvd->bv_lock, RW_WRITER);
656718519f4SMartin Matuska 	brt_vdev_dealloc(brtvd);
657718519f4SMartin Matuska 	rw_exit(&brtvd->bv_lock);
658718519f4SMartin Matuska 
659718519f4SMartin Matuska 	spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
6602a58b312SMartin Matuska }
6612a58b312SMartin Matuska 
6622a58b312SMartin Matuska static void
663718519f4SMartin Matuska brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
6642a58b312SMartin Matuska {
665718519f4SMartin Matuska 	brt_vdev_t **vdevs;
6662a58b312SMartin Matuska 
667718519f4SMartin Matuska 	ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
668718519f4SMartin Matuska 	ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
6692a58b312SMartin Matuska 
670718519f4SMartin Matuska 	if (nvdevs == spa->spa_brt_nvdevs)
671718519f4SMartin Matuska 		return;
6722a58b312SMartin Matuska 
673718519f4SMartin Matuska 	vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
674718519f4SMartin Matuska 	if (spa->spa_brt_nvdevs > 0) {
675718519f4SMartin Matuska 		ASSERT(spa->spa_brt_vdevs != NULL);
676718519f4SMartin Matuska 
677718519f4SMartin Matuska 		memcpy(vdevs, spa->spa_brt_vdevs,
678718519f4SMartin Matuska 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
679718519f4SMartin Matuska 		kmem_free(spa->spa_brt_vdevs,
680718519f4SMartin Matuska 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
6812a58b312SMartin Matuska 	}
682718519f4SMartin Matuska 	spa->spa_brt_vdevs = vdevs;
6832a58b312SMartin Matuska 
684718519f4SMartin Matuska 	for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
685718519f4SMartin Matuska 		brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
686718519f4SMartin Matuska 		rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
6872a58b312SMartin Matuska 		brtvd->bv_vdevid = vdevid;
6882a58b312SMartin Matuska 		brtvd->bv_initiated = FALSE;
689718519f4SMartin Matuska 		rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
690718519f4SMartin Matuska 		avl_create(&brtvd->bv_tree, brt_entry_compare,
691718519f4SMartin Matuska 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
692718519f4SMartin Matuska 		for (int i = 0; i < TXG_SIZE; i++) {
693718519f4SMartin Matuska 			avl_create(&brtvd->bv_pending_tree[i],
694718519f4SMartin Matuska 			    brt_entry_compare, sizeof (brt_entry_t),
695718519f4SMartin Matuska 			    offsetof(brt_entry_t, bre_node));
696718519f4SMartin Matuska 		}
697718519f4SMartin Matuska 		mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
698718519f4SMartin Matuska 		spa->spa_brt_vdevs[vdevid] = brtvd;
6992a58b312SMartin Matuska 	}
7002a58b312SMartin Matuska 
7012a58b312SMartin Matuska 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
702718519f4SMartin Matuska 	    (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
703718519f4SMartin Matuska 	spa->spa_brt_nvdevs = nvdevs;
7042a58b312SMartin Matuska }
7052a58b312SMartin Matuska 
7062a58b312SMartin Matuska static boolean_t
707718519f4SMartin Matuska brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
7082a58b312SMartin Matuska {
709718519f4SMartin Matuska 	uint64_t idx = offset / spa->spa_brt_rangesize;
710718519f4SMartin Matuska 	if (idx < brtvd->bv_size) {
7112a58b312SMartin Matuska 		/* VDEV wasn't expanded. */
7122a58b312SMartin Matuska 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
7132a58b312SMartin Matuska 	}
7142a58b312SMartin Matuska 	return (FALSE);
7152a58b312SMartin Matuska }
7162a58b312SMartin Matuska 
7172a58b312SMartin Matuska static void
718718519f4SMartin Matuska brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
719718519f4SMartin Matuska     uint64_t dsize, uint64_t count)
7202a58b312SMartin Matuska {
7212a58b312SMartin Matuska 	uint64_t idx;
7222a58b312SMartin Matuska 
723718519f4SMartin Matuska 	ASSERT(brtvd->bv_initiated);
7242a58b312SMartin Matuska 
725718519f4SMartin Matuska 	brtvd->bv_savedspace += dsize * count;
7262a58b312SMartin Matuska 	brtvd->bv_meta_dirty = TRUE;
7272a58b312SMartin Matuska 
728718519f4SMartin Matuska 	if (bre->bre_count > 0)
7292a58b312SMartin Matuska 		return;
7302a58b312SMartin Matuska 
7312a58b312SMartin Matuska 	brtvd->bv_usedspace += dsize;
7322a58b312SMartin Matuska 
733718519f4SMartin Matuska 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
7342a58b312SMartin Matuska 	if (idx >= brtvd->bv_size) {
7352a58b312SMartin Matuska 		/* VDEV has been expanded. */
736718519f4SMartin Matuska 		rw_enter(&brtvd->bv_lock, RW_WRITER);
737718519f4SMartin Matuska 		brt_vdev_realloc(spa, brtvd);
738718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
7392a58b312SMartin Matuska 	}
7402a58b312SMartin Matuska 
7412a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
7422a58b312SMartin Matuska 
7432a58b312SMartin Matuska 	brtvd->bv_totalcount++;
7442a58b312SMartin Matuska 	brt_vdev_entcount_inc(brtvd, idx);
7452a58b312SMartin Matuska 	brtvd->bv_entcount_dirty = TRUE;
7462a58b312SMartin Matuska 	idx = idx / BRT_BLOCKSIZE / 8;
7472a58b312SMartin Matuska 	BT_SET(brtvd->bv_bitmap, idx);
7482a58b312SMartin Matuska }
7492a58b312SMartin Matuska 
7502a58b312SMartin Matuska static void
751718519f4SMartin Matuska brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
7522a58b312SMartin Matuska     uint64_t dsize)
7532a58b312SMartin Matuska {
7542a58b312SMartin Matuska 	uint64_t idx;
7552a58b312SMartin Matuska 
756718519f4SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
757718519f4SMartin Matuska 	ASSERT(brtvd->bv_initiated);
7582a58b312SMartin Matuska 
7592a58b312SMartin Matuska 	brtvd->bv_savedspace -= dsize;
7602a58b312SMartin Matuska 	brtvd->bv_meta_dirty = TRUE;
7612a58b312SMartin Matuska 
762718519f4SMartin Matuska 	if (bre->bre_count > 0)
7632a58b312SMartin Matuska 		return;
7642a58b312SMartin Matuska 
7652a58b312SMartin Matuska 	brtvd->bv_usedspace -= dsize;
7662a58b312SMartin Matuska 
767718519f4SMartin Matuska 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
7682a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
7692a58b312SMartin Matuska 
7702a58b312SMartin Matuska 	ASSERT(brtvd->bv_totalcount > 0);
7712a58b312SMartin Matuska 	brtvd->bv_totalcount--;
7722a58b312SMartin Matuska 	brt_vdev_entcount_dec(brtvd, idx);
7732a58b312SMartin Matuska 	brtvd->bv_entcount_dirty = TRUE;
7742a58b312SMartin Matuska 	idx = idx / BRT_BLOCKSIZE / 8;
7752a58b312SMartin Matuska 	BT_SET(brtvd->bv_bitmap, idx);
7762a58b312SMartin Matuska }
7772a58b312SMartin Matuska 
7782a58b312SMartin Matuska static void
779718519f4SMartin Matuska brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
7802a58b312SMartin Matuska {
7812a58b312SMartin Matuska 	dmu_buf_t *db;
7822a58b312SMartin Matuska 	brt_vdev_phys_t *bvphys;
7832a58b312SMartin Matuska 
7842a58b312SMartin Matuska 	ASSERT(brtvd->bv_meta_dirty);
7852a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_brtvdev != 0);
7862a58b312SMartin Matuska 	ASSERT(dmu_tx_is_syncing(tx));
7872a58b312SMartin Matuska 
788718519f4SMartin Matuska 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
789718519f4SMartin Matuska 	    FTAG, &db));
7902a58b312SMartin Matuska 
7912a58b312SMartin Matuska 	if (brtvd->bv_entcount_dirty) {
7922a58b312SMartin Matuska 		/*
7932a58b312SMartin Matuska 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
7942a58b312SMartin Matuska 		 */
795718519f4SMartin Matuska 		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
7962a58b312SMartin Matuska 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
7972a58b312SMartin Matuska 		    brtvd->bv_entcount, tx);
798718519f4SMartin Matuska 		uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
799718519f4SMartin Matuska 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
8002a58b312SMartin Matuska 		brtvd->bv_entcount_dirty = FALSE;
8012a58b312SMartin Matuska 	}
8022a58b312SMartin Matuska 
8032a58b312SMartin Matuska 	dmu_buf_will_dirty(db, tx);
8042a58b312SMartin Matuska 	bvphys = db->db_data;
8052a58b312SMartin Matuska 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
8062a58b312SMartin Matuska 	bvphys->bvp_size = brtvd->bv_size;
8072a58b312SMartin Matuska 	if (brtvd->bv_need_byteswap) {
8082a58b312SMartin Matuska 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
8092a58b312SMartin Matuska 	} else {
8102a58b312SMartin Matuska 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
8112a58b312SMartin Matuska 	}
8122a58b312SMartin Matuska 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
813718519f4SMartin Matuska 	bvphys->bvp_rangesize = spa->spa_brt_rangesize;
8142a58b312SMartin Matuska 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
8152a58b312SMartin Matuska 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
8162a58b312SMartin Matuska 	dmu_buf_rele(db, FTAG);
8172a58b312SMartin Matuska 
8182a58b312SMartin Matuska 	brtvd->bv_meta_dirty = FALSE;
8192a58b312SMartin Matuska }
8202a58b312SMartin Matuska 
8212a58b312SMartin Matuska static void
822718519f4SMartin Matuska brt_vdevs_free(spa_t *spa)
8232a58b312SMartin Matuska {
824718519f4SMartin Matuska 	if (spa->spa_brt_vdevs == 0)
825718519f4SMartin Matuska 		return;
826718519f4SMartin Matuska 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
827718519f4SMartin Matuska 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
828718519f4SMartin Matuska 		rw_enter(&brtvd->bv_lock, RW_WRITER);
8292a58b312SMartin Matuska 		if (brtvd->bv_initiated)
830718519f4SMartin Matuska 			brt_vdev_dealloc(brtvd);
831718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
832718519f4SMartin Matuska 		rw_destroy(&brtvd->bv_lock);
833718519f4SMartin Matuska 		if (brtvd->bv_mos_entries != 0)
834718519f4SMartin Matuska 			dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
835718519f4SMartin Matuska 		rw_destroy(&brtvd->bv_mos_entries_lock);
836718519f4SMartin Matuska 		avl_destroy(&brtvd->bv_tree);
837718519f4SMartin Matuska 		for (int i = 0; i < TXG_SIZE; i++)
838718519f4SMartin Matuska 			avl_destroy(&brtvd->bv_pending_tree[i]);
839718519f4SMartin Matuska 		mutex_destroy(&brtvd->bv_pending_lock);
840718519f4SMartin Matuska 		kmem_free(brtvd, sizeof (*brtvd));
8412a58b312SMartin Matuska 	}
842718519f4SMartin Matuska 	kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
843718519f4SMartin Matuska 	    spa->spa_brt_nvdevs);
8442a58b312SMartin Matuska }
8452a58b312SMartin Matuska 
8462a58b312SMartin Matuska static void
8472a58b312SMartin Matuska brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
8482a58b312SMartin Matuska {
8492a58b312SMartin Matuska 
850718519f4SMartin Matuska 	bre->bre_bp = *bp;
851718519f4SMartin Matuska 	bre->bre_count = 0;
852718519f4SMartin Matuska 	bre->bre_pcount = 0;
8532a58b312SMartin Matuska 
8542a58b312SMartin Matuska 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
8552a58b312SMartin Matuska }
8562a58b312SMartin Matuska 
8572a58b312SMartin Matuska static int
858718519f4SMartin Matuska brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
8592a58b312SMartin Matuska {
860718519f4SMartin Matuska 	uint64_t off = BRE_OFFSET(bre);
8612a58b312SMartin Matuska 
862*dd215568SMartin Matuska 	if (brtvd->bv_mos_entries == 0)
863*dd215568SMartin Matuska 		return (SET_ERROR(ENOENT));
864*dd215568SMartin Matuska 
865718519f4SMartin Matuska 	return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
866718519f4SMartin Matuska 	    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
8672a58b312SMartin Matuska }
8682a58b312SMartin Matuska 
8692a58b312SMartin Matuska /*
8702a58b312SMartin Matuska  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
8712a58b312SMartin Matuska  * positive, but gives us quick answer if we should look into BRT, which
8722a58b312SMartin Matuska  * may require reads and thus will be more expensive.
8732a58b312SMartin Matuska  */
8742a58b312SMartin Matuska boolean_t
8752a58b312SMartin Matuska brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
8762a58b312SMartin Matuska {
8772a58b312SMartin Matuska 
878718519f4SMartin Matuska 	if (spa->spa_brt_nvdevs == 0)
879718519f4SMartin Matuska 		return (B_FALSE);
8802a58b312SMartin Matuska 
881718519f4SMartin Matuska 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
882718519f4SMartin Matuska 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
883718519f4SMartin Matuska 	if (brtvd == NULL || !brtvd->bv_initiated)
884718519f4SMartin Matuska 		return (FALSE);
8852a58b312SMartin Matuska 
886718519f4SMartin Matuska 	/*
887718519f4SMartin Matuska 	 * We don't need locks here, since bv_entcount pointer must be
888718519f4SMartin Matuska 	 * stable at this point, and we don't care about false positive
889718519f4SMartin Matuska 	 * races here, while false negative should be impossible, since
890718519f4SMartin Matuska 	 * all brt_vdev_addref() have already completed by this point.
891718519f4SMartin Matuska 	 */
892718519f4SMartin Matuska 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
893718519f4SMartin Matuska 	return (brt_vdev_lookup(spa, brtvd, off));
8942a58b312SMartin Matuska }
8952a58b312SMartin Matuska 
8962a58b312SMartin Matuska uint64_t
8972a58b312SMartin Matuska brt_get_dspace(spa_t *spa)
8982a58b312SMartin Matuska {
899718519f4SMartin Matuska 	if (spa->spa_brt_nvdevs == 0)
9002a58b312SMartin Matuska 		return (0);
9012a58b312SMartin Matuska 
902718519f4SMartin Matuska 	brt_rlock(spa);
903718519f4SMartin Matuska 	uint64_t s = 0;
904718519f4SMartin Matuska 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
905718519f4SMartin Matuska 		s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
906718519f4SMartin Matuska 	brt_unlock(spa);
907718519f4SMartin Matuska 	return (s);
9082a58b312SMartin Matuska }
9092a58b312SMartin Matuska 
9102a58b312SMartin Matuska uint64_t
9112a58b312SMartin Matuska brt_get_used(spa_t *spa)
9122a58b312SMartin Matuska {
913718519f4SMartin Matuska 	if (spa->spa_brt_nvdevs == 0)
9142a58b312SMartin Matuska 		return (0);
9152a58b312SMartin Matuska 
916718519f4SMartin Matuska 	brt_rlock(spa);
917718519f4SMartin Matuska 	uint64_t s = 0;
918718519f4SMartin Matuska 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
919718519f4SMartin Matuska 		s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
920718519f4SMartin Matuska 	brt_unlock(spa);
921718519f4SMartin Matuska 	return (s);
9222a58b312SMartin Matuska }
9232a58b312SMartin Matuska 
9242a58b312SMartin Matuska uint64_t
9252a58b312SMartin Matuska brt_get_saved(spa_t *spa)
9262a58b312SMartin Matuska {
927718519f4SMartin Matuska 	return (brt_get_dspace(spa));
9282a58b312SMartin Matuska }
9292a58b312SMartin Matuska 
9302a58b312SMartin Matuska uint64_t
9312a58b312SMartin Matuska brt_get_ratio(spa_t *spa)
9322a58b312SMartin Matuska {
933718519f4SMartin Matuska 	uint64_t used = brt_get_used(spa);
934718519f4SMartin Matuska 	if (used == 0)
9352a58b312SMartin Matuska 		return (100);
936718519f4SMartin Matuska 	return ((used + brt_get_saved(spa)) * 100 / used);
9372a58b312SMartin Matuska }
9382a58b312SMartin Matuska 
9392a58b312SMartin Matuska static int
9402a58b312SMartin Matuska brt_kstats_update(kstat_t *ksp, int rw)
9412a58b312SMartin Matuska {
9422a58b312SMartin Matuska 	brt_stats_t *bs = ksp->ks_data;
9432a58b312SMartin Matuska 
9442a58b312SMartin Matuska 	if (rw == KSTAT_WRITE)
9452a58b312SMartin Matuska 		return (EACCES);
9462a58b312SMartin Matuska 
9472a58b312SMartin Matuska 	bs->brt_addref_entry_not_on_disk.value.ui64 =
9482a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
9492a58b312SMartin Matuska 	bs->brt_addref_entry_on_disk.value.ui64 =
9502a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
9512a58b312SMartin Matuska 	bs->brt_decref_entry_in_memory.value.ui64 =
9522a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
9532a58b312SMartin Matuska 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
9542a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
9552a58b312SMartin Matuska 	bs->brt_decref_entry_not_in_memory.value.ui64 =
9562a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
9572a58b312SMartin Matuska 	bs->brt_decref_entry_read_lost_race.value.ui64 =
9582a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
9592a58b312SMartin Matuska 	bs->brt_decref_entry_still_referenced.value.ui64 =
9602a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
9612a58b312SMartin Matuska 	bs->brt_decref_free_data_later.value.ui64 =
9622a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
9632a58b312SMartin Matuska 	bs->brt_decref_free_data_now.value.ui64 =
9642a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
9652a58b312SMartin Matuska 	bs->brt_decref_no_entry.value.ui64 =
9662a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_no_entry);
9672a58b312SMartin Matuska 
9682a58b312SMartin Matuska 	return (0);
9692a58b312SMartin Matuska }
9702a58b312SMartin Matuska 
9712a58b312SMartin Matuska static void
9722a58b312SMartin Matuska brt_stat_init(void)
9732a58b312SMartin Matuska {
9742a58b312SMartin Matuska 
9752a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
9762a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
9772a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
9782a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
9792a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
9802a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
9812a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
9822a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
9832a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
9842a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
9852a58b312SMartin Matuska 
9862a58b312SMartin Matuska 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
9872a58b312SMartin Matuska 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
9882a58b312SMartin Matuska 	if (brt_ksp != NULL) {
9892a58b312SMartin Matuska 		brt_ksp->ks_data = &brt_stats;
9902a58b312SMartin Matuska 		brt_ksp->ks_update = brt_kstats_update;
9912a58b312SMartin Matuska 		kstat_install(brt_ksp);
9922a58b312SMartin Matuska 	}
9932a58b312SMartin Matuska }
9942a58b312SMartin Matuska 
9952a58b312SMartin Matuska static void
9962a58b312SMartin Matuska brt_stat_fini(void)
9972a58b312SMartin Matuska {
9982a58b312SMartin Matuska 	if (brt_ksp != NULL) {
9992a58b312SMartin Matuska 		kstat_delete(brt_ksp);
10002a58b312SMartin Matuska 		brt_ksp = NULL;
10012a58b312SMartin Matuska 	}
10022a58b312SMartin Matuska 
10032a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
10042a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
10052a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
10062a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
10072a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
10082a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
10092a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
10102a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
10112a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
10122a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_no_entry);
10132a58b312SMartin Matuska }
10142a58b312SMartin Matuska 
10152a58b312SMartin Matuska void
10162a58b312SMartin Matuska brt_init(void)
10172a58b312SMartin Matuska {
10182a58b312SMartin Matuska 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
10192a58b312SMartin Matuska 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
10202a58b312SMartin Matuska 
10212a58b312SMartin Matuska 	brt_stat_init();
10222a58b312SMartin Matuska }
10232a58b312SMartin Matuska 
10242a58b312SMartin Matuska void
10252a58b312SMartin Matuska brt_fini(void)
10262a58b312SMartin Matuska {
10272a58b312SMartin Matuska 	brt_stat_fini();
10282a58b312SMartin Matuska 
10292a58b312SMartin Matuska 	kmem_cache_destroy(brt_entry_cache);
10302a58b312SMartin Matuska }
10312a58b312SMartin Matuska 
10322a58b312SMartin Matuska /* Return TRUE if block should be freed immediately. */
10332a58b312SMartin Matuska boolean_t
10342a58b312SMartin Matuska brt_entry_decref(spa_t *spa, const blkptr_t *bp)
10352a58b312SMartin Matuska {
10362a58b312SMartin Matuska 	brt_entry_t *bre, *racebre;
10372a58b312SMartin Matuska 	brt_entry_t bre_search;
10382a58b312SMartin Matuska 	avl_index_t where;
10392a58b312SMartin Matuska 	uint64_t vdevid;
10402a58b312SMartin Matuska 	int error;
10412a58b312SMartin Matuska 
10422a58b312SMartin Matuska 	brt_entry_fill(bp, &bre_search, &vdevid);
10432a58b312SMartin Matuska 
1044718519f4SMartin Matuska 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
10452a58b312SMartin Matuska 	ASSERT(brtvd != NULL);
10462a58b312SMartin Matuska 
1047718519f4SMartin Matuska 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1048718519f4SMartin Matuska 	ASSERT(brtvd->bv_initiated);
10492a58b312SMartin Matuska 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
10502a58b312SMartin Matuska 	if (bre != NULL) {
10512a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
10522a58b312SMartin Matuska 		goto out;
10532a58b312SMartin Matuska 	} else {
10542a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
10552a58b312SMartin Matuska 	}
1056718519f4SMartin Matuska 	rw_exit(&brtvd->bv_lock);
10572a58b312SMartin Matuska 
1058718519f4SMartin Matuska 	error = brt_entry_lookup(brtvd, &bre_search);
1059718519f4SMartin Matuska 	/* bre_search now contains correct bre_count */
10602a58b312SMartin Matuska 	if (error == ENOENT) {
1061718519f4SMartin Matuska 		BRTSTAT_BUMP(brt_decref_no_entry);
1062718519f4SMartin Matuska 		return (B_TRUE);
10632a58b312SMartin Matuska 	}
1064718519f4SMartin Matuska 	ASSERT0(error);
10652a58b312SMartin Matuska 
1066718519f4SMartin Matuska 	rw_enter(&brtvd->bv_lock, RW_WRITER);
10672a58b312SMartin Matuska 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
10682a58b312SMartin Matuska 	if (racebre != NULL) {
1069718519f4SMartin Matuska 		/* The entry was added when the lock was dropped. */
10702a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
10712a58b312SMartin Matuska 		bre = racebre;
10722a58b312SMartin Matuska 		goto out;
10732a58b312SMartin Matuska 	}
10742a58b312SMartin Matuska 
10752a58b312SMartin Matuska 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1076718519f4SMartin Matuska 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1077718519f4SMartin Matuska 	bre->bre_bp = bre_search.bre_bp;
1078718519f4SMartin Matuska 	bre->bre_count = bre_search.bre_count;
1079718519f4SMartin Matuska 	bre->bre_pcount = 0;
10802a58b312SMartin Matuska 	avl_insert(&brtvd->bv_tree, bre, where);
10812a58b312SMartin Matuska 
10822a58b312SMartin Matuska out:
1083718519f4SMartin Matuska 	if (bre->bre_count == 0) {
1084718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
10852a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_free_data_now);
10862a58b312SMartin Matuska 		return (B_TRUE);
10872a58b312SMartin Matuska 	}
10882a58b312SMartin Matuska 
1089718519f4SMartin Matuska 	bre->bre_pcount--;
1090718519f4SMartin Matuska 	ASSERT(bre->bre_count > 0);
1091718519f4SMartin Matuska 	bre->bre_count--;
1092718519f4SMartin Matuska 	if (bre->bre_count == 0)
10932a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_free_data_later);
10942a58b312SMartin Matuska 	else
10952a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1096718519f4SMartin Matuska 	brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
10972a58b312SMartin Matuska 
1098718519f4SMartin Matuska 	rw_exit(&brtvd->bv_lock);
10992a58b312SMartin Matuska 
11002a58b312SMartin Matuska 	return (B_FALSE);
11012a58b312SMartin Matuska }
11022a58b312SMartin Matuska 
1103315ee00fSMartin Matuska uint64_t
1104315ee00fSMartin Matuska brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1105315ee00fSMartin Matuska {
1106315ee00fSMartin Matuska 	brt_entry_t bre_search, *bre;
1107315ee00fSMartin Matuska 	uint64_t vdevid, refcnt;
1108315ee00fSMartin Matuska 	int error;
1109315ee00fSMartin Matuska 
1110315ee00fSMartin Matuska 	brt_entry_fill(bp, &bre_search, &vdevid);
1111315ee00fSMartin Matuska 
1112718519f4SMartin Matuska 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1113315ee00fSMartin Matuska 	ASSERT(brtvd != NULL);
1114315ee00fSMartin Matuska 
1115718519f4SMartin Matuska 	rw_enter(&brtvd->bv_lock, RW_READER);
1116718519f4SMartin Matuska 	ASSERT(brtvd->bv_initiated);
1117315ee00fSMartin Matuska 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1118315ee00fSMartin Matuska 	if (bre == NULL) {
1119718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
1120718519f4SMartin Matuska 		error = brt_entry_lookup(brtvd, &bre_search);
1121718519f4SMartin Matuska 		if (error == ENOENT) {
1122315ee00fSMartin Matuska 			refcnt = 0;
1123718519f4SMartin Matuska 		} else {
1124718519f4SMartin Matuska 			ASSERT0(error);
1125718519f4SMartin Matuska 			refcnt = bre_search.bre_count;
1126718519f4SMartin Matuska 		}
1127718519f4SMartin Matuska 	} else {
1128718519f4SMartin Matuska 		refcnt = bre->bre_count;
1129718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
1130718519f4SMartin Matuska 	}
1131315ee00fSMartin Matuska 
1132315ee00fSMartin Matuska 	return (refcnt);
1133315ee00fSMartin Matuska }
1134315ee00fSMartin Matuska 
11352a58b312SMartin Matuska static void
1136718519f4SMartin Matuska brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
11372a58b312SMartin Matuska {
1138718519f4SMartin Matuska 	if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
11392a58b312SMartin Matuska 		return;
11402a58b312SMartin Matuska 
1141718519f4SMartin Matuska 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1142718519f4SMartin Matuska 	rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1143718519f4SMartin Matuska 	if (brtvd->bv_mos_entries != 0) {
1144718519f4SMartin Matuska 		(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1145718519f4SMartin Matuska 		    &off, BRT_KEY_WORDS);
1146718519f4SMartin Matuska 	}
1147718519f4SMartin Matuska 	rw_exit(&brtvd->bv_mos_entries_lock);
11482a58b312SMartin Matuska }
11492a58b312SMartin Matuska 
11502a58b312SMartin Matuska static int
1151718519f4SMartin Matuska brt_entry_compare(const void *x1, const void *x2)
11522a58b312SMartin Matuska {
1153718519f4SMartin Matuska 	const brt_entry_t *bre1 = x1, *bre2 = x2;
1154718519f4SMartin Matuska 	const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
11552a58b312SMartin Matuska 
1156718519f4SMartin Matuska 	return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1157718519f4SMartin Matuska 	    DVA_GET_OFFSET(&bp2->blk_dva[0])));
11582a58b312SMartin Matuska }
11592a58b312SMartin Matuska 
11602a58b312SMartin Matuska void
11612a58b312SMartin Matuska brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
11622a58b312SMartin Matuska {
1163718519f4SMartin Matuska 	brt_entry_t *bre, *newbre;
11642a58b312SMartin Matuska 	avl_index_t where;
11652a58b312SMartin Matuska 	uint64_t txg;
11662a58b312SMartin Matuska 
11672a58b312SMartin Matuska 	txg = dmu_tx_get_txg(tx);
11682a58b312SMartin Matuska 	ASSERT3U(txg, !=, 0);
11692a58b312SMartin Matuska 
1170718519f4SMartin Matuska 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1171718519f4SMartin Matuska 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1172718519f4SMartin Matuska 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
11732a58b312SMartin Matuska 
1174718519f4SMartin Matuska 	newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1175718519f4SMartin Matuska 	newbre->bre_bp = *bp;
1176718519f4SMartin Matuska 	newbre->bre_count = 0;
1177718519f4SMartin Matuska 	newbre->bre_pcount = 1;
11782a58b312SMartin Matuska 
1179718519f4SMartin Matuska 	mutex_enter(&brtvd->bv_pending_lock);
1180718519f4SMartin Matuska 	bre = avl_find(pending_tree, newbre, &where);
1181718519f4SMartin Matuska 	if (bre == NULL) {
1182718519f4SMartin Matuska 		avl_insert(pending_tree, newbre, where);
1183718519f4SMartin Matuska 		newbre = NULL;
11842a58b312SMartin Matuska 	} else {
1185718519f4SMartin Matuska 		bre->bre_pcount++;
11862a58b312SMartin Matuska 	}
1187718519f4SMartin Matuska 	mutex_exit(&brtvd->bv_pending_lock);
11882a58b312SMartin Matuska 
1189718519f4SMartin Matuska 	if (newbre != NULL) {
1190718519f4SMartin Matuska 		ASSERT(bre != NULL);
1191718519f4SMartin Matuska 		ASSERT(bre != newbre);
1192718519f4SMartin Matuska 		kmem_cache_free(brt_entry_cache, newbre);
11932a58b312SMartin Matuska 	} else {
1194718519f4SMartin Matuska 		ASSERT0P(bre);
11952a58b312SMartin Matuska 
1196783d3ff6SMartin Matuska 		/* Prefetch BRT entry for the syncing context. */
1197718519f4SMartin Matuska 		brt_prefetch(brtvd, bp);
11982a58b312SMartin Matuska 	}
1199783d3ff6SMartin Matuska }
12002a58b312SMartin Matuska 
12012a58b312SMartin Matuska void
12022a58b312SMartin Matuska brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
12032a58b312SMartin Matuska {
1204718519f4SMartin Matuska 	brt_entry_t *bre, bre_search;
12052a58b312SMartin Matuska 	uint64_t txg;
12062a58b312SMartin Matuska 
12072a58b312SMartin Matuska 	txg = dmu_tx_get_txg(tx);
12082a58b312SMartin Matuska 	ASSERT3U(txg, !=, 0);
12092a58b312SMartin Matuska 
1210718519f4SMartin Matuska 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1211718519f4SMartin Matuska 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1212718519f4SMartin Matuska 	ASSERT(brtvd != NULL);
1213718519f4SMartin Matuska 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
12142a58b312SMartin Matuska 
1215718519f4SMartin Matuska 	bre_search.bre_bp = *bp;
12162a58b312SMartin Matuska 
1217718519f4SMartin Matuska 	mutex_enter(&brtvd->bv_pending_lock);
1218718519f4SMartin Matuska 	bre = avl_find(pending_tree, &bre_search, NULL);
1219718519f4SMartin Matuska 	ASSERT(bre != NULL);
1220718519f4SMartin Matuska 	ASSERT(bre->bre_pcount > 0);
1221718519f4SMartin Matuska 	bre->bre_pcount--;
1222718519f4SMartin Matuska 	if (bre->bre_pcount == 0)
1223718519f4SMartin Matuska 		avl_remove(pending_tree, bre);
1224718519f4SMartin Matuska 	else
1225718519f4SMartin Matuska 		bre = NULL;
1226718519f4SMartin Matuska 	mutex_exit(&brtvd->bv_pending_lock);
12272a58b312SMartin Matuska 
1228718519f4SMartin Matuska 	if (bre)
1229718519f4SMartin Matuska 		kmem_cache_free(brt_entry_cache, bre);
1230718519f4SMartin Matuska }
1231718519f4SMartin Matuska 
1232718519f4SMartin Matuska static void
1233718519f4SMartin Matuska brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1234718519f4SMartin Matuska {
1235718519f4SMartin Matuska 	brt_entry_t *bre, *nbre;
1236718519f4SMartin Matuska 
1237718519f4SMartin Matuska 	/*
1238718519f4SMartin Matuska 	 * We are in syncing context, so no other bv_pending_tree accesses
1239718519f4SMartin Matuska 	 * are possible for the TXG.  So we don't need bv_pending_lock.
1240718519f4SMartin Matuska 	 */
1241718519f4SMartin Matuska 	ASSERT(avl_is_empty(&brtvd->bv_tree));
1242718519f4SMartin Matuska 	avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1243718519f4SMartin Matuska 
1244718519f4SMartin Matuska 	for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1245718519f4SMartin Matuska 		nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1246718519f4SMartin Matuska 
1247718519f4SMartin Matuska 		/*
1248718519f4SMartin Matuska 		 * If the block has DEDUP bit set, it means that it
1249718519f4SMartin Matuska 		 * already exists in the DEDUP table, so we can just
1250718519f4SMartin Matuska 		 * use that instead of creating new entry in the BRT.
1251718519f4SMartin Matuska 		 */
1252718519f4SMartin Matuska 		if (BP_GET_DEDUP(&bre->bre_bp)) {
1253718519f4SMartin Matuska 			while (bre->bre_pcount > 0) {
1254718519f4SMartin Matuska 				if (!ddt_addref(spa, &bre->bre_bp))
1255718519f4SMartin Matuska 					break;
1256718519f4SMartin Matuska 				bre->bre_pcount--;
1257718519f4SMartin Matuska 			}
1258718519f4SMartin Matuska 			if (bre->bre_pcount == 0) {
1259718519f4SMartin Matuska 				avl_remove(&brtvd->bv_tree, bre);
1260718519f4SMartin Matuska 				kmem_cache_free(brt_entry_cache, bre);
1261718519f4SMartin Matuska 				continue;
12622a58b312SMartin Matuska 			}
12632a58b312SMartin Matuska 		}
12642a58b312SMartin Matuska 
1265718519f4SMartin Matuska 		/*
1266718519f4SMartin Matuska 		 * Unless we know that the block is definitely not in ZAP,
1267718519f4SMartin Matuska 		 * try to get its reference count from there.
1268718519f4SMartin Matuska 		 */
1269718519f4SMartin Matuska 		uint64_t off = BRE_OFFSET(bre);
1270718519f4SMartin Matuska 		if (brtvd->bv_mos_entries != 0 &&
1271718519f4SMartin Matuska 		    brt_vdev_lookup(spa, brtvd, off)) {
1272718519f4SMartin Matuska 			int error = zap_lookup_uint64_by_dnode(
1273718519f4SMartin Matuska 			    brtvd->bv_mos_entries_dnode, &off,
1274718519f4SMartin Matuska 			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1275718519f4SMartin Matuska 			    &bre->bre_count);
1276718519f4SMartin Matuska 			if (error == 0) {
1277718519f4SMartin Matuska 				BRTSTAT_BUMP(brt_addref_entry_on_disk);
1278718519f4SMartin Matuska 			} else {
1279718519f4SMartin Matuska 				ASSERT3U(error, ==, ENOENT);
1280718519f4SMartin Matuska 				BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1281718519f4SMartin Matuska 			}
1282718519f4SMartin Matuska 		}
1283718519f4SMartin Matuska 	}
1284718519f4SMartin Matuska 
1285718519f4SMartin Matuska 	/*
1286718519f4SMartin Matuska 	 * If all the cloned blocks we had were handled by DDT, we don't need
1287718519f4SMartin Matuska 	 * to initiate the vdev.
1288718519f4SMartin Matuska 	 */
1289718519f4SMartin Matuska 	if (avl_is_empty(&brtvd->bv_tree))
1290718519f4SMartin Matuska 		return;
1291718519f4SMartin Matuska 
1292718519f4SMartin Matuska 	if (!brtvd->bv_initiated) {
1293718519f4SMartin Matuska 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1294718519f4SMartin Matuska 		brt_vdev_realloc(spa, brtvd);
1295718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
1296718519f4SMartin Matuska 	}
1297718519f4SMartin Matuska 
1298718519f4SMartin Matuska 	/*
1299718519f4SMartin Matuska 	 * Convert pending references into proper ones.  This has to be a
1300718519f4SMartin Matuska 	 * separate loop, since entcount modifications would cause false
1301718519f4SMartin Matuska 	 * positives for brt_vdev_lookup() on following iterations.
1302718519f4SMartin Matuska 	 */
1303718519f4SMartin Matuska 	for (bre = avl_first(&brtvd->bv_tree); bre;
1304718519f4SMartin Matuska 	    bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1305718519f4SMartin Matuska 		brt_vdev_addref(spa, brtvd, bre,
1306718519f4SMartin Matuska 		    bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1307718519f4SMartin Matuska 		bre->bre_count += bre->bre_pcount;
1308718519f4SMartin Matuska 	}
13092a58b312SMartin Matuska }
13102a58b312SMartin Matuska 
13112a58b312SMartin Matuska void
13122a58b312SMartin Matuska brt_pending_apply(spa_t *spa, uint64_t txg)
13132a58b312SMartin Matuska {
13142a58b312SMartin Matuska 
1315718519f4SMartin Matuska 	brt_rlock(spa);
1316718519f4SMartin Matuska 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1317718519f4SMartin Matuska 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1318718519f4SMartin Matuska 		brt_unlock(spa);
13192a58b312SMartin Matuska 
1320718519f4SMartin Matuska 		brt_pending_apply_vdev(spa, brtvd, txg);
13212a58b312SMartin Matuska 
1322718519f4SMartin Matuska 		brt_rlock(spa);
13232a58b312SMartin Matuska 	}
1324718519f4SMartin Matuska 	brt_unlock(spa);
13252a58b312SMartin Matuska }
13262a58b312SMartin Matuska 
13272a58b312SMartin Matuska static void
1328783d3ff6SMartin Matuska brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
13292a58b312SMartin Matuska {
1330718519f4SMartin Matuska 	uint64_t off = BRE_OFFSET(bre);
1331718519f4SMartin Matuska 
1332718519f4SMartin Matuska 	if (bre->bre_pcount == 0) {
1333718519f4SMartin Matuska 		/* The net change is zero, nothing to do in ZAP. */
1334718519f4SMartin Matuska 	} else if (bre->bre_count == 0) {
1335718519f4SMartin Matuska 		int error = zap_remove_uint64_by_dnode(dn, &off,
1336783d3ff6SMartin Matuska 		    BRT_KEY_WORDS, tx);
1337783d3ff6SMartin Matuska 		VERIFY(error == 0 || error == ENOENT);
13382a58b312SMartin Matuska 	} else {
1339718519f4SMartin Matuska 		VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1340718519f4SMartin Matuska 		    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1341718519f4SMartin Matuska 		    &bre->bre_count, tx));
13422a58b312SMartin Matuska 	}
13432a58b312SMartin Matuska }
13442a58b312SMartin Matuska 
13452a58b312SMartin Matuska static void
1346718519f4SMartin Matuska brt_sync_table(spa_t *spa, dmu_tx_t *tx)
13472a58b312SMartin Matuska {
13482a58b312SMartin Matuska 	brt_entry_t *bre;
13492a58b312SMartin Matuska 
1350718519f4SMartin Matuska 	brt_rlock(spa);
1351718519f4SMartin Matuska 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1352718519f4SMartin Matuska 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1353718519f4SMartin Matuska 		brt_unlock(spa);
13542a58b312SMartin Matuska 
13552a58b312SMartin Matuska 		if (!brtvd->bv_meta_dirty) {
13562a58b312SMartin Matuska 			ASSERT(!brtvd->bv_entcount_dirty);
13572a58b312SMartin Matuska 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1358718519f4SMartin Matuska 			brt_rlock(spa);
13592a58b312SMartin Matuska 			continue;
13602a58b312SMartin Matuska 		}
13612a58b312SMartin Matuska 
13622a58b312SMartin Matuska 		ASSERT(!brtvd->bv_entcount_dirty ||
13632a58b312SMartin Matuska 		    avl_numnodes(&brtvd->bv_tree) != 0);
13642a58b312SMartin Matuska 
13652a58b312SMartin Matuska 		if (brtvd->bv_mos_brtvdev == 0)
1366718519f4SMartin Matuska 			brt_vdev_create(spa, brtvd, tx);
13672a58b312SMartin Matuska 
1368718519f4SMartin Matuska 		void *c = NULL;
13692a58b312SMartin Matuska 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1370718519f4SMartin Matuska 			brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
1371718519f4SMartin Matuska 			kmem_cache_free(brt_entry_cache, bre);
13722a58b312SMartin Matuska 		}
13732a58b312SMartin Matuska 
1374718519f4SMartin Matuska #ifdef ZFS_DEBUG
1375718519f4SMartin Matuska 		if (zfs_flags & ZFS_DEBUG_BRT)
1376718519f4SMartin Matuska 			brt_vdev_dump(brtvd);
1377718519f4SMartin Matuska #endif
13782a58b312SMartin Matuska 		if (brtvd->bv_totalcount == 0)
1379718519f4SMartin Matuska 			brt_vdev_destroy(spa, brtvd, tx);
1380718519f4SMartin Matuska 		else
1381718519f4SMartin Matuska 			brt_vdev_sync(spa, brtvd, tx);
1382718519f4SMartin Matuska 		brt_rlock(spa);
13832a58b312SMartin Matuska 	}
1384718519f4SMartin Matuska 	brt_unlock(spa);
13852a58b312SMartin Matuska }
13862a58b312SMartin Matuska 
13872a58b312SMartin Matuska void
13882a58b312SMartin Matuska brt_sync(spa_t *spa, uint64_t txg)
13892a58b312SMartin Matuska {
13902a58b312SMartin Matuska 	dmu_tx_t *tx;
1391718519f4SMartin Matuska 	uint64_t vdevid;
13922a58b312SMartin Matuska 
1393718519f4SMartin Matuska 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
13942a58b312SMartin Matuska 
1395718519f4SMartin Matuska 	brt_rlock(spa);
1396718519f4SMartin Matuska 	for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1397718519f4SMartin Matuska 		if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1398718519f4SMartin Matuska 			break;
1399718519f4SMartin Matuska 	}
1400718519f4SMartin Matuska 	if (vdevid >= spa->spa_brt_nvdevs) {
1401718519f4SMartin Matuska 		brt_unlock(spa);
14022a58b312SMartin Matuska 		return;
14032a58b312SMartin Matuska 	}
1404718519f4SMartin Matuska 	brt_unlock(spa);
14052a58b312SMartin Matuska 
14062a58b312SMartin Matuska 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1407718519f4SMartin Matuska 	brt_sync_table(spa, tx);
14082a58b312SMartin Matuska 	dmu_tx_commit(tx);
14092a58b312SMartin Matuska }
14102a58b312SMartin Matuska 
14112a58b312SMartin Matuska static void
14122a58b312SMartin Matuska brt_alloc(spa_t *spa)
14132a58b312SMartin Matuska {
1414718519f4SMartin Matuska 	rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1415718519f4SMartin Matuska 	spa->spa_brt_vdevs = NULL;
1416718519f4SMartin Matuska 	spa->spa_brt_nvdevs = 0;
1417718519f4SMartin Matuska 	spa->spa_brt_rangesize = 0;
14182a58b312SMartin Matuska }
14192a58b312SMartin Matuska 
14202a58b312SMartin Matuska void
14212a58b312SMartin Matuska brt_create(spa_t *spa)
14222a58b312SMartin Matuska {
14232a58b312SMartin Matuska 	brt_alloc(spa);
1424718519f4SMartin Matuska 	spa->spa_brt_rangesize = BRT_RANGESIZE;
14252a58b312SMartin Matuska }
14262a58b312SMartin Matuska 
14272a58b312SMartin Matuska int
14282a58b312SMartin Matuska brt_load(spa_t *spa)
14292a58b312SMartin Matuska {
1430718519f4SMartin Matuska 	int error = 0;
14312a58b312SMartin Matuska 
14322a58b312SMartin Matuska 	brt_alloc(spa);
1433718519f4SMartin Matuska 	brt_wlock(spa);
1434718519f4SMartin Matuska 	for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1435718519f4SMartin Matuska 	    vdevid++) {
1436718519f4SMartin Matuska 		char name[64];
1437718519f4SMartin Matuska 		uint64_t mos_brtvdev;
14382a58b312SMartin Matuska 
1439718519f4SMartin Matuska 		/* Look if this vdev had active block cloning. */
1440718519f4SMartin Matuska 		snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1441718519f4SMartin Matuska 		    (u_longlong_t)vdevid);
1442718519f4SMartin Matuska 		error = zap_lookup(spa->spa_meta_objset,
1443718519f4SMartin Matuska 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1444718519f4SMartin Matuska 		    &mos_brtvdev);
1445718519f4SMartin Matuska 		if (error == ENOENT) {
1446718519f4SMartin Matuska 			error = 0;
1447718519f4SMartin Matuska 			continue;
1448718519f4SMartin Matuska 		}
1449718519f4SMartin Matuska 		if (error != 0)
1450718519f4SMartin Matuska 			break;
1451718519f4SMartin Matuska 
1452718519f4SMartin Matuska 		/* If it did, then allocate them all and load this one. */
1453718519f4SMartin Matuska 		brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1454718519f4SMartin Matuska 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1455718519f4SMartin Matuska 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1456718519f4SMartin Matuska 		brtvd->bv_mos_brtvdev = mos_brtvdev;
1457718519f4SMartin Matuska 		error = brt_vdev_load(spa, brtvd);
1458718519f4SMartin Matuska 		rw_exit(&brtvd->bv_lock);
1459718519f4SMartin Matuska 		if (error != 0)
1460718519f4SMartin Matuska 			break;
1461718519f4SMartin Matuska 	}
1462718519f4SMartin Matuska 
1463718519f4SMartin Matuska 	if (spa->spa_brt_rangesize == 0)
1464718519f4SMartin Matuska 		spa->spa_brt_rangesize = BRT_RANGESIZE;
1465718519f4SMartin Matuska 	brt_unlock(spa);
1466718519f4SMartin Matuska 	return (error);
14672a58b312SMartin Matuska }
14682a58b312SMartin Matuska 
14692a58b312SMartin Matuska void
14702a58b312SMartin Matuska brt_unload(spa_t *spa)
14712a58b312SMartin Matuska {
1472718519f4SMartin Matuska 	if (spa->spa_brt_rangesize == 0)
14732a58b312SMartin Matuska 		return;
1474718519f4SMartin Matuska 	brt_vdevs_free(spa);
1475718519f4SMartin Matuska 	rw_destroy(&spa->spa_brt_lock);
1476718519f4SMartin Matuska 	spa->spa_brt_rangesize = 0;
14772a58b312SMartin Matuska }
14782a58b312SMartin Matuska 
1479783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1480783d3ff6SMartin Matuska 	"Enable prefetching of BRT ZAP entries");
1481783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1482783d3ff6SMartin Matuska 	"BRT ZAP leaf blockshift");
1483783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1484783d3ff6SMartin Matuska 	"BRT ZAP indirect blockshift");
1485