12a58b312SMartin Matuska /* 22a58b312SMartin Matuska * CDDL HEADER START 32a58b312SMartin Matuska * 42a58b312SMartin Matuska * The contents of this file are subject to the terms of the 52a58b312SMartin Matuska * Common Development and Distribution License (the "License"). 62a58b312SMartin Matuska * You may not use this file except in compliance with the License. 72a58b312SMartin Matuska * 82a58b312SMartin Matuska * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 92a58b312SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 102a58b312SMartin Matuska * See the License for the specific language governing permissions 112a58b312SMartin Matuska * and limitations under the License. 122a58b312SMartin Matuska * 132a58b312SMartin Matuska * When distributing Covered Code, include this CDDL HEADER in each 142a58b312SMartin Matuska * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 152a58b312SMartin Matuska * If applicable, add the following below this CDDL HEADER, with the 162a58b312SMartin Matuska * fields enclosed by brackets "[]" replaced with your own identifying 172a58b312SMartin Matuska * information: Portions Copyright [yyyy] [name of copyright owner] 182a58b312SMartin Matuska * 192a58b312SMartin Matuska * CDDL HEADER END 202a58b312SMartin Matuska */ 212a58b312SMartin Matuska 222a58b312SMartin Matuska /* 232a58b312SMartin Matuska * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 242a58b312SMartin Matuska */ 252a58b312SMartin Matuska 262a58b312SMartin Matuska #include <sys/zfs_context.h> 272a58b312SMartin Matuska #include <sys/spa.h> 282a58b312SMartin Matuska #include <sys/spa_impl.h> 292a58b312SMartin Matuska #include <sys/zio.h> 302a58b312SMartin Matuska #include <sys/brt.h> 312276e539SMartin Matuska #include <sys/brt_impl.h> 322a58b312SMartin Matuska #include <sys/ddt.h> 332a58b312SMartin Matuska #include <sys/bitmap.h> 342a58b312SMartin Matuska #include <sys/zap.h> 352a58b312SMartin Matuska #include <sys/dmu_tx.h> 362a58b312SMartin Matuska #include <sys/arc.h> 372a58b312SMartin Matuska #include <sys/dsl_pool.h> 382a58b312SMartin Matuska #include <sys/dsl_scan.h> 392a58b312SMartin Matuska #include <sys/vdev_impl.h> 402a58b312SMartin Matuska #include <sys/kstat.h> 412a58b312SMartin Matuska #include <sys/wmsum.h> 422a58b312SMartin Matuska 432a58b312SMartin Matuska /* 442a58b312SMartin Matuska * Block Cloning design. 452a58b312SMartin Matuska * 462a58b312SMartin Matuska * Block Cloning allows to manually clone a file (or a subset of its blocks) 472a58b312SMartin Matuska * into another (or the same) file by just creating additional references to 482a58b312SMartin Matuska * the data blocks without copying the data itself. Those references are kept 492a58b312SMartin Matuska * in the Block Reference Tables (BRTs). 502a58b312SMartin Matuska * 512a58b312SMartin Matuska * In many ways this is similar to the existing deduplication, but there are 522a58b312SMartin Matuska * some important differences: 532a58b312SMartin Matuska * 542a58b312SMartin Matuska * - Deduplication is automatic and Block Cloning is not - one has to use a 552a58b312SMartin Matuska * dedicated system call(s) to clone the given file/blocks. 562a58b312SMartin Matuska * - Deduplication keeps all data blocks in its table, even those referenced 572a58b312SMartin Matuska * just once. Block Cloning creates an entry in its tables only when there 582a58b312SMartin Matuska * are at least two references to the given data block. If the block was 592a58b312SMartin Matuska * never explicitly cloned or the second to last reference was dropped, 602a58b312SMartin Matuska * there will be neither space nor performance overhead. 612a58b312SMartin Matuska * - Deduplication needs data to work - one needs to pass real data to the 622a58b312SMartin Matuska * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 632a58b312SMartin Matuska * data, just block pointers to the data, so it is extremely fast, as we pay 642a58b312SMartin Matuska * neither the cost of reading the data, nor the cost of writing the data - 652a58b312SMartin Matuska * we operate exclusively on metadata. 662a58b312SMartin Matuska * - If the D (dedup) bit is not set in the block pointer, it means that 672a58b312SMartin Matuska * the block is not in the dedup table (DDT) and we won't consult the DDT 682a58b312SMartin Matuska * when we need to free the block. Block Cloning must be consulted on every 692a58b312SMartin Matuska * free, because we cannot modify the source BP (eg. by setting something 702a58b312SMartin Matuska * similar to the D bit), thus we have no hint if the block is in the 712a58b312SMartin Matuska * Block Reference Table (BRT), so we need to look into the BRT. There is 722a58b312SMartin Matuska * an optimization in place that allows us to eliminate the majority of BRT 732a58b312SMartin Matuska * lookups which is described below in the "Minimizing free penalty" section. 742a58b312SMartin Matuska * - The BRT entry is much smaller than the DDT entry - for BRT we only store 752a58b312SMartin Matuska * 64bit offset and 64bit reference counter. 762a58b312SMartin Matuska * - Dedup keys are cryptographic hashes, so two blocks that are close to each 772a58b312SMartin Matuska * other on disk are most likely in totally different parts of the DDT. 782a58b312SMartin Matuska * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 792a58b312SMartin Matuska * from one file should have BRT entries close to each other. 802a58b312SMartin Matuska * - Scrub will only do a single pass over a block that is referenced multiple 812a58b312SMartin Matuska * times in the DDT. Unfortunately it is not currently (if at all) possible 822a58b312SMartin Matuska * with Block Cloning and block referenced multiple times will be scrubbed 832a58b312SMartin Matuska * multiple times. The new, sorted scrub should be able to eliminate 842a58b312SMartin Matuska * duplicated reads given enough memory. 852a58b312SMartin Matuska * - Deduplication requires cryptographically strong hash as a checksum or 862a58b312SMartin Matuska * additional data verification. Block Cloning works with any checksum 872a58b312SMartin Matuska * algorithm or even with checksumming disabled. 882a58b312SMartin Matuska * 892a58b312SMartin Matuska * As mentioned above, the BRT entries are much smaller than the DDT entries. 902a58b312SMartin Matuska * To uniquely identify a block we just need its vdev id and offset. We also 912a58b312SMartin Matuska * need to maintain a reference counter. The vdev id will often repeat, as there 922a58b312SMartin Matuska * is a small number of top-level VDEVs and a large number of blocks stored in 932a58b312SMartin Matuska * each VDEV. We take advantage of that to reduce the BRT entry size further by 942a58b312SMartin Matuska * maintaining one BRT for each top-level VDEV, so we can then have only offset 952a58b312SMartin Matuska * and counter as the BRT entry. 962a58b312SMartin Matuska * 972a58b312SMartin Matuska * Minimizing free penalty. 982a58b312SMartin Matuska * 992a58b312SMartin Matuska * Block Cloning allows creating additional references to any existing block. 1002a58b312SMartin Matuska * When we free a block there is no hint in the block pointer whether the block 1012a58b312SMartin Matuska * was cloned or not, so on each free we have to check if there is a 1022a58b312SMartin Matuska * corresponding entry in the BRT or not. If there is, we need to decrease 1032a58b312SMartin Matuska * the reference counter. Doing BRT lookup on every free can potentially be 1042a58b312SMartin Matuska * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 1052a58b312SMartin Matuska * This is the main problem with deduplication, so we've learned our lesson and 1062a58b312SMartin Matuska * try not to repeat the same mistake here. How do we do that? We divide each 1072a58b312SMartin Matuska * top-level VDEV into 16MB regions. For each region we maintain a counter that 1082a58b312SMartin Matuska * is a sum of all the BRT entries that have offsets within the region. This 1092a58b312SMartin Matuska * creates the entries count array of 16bit numbers for each top-level VDEV. 1102a58b312SMartin Matuska * The entries count array is always kept in memory and updated on disk in the 1112a58b312SMartin Matuska * same transaction group as the BRT updates to keep everything in-sync. We can 1122a58b312SMartin Matuska * keep the array in memory, because it is very small. With 16MB regions and 1132a58b312SMartin Matuska * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 1142a58b312SMartin Matuska * the region size even further in the future). Now, when we want to free 1152a58b312SMartin Matuska * a block, we first consult the array. If the counter for the whole region is 1162a58b312SMartin Matuska * zero, there is no need to look for the BRT entry, as there isn't one for 1172a58b312SMartin Matuska * sure. If the counter for the region is greater than zero, only then we will 1182a58b312SMartin Matuska * do a BRT lookup and if an entry is found we will decrease the reference 1192a58b312SMartin Matuska * counter in the BRT entry and in the entry counters array. 1202a58b312SMartin Matuska * 1212a58b312SMartin Matuska * The entry counters array is small, but can potentially be larger for very 1222a58b312SMartin Matuska * large VDEVs or smaller regions. In this case we don't want to rewrite entire 1232a58b312SMartin Matuska * array on every change. We then divide the array into 32kB block and keep 1242a58b312SMartin Matuska * a bitmap of dirty blocks within a transaction group. When we sync the 1252a58b312SMartin Matuska * transaction group we can only update the parts of the entry counters array 1262a58b312SMartin Matuska * that were modified. Note: Keeping track of the dirty parts of the entry 1272a58b312SMartin Matuska * counters array is implemented, but updating only parts of the array on disk 1282a58b312SMartin Matuska * is not yet implemented - for now we will update entire array if there was 1292a58b312SMartin Matuska * any change. 1302a58b312SMartin Matuska * 1312a58b312SMartin Matuska * The implementation tries to be economic: if BRT is not used, or no longer 1322a58b312SMartin Matuska * used, there will be no entries in the MOS and no additional memory used (eg. 1332a58b312SMartin Matuska * the entry counters array is only allocated if needed). 1342a58b312SMartin Matuska * 1352a58b312SMartin Matuska * Interaction between Deduplication and Block Cloning. 1362a58b312SMartin Matuska * 1372a58b312SMartin Matuska * If both functionalities are in use, we could end up with a block that is 1382a58b312SMartin Matuska * referenced multiple times in both DDT and BRT. When we free one of the 1392a58b312SMartin Matuska * references we couldn't tell where it belongs, so we would have to decide 1402a58b312SMartin Matuska * what table takes the precedence: do we first clear DDT references or BRT 1412a58b312SMartin Matuska * references? To avoid this dilemma BRT cooperates with DDT - if a given block 1422a58b312SMartin Matuska * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 1432a58b312SMartin Matuska * lookup DDT entry instead and increase the counter there. No BRT entry 1442a58b312SMartin Matuska * will be created for a block which has the D (dedup) bit set. 1452a58b312SMartin Matuska * BRT may be more efficient for manual deduplication, but if the block is 1462a58b312SMartin Matuska * already in the DDT, then creating additional BRT entry would be less 1472a58b312SMartin Matuska * efficient. This clever idea was proposed by Allan Jude. 1482a58b312SMartin Matuska * 1492a58b312SMartin Matuska * Block Cloning across datasets. 1502a58b312SMartin Matuska * 1512a58b312SMartin Matuska * Block Cloning is not limited to cloning blocks within the same dataset. 1522a58b312SMartin Matuska * It is possible (and very useful) to clone blocks between different datasets. 1532a58b312SMartin Matuska * One use case is recovering files from snapshots. By cloning the files into 1542a58b312SMartin Matuska * dataset we need no additional storage. Without Block Cloning we would need 1552a58b312SMartin Matuska * additional space for those files. 1562a58b312SMartin Matuska * Another interesting use case is moving the files between datasets 1572a58b312SMartin Matuska * (copying the file content to the new dataset and removing the source file). 1582a58b312SMartin Matuska * In that case Block Cloning will only be used briefly, because the BRT entries 1592a58b312SMartin Matuska * will be removed when the source is removed. 1603494f7c0SMartin Matuska * Block Cloning across encrypted datasets is supported as long as both 1613494f7c0SMartin Matuska * datasets share the same master key (e.g. snapshots and clones) 1622a58b312SMartin Matuska * 1632a58b312SMartin Matuska * Block Cloning flow through ZFS layers. 1642a58b312SMartin Matuska * 1652a58b312SMartin Matuska * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 1662a58b312SMartin Matuska * blocks. As of this writing no interface is implemented that allows for block 1672a58b312SMartin Matuska * cloning within a ZVOL. 1682a58b312SMartin Matuska * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 1692a58b312SMartin Matuska * for blocking cloning. 1702a58b312SMartin Matuska * 1712a58b312SMartin Matuska * ssize_t 1722a58b312SMartin Matuska * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 1732a58b312SMartin Matuska * size_t len, unsigned int flags); 1742a58b312SMartin Matuska * 1752a58b312SMartin Matuska * Even though offsets and length represent bytes, they have to be 176315ee00fSMartin Matuska * block-aligned or we will return an error so the upper layer can 1772a58b312SMartin Matuska * fallback to the generic mechanism that will just copy the data. 1782a58b312SMartin Matuska * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 1792a58b312SMartin Matuska * This function was implemented based on zfs_write(), but instead of writing 1802a58b312SMartin Matuska * the given data we first read block pointers using the new dmu_read_l0_bps() 1812a58b312SMartin Matuska * function from the source file. Once we have BPs from the source file we call 1822a58b312SMartin Matuska * the dmu_brt_clone() function on the destination file. This function 1832a58b312SMartin Matuska * allocates BPs for us. We iterate over all source BPs. If the given BP is 1842a58b312SMartin Matuska * a hole or an embedded block, we just copy BP as-is. If it points to a real 1852a58b312SMartin Matuska * data we place this BP on a BRT pending list using the brt_pending_add() 1862a58b312SMartin Matuska * function. 1872a58b312SMartin Matuska * 1882a58b312SMartin Matuska * We use this pending list to keep track of all BPs that got new references 1892a58b312SMartin Matuska * within this transaction group. 1902a58b312SMartin Matuska * 1912a58b312SMartin Matuska * Some special cases to consider and how we address them: 1922a58b312SMartin Matuska * - The block we want to clone may have been created within the same 1932a58b312SMartin Matuska * transaction group that we are trying to clone. Such block has no BP 194315ee00fSMartin Matuska * allocated yet, so cannot be immediately cloned. We return EAGAIN. 1952a58b312SMartin Matuska * - The block we want to clone may have been modified within the same 196315ee00fSMartin Matuska * transaction group. We return EAGAIN. 1972a58b312SMartin Matuska * - A block may be cloned multiple times during one transaction group (that's 1982a58b312SMartin Matuska * why pending list is actually a tree and not an append-only list - this 1992a58b312SMartin Matuska * way we can figure out faster if this block is cloned for the first time 2002a58b312SMartin Matuska * in this txg or consecutive time). 2012a58b312SMartin Matuska * - A block may be cloned and freed within the same transaction group 2022a58b312SMartin Matuska * (see dbuf_undirty()). 2032a58b312SMartin Matuska * - A block may be cloned and within the same transaction group the clone 2042a58b312SMartin Matuska * can be cloned again (see dmu_read_l0_bps()). 2052a58b312SMartin Matuska * - A file might have been deleted, but the caller still has a file descriptor 2062a58b312SMartin Matuska * open to this file and clones it. 2072a58b312SMartin Matuska * 2082a58b312SMartin Matuska * When we free a block we have an additional step in the ZIO pipeline where we 2092a58b312SMartin Matuska * call the zio_brt_free() function. We then call the brt_entry_decref() 2102a58b312SMartin Matuska * that loads the corresponding BRT entry (if one exists) and decreases 2112a58b312SMartin Matuska * reference counter. If this is not the last reference we will stop ZIO 2122a58b312SMartin Matuska * pipeline here. If this is the last reference or the block is not in the 2132a58b312SMartin Matuska * BRT, we continue the pipeline and free the block as usual. 2142a58b312SMartin Matuska * 2152a58b312SMartin Matuska * At the beginning of spa_sync() where there can be no more block cloning, 2162a58b312SMartin Matuska * but before issuing frees we call brt_pending_apply(). This function applies 2172a58b312SMartin Matuska * all the new clones to the BRT table - we load BRT entries and update 2182a58b312SMartin Matuska * reference counters. To sync new BRT entries to disk, we use brt_sync() 2192a58b312SMartin Matuska * function. This function will sync all dirty per-top-level-vdev BRTs, 2202a58b312SMartin Matuska * the entry counters arrays, etc. 2212a58b312SMartin Matuska * 2222a58b312SMartin Matuska * Block Cloning and ZIL. 2232a58b312SMartin Matuska * 2242a58b312SMartin Matuska * Every clone operation is divided into chunks (similar to write) and each 2252a58b312SMartin Matuska * chunk is cloned in a separate transaction. The chunk size is determined by 2262a58b312SMartin Matuska * how many BPs we can fit into a single ZIL entry. 2272a58b312SMartin Matuska * Replaying clone operation is different from the regular clone operation, 2282a58b312SMartin Matuska * as when we log clone operations we cannot use the source object - it may 2292a58b312SMartin Matuska * reside on a different dataset, so we log BPs we want to clone. 2302a58b312SMartin Matuska * The ZIL is replayed when we mount the given dataset, not when the pool is 2312a58b312SMartin Matuska * imported. Taking this into account it is possible that the pool is imported 2322a58b312SMartin Matuska * without mounting datasets and the source dataset is destroyed before the 2332a58b312SMartin Matuska * destination dataset is mounted and its ZIL replayed. 2342a58b312SMartin Matuska * To address this situation we leverage zil_claim() mechanism where ZFS will 2352a58b312SMartin Matuska * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 236525fe93dSMartin Matuska * entries, we will bump reference counters for their BPs in the BRT. Then 237525fe93dSMartin Matuska * on mount and ZIL replay we bump the reference counters once more, while the 238525fe93dSMartin Matuska * first references are dropped during ZIL destroy by zil_free_clone_range(). 239525fe93dSMartin Matuska * It is possible that after zil_claim() we never mount the destination, so 240525fe93dSMartin Matuska * we never replay its ZIL and just destroy it. In this case the only taken 241525fe93dSMartin Matuska * references will be dropped by zil_free_clone_range(), since the cloning is 242525fe93dSMartin Matuska * not going to ever take place. 2432a58b312SMartin Matuska */ 2442a58b312SMartin Matuska 2452a58b312SMartin Matuska static kmem_cache_t *brt_entry_cache; 2462a58b312SMartin Matuska 2472a58b312SMartin Matuska /* 2482a58b312SMartin Matuska * Enable/disable prefetching of BRT entries that we are going to modify. 2492a58b312SMartin Matuska */ 250783d3ff6SMartin Matuska static int brt_zap_prefetch = 1; 2512a58b312SMartin Matuska 2522a58b312SMartin Matuska #ifdef ZFS_DEBUG 2532a58b312SMartin Matuska #define BRT_DEBUG(...) do { \ 2542a58b312SMartin Matuska if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 2552a58b312SMartin Matuska __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 2562a58b312SMartin Matuska } \ 2572a58b312SMartin Matuska } while (0) 2582a58b312SMartin Matuska #else 2592a58b312SMartin Matuska #define BRT_DEBUG(...) do { } while (0) 2602a58b312SMartin Matuska #endif 2612a58b312SMartin Matuska 262783d3ff6SMartin Matuska static int brt_zap_default_bs = 12; 263783d3ff6SMartin Matuska static int brt_zap_default_ibs = 12; 2642a58b312SMartin Matuska 2652a58b312SMartin Matuska static kstat_t *brt_ksp; 2662a58b312SMartin Matuska 2672a58b312SMartin Matuska typedef struct brt_stats { 2682a58b312SMartin Matuska kstat_named_t brt_addref_entry_not_on_disk; 2692a58b312SMartin Matuska kstat_named_t brt_addref_entry_on_disk; 2702a58b312SMartin Matuska kstat_named_t brt_decref_entry_in_memory; 2712a58b312SMartin Matuska kstat_named_t brt_decref_entry_loaded_from_disk; 2722a58b312SMartin Matuska kstat_named_t brt_decref_entry_not_in_memory; 2732a58b312SMartin Matuska kstat_named_t brt_decref_entry_read_lost_race; 2742a58b312SMartin Matuska kstat_named_t brt_decref_entry_still_referenced; 2752a58b312SMartin Matuska kstat_named_t brt_decref_free_data_later; 2762a58b312SMartin Matuska kstat_named_t brt_decref_free_data_now; 2772a58b312SMartin Matuska kstat_named_t brt_decref_no_entry; 2782a58b312SMartin Matuska } brt_stats_t; 2792a58b312SMartin Matuska 2802a58b312SMartin Matuska static brt_stats_t brt_stats = { 2812a58b312SMartin Matuska { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 2822a58b312SMartin Matuska { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 2832a58b312SMartin Matuska { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 2842a58b312SMartin Matuska { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 2852a58b312SMartin Matuska { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 2862a58b312SMartin Matuska { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 2872a58b312SMartin Matuska { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 2882a58b312SMartin Matuska { "decref_free_data_later", KSTAT_DATA_UINT64 }, 2892a58b312SMartin Matuska { "decref_free_data_now", KSTAT_DATA_UINT64 }, 2902a58b312SMartin Matuska { "decref_no_entry", KSTAT_DATA_UINT64 } 2912a58b312SMartin Matuska }; 2922a58b312SMartin Matuska 2932a58b312SMartin Matuska struct { 2942a58b312SMartin Matuska wmsum_t brt_addref_entry_not_on_disk; 2952a58b312SMartin Matuska wmsum_t brt_addref_entry_on_disk; 2962a58b312SMartin Matuska wmsum_t brt_decref_entry_in_memory; 2972a58b312SMartin Matuska wmsum_t brt_decref_entry_loaded_from_disk; 2982a58b312SMartin Matuska wmsum_t brt_decref_entry_not_in_memory; 2992a58b312SMartin Matuska wmsum_t brt_decref_entry_read_lost_race; 3002a58b312SMartin Matuska wmsum_t brt_decref_entry_still_referenced; 3012a58b312SMartin Matuska wmsum_t brt_decref_free_data_later; 3022a58b312SMartin Matuska wmsum_t brt_decref_free_data_now; 3032a58b312SMartin Matuska wmsum_t brt_decref_no_entry; 3042a58b312SMartin Matuska } brt_sums; 3052a58b312SMartin Matuska 3062a58b312SMartin Matuska #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 3072a58b312SMartin Matuska 3082a58b312SMartin Matuska static int brt_entry_compare(const void *x1, const void *x2); 309718519f4SMartin Matuska static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); 3102a58b312SMartin Matuska 3112a58b312SMartin Matuska static void 312718519f4SMartin Matuska brt_rlock(spa_t *spa) 3132a58b312SMartin Matuska { 314718519f4SMartin Matuska rw_enter(&spa->spa_brt_lock, RW_READER); 3152a58b312SMartin Matuska } 3162a58b312SMartin Matuska 3172a58b312SMartin Matuska static void 318718519f4SMartin Matuska brt_wlock(spa_t *spa) 3192a58b312SMartin Matuska { 320718519f4SMartin Matuska rw_enter(&spa->spa_brt_lock, RW_WRITER); 3212a58b312SMartin Matuska } 3222a58b312SMartin Matuska 3232a58b312SMartin Matuska static void 324718519f4SMartin Matuska brt_unlock(spa_t *spa) 3252a58b312SMartin Matuska { 326718519f4SMartin Matuska rw_exit(&spa->spa_brt_lock); 3272a58b312SMartin Matuska } 3282a58b312SMartin Matuska 3292a58b312SMartin Matuska static uint16_t 3302a58b312SMartin Matuska brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 3312a58b312SMartin Matuska { 3322a58b312SMartin Matuska 3332a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 3342a58b312SMartin Matuska 3353494f7c0SMartin Matuska if (unlikely(brtvd->bv_need_byteswap)) { 3362a58b312SMartin Matuska return (BSWAP_16(brtvd->bv_entcount[idx])); 3372a58b312SMartin Matuska } else { 3382a58b312SMartin Matuska return (brtvd->bv_entcount[idx]); 3392a58b312SMartin Matuska } 3402a58b312SMartin Matuska } 3412a58b312SMartin Matuska 3422a58b312SMartin Matuska static void 3432a58b312SMartin Matuska brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 3442a58b312SMartin Matuska { 3452a58b312SMartin Matuska 3462a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 3472a58b312SMartin Matuska 3483494f7c0SMartin Matuska if (unlikely(brtvd->bv_need_byteswap)) { 3492a58b312SMartin Matuska brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 3502a58b312SMartin Matuska } else { 3512a58b312SMartin Matuska brtvd->bv_entcount[idx] = entcnt; 3522a58b312SMartin Matuska } 3532a58b312SMartin Matuska } 3542a58b312SMartin Matuska 3552a58b312SMartin Matuska static void 3562a58b312SMartin Matuska brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 3572a58b312SMartin Matuska { 3582a58b312SMartin Matuska uint16_t entcnt; 3592a58b312SMartin Matuska 3602a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 3612a58b312SMartin Matuska 3622a58b312SMartin Matuska entcnt = brt_vdev_entcount_get(brtvd, idx); 3632a58b312SMartin Matuska ASSERT(entcnt < UINT16_MAX); 3642a58b312SMartin Matuska 3652a58b312SMartin Matuska brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 3662a58b312SMartin Matuska } 3672a58b312SMartin Matuska 3682a58b312SMartin Matuska static void 3692a58b312SMartin Matuska brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 3702a58b312SMartin Matuska { 3712a58b312SMartin Matuska uint16_t entcnt; 3722a58b312SMartin Matuska 3732a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 3742a58b312SMartin Matuska 3752a58b312SMartin Matuska entcnt = brt_vdev_entcount_get(brtvd, idx); 3762a58b312SMartin Matuska ASSERT(entcnt > 0); 3772a58b312SMartin Matuska 3782a58b312SMartin Matuska brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 3792a58b312SMartin Matuska } 3802a58b312SMartin Matuska 3812a58b312SMartin Matuska #ifdef ZFS_DEBUG 3822a58b312SMartin Matuska static void 3833494f7c0SMartin Matuska brt_vdev_dump(brt_vdev_t *brtvd) 3842a58b312SMartin Matuska { 3852a58b312SMartin Matuska uint64_t idx; 3862a58b312SMartin Matuska 387718519f4SMartin Matuska uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 3883494f7c0SMartin Matuska zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " 389718519f4SMartin Matuska "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", 3903494f7c0SMartin Matuska (u_longlong_t)brtvd->bv_vdevid, 3912a58b312SMartin Matuska brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 3922a58b312SMartin Matuska (u_longlong_t)brtvd->bv_size, 3932a58b312SMartin Matuska (u_longlong_t)brtvd->bv_totalcount, 394718519f4SMartin Matuska (u_longlong_t)nblocks, 395718519f4SMartin Matuska (size_t)BT_SIZEOFMAP(nblocks)); 3962a58b312SMartin Matuska if (brtvd->bv_totalcount > 0) { 3972a58b312SMartin Matuska zfs_dbgmsg(" entcounts:"); 3982a58b312SMartin Matuska for (idx = 0; idx < brtvd->bv_size; idx++) { 3993494f7c0SMartin Matuska uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 4003494f7c0SMartin Matuska if (entcnt > 0) { 4012a58b312SMartin Matuska zfs_dbgmsg(" [%04llu] %hu", 4023494f7c0SMartin Matuska (u_longlong_t)idx, entcnt); 4032a58b312SMartin Matuska } 4042a58b312SMartin Matuska } 4052a58b312SMartin Matuska } 4062a58b312SMartin Matuska if (brtvd->bv_entcount_dirty) { 4072a58b312SMartin Matuska char *bitmap; 4082a58b312SMartin Matuska 409718519f4SMartin Matuska bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); 410718519f4SMartin Matuska for (idx = 0; idx < nblocks; idx++) { 4112a58b312SMartin Matuska bitmap[idx] = 4122a58b312SMartin Matuska BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 4132a58b312SMartin Matuska } 4142a58b312SMartin Matuska bitmap[idx] = '\0'; 4153494f7c0SMartin Matuska zfs_dbgmsg(" dirty: %s", bitmap); 416718519f4SMartin Matuska kmem_free(bitmap, nblocks + 1); 4172a58b312SMartin Matuska } 4182a58b312SMartin Matuska } 4192a58b312SMartin Matuska #endif 4202a58b312SMartin Matuska 4212a58b312SMartin Matuska static brt_vdev_t * 422718519f4SMartin Matuska brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) 4232a58b312SMartin Matuska { 424718519f4SMartin Matuska brt_vdev_t *brtvd = NULL; 4252a58b312SMartin Matuska 426718519f4SMartin Matuska brt_rlock(spa); 427718519f4SMartin Matuska if (vdevid < spa->spa_brt_nvdevs) { 428718519f4SMartin Matuska brtvd = spa->spa_brt_vdevs[vdevid]; 429718519f4SMartin Matuska } else if (alloc) { 430718519f4SMartin Matuska /* New VDEV was added. */ 431718519f4SMartin Matuska brt_unlock(spa); 432718519f4SMartin Matuska brt_wlock(spa); 433718519f4SMartin Matuska if (vdevid >= spa->spa_brt_nvdevs) 434718519f4SMartin Matuska brt_vdevs_expand(spa, vdevid + 1); 435718519f4SMartin Matuska brtvd = spa->spa_brt_vdevs[vdevid]; 4362a58b312SMartin Matuska } 437718519f4SMartin Matuska brt_unlock(spa); 4382a58b312SMartin Matuska return (brtvd); 4392a58b312SMartin Matuska } 4402a58b312SMartin Matuska 4412a58b312SMartin Matuska static void 442718519f4SMartin Matuska brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 4432a58b312SMartin Matuska { 4442a58b312SMartin Matuska char name[64]; 4452a58b312SMartin Matuska 446718519f4SMartin Matuska ASSERT(brtvd->bv_initiated); 4472a58b312SMartin Matuska ASSERT0(brtvd->bv_mos_brtvdev); 4482a58b312SMartin Matuska ASSERT0(brtvd->bv_mos_entries); 4492a58b312SMartin Matuska 450718519f4SMartin Matuska uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, 4512a58b312SMartin Matuska ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 452783d3ff6SMartin Matuska brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); 453718519f4SMartin Matuska VERIFY(mos_entries != 0); 454718519f4SMartin Matuska VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, 455718519f4SMartin Matuska &brtvd->bv_mos_entries_dnode)); 456718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 457718519f4SMartin Matuska brtvd->bv_mos_entries = mos_entries; 458718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock); 4592a58b312SMartin Matuska BRT_DEBUG("MOS entries created, object=%llu", 4602a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries); 4612a58b312SMartin Matuska 4622a58b312SMartin Matuska /* 4632a58b312SMartin Matuska * We allocate DMU buffer to store the bv_entcount[] array. 4642a58b312SMartin Matuska * We will keep array size (bv_size) and cummulative count for all 4652a58b312SMartin Matuska * bv_entcount[]s (bv_totalcount) in the bonus buffer. 4662a58b312SMartin Matuska */ 467718519f4SMartin Matuska brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, 4682a58b312SMartin Matuska DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 4692a58b312SMartin Matuska DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 4702a58b312SMartin Matuska VERIFY(brtvd->bv_mos_brtvdev != 0); 4712a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV created, object=%llu", 4722a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev); 4732a58b312SMartin Matuska 4742a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 4752a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 476718519f4SMartin Matuska VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, 4772a58b312SMartin Matuska sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 4782a58b312SMartin Matuska BRT_DEBUG("Pool directory object created, object=%s", name); 4792a58b312SMartin Matuska 480718519f4SMartin Matuska spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); 4812a58b312SMartin Matuska } 4822a58b312SMartin Matuska 4832a58b312SMartin Matuska static void 484718519f4SMartin Matuska brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) 4852a58b312SMartin Matuska { 4862a58b312SMartin Matuska vdev_t *vd; 4872a58b312SMartin Matuska uint16_t *entcount; 4882a58b312SMartin Matuska ulong_t *bitmap; 489718519f4SMartin Matuska uint64_t nblocks, onblocks, size; 4902a58b312SMartin Matuska 491718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 4922a58b312SMartin Matuska 493718519f4SMartin Matuska spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 494718519f4SMartin Matuska vd = vdev_lookup_top(spa, brtvd->bv_vdevid); 495718519f4SMartin Matuska size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; 496718519f4SMartin Matuska spa_config_exit(spa, SCL_VDEV, FTAG); 4972a58b312SMartin Matuska 498315ee00fSMartin Matuska entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 4992a58b312SMartin Matuska nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 5002a58b312SMartin Matuska bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 5012a58b312SMartin Matuska 5022a58b312SMartin Matuska if (!brtvd->bv_initiated) { 5032a58b312SMartin Matuska ASSERT0(brtvd->bv_size); 504718519f4SMartin Matuska ASSERT0P(brtvd->bv_entcount); 505718519f4SMartin Matuska ASSERT0P(brtvd->bv_bitmap); 5062a58b312SMartin Matuska } else { 5072a58b312SMartin Matuska ASSERT(brtvd->bv_size > 0); 5082a58b312SMartin Matuska ASSERT(brtvd->bv_entcount != NULL); 5092a58b312SMartin Matuska ASSERT(brtvd->bv_bitmap != NULL); 5102a58b312SMartin Matuska /* 5112a58b312SMartin Matuska * TODO: Allow vdev shrinking. We only need to implement 5122a58b312SMartin Matuska * shrinking the on-disk BRT VDEV object. 513718519f4SMartin Matuska * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 514718519f4SMartin Matuska * offset, size, tx); 5152a58b312SMartin Matuska */ 5162a58b312SMartin Matuska ASSERT3U(brtvd->bv_size, <=, size); 5172a58b312SMartin Matuska 5182a58b312SMartin Matuska memcpy(entcount, brtvd->bv_entcount, 5192a58b312SMartin Matuska sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 520315ee00fSMartin Matuska vmem_free(brtvd->bv_entcount, 5212a58b312SMartin Matuska sizeof (entcount[0]) * brtvd->bv_size); 522718519f4SMartin Matuska onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 523718519f4SMartin Matuska memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 524718519f4SMartin Matuska BT_SIZEOFMAP(onblocks))); 525718519f4SMartin Matuska kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); 5262a58b312SMartin Matuska } 5272a58b312SMartin Matuska 5282a58b312SMartin Matuska brtvd->bv_size = size; 5292a58b312SMartin Matuska brtvd->bv_entcount = entcount; 5302a58b312SMartin Matuska brtvd->bv_bitmap = bitmap; 5312a58b312SMartin Matuska if (!brtvd->bv_initiated) { 5322a58b312SMartin Matuska brtvd->bv_need_byteswap = FALSE; 5332a58b312SMartin Matuska brtvd->bv_initiated = TRUE; 5342a58b312SMartin Matuska BRT_DEBUG("BRT VDEV %llu initiated.", 5352a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 5362a58b312SMartin Matuska } 5372a58b312SMartin Matuska } 5382a58b312SMartin Matuska 539718519f4SMartin Matuska static int 540718519f4SMartin Matuska brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) 5412a58b312SMartin Matuska { 5422a58b312SMartin Matuska dmu_buf_t *db; 5432a58b312SMartin Matuska brt_vdev_phys_t *bvphys; 5442a58b312SMartin Matuska int error; 5452a58b312SMartin Matuska 546718519f4SMartin Matuska ASSERT(!brtvd->bv_initiated); 5472a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0); 5482a58b312SMartin Matuska 549718519f4SMartin Matuska error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 550718519f4SMartin Matuska FTAG, &db); 5512a58b312SMartin Matuska if (error != 0) 552718519f4SMartin Matuska return (error); 5532a58b312SMartin Matuska 5542a58b312SMartin Matuska bvphys = db->db_data; 555718519f4SMartin Matuska if (spa->spa_brt_rangesize == 0) { 556718519f4SMartin Matuska spa->spa_brt_rangesize = bvphys->bvp_rangesize; 5572a58b312SMartin Matuska } else { 558718519f4SMartin Matuska ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); 5592a58b312SMartin Matuska } 5602a58b312SMartin Matuska 561718519f4SMartin Matuska brt_vdev_realloc(spa, brtvd); 5622a58b312SMartin Matuska 5632a58b312SMartin Matuska /* TODO: We don't support VDEV shrinking. */ 5642a58b312SMartin Matuska ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 5652a58b312SMartin Matuska 5662a58b312SMartin Matuska /* 5672a58b312SMartin Matuska * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 5682a58b312SMartin Matuska */ 569718519f4SMartin Matuska error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, 5702a58b312SMartin Matuska MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 5712a58b312SMartin Matuska brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 572718519f4SMartin Matuska if (error != 0) 573718519f4SMartin Matuska return (error); 5742a58b312SMartin Matuska 575718519f4SMartin Matuska ASSERT(bvphys->bvp_mos_entries != 0); 576718519f4SMartin Matuska VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, 577718519f4SMartin Matuska &brtvd->bv_mos_entries_dnode)); 578718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 5792a58b312SMartin Matuska brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 580718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock); 5812a58b312SMartin Matuska brtvd->bv_need_byteswap = 5822a58b312SMartin Matuska (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 5832a58b312SMartin Matuska brtvd->bv_totalcount = bvphys->bvp_totalcount; 5842a58b312SMartin Matuska brtvd->bv_usedspace = bvphys->bvp_usedspace; 5852a58b312SMartin Matuska brtvd->bv_savedspace = bvphys->bvp_savedspace; 5862a58b312SMartin Matuska 5872a58b312SMartin Matuska dmu_buf_rele(db, FTAG); 5882a58b312SMartin Matuska 589718519f4SMartin Matuska BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", 590718519f4SMartin Matuska (u_longlong_t)brtvd->bv_vdevid, 591718519f4SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev, 5922a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries); 593718519f4SMartin Matuska return (0); 5942a58b312SMartin Matuska } 5952a58b312SMartin Matuska 5962a58b312SMartin Matuska static void 597718519f4SMartin Matuska brt_vdev_dealloc(brt_vdev_t *brtvd) 5982a58b312SMartin Matuska { 599718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 6002a58b312SMartin Matuska ASSERT(brtvd->bv_initiated); 601718519f4SMartin Matuska ASSERT0(avl_numnodes(&brtvd->bv_tree)); 6022a58b312SMartin Matuska 603315ee00fSMartin Matuska vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 6042a58b312SMartin Matuska brtvd->bv_entcount = NULL; 605718519f4SMartin Matuska uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 606718519f4SMartin Matuska kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); 6072a58b312SMartin Matuska brtvd->bv_bitmap = NULL; 6082a58b312SMartin Matuska 6092a58b312SMartin Matuska brtvd->bv_size = 0; 6102a58b312SMartin Matuska 6112a58b312SMartin Matuska brtvd->bv_initiated = FALSE; 6122a58b312SMartin Matuska BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 6132a58b312SMartin Matuska } 6142a58b312SMartin Matuska 6152a58b312SMartin Matuska static void 616718519f4SMartin Matuska brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 6172a58b312SMartin Matuska { 6182a58b312SMartin Matuska char name[64]; 6192a58b312SMartin Matuska uint64_t count; 6202a58b312SMartin Matuska 621718519f4SMartin Matuska ASSERT(brtvd->bv_initiated); 6222a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0); 6232a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0); 624718519f4SMartin Matuska ASSERT0(brtvd->bv_totalcount); 625718519f4SMartin Matuska ASSERT0(brtvd->bv_usedspace); 626718519f4SMartin Matuska ASSERT0(brtvd->bv_savedspace); 6272a58b312SMartin Matuska 628718519f4SMartin Matuska uint64_t mos_entries = brtvd->bv_mos_entries; 629718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 6302a58b312SMartin Matuska brtvd->bv_mos_entries = 0; 631718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock); 632718519f4SMartin Matuska dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 633718519f4SMartin Matuska brtvd->bv_mos_entries_dnode = NULL; 634718519f4SMartin Matuska ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); 635718519f4SMartin Matuska ASSERT0(count); 636718519f4SMartin Matuska VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); 637718519f4SMartin Matuska BRT_DEBUG("MOS entries destroyed, object=%llu", 638718519f4SMartin Matuska (u_longlong_t)mos_entries); 6392a58b312SMartin Matuska 640718519f4SMartin Matuska VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 641718519f4SMartin Matuska tx)); 6422a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 6432a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev); 6442a58b312SMartin Matuska brtvd->bv_mos_brtvdev = 0; 645718519f4SMartin Matuska brtvd->bv_entcount_dirty = FALSE; 6462a58b312SMartin Matuska 6472a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 6482a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 649718519f4SMartin Matuska VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 650718519f4SMartin Matuska name, tx)); 6512a58b312SMartin Matuska BRT_DEBUG("Pool directory object removed, object=%s", name); 6522a58b312SMartin Matuska 653718519f4SMartin Matuska brtvd->bv_meta_dirty = FALSE; 6542a58b312SMartin Matuska 655718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 656718519f4SMartin Matuska brt_vdev_dealloc(brtvd); 657718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 658718519f4SMartin Matuska 659718519f4SMartin Matuska spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); 6602a58b312SMartin Matuska } 6612a58b312SMartin Matuska 6622a58b312SMartin Matuska static void 663718519f4SMartin Matuska brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) 6642a58b312SMartin Matuska { 665718519f4SMartin Matuska brt_vdev_t **vdevs; 6662a58b312SMartin Matuska 667718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); 668718519f4SMartin Matuska ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); 6692a58b312SMartin Matuska 670718519f4SMartin Matuska if (nvdevs == spa->spa_brt_nvdevs) 671718519f4SMartin Matuska return; 6722a58b312SMartin Matuska 673718519f4SMartin Matuska vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); 674718519f4SMartin Matuska if (spa->spa_brt_nvdevs > 0) { 675718519f4SMartin Matuska ASSERT(spa->spa_brt_vdevs != NULL); 676718519f4SMartin Matuska 677718519f4SMartin Matuska memcpy(vdevs, spa->spa_brt_vdevs, 678718519f4SMartin Matuska sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 679718519f4SMartin Matuska kmem_free(spa->spa_brt_vdevs, 680718519f4SMartin Matuska sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 6812a58b312SMartin Matuska } 682718519f4SMartin Matuska spa->spa_brt_vdevs = vdevs; 6832a58b312SMartin Matuska 684718519f4SMartin Matuska for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { 685718519f4SMartin Matuska brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); 686718519f4SMartin Matuska rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); 6872a58b312SMartin Matuska brtvd->bv_vdevid = vdevid; 6882a58b312SMartin Matuska brtvd->bv_initiated = FALSE; 689718519f4SMartin Matuska rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); 690718519f4SMartin Matuska avl_create(&brtvd->bv_tree, brt_entry_compare, 691718519f4SMartin Matuska sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 692718519f4SMartin Matuska for (int i = 0; i < TXG_SIZE; i++) { 693718519f4SMartin Matuska avl_create(&brtvd->bv_pending_tree[i], 694718519f4SMartin Matuska brt_entry_compare, sizeof (brt_entry_t), 695718519f4SMartin Matuska offsetof(brt_entry_t, bre_node)); 696718519f4SMartin Matuska } 697718519f4SMartin Matuska mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); 698718519f4SMartin Matuska spa->spa_brt_vdevs[vdevid] = brtvd; 6992a58b312SMartin Matuska } 7002a58b312SMartin Matuska 7012a58b312SMartin Matuska BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 702718519f4SMartin Matuska (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); 703718519f4SMartin Matuska spa->spa_brt_nvdevs = nvdevs; 7042a58b312SMartin Matuska } 7052a58b312SMartin Matuska 7062a58b312SMartin Matuska static boolean_t 707718519f4SMartin Matuska brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) 7082a58b312SMartin Matuska { 709718519f4SMartin Matuska uint64_t idx = offset / spa->spa_brt_rangesize; 710718519f4SMartin Matuska if (idx < brtvd->bv_size) { 7112a58b312SMartin Matuska /* VDEV wasn't expanded. */ 7122a58b312SMartin Matuska return (brt_vdev_entcount_get(brtvd, idx) > 0); 7132a58b312SMartin Matuska } 7142a58b312SMartin Matuska return (FALSE); 7152a58b312SMartin Matuska } 7162a58b312SMartin Matuska 7172a58b312SMartin Matuska static void 718718519f4SMartin Matuska brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 719718519f4SMartin Matuska uint64_t dsize, uint64_t count) 7202a58b312SMartin Matuska { 7212a58b312SMartin Matuska uint64_t idx; 7222a58b312SMartin Matuska 723718519f4SMartin Matuska ASSERT(brtvd->bv_initiated); 7242a58b312SMartin Matuska 725718519f4SMartin Matuska brtvd->bv_savedspace += dsize * count; 7262a58b312SMartin Matuska brtvd->bv_meta_dirty = TRUE; 7272a58b312SMartin Matuska 728718519f4SMartin Matuska if (bre->bre_count > 0) 7292a58b312SMartin Matuska return; 7302a58b312SMartin Matuska 7312a58b312SMartin Matuska brtvd->bv_usedspace += dsize; 7322a58b312SMartin Matuska 733718519f4SMartin Matuska idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; 7342a58b312SMartin Matuska if (idx >= brtvd->bv_size) { 7352a58b312SMartin Matuska /* VDEV has been expanded. */ 736718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 737718519f4SMartin Matuska brt_vdev_realloc(spa, brtvd); 738718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 7392a58b312SMartin Matuska } 7402a58b312SMartin Matuska 7412a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 7422a58b312SMartin Matuska 7432a58b312SMartin Matuska brtvd->bv_totalcount++; 7442a58b312SMartin Matuska brt_vdev_entcount_inc(brtvd, idx); 7452a58b312SMartin Matuska brtvd->bv_entcount_dirty = TRUE; 7462a58b312SMartin Matuska idx = idx / BRT_BLOCKSIZE / 8; 7472a58b312SMartin Matuska BT_SET(brtvd->bv_bitmap, idx); 7482a58b312SMartin Matuska } 7492a58b312SMartin Matuska 7502a58b312SMartin Matuska static void 751718519f4SMartin Matuska brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 7522a58b312SMartin Matuska uint64_t dsize) 7532a58b312SMartin Matuska { 7542a58b312SMartin Matuska uint64_t idx; 7552a58b312SMartin Matuska 756718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 757718519f4SMartin Matuska ASSERT(brtvd->bv_initiated); 7582a58b312SMartin Matuska 7592a58b312SMartin Matuska brtvd->bv_savedspace -= dsize; 7602a58b312SMartin Matuska brtvd->bv_meta_dirty = TRUE; 7612a58b312SMartin Matuska 762718519f4SMartin Matuska if (bre->bre_count > 0) 7632a58b312SMartin Matuska return; 7642a58b312SMartin Matuska 7652a58b312SMartin Matuska brtvd->bv_usedspace -= dsize; 7662a58b312SMartin Matuska 767718519f4SMartin Matuska idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; 7682a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 7692a58b312SMartin Matuska 7702a58b312SMartin Matuska ASSERT(brtvd->bv_totalcount > 0); 7712a58b312SMartin Matuska brtvd->bv_totalcount--; 7722a58b312SMartin Matuska brt_vdev_entcount_dec(brtvd, idx); 7732a58b312SMartin Matuska brtvd->bv_entcount_dirty = TRUE; 7742a58b312SMartin Matuska idx = idx / BRT_BLOCKSIZE / 8; 7752a58b312SMartin Matuska BT_SET(brtvd->bv_bitmap, idx); 7762a58b312SMartin Matuska } 7772a58b312SMartin Matuska 7782a58b312SMartin Matuska static void 779718519f4SMartin Matuska brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 7802a58b312SMartin Matuska { 7812a58b312SMartin Matuska dmu_buf_t *db; 7822a58b312SMartin Matuska brt_vdev_phys_t *bvphys; 7832a58b312SMartin Matuska 7842a58b312SMartin Matuska ASSERT(brtvd->bv_meta_dirty); 7852a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0); 7862a58b312SMartin Matuska ASSERT(dmu_tx_is_syncing(tx)); 7872a58b312SMartin Matuska 788718519f4SMartin Matuska VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 789718519f4SMartin Matuska FTAG, &db)); 7902a58b312SMartin Matuska 7912a58b312SMartin Matuska if (brtvd->bv_entcount_dirty) { 7922a58b312SMartin Matuska /* 7932a58b312SMartin Matuska * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 7942a58b312SMartin Matuska */ 795718519f4SMartin Matuska dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, 7962a58b312SMartin Matuska brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 7972a58b312SMartin Matuska brtvd->bv_entcount, tx); 798718519f4SMartin Matuska uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 799718519f4SMartin Matuska memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); 8002a58b312SMartin Matuska brtvd->bv_entcount_dirty = FALSE; 8012a58b312SMartin Matuska } 8022a58b312SMartin Matuska 8032a58b312SMartin Matuska dmu_buf_will_dirty(db, tx); 8042a58b312SMartin Matuska bvphys = db->db_data; 8052a58b312SMartin Matuska bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 8062a58b312SMartin Matuska bvphys->bvp_size = brtvd->bv_size; 8072a58b312SMartin Matuska if (brtvd->bv_need_byteswap) { 8082a58b312SMartin Matuska bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 8092a58b312SMartin Matuska } else { 8102a58b312SMartin Matuska bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 8112a58b312SMartin Matuska } 8122a58b312SMartin Matuska bvphys->bvp_totalcount = brtvd->bv_totalcount; 813718519f4SMartin Matuska bvphys->bvp_rangesize = spa->spa_brt_rangesize; 8142a58b312SMartin Matuska bvphys->bvp_usedspace = brtvd->bv_usedspace; 8152a58b312SMartin Matuska bvphys->bvp_savedspace = brtvd->bv_savedspace; 8162a58b312SMartin Matuska dmu_buf_rele(db, FTAG); 8172a58b312SMartin Matuska 8182a58b312SMartin Matuska brtvd->bv_meta_dirty = FALSE; 8192a58b312SMartin Matuska } 8202a58b312SMartin Matuska 8212a58b312SMartin Matuska static void 822718519f4SMartin Matuska brt_vdevs_free(spa_t *spa) 8232a58b312SMartin Matuska { 824718519f4SMartin Matuska if (spa->spa_brt_vdevs == 0) 825718519f4SMartin Matuska return; 826718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 827718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 828718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 8292a58b312SMartin Matuska if (brtvd->bv_initiated) 830718519f4SMartin Matuska brt_vdev_dealloc(brtvd); 831718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 832718519f4SMartin Matuska rw_destroy(&brtvd->bv_lock); 833718519f4SMartin Matuska if (brtvd->bv_mos_entries != 0) 834718519f4SMartin Matuska dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 835718519f4SMartin Matuska rw_destroy(&brtvd->bv_mos_entries_lock); 836718519f4SMartin Matuska avl_destroy(&brtvd->bv_tree); 837718519f4SMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 838718519f4SMartin Matuska avl_destroy(&brtvd->bv_pending_tree[i]); 839718519f4SMartin Matuska mutex_destroy(&brtvd->bv_pending_lock); 840718519f4SMartin Matuska kmem_free(brtvd, sizeof (*brtvd)); 8412a58b312SMartin Matuska } 842718519f4SMartin Matuska kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * 843718519f4SMartin Matuska spa->spa_brt_nvdevs); 8442a58b312SMartin Matuska } 8452a58b312SMartin Matuska 8462a58b312SMartin Matuska static void 8472a58b312SMartin Matuska brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 8482a58b312SMartin Matuska { 8492a58b312SMartin Matuska 850718519f4SMartin Matuska bre->bre_bp = *bp; 851718519f4SMartin Matuska bre->bre_count = 0; 852718519f4SMartin Matuska bre->bre_pcount = 0; 8532a58b312SMartin Matuska 8542a58b312SMartin Matuska *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 8552a58b312SMartin Matuska } 8562a58b312SMartin Matuska 8572a58b312SMartin Matuska static int 858718519f4SMartin Matuska brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) 8592a58b312SMartin Matuska { 860718519f4SMartin Matuska uint64_t off = BRE_OFFSET(bre); 8612a58b312SMartin Matuska 862*dd215568SMartin Matuska if (brtvd->bv_mos_entries == 0) 863*dd215568SMartin Matuska return (SET_ERROR(ENOENT)); 864*dd215568SMartin Matuska 865718519f4SMartin Matuska return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 866718519f4SMartin Matuska &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); 8672a58b312SMartin Matuska } 8682a58b312SMartin Matuska 8692a58b312SMartin Matuska /* 8702a58b312SMartin Matuska * Return TRUE if we _can_ have BRT entry for this bp. It might be false 8712a58b312SMartin Matuska * positive, but gives us quick answer if we should look into BRT, which 8722a58b312SMartin Matuska * may require reads and thus will be more expensive. 8732a58b312SMartin Matuska */ 8742a58b312SMartin Matuska boolean_t 8752a58b312SMartin Matuska brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 8762a58b312SMartin Matuska { 8772a58b312SMartin Matuska 878718519f4SMartin Matuska if (spa->spa_brt_nvdevs == 0) 879718519f4SMartin Matuska return (B_FALSE); 8802a58b312SMartin Matuska 881718519f4SMartin Matuska uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 882718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 883718519f4SMartin Matuska if (brtvd == NULL || !brtvd->bv_initiated) 884718519f4SMartin Matuska return (FALSE); 8852a58b312SMartin Matuska 886718519f4SMartin Matuska /* 887718519f4SMartin Matuska * We don't need locks here, since bv_entcount pointer must be 888718519f4SMartin Matuska * stable at this point, and we don't care about false positive 889718519f4SMartin Matuska * races here, while false negative should be impossible, since 890718519f4SMartin Matuska * all brt_vdev_addref() have already completed by this point. 891718519f4SMartin Matuska */ 892718519f4SMartin Matuska uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 893718519f4SMartin Matuska return (brt_vdev_lookup(spa, brtvd, off)); 8942a58b312SMartin Matuska } 8952a58b312SMartin Matuska 8962a58b312SMartin Matuska uint64_t 8972a58b312SMartin Matuska brt_get_dspace(spa_t *spa) 8982a58b312SMartin Matuska { 899718519f4SMartin Matuska if (spa->spa_brt_nvdevs == 0) 9002a58b312SMartin Matuska return (0); 9012a58b312SMartin Matuska 902718519f4SMartin Matuska brt_rlock(spa); 903718519f4SMartin Matuska uint64_t s = 0; 904718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 905718519f4SMartin Matuska s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; 906718519f4SMartin Matuska brt_unlock(spa); 907718519f4SMartin Matuska return (s); 9082a58b312SMartin Matuska } 9092a58b312SMartin Matuska 9102a58b312SMartin Matuska uint64_t 9112a58b312SMartin Matuska brt_get_used(spa_t *spa) 9122a58b312SMartin Matuska { 913718519f4SMartin Matuska if (spa->spa_brt_nvdevs == 0) 9142a58b312SMartin Matuska return (0); 9152a58b312SMartin Matuska 916718519f4SMartin Matuska brt_rlock(spa); 917718519f4SMartin Matuska uint64_t s = 0; 918718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 919718519f4SMartin Matuska s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; 920718519f4SMartin Matuska brt_unlock(spa); 921718519f4SMartin Matuska return (s); 9222a58b312SMartin Matuska } 9232a58b312SMartin Matuska 9242a58b312SMartin Matuska uint64_t 9252a58b312SMartin Matuska brt_get_saved(spa_t *spa) 9262a58b312SMartin Matuska { 927718519f4SMartin Matuska return (brt_get_dspace(spa)); 9282a58b312SMartin Matuska } 9292a58b312SMartin Matuska 9302a58b312SMartin Matuska uint64_t 9312a58b312SMartin Matuska brt_get_ratio(spa_t *spa) 9322a58b312SMartin Matuska { 933718519f4SMartin Matuska uint64_t used = brt_get_used(spa); 934718519f4SMartin Matuska if (used == 0) 9352a58b312SMartin Matuska return (100); 936718519f4SMartin Matuska return ((used + brt_get_saved(spa)) * 100 / used); 9372a58b312SMartin Matuska } 9382a58b312SMartin Matuska 9392a58b312SMartin Matuska static int 9402a58b312SMartin Matuska brt_kstats_update(kstat_t *ksp, int rw) 9412a58b312SMartin Matuska { 9422a58b312SMartin Matuska brt_stats_t *bs = ksp->ks_data; 9432a58b312SMartin Matuska 9442a58b312SMartin Matuska if (rw == KSTAT_WRITE) 9452a58b312SMartin Matuska return (EACCES); 9462a58b312SMartin Matuska 9472a58b312SMartin Matuska bs->brt_addref_entry_not_on_disk.value.ui64 = 9482a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 9492a58b312SMartin Matuska bs->brt_addref_entry_on_disk.value.ui64 = 9502a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_on_disk); 9512a58b312SMartin Matuska bs->brt_decref_entry_in_memory.value.ui64 = 9522a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_in_memory); 9532a58b312SMartin Matuska bs->brt_decref_entry_loaded_from_disk.value.ui64 = 9542a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 9552a58b312SMartin Matuska bs->brt_decref_entry_not_in_memory.value.ui64 = 9562a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 9572a58b312SMartin Matuska bs->brt_decref_entry_read_lost_race.value.ui64 = 9582a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 9592a58b312SMartin Matuska bs->brt_decref_entry_still_referenced.value.ui64 = 9602a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 9612a58b312SMartin Matuska bs->brt_decref_free_data_later.value.ui64 = 9622a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_free_data_later); 9632a58b312SMartin Matuska bs->brt_decref_free_data_now.value.ui64 = 9642a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_free_data_now); 9652a58b312SMartin Matuska bs->brt_decref_no_entry.value.ui64 = 9662a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_no_entry); 9672a58b312SMartin Matuska 9682a58b312SMartin Matuska return (0); 9692a58b312SMartin Matuska } 9702a58b312SMartin Matuska 9712a58b312SMartin Matuska static void 9722a58b312SMartin Matuska brt_stat_init(void) 9732a58b312SMartin Matuska { 9742a58b312SMartin Matuska 9752a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 9762a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 9772a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 9782a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 9792a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 9802a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 9812a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 9822a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 9832a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 9842a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_no_entry, 0); 9852a58b312SMartin Matuska 9862a58b312SMartin Matuska brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 9872a58b312SMartin Matuska sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 9882a58b312SMartin Matuska if (brt_ksp != NULL) { 9892a58b312SMartin Matuska brt_ksp->ks_data = &brt_stats; 9902a58b312SMartin Matuska brt_ksp->ks_update = brt_kstats_update; 9912a58b312SMartin Matuska kstat_install(brt_ksp); 9922a58b312SMartin Matuska } 9932a58b312SMartin Matuska } 9942a58b312SMartin Matuska 9952a58b312SMartin Matuska static void 9962a58b312SMartin Matuska brt_stat_fini(void) 9972a58b312SMartin Matuska { 9982a58b312SMartin Matuska if (brt_ksp != NULL) { 9992a58b312SMartin Matuska kstat_delete(brt_ksp); 10002a58b312SMartin Matuska brt_ksp = NULL; 10012a58b312SMartin Matuska } 10022a58b312SMartin Matuska 10032a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 10042a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 10052a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 10062a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 10072a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 10082a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 10092a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 10102a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_free_data_later); 10112a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_free_data_now); 10122a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_no_entry); 10132a58b312SMartin Matuska } 10142a58b312SMartin Matuska 10152a58b312SMartin Matuska void 10162a58b312SMartin Matuska brt_init(void) 10172a58b312SMartin Matuska { 10182a58b312SMartin Matuska brt_entry_cache = kmem_cache_create("brt_entry_cache", 10192a58b312SMartin Matuska sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 10202a58b312SMartin Matuska 10212a58b312SMartin Matuska brt_stat_init(); 10222a58b312SMartin Matuska } 10232a58b312SMartin Matuska 10242a58b312SMartin Matuska void 10252a58b312SMartin Matuska brt_fini(void) 10262a58b312SMartin Matuska { 10272a58b312SMartin Matuska brt_stat_fini(); 10282a58b312SMartin Matuska 10292a58b312SMartin Matuska kmem_cache_destroy(brt_entry_cache); 10302a58b312SMartin Matuska } 10312a58b312SMartin Matuska 10322a58b312SMartin Matuska /* Return TRUE if block should be freed immediately. */ 10332a58b312SMartin Matuska boolean_t 10342a58b312SMartin Matuska brt_entry_decref(spa_t *spa, const blkptr_t *bp) 10352a58b312SMartin Matuska { 10362a58b312SMartin Matuska brt_entry_t *bre, *racebre; 10372a58b312SMartin Matuska brt_entry_t bre_search; 10382a58b312SMartin Matuska avl_index_t where; 10392a58b312SMartin Matuska uint64_t vdevid; 10402a58b312SMartin Matuska int error; 10412a58b312SMartin Matuska 10422a58b312SMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid); 10432a58b312SMartin Matuska 1044718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 10452a58b312SMartin Matuska ASSERT(brtvd != NULL); 10462a58b312SMartin Matuska 1047718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 1048718519f4SMartin Matuska ASSERT(brtvd->bv_initiated); 10492a58b312SMartin Matuska bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 10502a58b312SMartin Matuska if (bre != NULL) { 10512a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_in_memory); 10522a58b312SMartin Matuska goto out; 10532a58b312SMartin Matuska } else { 10542a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 10552a58b312SMartin Matuska } 1056718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 10572a58b312SMartin Matuska 1058718519f4SMartin Matuska error = brt_entry_lookup(brtvd, &bre_search); 1059718519f4SMartin Matuska /* bre_search now contains correct bre_count */ 10602a58b312SMartin Matuska if (error == ENOENT) { 1061718519f4SMartin Matuska BRTSTAT_BUMP(brt_decref_no_entry); 1062718519f4SMartin Matuska return (B_TRUE); 10632a58b312SMartin Matuska } 1064718519f4SMartin Matuska ASSERT0(error); 10652a58b312SMartin Matuska 1066718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 10672a58b312SMartin Matuska racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 10682a58b312SMartin Matuska if (racebre != NULL) { 1069718519f4SMartin Matuska /* The entry was added when the lock was dropped. */ 10702a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 10712a58b312SMartin Matuska bre = racebre; 10722a58b312SMartin Matuska goto out; 10732a58b312SMartin Matuska } 10742a58b312SMartin Matuska 10752a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1076718519f4SMartin Matuska bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1077718519f4SMartin Matuska bre->bre_bp = bre_search.bre_bp; 1078718519f4SMartin Matuska bre->bre_count = bre_search.bre_count; 1079718519f4SMartin Matuska bre->bre_pcount = 0; 10802a58b312SMartin Matuska avl_insert(&brtvd->bv_tree, bre, where); 10812a58b312SMartin Matuska 10822a58b312SMartin Matuska out: 1083718519f4SMartin Matuska if (bre->bre_count == 0) { 1084718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 10852a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_free_data_now); 10862a58b312SMartin Matuska return (B_TRUE); 10872a58b312SMartin Matuska } 10882a58b312SMartin Matuska 1089718519f4SMartin Matuska bre->bre_pcount--; 1090718519f4SMartin Matuska ASSERT(bre->bre_count > 0); 1091718519f4SMartin Matuska bre->bre_count--; 1092718519f4SMartin Matuska if (bre->bre_count == 0) 10932a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_free_data_later); 10942a58b312SMartin Matuska else 10952a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1096718519f4SMartin Matuska brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); 10972a58b312SMartin Matuska 1098718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 10992a58b312SMartin Matuska 11002a58b312SMartin Matuska return (B_FALSE); 11012a58b312SMartin Matuska } 11022a58b312SMartin Matuska 1103315ee00fSMartin Matuska uint64_t 1104315ee00fSMartin Matuska brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1105315ee00fSMartin Matuska { 1106315ee00fSMartin Matuska brt_entry_t bre_search, *bre; 1107315ee00fSMartin Matuska uint64_t vdevid, refcnt; 1108315ee00fSMartin Matuska int error; 1109315ee00fSMartin Matuska 1110315ee00fSMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid); 1111315ee00fSMartin Matuska 1112718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1113315ee00fSMartin Matuska ASSERT(brtvd != NULL); 1114315ee00fSMartin Matuska 1115718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_READER); 1116718519f4SMartin Matuska ASSERT(brtvd->bv_initiated); 1117315ee00fSMartin Matuska bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1118315ee00fSMartin Matuska if (bre == NULL) { 1119718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 1120718519f4SMartin Matuska error = brt_entry_lookup(brtvd, &bre_search); 1121718519f4SMartin Matuska if (error == ENOENT) { 1122315ee00fSMartin Matuska refcnt = 0; 1123718519f4SMartin Matuska } else { 1124718519f4SMartin Matuska ASSERT0(error); 1125718519f4SMartin Matuska refcnt = bre_search.bre_count; 1126718519f4SMartin Matuska } 1127718519f4SMartin Matuska } else { 1128718519f4SMartin Matuska refcnt = bre->bre_count; 1129718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 1130718519f4SMartin Matuska } 1131315ee00fSMartin Matuska 1132315ee00fSMartin Matuska return (refcnt); 1133315ee00fSMartin Matuska } 1134315ee00fSMartin Matuska 11352a58b312SMartin Matuska static void 1136718519f4SMartin Matuska brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) 11372a58b312SMartin Matuska { 1138718519f4SMartin Matuska if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) 11392a58b312SMartin Matuska return; 11402a58b312SMartin Matuska 1141718519f4SMartin Matuska uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 1142718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); 1143718519f4SMartin Matuska if (brtvd->bv_mos_entries != 0) { 1144718519f4SMartin Matuska (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 1145718519f4SMartin Matuska &off, BRT_KEY_WORDS); 1146718519f4SMartin Matuska } 1147718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock); 11482a58b312SMartin Matuska } 11492a58b312SMartin Matuska 11502a58b312SMartin Matuska static int 1151718519f4SMartin Matuska brt_entry_compare(const void *x1, const void *x2) 11522a58b312SMartin Matuska { 1153718519f4SMartin Matuska const brt_entry_t *bre1 = x1, *bre2 = x2; 1154718519f4SMartin Matuska const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; 11552a58b312SMartin Matuska 1156718519f4SMartin Matuska return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1157718519f4SMartin Matuska DVA_GET_OFFSET(&bp2->blk_dva[0]))); 11582a58b312SMartin Matuska } 11592a58b312SMartin Matuska 11602a58b312SMartin Matuska void 11612a58b312SMartin Matuska brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 11622a58b312SMartin Matuska { 1163718519f4SMartin Matuska brt_entry_t *bre, *newbre; 11642a58b312SMartin Matuska avl_index_t where; 11652a58b312SMartin Matuska uint64_t txg; 11662a58b312SMartin Matuska 11672a58b312SMartin Matuska txg = dmu_tx_get_txg(tx); 11682a58b312SMartin Matuska ASSERT3U(txg, !=, 0); 11692a58b312SMartin Matuska 1170718519f4SMartin Matuska uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1171718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); 1172718519f4SMartin Matuska avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; 11732a58b312SMartin Matuska 1174718519f4SMartin Matuska newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1175718519f4SMartin Matuska newbre->bre_bp = *bp; 1176718519f4SMartin Matuska newbre->bre_count = 0; 1177718519f4SMartin Matuska newbre->bre_pcount = 1; 11782a58b312SMartin Matuska 1179718519f4SMartin Matuska mutex_enter(&brtvd->bv_pending_lock); 1180718519f4SMartin Matuska bre = avl_find(pending_tree, newbre, &where); 1181718519f4SMartin Matuska if (bre == NULL) { 1182718519f4SMartin Matuska avl_insert(pending_tree, newbre, where); 1183718519f4SMartin Matuska newbre = NULL; 11842a58b312SMartin Matuska } else { 1185718519f4SMartin Matuska bre->bre_pcount++; 11862a58b312SMartin Matuska } 1187718519f4SMartin Matuska mutex_exit(&brtvd->bv_pending_lock); 11882a58b312SMartin Matuska 1189718519f4SMartin Matuska if (newbre != NULL) { 1190718519f4SMartin Matuska ASSERT(bre != NULL); 1191718519f4SMartin Matuska ASSERT(bre != newbre); 1192718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, newbre); 11932a58b312SMartin Matuska } else { 1194718519f4SMartin Matuska ASSERT0P(bre); 11952a58b312SMartin Matuska 1196783d3ff6SMartin Matuska /* Prefetch BRT entry for the syncing context. */ 1197718519f4SMartin Matuska brt_prefetch(brtvd, bp); 11982a58b312SMartin Matuska } 1199783d3ff6SMartin Matuska } 12002a58b312SMartin Matuska 12012a58b312SMartin Matuska void 12022a58b312SMartin Matuska brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 12032a58b312SMartin Matuska { 1204718519f4SMartin Matuska brt_entry_t *bre, bre_search; 12052a58b312SMartin Matuska uint64_t txg; 12062a58b312SMartin Matuska 12072a58b312SMartin Matuska txg = dmu_tx_get_txg(tx); 12082a58b312SMartin Matuska ASSERT3U(txg, !=, 0); 12092a58b312SMartin Matuska 1210718519f4SMartin Matuska uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1211718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1212718519f4SMartin Matuska ASSERT(brtvd != NULL); 1213718519f4SMartin Matuska avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; 12142a58b312SMartin Matuska 1215718519f4SMartin Matuska bre_search.bre_bp = *bp; 12162a58b312SMartin Matuska 1217718519f4SMartin Matuska mutex_enter(&brtvd->bv_pending_lock); 1218718519f4SMartin Matuska bre = avl_find(pending_tree, &bre_search, NULL); 1219718519f4SMartin Matuska ASSERT(bre != NULL); 1220718519f4SMartin Matuska ASSERT(bre->bre_pcount > 0); 1221718519f4SMartin Matuska bre->bre_pcount--; 1222718519f4SMartin Matuska if (bre->bre_pcount == 0) 1223718519f4SMartin Matuska avl_remove(pending_tree, bre); 1224718519f4SMartin Matuska else 1225718519f4SMartin Matuska bre = NULL; 1226718519f4SMartin Matuska mutex_exit(&brtvd->bv_pending_lock); 12272a58b312SMartin Matuska 1228718519f4SMartin Matuska if (bre) 1229718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, bre); 1230718519f4SMartin Matuska } 1231718519f4SMartin Matuska 1232718519f4SMartin Matuska static void 1233718519f4SMartin Matuska brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) 1234718519f4SMartin Matuska { 1235718519f4SMartin Matuska brt_entry_t *bre, *nbre; 1236718519f4SMartin Matuska 1237718519f4SMartin Matuska /* 1238718519f4SMartin Matuska * We are in syncing context, so no other bv_pending_tree accesses 1239718519f4SMartin Matuska * are possible for the TXG. So we don't need bv_pending_lock. 1240718519f4SMartin Matuska */ 1241718519f4SMartin Matuska ASSERT(avl_is_empty(&brtvd->bv_tree)); 1242718519f4SMartin Matuska avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); 1243718519f4SMartin Matuska 1244718519f4SMartin Matuska for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { 1245718519f4SMartin Matuska nbre = AVL_NEXT(&brtvd->bv_tree, bre); 1246718519f4SMartin Matuska 1247718519f4SMartin Matuska /* 1248718519f4SMartin Matuska * If the block has DEDUP bit set, it means that it 1249718519f4SMartin Matuska * already exists in the DEDUP table, so we can just 1250718519f4SMartin Matuska * use that instead of creating new entry in the BRT. 1251718519f4SMartin Matuska */ 1252718519f4SMartin Matuska if (BP_GET_DEDUP(&bre->bre_bp)) { 1253718519f4SMartin Matuska while (bre->bre_pcount > 0) { 1254718519f4SMartin Matuska if (!ddt_addref(spa, &bre->bre_bp)) 1255718519f4SMartin Matuska break; 1256718519f4SMartin Matuska bre->bre_pcount--; 1257718519f4SMartin Matuska } 1258718519f4SMartin Matuska if (bre->bre_pcount == 0) { 1259718519f4SMartin Matuska avl_remove(&brtvd->bv_tree, bre); 1260718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, bre); 1261718519f4SMartin Matuska continue; 12622a58b312SMartin Matuska } 12632a58b312SMartin Matuska } 12642a58b312SMartin Matuska 1265718519f4SMartin Matuska /* 1266718519f4SMartin Matuska * Unless we know that the block is definitely not in ZAP, 1267718519f4SMartin Matuska * try to get its reference count from there. 1268718519f4SMartin Matuska */ 1269718519f4SMartin Matuska uint64_t off = BRE_OFFSET(bre); 1270718519f4SMartin Matuska if (brtvd->bv_mos_entries != 0 && 1271718519f4SMartin Matuska brt_vdev_lookup(spa, brtvd, off)) { 1272718519f4SMartin Matuska int error = zap_lookup_uint64_by_dnode( 1273718519f4SMartin Matuska brtvd->bv_mos_entries_dnode, &off, 1274718519f4SMartin Matuska BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1275718519f4SMartin Matuska &bre->bre_count); 1276718519f4SMartin Matuska if (error == 0) { 1277718519f4SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_on_disk); 1278718519f4SMartin Matuska } else { 1279718519f4SMartin Matuska ASSERT3U(error, ==, ENOENT); 1280718519f4SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1281718519f4SMartin Matuska } 1282718519f4SMartin Matuska } 1283718519f4SMartin Matuska } 1284718519f4SMartin Matuska 1285718519f4SMartin Matuska /* 1286718519f4SMartin Matuska * If all the cloned blocks we had were handled by DDT, we don't need 1287718519f4SMartin Matuska * to initiate the vdev. 1288718519f4SMartin Matuska */ 1289718519f4SMartin Matuska if (avl_is_empty(&brtvd->bv_tree)) 1290718519f4SMartin Matuska return; 1291718519f4SMartin Matuska 1292718519f4SMartin Matuska if (!brtvd->bv_initiated) { 1293718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 1294718519f4SMartin Matuska brt_vdev_realloc(spa, brtvd); 1295718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 1296718519f4SMartin Matuska } 1297718519f4SMartin Matuska 1298718519f4SMartin Matuska /* 1299718519f4SMartin Matuska * Convert pending references into proper ones. This has to be a 1300718519f4SMartin Matuska * separate loop, since entcount modifications would cause false 1301718519f4SMartin Matuska * positives for brt_vdev_lookup() on following iterations. 1302718519f4SMartin Matuska */ 1303718519f4SMartin Matuska for (bre = avl_first(&brtvd->bv_tree); bre; 1304718519f4SMartin Matuska bre = AVL_NEXT(&brtvd->bv_tree, bre)) { 1305718519f4SMartin Matuska brt_vdev_addref(spa, brtvd, bre, 1306718519f4SMartin Matuska bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); 1307718519f4SMartin Matuska bre->bre_count += bre->bre_pcount; 1308718519f4SMartin Matuska } 13092a58b312SMartin Matuska } 13102a58b312SMartin Matuska 13112a58b312SMartin Matuska void 13122a58b312SMartin Matuska brt_pending_apply(spa_t *spa, uint64_t txg) 13132a58b312SMartin Matuska { 13142a58b312SMartin Matuska 1315718519f4SMartin Matuska brt_rlock(spa); 1316718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1317718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1318718519f4SMartin Matuska brt_unlock(spa); 13192a58b312SMartin Matuska 1320718519f4SMartin Matuska brt_pending_apply_vdev(spa, brtvd, txg); 13212a58b312SMartin Matuska 1322718519f4SMartin Matuska brt_rlock(spa); 13232a58b312SMartin Matuska } 1324718519f4SMartin Matuska brt_unlock(spa); 13252a58b312SMartin Matuska } 13262a58b312SMartin Matuska 13272a58b312SMartin Matuska static void 1328783d3ff6SMartin Matuska brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) 13292a58b312SMartin Matuska { 1330718519f4SMartin Matuska uint64_t off = BRE_OFFSET(bre); 1331718519f4SMartin Matuska 1332718519f4SMartin Matuska if (bre->bre_pcount == 0) { 1333718519f4SMartin Matuska /* The net change is zero, nothing to do in ZAP. */ 1334718519f4SMartin Matuska } else if (bre->bre_count == 0) { 1335718519f4SMartin Matuska int error = zap_remove_uint64_by_dnode(dn, &off, 1336783d3ff6SMartin Matuska BRT_KEY_WORDS, tx); 1337783d3ff6SMartin Matuska VERIFY(error == 0 || error == ENOENT); 13382a58b312SMartin Matuska } else { 1339718519f4SMartin Matuska VERIFY0(zap_update_uint64_by_dnode(dn, &off, 1340718519f4SMartin Matuska BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1341718519f4SMartin Matuska &bre->bre_count, tx)); 13422a58b312SMartin Matuska } 13432a58b312SMartin Matuska } 13442a58b312SMartin Matuska 13452a58b312SMartin Matuska static void 1346718519f4SMartin Matuska brt_sync_table(spa_t *spa, dmu_tx_t *tx) 13472a58b312SMartin Matuska { 13482a58b312SMartin Matuska brt_entry_t *bre; 13492a58b312SMartin Matuska 1350718519f4SMartin Matuska brt_rlock(spa); 1351718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1352718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1353718519f4SMartin Matuska brt_unlock(spa); 13542a58b312SMartin Matuska 13552a58b312SMartin Matuska if (!brtvd->bv_meta_dirty) { 13562a58b312SMartin Matuska ASSERT(!brtvd->bv_entcount_dirty); 13572a58b312SMartin Matuska ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1358718519f4SMartin Matuska brt_rlock(spa); 13592a58b312SMartin Matuska continue; 13602a58b312SMartin Matuska } 13612a58b312SMartin Matuska 13622a58b312SMartin Matuska ASSERT(!brtvd->bv_entcount_dirty || 13632a58b312SMartin Matuska avl_numnodes(&brtvd->bv_tree) != 0); 13642a58b312SMartin Matuska 13652a58b312SMartin Matuska if (brtvd->bv_mos_brtvdev == 0) 1366718519f4SMartin Matuska brt_vdev_create(spa, brtvd, tx); 13672a58b312SMartin Matuska 1368718519f4SMartin Matuska void *c = NULL; 13692a58b312SMartin Matuska while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1370718519f4SMartin Matuska brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); 1371718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, bre); 13722a58b312SMartin Matuska } 13732a58b312SMartin Matuska 1374718519f4SMartin Matuska #ifdef ZFS_DEBUG 1375718519f4SMartin Matuska if (zfs_flags & ZFS_DEBUG_BRT) 1376718519f4SMartin Matuska brt_vdev_dump(brtvd); 1377718519f4SMartin Matuska #endif 13782a58b312SMartin Matuska if (brtvd->bv_totalcount == 0) 1379718519f4SMartin Matuska brt_vdev_destroy(spa, brtvd, tx); 1380718519f4SMartin Matuska else 1381718519f4SMartin Matuska brt_vdev_sync(spa, brtvd, tx); 1382718519f4SMartin Matuska brt_rlock(spa); 13832a58b312SMartin Matuska } 1384718519f4SMartin Matuska brt_unlock(spa); 13852a58b312SMartin Matuska } 13862a58b312SMartin Matuska 13872a58b312SMartin Matuska void 13882a58b312SMartin Matuska brt_sync(spa_t *spa, uint64_t txg) 13892a58b312SMartin Matuska { 13902a58b312SMartin Matuska dmu_tx_t *tx; 1391718519f4SMartin Matuska uint64_t vdevid; 13922a58b312SMartin Matuska 1393718519f4SMartin Matuska ASSERT3U(spa_syncing_txg(spa), ==, txg); 13942a58b312SMartin Matuska 1395718519f4SMartin Matuska brt_rlock(spa); 1396718519f4SMartin Matuska for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1397718519f4SMartin Matuska if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) 1398718519f4SMartin Matuska break; 1399718519f4SMartin Matuska } 1400718519f4SMartin Matuska if (vdevid >= spa->spa_brt_nvdevs) { 1401718519f4SMartin Matuska brt_unlock(spa); 14022a58b312SMartin Matuska return; 14032a58b312SMartin Matuska } 1404718519f4SMartin Matuska brt_unlock(spa); 14052a58b312SMartin Matuska 14062a58b312SMartin Matuska tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1407718519f4SMartin Matuska brt_sync_table(spa, tx); 14082a58b312SMartin Matuska dmu_tx_commit(tx); 14092a58b312SMartin Matuska } 14102a58b312SMartin Matuska 14112a58b312SMartin Matuska static void 14122a58b312SMartin Matuska brt_alloc(spa_t *spa) 14132a58b312SMartin Matuska { 1414718519f4SMartin Matuska rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); 1415718519f4SMartin Matuska spa->spa_brt_vdevs = NULL; 1416718519f4SMartin Matuska spa->spa_brt_nvdevs = 0; 1417718519f4SMartin Matuska spa->spa_brt_rangesize = 0; 14182a58b312SMartin Matuska } 14192a58b312SMartin Matuska 14202a58b312SMartin Matuska void 14212a58b312SMartin Matuska brt_create(spa_t *spa) 14222a58b312SMartin Matuska { 14232a58b312SMartin Matuska brt_alloc(spa); 1424718519f4SMartin Matuska spa->spa_brt_rangesize = BRT_RANGESIZE; 14252a58b312SMartin Matuska } 14262a58b312SMartin Matuska 14272a58b312SMartin Matuska int 14282a58b312SMartin Matuska brt_load(spa_t *spa) 14292a58b312SMartin Matuska { 1430718519f4SMartin Matuska int error = 0; 14312a58b312SMartin Matuska 14322a58b312SMartin Matuska brt_alloc(spa); 1433718519f4SMartin Matuska brt_wlock(spa); 1434718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; 1435718519f4SMartin Matuska vdevid++) { 1436718519f4SMartin Matuska char name[64]; 1437718519f4SMartin Matuska uint64_t mos_brtvdev; 14382a58b312SMartin Matuska 1439718519f4SMartin Matuska /* Look if this vdev had active block cloning. */ 1440718519f4SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 1441718519f4SMartin Matuska (u_longlong_t)vdevid); 1442718519f4SMartin Matuska error = zap_lookup(spa->spa_meta_objset, 1443718519f4SMartin Matuska DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, 1444718519f4SMartin Matuska &mos_brtvdev); 1445718519f4SMartin Matuska if (error == ENOENT) { 1446718519f4SMartin Matuska error = 0; 1447718519f4SMartin Matuska continue; 1448718519f4SMartin Matuska } 1449718519f4SMartin Matuska if (error != 0) 1450718519f4SMartin Matuska break; 1451718519f4SMartin Matuska 1452718519f4SMartin Matuska /* If it did, then allocate them all and load this one. */ 1453718519f4SMartin Matuska brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); 1454718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1455718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER); 1456718519f4SMartin Matuska brtvd->bv_mos_brtvdev = mos_brtvdev; 1457718519f4SMartin Matuska error = brt_vdev_load(spa, brtvd); 1458718519f4SMartin Matuska rw_exit(&brtvd->bv_lock); 1459718519f4SMartin Matuska if (error != 0) 1460718519f4SMartin Matuska break; 1461718519f4SMartin Matuska } 1462718519f4SMartin Matuska 1463718519f4SMartin Matuska if (spa->spa_brt_rangesize == 0) 1464718519f4SMartin Matuska spa->spa_brt_rangesize = BRT_RANGESIZE; 1465718519f4SMartin Matuska brt_unlock(spa); 1466718519f4SMartin Matuska return (error); 14672a58b312SMartin Matuska } 14682a58b312SMartin Matuska 14692a58b312SMartin Matuska void 14702a58b312SMartin Matuska brt_unload(spa_t *spa) 14712a58b312SMartin Matuska { 1472718519f4SMartin Matuska if (spa->spa_brt_rangesize == 0) 14732a58b312SMartin Matuska return; 1474718519f4SMartin Matuska brt_vdevs_free(spa); 1475718519f4SMartin Matuska rw_destroy(&spa->spa_brt_lock); 1476718519f4SMartin Matuska spa->spa_brt_rangesize = 0; 14772a58b312SMartin Matuska } 14782a58b312SMartin Matuska 1479783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, 1480783d3ff6SMartin Matuska "Enable prefetching of BRT ZAP entries"); 1481783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, 1482783d3ff6SMartin Matuska "BRT ZAP leaf blockshift"); 1483783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, 1484783d3ff6SMartin Matuska "BRT ZAP indirect blockshift"); 1485