1*2a58b312SMartin Matuska /* 2*2a58b312SMartin Matuska * CDDL HEADER START 3*2a58b312SMartin Matuska * 4*2a58b312SMartin Matuska * The contents of this file are subject to the terms of the 5*2a58b312SMartin Matuska * Common Development and Distribution License (the "License"). 6*2a58b312SMartin Matuska * You may not use this file except in compliance with the License. 7*2a58b312SMartin Matuska * 8*2a58b312SMartin Matuska * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*2a58b312SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10*2a58b312SMartin Matuska * See the License for the specific language governing permissions 11*2a58b312SMartin Matuska * and limitations under the License. 12*2a58b312SMartin Matuska * 13*2a58b312SMartin Matuska * When distributing Covered Code, include this CDDL HEADER in each 14*2a58b312SMartin Matuska * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*2a58b312SMartin Matuska * If applicable, add the following below this CDDL HEADER, with the 16*2a58b312SMartin Matuska * fields enclosed by brackets "[]" replaced with your own identifying 17*2a58b312SMartin Matuska * information: Portions Copyright [yyyy] [name of copyright owner] 18*2a58b312SMartin Matuska * 19*2a58b312SMartin Matuska * CDDL HEADER END 20*2a58b312SMartin Matuska */ 21*2a58b312SMartin Matuska 22*2a58b312SMartin Matuska /* 23*2a58b312SMartin Matuska * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24*2a58b312SMartin Matuska */ 25*2a58b312SMartin Matuska 26*2a58b312SMartin Matuska #include <sys/zfs_context.h> 27*2a58b312SMartin Matuska #include <sys/spa.h> 28*2a58b312SMartin Matuska #include <sys/spa_impl.h> 29*2a58b312SMartin Matuska #include <sys/zio.h> 30*2a58b312SMartin Matuska #include <sys/brt.h> 31*2a58b312SMartin Matuska #include <sys/ddt.h> 32*2a58b312SMartin Matuska #include <sys/bitmap.h> 33*2a58b312SMartin Matuska #include <sys/zap.h> 34*2a58b312SMartin Matuska #include <sys/dmu_tx.h> 35*2a58b312SMartin Matuska #include <sys/arc.h> 36*2a58b312SMartin Matuska #include <sys/dsl_pool.h> 37*2a58b312SMartin Matuska #include <sys/dsl_scan.h> 38*2a58b312SMartin Matuska #include <sys/vdev_impl.h> 39*2a58b312SMartin Matuska #include <sys/kstat.h> 40*2a58b312SMartin Matuska #include <sys/wmsum.h> 41*2a58b312SMartin Matuska 42*2a58b312SMartin Matuska /* 43*2a58b312SMartin Matuska * Block Cloning design. 44*2a58b312SMartin Matuska * 45*2a58b312SMartin Matuska * Block Cloning allows to manually clone a file (or a subset of its blocks) 46*2a58b312SMartin Matuska * into another (or the same) file by just creating additional references to 47*2a58b312SMartin Matuska * the data blocks without copying the data itself. Those references are kept 48*2a58b312SMartin Matuska * in the Block Reference Tables (BRTs). 49*2a58b312SMartin Matuska * 50*2a58b312SMartin Matuska * In many ways this is similar to the existing deduplication, but there are 51*2a58b312SMartin Matuska * some important differences: 52*2a58b312SMartin Matuska * 53*2a58b312SMartin Matuska * - Deduplication is automatic and Block Cloning is not - one has to use a 54*2a58b312SMartin Matuska * dedicated system call(s) to clone the given file/blocks. 55*2a58b312SMartin Matuska * - Deduplication keeps all data blocks in its table, even those referenced 56*2a58b312SMartin Matuska * just once. Block Cloning creates an entry in its tables only when there 57*2a58b312SMartin Matuska * are at least two references to the given data block. If the block was 58*2a58b312SMartin Matuska * never explicitly cloned or the second to last reference was dropped, 59*2a58b312SMartin Matuska * there will be neither space nor performance overhead. 60*2a58b312SMartin Matuska * - Deduplication needs data to work - one needs to pass real data to the 61*2a58b312SMartin Matuska * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 62*2a58b312SMartin Matuska * data, just block pointers to the data, so it is extremely fast, as we pay 63*2a58b312SMartin Matuska * neither the cost of reading the data, nor the cost of writing the data - 64*2a58b312SMartin Matuska * we operate exclusively on metadata. 65*2a58b312SMartin Matuska * - If the D (dedup) bit is not set in the block pointer, it means that 66*2a58b312SMartin Matuska * the block is not in the dedup table (DDT) and we won't consult the DDT 67*2a58b312SMartin Matuska * when we need to free the block. Block Cloning must be consulted on every 68*2a58b312SMartin Matuska * free, because we cannot modify the source BP (eg. by setting something 69*2a58b312SMartin Matuska * similar to the D bit), thus we have no hint if the block is in the 70*2a58b312SMartin Matuska * Block Reference Table (BRT), so we need to look into the BRT. There is 71*2a58b312SMartin Matuska * an optimization in place that allows us to eliminate the majority of BRT 72*2a58b312SMartin Matuska * lookups which is described below in the "Minimizing free penalty" section. 73*2a58b312SMartin Matuska * - The BRT entry is much smaller than the DDT entry - for BRT we only store 74*2a58b312SMartin Matuska * 64bit offset and 64bit reference counter. 75*2a58b312SMartin Matuska * - Dedup keys are cryptographic hashes, so two blocks that are close to each 76*2a58b312SMartin Matuska * other on disk are most likely in totally different parts of the DDT. 77*2a58b312SMartin Matuska * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 78*2a58b312SMartin Matuska * from one file should have BRT entries close to each other. 79*2a58b312SMartin Matuska * - Scrub will only do a single pass over a block that is referenced multiple 80*2a58b312SMartin Matuska * times in the DDT. Unfortunately it is not currently (if at all) possible 81*2a58b312SMartin Matuska * with Block Cloning and block referenced multiple times will be scrubbed 82*2a58b312SMartin Matuska * multiple times. The new, sorted scrub should be able to eliminate 83*2a58b312SMartin Matuska * duplicated reads given enough memory. 84*2a58b312SMartin Matuska * - Deduplication requires cryptographically strong hash as a checksum or 85*2a58b312SMartin Matuska * additional data verification. Block Cloning works with any checksum 86*2a58b312SMartin Matuska * algorithm or even with checksumming disabled. 87*2a58b312SMartin Matuska * 88*2a58b312SMartin Matuska * As mentioned above, the BRT entries are much smaller than the DDT entries. 89*2a58b312SMartin Matuska * To uniquely identify a block we just need its vdev id and offset. We also 90*2a58b312SMartin Matuska * need to maintain a reference counter. The vdev id will often repeat, as there 91*2a58b312SMartin Matuska * is a small number of top-level VDEVs and a large number of blocks stored in 92*2a58b312SMartin Matuska * each VDEV. We take advantage of that to reduce the BRT entry size further by 93*2a58b312SMartin Matuska * maintaining one BRT for each top-level VDEV, so we can then have only offset 94*2a58b312SMartin Matuska * and counter as the BRT entry. 95*2a58b312SMartin Matuska * 96*2a58b312SMartin Matuska * Minimizing free penalty. 97*2a58b312SMartin Matuska * 98*2a58b312SMartin Matuska * Block Cloning allows creating additional references to any existing block. 99*2a58b312SMartin Matuska * When we free a block there is no hint in the block pointer whether the block 100*2a58b312SMartin Matuska * was cloned or not, so on each free we have to check if there is a 101*2a58b312SMartin Matuska * corresponding entry in the BRT or not. If there is, we need to decrease 102*2a58b312SMartin Matuska * the reference counter. Doing BRT lookup on every free can potentially be 103*2a58b312SMartin Matuska * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 104*2a58b312SMartin Matuska * This is the main problem with deduplication, so we've learned our lesson and 105*2a58b312SMartin Matuska * try not to repeat the same mistake here. How do we do that? We divide each 106*2a58b312SMartin Matuska * top-level VDEV into 16MB regions. For each region we maintain a counter that 107*2a58b312SMartin Matuska * is a sum of all the BRT entries that have offsets within the region. This 108*2a58b312SMartin Matuska * creates the entries count array of 16bit numbers for each top-level VDEV. 109*2a58b312SMartin Matuska * The entries count array is always kept in memory and updated on disk in the 110*2a58b312SMartin Matuska * same transaction group as the BRT updates to keep everything in-sync. We can 111*2a58b312SMartin Matuska * keep the array in memory, because it is very small. With 16MB regions and 112*2a58b312SMartin Matuska * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 113*2a58b312SMartin Matuska * the region size even further in the future). Now, when we want to free 114*2a58b312SMartin Matuska * a block, we first consult the array. If the counter for the whole region is 115*2a58b312SMartin Matuska * zero, there is no need to look for the BRT entry, as there isn't one for 116*2a58b312SMartin Matuska * sure. If the counter for the region is greater than zero, only then we will 117*2a58b312SMartin Matuska * do a BRT lookup and if an entry is found we will decrease the reference 118*2a58b312SMartin Matuska * counter in the BRT entry and in the entry counters array. 119*2a58b312SMartin Matuska * 120*2a58b312SMartin Matuska * The entry counters array is small, but can potentially be larger for very 121*2a58b312SMartin Matuska * large VDEVs or smaller regions. In this case we don't want to rewrite entire 122*2a58b312SMartin Matuska * array on every change. We then divide the array into 32kB block and keep 123*2a58b312SMartin Matuska * a bitmap of dirty blocks within a transaction group. When we sync the 124*2a58b312SMartin Matuska * transaction group we can only update the parts of the entry counters array 125*2a58b312SMartin Matuska * that were modified. Note: Keeping track of the dirty parts of the entry 126*2a58b312SMartin Matuska * counters array is implemented, but updating only parts of the array on disk 127*2a58b312SMartin Matuska * is not yet implemented - for now we will update entire array if there was 128*2a58b312SMartin Matuska * any change. 129*2a58b312SMartin Matuska * 130*2a58b312SMartin Matuska * The implementation tries to be economic: if BRT is not used, or no longer 131*2a58b312SMartin Matuska * used, there will be no entries in the MOS and no additional memory used (eg. 132*2a58b312SMartin Matuska * the entry counters array is only allocated if needed). 133*2a58b312SMartin Matuska * 134*2a58b312SMartin Matuska * Interaction between Deduplication and Block Cloning. 135*2a58b312SMartin Matuska * 136*2a58b312SMartin Matuska * If both functionalities are in use, we could end up with a block that is 137*2a58b312SMartin Matuska * referenced multiple times in both DDT and BRT. When we free one of the 138*2a58b312SMartin Matuska * references we couldn't tell where it belongs, so we would have to decide 139*2a58b312SMartin Matuska * what table takes the precedence: do we first clear DDT references or BRT 140*2a58b312SMartin Matuska * references? To avoid this dilemma BRT cooperates with DDT - if a given block 141*2a58b312SMartin Matuska * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 142*2a58b312SMartin Matuska * lookup DDT entry instead and increase the counter there. No BRT entry 143*2a58b312SMartin Matuska * will be created for a block which has the D (dedup) bit set. 144*2a58b312SMartin Matuska * BRT may be more efficient for manual deduplication, but if the block is 145*2a58b312SMartin Matuska * already in the DDT, then creating additional BRT entry would be less 146*2a58b312SMartin Matuska * efficient. This clever idea was proposed by Allan Jude. 147*2a58b312SMartin Matuska * 148*2a58b312SMartin Matuska * Block Cloning across datasets. 149*2a58b312SMartin Matuska * 150*2a58b312SMartin Matuska * Block Cloning is not limited to cloning blocks within the same dataset. 151*2a58b312SMartin Matuska * It is possible (and very useful) to clone blocks between different datasets. 152*2a58b312SMartin Matuska * One use case is recovering files from snapshots. By cloning the files into 153*2a58b312SMartin Matuska * dataset we need no additional storage. Without Block Cloning we would need 154*2a58b312SMartin Matuska * additional space for those files. 155*2a58b312SMartin Matuska * Another interesting use case is moving the files between datasets 156*2a58b312SMartin Matuska * (copying the file content to the new dataset and removing the source file). 157*2a58b312SMartin Matuska * In that case Block Cloning will only be used briefly, because the BRT entries 158*2a58b312SMartin Matuska * will be removed when the source is removed. 159*2a58b312SMartin Matuska * Note: currently it is not possible to clone blocks between encrypted 160*2a58b312SMartin Matuska * datasets, even if those datasets use the same encryption key (this includes 161*2a58b312SMartin Matuska * snapshots of encrypted datasets). Cloning blocks between datasets that use 162*2a58b312SMartin Matuska * the same keys should be possible and should be implemented in the future. 163*2a58b312SMartin Matuska * 164*2a58b312SMartin Matuska * Block Cloning flow through ZFS layers. 165*2a58b312SMartin Matuska * 166*2a58b312SMartin Matuska * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 167*2a58b312SMartin Matuska * blocks. As of this writing no interface is implemented that allows for block 168*2a58b312SMartin Matuska * cloning within a ZVOL. 169*2a58b312SMartin Matuska * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 170*2a58b312SMartin Matuska * for blocking cloning. 171*2a58b312SMartin Matuska * 172*2a58b312SMartin Matuska * ssize_t 173*2a58b312SMartin Matuska * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 174*2a58b312SMartin Matuska * size_t len, unsigned int flags); 175*2a58b312SMartin Matuska * 176*2a58b312SMartin Matuska * Even though offsets and length represent bytes, they have to be 177*2a58b312SMartin Matuska * block-aligned or we will return the EXDEV error so the upper layer can 178*2a58b312SMartin Matuska * fallback to the generic mechanism that will just copy the data. 179*2a58b312SMartin Matuska * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 180*2a58b312SMartin Matuska * This function was implemented based on zfs_write(), but instead of writing 181*2a58b312SMartin Matuska * the given data we first read block pointers using the new dmu_read_l0_bps() 182*2a58b312SMartin Matuska * function from the source file. Once we have BPs from the source file we call 183*2a58b312SMartin Matuska * the dmu_brt_clone() function on the destination file. This function 184*2a58b312SMartin Matuska * allocates BPs for us. We iterate over all source BPs. If the given BP is 185*2a58b312SMartin Matuska * a hole or an embedded block, we just copy BP as-is. If it points to a real 186*2a58b312SMartin Matuska * data we place this BP on a BRT pending list using the brt_pending_add() 187*2a58b312SMartin Matuska * function. 188*2a58b312SMartin Matuska * 189*2a58b312SMartin Matuska * We use this pending list to keep track of all BPs that got new references 190*2a58b312SMartin Matuska * within this transaction group. 191*2a58b312SMartin Matuska * 192*2a58b312SMartin Matuska * Some special cases to consider and how we address them: 193*2a58b312SMartin Matuska * - The block we want to clone may have been created within the same 194*2a58b312SMartin Matuska * transaction group that we are trying to clone. Such block has no BP 195*2a58b312SMartin Matuska * allocated yet, so cannot be immediately cloned. We return EXDEV. 196*2a58b312SMartin Matuska * - The block we want to clone may have been modified within the same 197*2a58b312SMartin Matuska * transaction group. We return EXDEV. 198*2a58b312SMartin Matuska * - A block may be cloned multiple times during one transaction group (that's 199*2a58b312SMartin Matuska * why pending list is actually a tree and not an append-only list - this 200*2a58b312SMartin Matuska * way we can figure out faster if this block is cloned for the first time 201*2a58b312SMartin Matuska * in this txg or consecutive time). 202*2a58b312SMartin Matuska * - A block may be cloned and freed within the same transaction group 203*2a58b312SMartin Matuska * (see dbuf_undirty()). 204*2a58b312SMartin Matuska * - A block may be cloned and within the same transaction group the clone 205*2a58b312SMartin Matuska * can be cloned again (see dmu_read_l0_bps()). 206*2a58b312SMartin Matuska * - A file might have been deleted, but the caller still has a file descriptor 207*2a58b312SMartin Matuska * open to this file and clones it. 208*2a58b312SMartin Matuska * 209*2a58b312SMartin Matuska * When we free a block we have an additional step in the ZIO pipeline where we 210*2a58b312SMartin Matuska * call the zio_brt_free() function. We then call the brt_entry_decref() 211*2a58b312SMartin Matuska * that loads the corresponding BRT entry (if one exists) and decreases 212*2a58b312SMartin Matuska * reference counter. If this is not the last reference we will stop ZIO 213*2a58b312SMartin Matuska * pipeline here. If this is the last reference or the block is not in the 214*2a58b312SMartin Matuska * BRT, we continue the pipeline and free the block as usual. 215*2a58b312SMartin Matuska * 216*2a58b312SMartin Matuska * At the beginning of spa_sync() where there can be no more block cloning, 217*2a58b312SMartin Matuska * but before issuing frees we call brt_pending_apply(). This function applies 218*2a58b312SMartin Matuska * all the new clones to the BRT table - we load BRT entries and update 219*2a58b312SMartin Matuska * reference counters. To sync new BRT entries to disk, we use brt_sync() 220*2a58b312SMartin Matuska * function. This function will sync all dirty per-top-level-vdev BRTs, 221*2a58b312SMartin Matuska * the entry counters arrays, etc. 222*2a58b312SMartin Matuska * 223*2a58b312SMartin Matuska * Block Cloning and ZIL. 224*2a58b312SMartin Matuska * 225*2a58b312SMartin Matuska * Every clone operation is divided into chunks (similar to write) and each 226*2a58b312SMartin Matuska * chunk is cloned in a separate transaction. The chunk size is determined by 227*2a58b312SMartin Matuska * how many BPs we can fit into a single ZIL entry. 228*2a58b312SMartin Matuska * Replaying clone operation is different from the regular clone operation, 229*2a58b312SMartin Matuska * as when we log clone operations we cannot use the source object - it may 230*2a58b312SMartin Matuska * reside on a different dataset, so we log BPs we want to clone. 231*2a58b312SMartin Matuska * The ZIL is replayed when we mount the given dataset, not when the pool is 232*2a58b312SMartin Matuska * imported. Taking this into account it is possible that the pool is imported 233*2a58b312SMartin Matuska * without mounting datasets and the source dataset is destroyed before the 234*2a58b312SMartin Matuska * destination dataset is mounted and its ZIL replayed. 235*2a58b312SMartin Matuska * To address this situation we leverage zil_claim() mechanism where ZFS will 236*2a58b312SMartin Matuska * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 237*2a58b312SMartin Matuska * entries, we will bump reference counters for their BPs in the BRT and then 238*2a58b312SMartin Matuska * on mount and ZIL replay we will just attach BPs to the file without 239*2a58b312SMartin Matuska * bumping reference counters. 240*2a58b312SMartin Matuska * Note it is still possible that after zil_claim() we never mount the 241*2a58b312SMartin Matuska * destination, so we never replay its ZIL and we destroy it. This way we would 242*2a58b312SMartin Matuska * end up with leaked references in BRT. We address that too as ZFS gives us 243*2a58b312SMartin Matuska * a chance to clean this up on dataset destroy (see zil_free_clone_range()). 244*2a58b312SMartin Matuska */ 245*2a58b312SMartin Matuska 246*2a58b312SMartin Matuska /* 247*2a58b312SMartin Matuska * BRT - Block Reference Table. 248*2a58b312SMartin Matuska */ 249*2a58b312SMartin Matuska #define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" 250*2a58b312SMartin Matuska 251*2a58b312SMartin Matuska /* 252*2a58b312SMartin Matuska * We divide each VDEV into 16MB chunks. Each chunk is represented in memory 253*2a58b312SMartin Matuska * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B 254*2a58b312SMartin Matuska * Each element in this array represents how many BRT entries do we have in this 255*2a58b312SMartin Matuska * chunk of storage. We always load this entire array into memory and update as 256*2a58b312SMartin Matuska * needed. By having it in memory we can quickly tell (during zio_free()) if 257*2a58b312SMartin Matuska * there are any BRT entries that we might need to update. 258*2a58b312SMartin Matuska * 259*2a58b312SMartin Matuska * This value cannot be larger than 16MB, at least as long as we support 260*2a58b312SMartin Matuska * 512 byte block sizes. With 512 byte block size we can have exactly 261*2a58b312SMartin Matuska * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too 262*2a58b312SMartin Matuska * many for a 16bit counter. 263*2a58b312SMartin Matuska */ 264*2a58b312SMartin Matuska #define BRT_RANGESIZE (16 * 1024 * 1024) 265*2a58b312SMartin Matuska _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, 266*2a58b312SMartin Matuska "BRT_RANGESIZE is too large."); 267*2a58b312SMartin Matuska /* 268*2a58b312SMartin Matuska * We don't want to update the whole structure every time. Maintain bitmap 269*2a58b312SMartin Matuska * of dirty blocks within the regions, so that a single bit represents a 270*2a58b312SMartin Matuska * block size of entcounts. For example if we have a 1PB vdev then all 271*2a58b312SMartin Matuska * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this 272*2a58b312SMartin Matuska * 128MB array of entcounts into 32kB disk blocks, as we don't want to update 273*2a58b312SMartin Matuska * the whole 128MB on disk when we have updated only a single entcount. 274*2a58b312SMartin Matuska * We maintain a bitmap where each 32kB disk block within 128MB entcounts array 275*2a58b312SMartin Matuska * is represented by a single bit. This gives us 4096 bits. A set bit in the 276*2a58b312SMartin Matuska * bitmap means that we had a change in at least one of the 16384 entcounts 277*2a58b312SMartin Matuska * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). 278*2a58b312SMartin Matuska */ 279*2a58b312SMartin Matuska #define BRT_BLOCKSIZE (32 * 1024) 280*2a58b312SMartin Matuska #define BRT_RANGESIZE_TO_NBLOCKS(size) \ 281*2a58b312SMartin Matuska (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) 282*2a58b312SMartin Matuska 283*2a58b312SMartin Matuska #define BRT_LITTLE_ENDIAN 0 284*2a58b312SMartin Matuska #define BRT_BIG_ENDIAN 1 285*2a58b312SMartin Matuska #ifdef _ZFS_LITTLE_ENDIAN 286*2a58b312SMartin Matuska #define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN 287*2a58b312SMartin Matuska #define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN 288*2a58b312SMartin Matuska #else 289*2a58b312SMartin Matuska #define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN 290*2a58b312SMartin Matuska #define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN 291*2a58b312SMartin Matuska #endif 292*2a58b312SMartin Matuska 293*2a58b312SMartin Matuska typedef struct brt_vdev_phys { 294*2a58b312SMartin Matuska uint64_t bvp_mos_entries; 295*2a58b312SMartin Matuska uint64_t bvp_size; 296*2a58b312SMartin Matuska uint64_t bvp_byteorder; 297*2a58b312SMartin Matuska uint64_t bvp_totalcount; 298*2a58b312SMartin Matuska uint64_t bvp_rangesize; 299*2a58b312SMartin Matuska uint64_t bvp_usedspace; 300*2a58b312SMartin Matuska uint64_t bvp_savedspace; 301*2a58b312SMartin Matuska } brt_vdev_phys_t; 302*2a58b312SMartin Matuska 303*2a58b312SMartin Matuska typedef struct brt_vdev { 304*2a58b312SMartin Matuska /* 305*2a58b312SMartin Matuska * VDEV id. 306*2a58b312SMartin Matuska */ 307*2a58b312SMartin Matuska uint64_t bv_vdevid; 308*2a58b312SMartin Matuska /* 309*2a58b312SMartin Matuska * Is the structure initiated? 310*2a58b312SMartin Matuska * (bv_entcount and bv_bitmap are allocated?) 311*2a58b312SMartin Matuska */ 312*2a58b312SMartin Matuska boolean_t bv_initiated; 313*2a58b312SMartin Matuska /* 314*2a58b312SMartin Matuska * Object number in the MOS for the entcount array and brt_vdev_phys. 315*2a58b312SMartin Matuska */ 316*2a58b312SMartin Matuska uint64_t bv_mos_brtvdev; 317*2a58b312SMartin Matuska /* 318*2a58b312SMartin Matuska * Object number in the MOS for the entries table. 319*2a58b312SMartin Matuska */ 320*2a58b312SMartin Matuska uint64_t bv_mos_entries; 321*2a58b312SMartin Matuska /* 322*2a58b312SMartin Matuska * Entries to sync. 323*2a58b312SMartin Matuska */ 324*2a58b312SMartin Matuska avl_tree_t bv_tree; 325*2a58b312SMartin Matuska /* 326*2a58b312SMartin Matuska * Does the bv_entcount[] array needs byte swapping? 327*2a58b312SMartin Matuska */ 328*2a58b312SMartin Matuska boolean_t bv_need_byteswap; 329*2a58b312SMartin Matuska /* 330*2a58b312SMartin Matuska * Number of entries in the bv_entcount[] array. 331*2a58b312SMartin Matuska */ 332*2a58b312SMartin Matuska uint64_t bv_size; 333*2a58b312SMartin Matuska /* 334*2a58b312SMartin Matuska * This is the array with BRT entry count per BRT_RANGESIZE. 335*2a58b312SMartin Matuska */ 336*2a58b312SMartin Matuska uint16_t *bv_entcount; 337*2a58b312SMartin Matuska /* 338*2a58b312SMartin Matuska * Sum of all bv_entcount[]s. 339*2a58b312SMartin Matuska */ 340*2a58b312SMartin Matuska uint64_t bv_totalcount; 341*2a58b312SMartin Matuska /* 342*2a58b312SMartin Matuska * Space on disk occupied by cloned blocks (without compression). 343*2a58b312SMartin Matuska */ 344*2a58b312SMartin Matuska uint64_t bv_usedspace; 345*2a58b312SMartin Matuska /* 346*2a58b312SMartin Matuska * How much additional space would be occupied without block cloning. 347*2a58b312SMartin Matuska */ 348*2a58b312SMartin Matuska uint64_t bv_savedspace; 349*2a58b312SMartin Matuska /* 350*2a58b312SMartin Matuska * brt_vdev_phys needs updating on disk. 351*2a58b312SMartin Matuska */ 352*2a58b312SMartin Matuska boolean_t bv_meta_dirty; 353*2a58b312SMartin Matuska /* 354*2a58b312SMartin Matuska * bv_entcount[] needs updating on disk. 355*2a58b312SMartin Matuska */ 356*2a58b312SMartin Matuska boolean_t bv_entcount_dirty; 357*2a58b312SMartin Matuska /* 358*2a58b312SMartin Matuska * bv_entcount[] potentially can be a bit too big to sychronize it all 359*2a58b312SMartin Matuska * when we just changed few entcounts. The fields below allow us to 360*2a58b312SMartin Matuska * track updates to bv_entcount[] array since the last sync. 361*2a58b312SMartin Matuska * A single bit in the bv_bitmap represents as many entcounts as can 362*2a58b312SMartin Matuska * fit into a single BRT_BLOCKSIZE. 363*2a58b312SMartin Matuska * For example we have 65536 entcounts in the bv_entcount array 364*2a58b312SMartin Matuska * (so the whole array is 128kB). We updated bv_entcount[2] and 365*2a58b312SMartin Matuska * bv_entcount[5]. In that case only first bit in the bv_bitmap will 366*2a58b312SMartin Matuska * be set and we will write only first BRT_BLOCKSIZE out of 128kB. 367*2a58b312SMartin Matuska */ 368*2a58b312SMartin Matuska ulong_t *bv_bitmap; 369*2a58b312SMartin Matuska uint64_t bv_nblocks; 370*2a58b312SMartin Matuska } brt_vdev_t; 371*2a58b312SMartin Matuska 372*2a58b312SMartin Matuska /* 373*2a58b312SMartin Matuska * In-core brt 374*2a58b312SMartin Matuska */ 375*2a58b312SMartin Matuska typedef struct brt { 376*2a58b312SMartin Matuska krwlock_t brt_lock; 377*2a58b312SMartin Matuska spa_t *brt_spa; 378*2a58b312SMartin Matuska #define brt_mos brt_spa->spa_meta_objset 379*2a58b312SMartin Matuska uint64_t brt_rangesize; 380*2a58b312SMartin Matuska uint64_t brt_usedspace; 381*2a58b312SMartin Matuska uint64_t brt_savedspace; 382*2a58b312SMartin Matuska avl_tree_t brt_pending_tree[TXG_SIZE]; 383*2a58b312SMartin Matuska kmutex_t brt_pending_lock[TXG_SIZE]; 384*2a58b312SMartin Matuska /* Sum of all entries across all bv_trees. */ 385*2a58b312SMartin Matuska uint64_t brt_nentries; 386*2a58b312SMartin Matuska brt_vdev_t *brt_vdevs; 387*2a58b312SMartin Matuska uint64_t brt_nvdevs; 388*2a58b312SMartin Matuska } brt_t; 389*2a58b312SMartin Matuska 390*2a58b312SMartin Matuska /* Size of bre_offset / sizeof (uint64_t). */ 391*2a58b312SMartin Matuska #define BRT_KEY_WORDS (1) 392*2a58b312SMartin Matuska 393*2a58b312SMartin Matuska /* 394*2a58b312SMartin Matuska * In-core brt entry. 395*2a58b312SMartin Matuska * On-disk we use bre_offset as the key and bre_refcount as the value. 396*2a58b312SMartin Matuska */ 397*2a58b312SMartin Matuska typedef struct brt_entry { 398*2a58b312SMartin Matuska uint64_t bre_offset; 399*2a58b312SMartin Matuska uint64_t bre_refcount; 400*2a58b312SMartin Matuska avl_node_t bre_node; 401*2a58b312SMartin Matuska } brt_entry_t; 402*2a58b312SMartin Matuska 403*2a58b312SMartin Matuska typedef struct brt_pending_entry { 404*2a58b312SMartin Matuska blkptr_t bpe_bp; 405*2a58b312SMartin Matuska int bpe_count; 406*2a58b312SMartin Matuska avl_node_t bpe_node; 407*2a58b312SMartin Matuska } brt_pending_entry_t; 408*2a58b312SMartin Matuska 409*2a58b312SMartin Matuska static kmem_cache_t *brt_entry_cache; 410*2a58b312SMartin Matuska static kmem_cache_t *brt_pending_entry_cache; 411*2a58b312SMartin Matuska 412*2a58b312SMartin Matuska /* 413*2a58b312SMartin Matuska * Enable/disable prefetching of BRT entries that we are going to modify. 414*2a58b312SMartin Matuska */ 415*2a58b312SMartin Matuska int zfs_brt_prefetch = 1; 416*2a58b312SMartin Matuska 417*2a58b312SMartin Matuska #ifdef ZFS_DEBUG 418*2a58b312SMartin Matuska #define BRT_DEBUG(...) do { \ 419*2a58b312SMartin Matuska if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 420*2a58b312SMartin Matuska __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 421*2a58b312SMartin Matuska } \ 422*2a58b312SMartin Matuska } while (0) 423*2a58b312SMartin Matuska #else 424*2a58b312SMartin Matuska #define BRT_DEBUG(...) do { } while (0) 425*2a58b312SMartin Matuska #endif 426*2a58b312SMartin Matuska 427*2a58b312SMartin Matuska int brt_zap_leaf_blockshift = 12; 428*2a58b312SMartin Matuska int brt_zap_indirect_blockshift = 12; 429*2a58b312SMartin Matuska 430*2a58b312SMartin Matuska static kstat_t *brt_ksp; 431*2a58b312SMartin Matuska 432*2a58b312SMartin Matuska typedef struct brt_stats { 433*2a58b312SMartin Matuska kstat_named_t brt_addref_entry_in_memory; 434*2a58b312SMartin Matuska kstat_named_t brt_addref_entry_not_on_disk; 435*2a58b312SMartin Matuska kstat_named_t brt_addref_entry_on_disk; 436*2a58b312SMartin Matuska kstat_named_t brt_addref_entry_read_lost_race; 437*2a58b312SMartin Matuska kstat_named_t brt_decref_entry_in_memory; 438*2a58b312SMartin Matuska kstat_named_t brt_decref_entry_loaded_from_disk; 439*2a58b312SMartin Matuska kstat_named_t brt_decref_entry_not_in_memory; 440*2a58b312SMartin Matuska kstat_named_t brt_decref_entry_not_on_disk; 441*2a58b312SMartin Matuska kstat_named_t brt_decref_entry_read_lost_race; 442*2a58b312SMartin Matuska kstat_named_t brt_decref_entry_still_referenced; 443*2a58b312SMartin Matuska kstat_named_t brt_decref_free_data_later; 444*2a58b312SMartin Matuska kstat_named_t brt_decref_free_data_now; 445*2a58b312SMartin Matuska kstat_named_t brt_decref_no_entry; 446*2a58b312SMartin Matuska } brt_stats_t; 447*2a58b312SMartin Matuska 448*2a58b312SMartin Matuska static brt_stats_t brt_stats = { 449*2a58b312SMartin Matuska { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, 450*2a58b312SMartin Matuska { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 451*2a58b312SMartin Matuska { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 452*2a58b312SMartin Matuska { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 453*2a58b312SMartin Matuska { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 454*2a58b312SMartin Matuska { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 455*2a58b312SMartin Matuska { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 456*2a58b312SMartin Matuska { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 457*2a58b312SMartin Matuska { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 458*2a58b312SMartin Matuska { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 459*2a58b312SMartin Matuska { "decref_free_data_later", KSTAT_DATA_UINT64 }, 460*2a58b312SMartin Matuska { "decref_free_data_now", KSTAT_DATA_UINT64 }, 461*2a58b312SMartin Matuska { "decref_no_entry", KSTAT_DATA_UINT64 } 462*2a58b312SMartin Matuska }; 463*2a58b312SMartin Matuska 464*2a58b312SMartin Matuska struct { 465*2a58b312SMartin Matuska wmsum_t brt_addref_entry_in_memory; 466*2a58b312SMartin Matuska wmsum_t brt_addref_entry_not_on_disk; 467*2a58b312SMartin Matuska wmsum_t brt_addref_entry_on_disk; 468*2a58b312SMartin Matuska wmsum_t brt_addref_entry_read_lost_race; 469*2a58b312SMartin Matuska wmsum_t brt_decref_entry_in_memory; 470*2a58b312SMartin Matuska wmsum_t brt_decref_entry_loaded_from_disk; 471*2a58b312SMartin Matuska wmsum_t brt_decref_entry_not_in_memory; 472*2a58b312SMartin Matuska wmsum_t brt_decref_entry_not_on_disk; 473*2a58b312SMartin Matuska wmsum_t brt_decref_entry_read_lost_race; 474*2a58b312SMartin Matuska wmsum_t brt_decref_entry_still_referenced; 475*2a58b312SMartin Matuska wmsum_t brt_decref_free_data_later; 476*2a58b312SMartin Matuska wmsum_t brt_decref_free_data_now; 477*2a58b312SMartin Matuska wmsum_t brt_decref_no_entry; 478*2a58b312SMartin Matuska } brt_sums; 479*2a58b312SMartin Matuska 480*2a58b312SMartin Matuska #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 481*2a58b312SMartin Matuska 482*2a58b312SMartin Matuska static int brt_entry_compare(const void *x1, const void *x2); 483*2a58b312SMartin Matuska static int brt_pending_entry_compare(const void *x1, const void *x2); 484*2a58b312SMartin Matuska 485*2a58b312SMartin Matuska static void 486*2a58b312SMartin Matuska brt_rlock(brt_t *brt) 487*2a58b312SMartin Matuska { 488*2a58b312SMartin Matuska rw_enter(&brt->brt_lock, RW_READER); 489*2a58b312SMartin Matuska } 490*2a58b312SMartin Matuska 491*2a58b312SMartin Matuska static void 492*2a58b312SMartin Matuska brt_wlock(brt_t *brt) 493*2a58b312SMartin Matuska { 494*2a58b312SMartin Matuska rw_enter(&brt->brt_lock, RW_WRITER); 495*2a58b312SMartin Matuska } 496*2a58b312SMartin Matuska 497*2a58b312SMartin Matuska static void 498*2a58b312SMartin Matuska brt_unlock(brt_t *brt) 499*2a58b312SMartin Matuska { 500*2a58b312SMartin Matuska rw_exit(&brt->brt_lock); 501*2a58b312SMartin Matuska } 502*2a58b312SMartin Matuska 503*2a58b312SMartin Matuska static uint16_t 504*2a58b312SMartin Matuska brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 505*2a58b312SMartin Matuska { 506*2a58b312SMartin Matuska 507*2a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 508*2a58b312SMartin Matuska 509*2a58b312SMartin Matuska if (brtvd->bv_need_byteswap) { 510*2a58b312SMartin Matuska return (BSWAP_16(brtvd->bv_entcount[idx])); 511*2a58b312SMartin Matuska } else { 512*2a58b312SMartin Matuska return (brtvd->bv_entcount[idx]); 513*2a58b312SMartin Matuska } 514*2a58b312SMartin Matuska } 515*2a58b312SMartin Matuska 516*2a58b312SMartin Matuska static void 517*2a58b312SMartin Matuska brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 518*2a58b312SMartin Matuska { 519*2a58b312SMartin Matuska 520*2a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 521*2a58b312SMartin Matuska 522*2a58b312SMartin Matuska if (brtvd->bv_need_byteswap) { 523*2a58b312SMartin Matuska brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 524*2a58b312SMartin Matuska } else { 525*2a58b312SMartin Matuska brtvd->bv_entcount[idx] = entcnt; 526*2a58b312SMartin Matuska } 527*2a58b312SMartin Matuska } 528*2a58b312SMartin Matuska 529*2a58b312SMartin Matuska static void 530*2a58b312SMartin Matuska brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 531*2a58b312SMartin Matuska { 532*2a58b312SMartin Matuska uint16_t entcnt; 533*2a58b312SMartin Matuska 534*2a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 535*2a58b312SMartin Matuska 536*2a58b312SMartin Matuska entcnt = brt_vdev_entcount_get(brtvd, idx); 537*2a58b312SMartin Matuska ASSERT(entcnt < UINT16_MAX); 538*2a58b312SMartin Matuska 539*2a58b312SMartin Matuska brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 540*2a58b312SMartin Matuska } 541*2a58b312SMartin Matuska 542*2a58b312SMartin Matuska static void 543*2a58b312SMartin Matuska brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 544*2a58b312SMartin Matuska { 545*2a58b312SMartin Matuska uint16_t entcnt; 546*2a58b312SMartin Matuska 547*2a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 548*2a58b312SMartin Matuska 549*2a58b312SMartin Matuska entcnt = brt_vdev_entcount_get(brtvd, idx); 550*2a58b312SMartin Matuska ASSERT(entcnt > 0); 551*2a58b312SMartin Matuska 552*2a58b312SMartin Matuska brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 553*2a58b312SMartin Matuska } 554*2a58b312SMartin Matuska 555*2a58b312SMartin Matuska #ifdef ZFS_DEBUG 556*2a58b312SMartin Matuska static void 557*2a58b312SMartin Matuska brt_vdev_dump(brt_t *brt) 558*2a58b312SMartin Matuska { 559*2a58b312SMartin Matuska brt_vdev_t *brtvd; 560*2a58b312SMartin Matuska uint64_t vdevid; 561*2a58b312SMartin Matuska 562*2a58b312SMartin Matuska if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { 563*2a58b312SMartin Matuska return; 564*2a58b312SMartin Matuska } 565*2a58b312SMartin Matuska 566*2a58b312SMartin Matuska if (brt->brt_nvdevs == 0) { 567*2a58b312SMartin Matuska zfs_dbgmsg("BRT empty"); 568*2a58b312SMartin Matuska return; 569*2a58b312SMartin Matuska } 570*2a58b312SMartin Matuska 571*2a58b312SMartin Matuska zfs_dbgmsg("BRT vdev dump:"); 572*2a58b312SMartin Matuska for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 573*2a58b312SMartin Matuska uint64_t idx; 574*2a58b312SMartin Matuska 575*2a58b312SMartin Matuska brtvd = &brt->brt_vdevs[vdevid]; 576*2a58b312SMartin Matuska zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " 577*2a58b312SMartin Matuska "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", 578*2a58b312SMartin Matuska (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, 579*2a58b312SMartin Matuska brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 580*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_size, 581*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_totalcount, 582*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_nblocks, 583*2a58b312SMartin Matuska (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); 584*2a58b312SMartin Matuska if (brtvd->bv_totalcount > 0) { 585*2a58b312SMartin Matuska zfs_dbgmsg(" entcounts:"); 586*2a58b312SMartin Matuska for (idx = 0; idx < brtvd->bv_size; idx++) { 587*2a58b312SMartin Matuska if (brt_vdev_entcount_get(brtvd, idx) > 0) { 588*2a58b312SMartin Matuska zfs_dbgmsg(" [%04llu] %hu", 589*2a58b312SMartin Matuska (u_longlong_t)idx, 590*2a58b312SMartin Matuska brt_vdev_entcount_get(brtvd, idx)); 591*2a58b312SMartin Matuska } 592*2a58b312SMartin Matuska } 593*2a58b312SMartin Matuska } 594*2a58b312SMartin Matuska if (brtvd->bv_entcount_dirty) { 595*2a58b312SMartin Matuska char *bitmap; 596*2a58b312SMartin Matuska 597*2a58b312SMartin Matuska bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); 598*2a58b312SMartin Matuska for (idx = 0; idx < brtvd->bv_nblocks; idx++) { 599*2a58b312SMartin Matuska bitmap[idx] = 600*2a58b312SMartin Matuska BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 601*2a58b312SMartin Matuska } 602*2a58b312SMartin Matuska bitmap[idx] = '\0'; 603*2a58b312SMartin Matuska zfs_dbgmsg(" bitmap: %s", bitmap); 604*2a58b312SMartin Matuska kmem_free(bitmap, brtvd->bv_nblocks + 1); 605*2a58b312SMartin Matuska } 606*2a58b312SMartin Matuska } 607*2a58b312SMartin Matuska } 608*2a58b312SMartin Matuska #endif 609*2a58b312SMartin Matuska 610*2a58b312SMartin Matuska static brt_vdev_t * 611*2a58b312SMartin Matuska brt_vdev(brt_t *brt, uint64_t vdevid) 612*2a58b312SMartin Matuska { 613*2a58b312SMartin Matuska brt_vdev_t *brtvd; 614*2a58b312SMartin Matuska 615*2a58b312SMartin Matuska ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 616*2a58b312SMartin Matuska 617*2a58b312SMartin Matuska if (vdevid < brt->brt_nvdevs) { 618*2a58b312SMartin Matuska brtvd = &brt->brt_vdevs[vdevid]; 619*2a58b312SMartin Matuska } else { 620*2a58b312SMartin Matuska brtvd = NULL; 621*2a58b312SMartin Matuska } 622*2a58b312SMartin Matuska 623*2a58b312SMartin Matuska return (brtvd); 624*2a58b312SMartin Matuska } 625*2a58b312SMartin Matuska 626*2a58b312SMartin Matuska static void 627*2a58b312SMartin Matuska brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 628*2a58b312SMartin Matuska { 629*2a58b312SMartin Matuska char name[64]; 630*2a58b312SMartin Matuska 631*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 632*2a58b312SMartin Matuska ASSERT0(brtvd->bv_mos_brtvdev); 633*2a58b312SMartin Matuska ASSERT0(brtvd->bv_mos_entries); 634*2a58b312SMartin Matuska ASSERT(brtvd->bv_entcount != NULL); 635*2a58b312SMartin Matuska ASSERT(brtvd->bv_size > 0); 636*2a58b312SMartin Matuska ASSERT(brtvd->bv_bitmap != NULL); 637*2a58b312SMartin Matuska ASSERT(brtvd->bv_nblocks > 0); 638*2a58b312SMartin Matuska 639*2a58b312SMartin Matuska brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, 640*2a58b312SMartin Matuska ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 641*2a58b312SMartin Matuska brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, 642*2a58b312SMartin Matuska 0, tx); 643*2a58b312SMartin Matuska VERIFY(brtvd->bv_mos_entries != 0); 644*2a58b312SMartin Matuska BRT_DEBUG("MOS entries created, object=%llu", 645*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries); 646*2a58b312SMartin Matuska 647*2a58b312SMartin Matuska /* 648*2a58b312SMartin Matuska * We allocate DMU buffer to store the bv_entcount[] array. 649*2a58b312SMartin Matuska * We will keep array size (bv_size) and cummulative count for all 650*2a58b312SMartin Matuska * bv_entcount[]s (bv_totalcount) in the bonus buffer. 651*2a58b312SMartin Matuska */ 652*2a58b312SMartin Matuska brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, 653*2a58b312SMartin Matuska DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 654*2a58b312SMartin Matuska DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 655*2a58b312SMartin Matuska VERIFY(brtvd->bv_mos_brtvdev != 0); 656*2a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV created, object=%llu", 657*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev); 658*2a58b312SMartin Matuska 659*2a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 660*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 661*2a58b312SMartin Matuska VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 662*2a58b312SMartin Matuska sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 663*2a58b312SMartin Matuska BRT_DEBUG("Pool directory object created, object=%s", name); 664*2a58b312SMartin Matuska 665*2a58b312SMartin Matuska spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 666*2a58b312SMartin Matuska } 667*2a58b312SMartin Matuska 668*2a58b312SMartin Matuska static void 669*2a58b312SMartin Matuska brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) 670*2a58b312SMartin Matuska { 671*2a58b312SMartin Matuska vdev_t *vd; 672*2a58b312SMartin Matuska uint16_t *entcount; 673*2a58b312SMartin Matuska ulong_t *bitmap; 674*2a58b312SMartin Matuska uint64_t nblocks, size; 675*2a58b312SMartin Matuska 676*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 677*2a58b312SMartin Matuska 678*2a58b312SMartin Matuska spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); 679*2a58b312SMartin Matuska vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); 680*2a58b312SMartin Matuska size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; 681*2a58b312SMartin Matuska spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); 682*2a58b312SMartin Matuska 683*2a58b312SMartin Matuska entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 684*2a58b312SMartin Matuska nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 685*2a58b312SMartin Matuska bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 686*2a58b312SMartin Matuska 687*2a58b312SMartin Matuska if (!brtvd->bv_initiated) { 688*2a58b312SMartin Matuska ASSERT0(brtvd->bv_size); 689*2a58b312SMartin Matuska ASSERT(brtvd->bv_entcount == NULL); 690*2a58b312SMartin Matuska ASSERT(brtvd->bv_bitmap == NULL); 691*2a58b312SMartin Matuska ASSERT0(brtvd->bv_nblocks); 692*2a58b312SMartin Matuska 693*2a58b312SMartin Matuska avl_create(&brtvd->bv_tree, brt_entry_compare, 694*2a58b312SMartin Matuska sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 695*2a58b312SMartin Matuska } else { 696*2a58b312SMartin Matuska ASSERT(brtvd->bv_size > 0); 697*2a58b312SMartin Matuska ASSERT(brtvd->bv_entcount != NULL); 698*2a58b312SMartin Matuska ASSERT(brtvd->bv_bitmap != NULL); 699*2a58b312SMartin Matuska ASSERT(brtvd->bv_nblocks > 0); 700*2a58b312SMartin Matuska /* 701*2a58b312SMartin Matuska * TODO: Allow vdev shrinking. We only need to implement 702*2a58b312SMartin Matuska * shrinking the on-disk BRT VDEV object. 703*2a58b312SMartin Matuska * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, 704*2a58b312SMartin Matuska * size, tx); 705*2a58b312SMartin Matuska */ 706*2a58b312SMartin Matuska ASSERT3U(brtvd->bv_size, <=, size); 707*2a58b312SMartin Matuska 708*2a58b312SMartin Matuska memcpy(entcount, brtvd->bv_entcount, 709*2a58b312SMartin Matuska sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 710*2a58b312SMartin Matuska memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 711*2a58b312SMartin Matuska BT_SIZEOFMAP(brtvd->bv_nblocks))); 712*2a58b312SMartin Matuska kmem_free(brtvd->bv_entcount, 713*2a58b312SMartin Matuska sizeof (entcount[0]) * brtvd->bv_size); 714*2a58b312SMartin Matuska kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 715*2a58b312SMartin Matuska } 716*2a58b312SMartin Matuska 717*2a58b312SMartin Matuska brtvd->bv_size = size; 718*2a58b312SMartin Matuska brtvd->bv_entcount = entcount; 719*2a58b312SMartin Matuska brtvd->bv_bitmap = bitmap; 720*2a58b312SMartin Matuska brtvd->bv_nblocks = nblocks; 721*2a58b312SMartin Matuska if (!brtvd->bv_initiated) { 722*2a58b312SMartin Matuska brtvd->bv_need_byteswap = FALSE; 723*2a58b312SMartin Matuska brtvd->bv_initiated = TRUE; 724*2a58b312SMartin Matuska BRT_DEBUG("BRT VDEV %llu initiated.", 725*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 726*2a58b312SMartin Matuska } 727*2a58b312SMartin Matuska } 728*2a58b312SMartin Matuska 729*2a58b312SMartin Matuska static void 730*2a58b312SMartin Matuska brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) 731*2a58b312SMartin Matuska { 732*2a58b312SMartin Matuska char name[64]; 733*2a58b312SMartin Matuska dmu_buf_t *db; 734*2a58b312SMartin Matuska brt_vdev_phys_t *bvphys; 735*2a58b312SMartin Matuska int error; 736*2a58b312SMartin Matuska 737*2a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 738*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 739*2a58b312SMartin Matuska error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 740*2a58b312SMartin Matuska sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); 741*2a58b312SMartin Matuska if (error != 0) 742*2a58b312SMartin Matuska return; 743*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0); 744*2a58b312SMartin Matuska 745*2a58b312SMartin Matuska error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); 746*2a58b312SMartin Matuska ASSERT0(error); 747*2a58b312SMartin Matuska if (error != 0) 748*2a58b312SMartin Matuska return; 749*2a58b312SMartin Matuska 750*2a58b312SMartin Matuska bvphys = db->db_data; 751*2a58b312SMartin Matuska if (brt->brt_rangesize == 0) { 752*2a58b312SMartin Matuska brt->brt_rangesize = bvphys->bvp_rangesize; 753*2a58b312SMartin Matuska } else { 754*2a58b312SMartin Matuska ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); 755*2a58b312SMartin Matuska } 756*2a58b312SMartin Matuska 757*2a58b312SMartin Matuska ASSERT(!brtvd->bv_initiated); 758*2a58b312SMartin Matuska brt_vdev_realloc(brt, brtvd); 759*2a58b312SMartin Matuska 760*2a58b312SMartin Matuska /* TODO: We don't support VDEV shrinking. */ 761*2a58b312SMartin Matuska ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 762*2a58b312SMartin Matuska 763*2a58b312SMartin Matuska /* 764*2a58b312SMartin Matuska * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 765*2a58b312SMartin Matuska */ 766*2a58b312SMartin Matuska error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 767*2a58b312SMartin Matuska MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 768*2a58b312SMartin Matuska brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 769*2a58b312SMartin Matuska ASSERT0(error); 770*2a58b312SMartin Matuska 771*2a58b312SMartin Matuska brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 772*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0); 773*2a58b312SMartin Matuska brtvd->bv_need_byteswap = 774*2a58b312SMartin Matuska (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 775*2a58b312SMartin Matuska brtvd->bv_totalcount = bvphys->bvp_totalcount; 776*2a58b312SMartin Matuska brtvd->bv_usedspace = bvphys->bvp_usedspace; 777*2a58b312SMartin Matuska brtvd->bv_savedspace = bvphys->bvp_savedspace; 778*2a58b312SMartin Matuska brt->brt_usedspace += brtvd->bv_usedspace; 779*2a58b312SMartin Matuska brt->brt_savedspace += brtvd->bv_savedspace; 780*2a58b312SMartin Matuska 781*2a58b312SMartin Matuska dmu_buf_rele(db, FTAG); 782*2a58b312SMartin Matuska 783*2a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", 784*2a58b312SMartin Matuska name, (u_longlong_t)brtvd->bv_mos_brtvdev, 785*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries); 786*2a58b312SMartin Matuska } 787*2a58b312SMartin Matuska 788*2a58b312SMartin Matuska static void 789*2a58b312SMartin Matuska brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) 790*2a58b312SMartin Matuska { 791*2a58b312SMartin Matuska 792*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 793*2a58b312SMartin Matuska ASSERT(brtvd->bv_initiated); 794*2a58b312SMartin Matuska 795*2a58b312SMartin Matuska kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 796*2a58b312SMartin Matuska brtvd->bv_entcount = NULL; 797*2a58b312SMartin Matuska kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 798*2a58b312SMartin Matuska brtvd->bv_bitmap = NULL; 799*2a58b312SMartin Matuska ASSERT0(avl_numnodes(&brtvd->bv_tree)); 800*2a58b312SMartin Matuska avl_destroy(&brtvd->bv_tree); 801*2a58b312SMartin Matuska 802*2a58b312SMartin Matuska brtvd->bv_size = 0; 803*2a58b312SMartin Matuska brtvd->bv_nblocks = 0; 804*2a58b312SMartin Matuska 805*2a58b312SMartin Matuska brtvd->bv_initiated = FALSE; 806*2a58b312SMartin Matuska BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 807*2a58b312SMartin Matuska } 808*2a58b312SMartin Matuska 809*2a58b312SMartin Matuska static void 810*2a58b312SMartin Matuska brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 811*2a58b312SMartin Matuska { 812*2a58b312SMartin Matuska char name[64]; 813*2a58b312SMartin Matuska uint64_t count; 814*2a58b312SMartin Matuska dmu_buf_t *db; 815*2a58b312SMartin Matuska brt_vdev_phys_t *bvphys; 816*2a58b312SMartin Matuska 817*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 818*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0); 819*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0); 820*2a58b312SMartin Matuska 821*2a58b312SMartin Matuska VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); 822*2a58b312SMartin Matuska VERIFY0(count); 823*2a58b312SMartin Matuska VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); 824*2a58b312SMartin Matuska BRT_DEBUG("MOS entries destroyed, object=%llu", 825*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries); 826*2a58b312SMartin Matuska brtvd->bv_mos_entries = 0; 827*2a58b312SMartin Matuska 828*2a58b312SMartin Matuska VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 829*2a58b312SMartin Matuska bvphys = db->db_data; 830*2a58b312SMartin Matuska ASSERT0(bvphys->bvp_totalcount); 831*2a58b312SMartin Matuska ASSERT0(bvphys->bvp_usedspace); 832*2a58b312SMartin Matuska ASSERT0(bvphys->bvp_savedspace); 833*2a58b312SMartin Matuska dmu_buf_rele(db, FTAG); 834*2a58b312SMartin Matuska 835*2a58b312SMartin Matuska VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); 836*2a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 837*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev); 838*2a58b312SMartin Matuska brtvd->bv_mos_brtvdev = 0; 839*2a58b312SMartin Matuska 840*2a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 841*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid); 842*2a58b312SMartin Matuska VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 843*2a58b312SMartin Matuska BRT_DEBUG("Pool directory object removed, object=%s", name); 844*2a58b312SMartin Matuska 845*2a58b312SMartin Matuska brt_vdev_dealloc(brt, brtvd); 846*2a58b312SMartin Matuska 847*2a58b312SMartin Matuska spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 848*2a58b312SMartin Matuska } 849*2a58b312SMartin Matuska 850*2a58b312SMartin Matuska static void 851*2a58b312SMartin Matuska brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) 852*2a58b312SMartin Matuska { 853*2a58b312SMartin Matuska brt_vdev_t *brtvd, *vdevs; 854*2a58b312SMartin Matuska uint64_t vdevid; 855*2a58b312SMartin Matuska 856*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 857*2a58b312SMartin Matuska ASSERT3U(nvdevs, >, brt->brt_nvdevs); 858*2a58b312SMartin Matuska 859*2a58b312SMartin Matuska vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); 860*2a58b312SMartin Matuska if (brt->brt_nvdevs > 0) { 861*2a58b312SMartin Matuska ASSERT(brt->brt_vdevs != NULL); 862*2a58b312SMartin Matuska 863*2a58b312SMartin Matuska memcpy(vdevs, brt->brt_vdevs, 864*2a58b312SMartin Matuska sizeof (brt_vdev_t) * brt->brt_nvdevs); 865*2a58b312SMartin Matuska kmem_free(brt->brt_vdevs, 866*2a58b312SMartin Matuska sizeof (brt_vdev_t) * brt->brt_nvdevs); 867*2a58b312SMartin Matuska } 868*2a58b312SMartin Matuska for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { 869*2a58b312SMartin Matuska brtvd = &vdevs[vdevid]; 870*2a58b312SMartin Matuska 871*2a58b312SMartin Matuska brtvd->bv_vdevid = vdevid; 872*2a58b312SMartin Matuska brtvd->bv_initiated = FALSE; 873*2a58b312SMartin Matuska } 874*2a58b312SMartin Matuska 875*2a58b312SMartin Matuska BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 876*2a58b312SMartin Matuska (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); 877*2a58b312SMartin Matuska 878*2a58b312SMartin Matuska brt->brt_vdevs = vdevs; 879*2a58b312SMartin Matuska brt->brt_nvdevs = nvdevs; 880*2a58b312SMartin Matuska } 881*2a58b312SMartin Matuska 882*2a58b312SMartin Matuska static boolean_t 883*2a58b312SMartin Matuska brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) 884*2a58b312SMartin Matuska { 885*2a58b312SMartin Matuska uint64_t idx; 886*2a58b312SMartin Matuska 887*2a58b312SMartin Matuska ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 888*2a58b312SMartin Matuska 889*2a58b312SMartin Matuska idx = bre->bre_offset / brt->brt_rangesize; 890*2a58b312SMartin Matuska if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { 891*2a58b312SMartin Matuska /* VDEV wasn't expanded. */ 892*2a58b312SMartin Matuska return (brt_vdev_entcount_get(brtvd, idx) > 0); 893*2a58b312SMartin Matuska } 894*2a58b312SMartin Matuska 895*2a58b312SMartin Matuska return (FALSE); 896*2a58b312SMartin Matuska } 897*2a58b312SMartin Matuska 898*2a58b312SMartin Matuska static void 899*2a58b312SMartin Matuska brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 900*2a58b312SMartin Matuska uint64_t dsize) 901*2a58b312SMartin Matuska { 902*2a58b312SMartin Matuska uint64_t idx; 903*2a58b312SMartin Matuska 904*2a58b312SMartin Matuska ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 905*2a58b312SMartin Matuska ASSERT(brtvd != NULL); 906*2a58b312SMartin Matuska ASSERT(brtvd->bv_entcount != NULL); 907*2a58b312SMartin Matuska 908*2a58b312SMartin Matuska brt->brt_savedspace += dsize; 909*2a58b312SMartin Matuska brtvd->bv_savedspace += dsize; 910*2a58b312SMartin Matuska brtvd->bv_meta_dirty = TRUE; 911*2a58b312SMartin Matuska 912*2a58b312SMartin Matuska if (bre->bre_refcount > 1) { 913*2a58b312SMartin Matuska return; 914*2a58b312SMartin Matuska } 915*2a58b312SMartin Matuska 916*2a58b312SMartin Matuska brt->brt_usedspace += dsize; 917*2a58b312SMartin Matuska brtvd->bv_usedspace += dsize; 918*2a58b312SMartin Matuska 919*2a58b312SMartin Matuska idx = bre->bre_offset / brt->brt_rangesize; 920*2a58b312SMartin Matuska if (idx >= brtvd->bv_size) { 921*2a58b312SMartin Matuska /* VDEV has been expanded. */ 922*2a58b312SMartin Matuska brt_vdev_realloc(brt, brtvd); 923*2a58b312SMartin Matuska } 924*2a58b312SMartin Matuska 925*2a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 926*2a58b312SMartin Matuska 927*2a58b312SMartin Matuska brtvd->bv_totalcount++; 928*2a58b312SMartin Matuska brt_vdev_entcount_inc(brtvd, idx); 929*2a58b312SMartin Matuska brtvd->bv_entcount_dirty = TRUE; 930*2a58b312SMartin Matuska idx = idx / BRT_BLOCKSIZE / 8; 931*2a58b312SMartin Matuska BT_SET(brtvd->bv_bitmap, idx); 932*2a58b312SMartin Matuska 933*2a58b312SMartin Matuska #ifdef ZFS_DEBUG 934*2a58b312SMartin Matuska brt_vdev_dump(brt); 935*2a58b312SMartin Matuska #endif 936*2a58b312SMartin Matuska } 937*2a58b312SMartin Matuska 938*2a58b312SMartin Matuska static void 939*2a58b312SMartin Matuska brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 940*2a58b312SMartin Matuska uint64_t dsize) 941*2a58b312SMartin Matuska { 942*2a58b312SMartin Matuska uint64_t idx; 943*2a58b312SMartin Matuska 944*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 945*2a58b312SMartin Matuska ASSERT(brtvd != NULL); 946*2a58b312SMartin Matuska ASSERT(brtvd->bv_entcount != NULL); 947*2a58b312SMartin Matuska 948*2a58b312SMartin Matuska brt->brt_savedspace -= dsize; 949*2a58b312SMartin Matuska brtvd->bv_savedspace -= dsize; 950*2a58b312SMartin Matuska brtvd->bv_meta_dirty = TRUE; 951*2a58b312SMartin Matuska 952*2a58b312SMartin Matuska if (bre->bre_refcount > 0) { 953*2a58b312SMartin Matuska return; 954*2a58b312SMartin Matuska } 955*2a58b312SMartin Matuska 956*2a58b312SMartin Matuska brt->brt_usedspace -= dsize; 957*2a58b312SMartin Matuska brtvd->bv_usedspace -= dsize; 958*2a58b312SMartin Matuska 959*2a58b312SMartin Matuska idx = bre->bre_offset / brt->brt_rangesize; 960*2a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size); 961*2a58b312SMartin Matuska 962*2a58b312SMartin Matuska ASSERT(brtvd->bv_totalcount > 0); 963*2a58b312SMartin Matuska brtvd->bv_totalcount--; 964*2a58b312SMartin Matuska brt_vdev_entcount_dec(brtvd, idx); 965*2a58b312SMartin Matuska brtvd->bv_entcount_dirty = TRUE; 966*2a58b312SMartin Matuska idx = idx / BRT_BLOCKSIZE / 8; 967*2a58b312SMartin Matuska BT_SET(brtvd->bv_bitmap, idx); 968*2a58b312SMartin Matuska 969*2a58b312SMartin Matuska #ifdef ZFS_DEBUG 970*2a58b312SMartin Matuska brt_vdev_dump(brt); 971*2a58b312SMartin Matuska #endif 972*2a58b312SMartin Matuska } 973*2a58b312SMartin Matuska 974*2a58b312SMartin Matuska static void 975*2a58b312SMartin Matuska brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 976*2a58b312SMartin Matuska { 977*2a58b312SMartin Matuska dmu_buf_t *db; 978*2a58b312SMartin Matuska brt_vdev_phys_t *bvphys; 979*2a58b312SMartin Matuska 980*2a58b312SMartin Matuska ASSERT(brtvd->bv_meta_dirty); 981*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0); 982*2a58b312SMartin Matuska ASSERT(dmu_tx_is_syncing(tx)); 983*2a58b312SMartin Matuska 984*2a58b312SMartin Matuska VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 985*2a58b312SMartin Matuska 986*2a58b312SMartin Matuska if (brtvd->bv_entcount_dirty) { 987*2a58b312SMartin Matuska /* 988*2a58b312SMartin Matuska * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 989*2a58b312SMartin Matuska */ 990*2a58b312SMartin Matuska dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 991*2a58b312SMartin Matuska brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 992*2a58b312SMartin Matuska brtvd->bv_entcount, tx); 993*2a58b312SMartin Matuska memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); 994*2a58b312SMartin Matuska brtvd->bv_entcount_dirty = FALSE; 995*2a58b312SMartin Matuska } 996*2a58b312SMartin Matuska 997*2a58b312SMartin Matuska dmu_buf_will_dirty(db, tx); 998*2a58b312SMartin Matuska bvphys = db->db_data; 999*2a58b312SMartin Matuska bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 1000*2a58b312SMartin Matuska bvphys->bvp_size = brtvd->bv_size; 1001*2a58b312SMartin Matuska if (brtvd->bv_need_byteswap) { 1002*2a58b312SMartin Matuska bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 1003*2a58b312SMartin Matuska } else { 1004*2a58b312SMartin Matuska bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 1005*2a58b312SMartin Matuska } 1006*2a58b312SMartin Matuska bvphys->bvp_totalcount = brtvd->bv_totalcount; 1007*2a58b312SMartin Matuska bvphys->bvp_rangesize = brt->brt_rangesize; 1008*2a58b312SMartin Matuska bvphys->bvp_usedspace = brtvd->bv_usedspace; 1009*2a58b312SMartin Matuska bvphys->bvp_savedspace = brtvd->bv_savedspace; 1010*2a58b312SMartin Matuska dmu_buf_rele(db, FTAG); 1011*2a58b312SMartin Matuska 1012*2a58b312SMartin Matuska brtvd->bv_meta_dirty = FALSE; 1013*2a58b312SMartin Matuska } 1014*2a58b312SMartin Matuska 1015*2a58b312SMartin Matuska static void 1016*2a58b312SMartin Matuska brt_vdevs_alloc(brt_t *brt, boolean_t load) 1017*2a58b312SMartin Matuska { 1018*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1019*2a58b312SMartin Matuska uint64_t vdevid; 1020*2a58b312SMartin Matuska 1021*2a58b312SMartin Matuska brt_wlock(brt); 1022*2a58b312SMartin Matuska 1023*2a58b312SMartin Matuska brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); 1024*2a58b312SMartin Matuska 1025*2a58b312SMartin Matuska if (load) { 1026*2a58b312SMartin Matuska for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1027*2a58b312SMartin Matuska brtvd = &brt->brt_vdevs[vdevid]; 1028*2a58b312SMartin Matuska ASSERT(brtvd->bv_entcount == NULL); 1029*2a58b312SMartin Matuska 1030*2a58b312SMartin Matuska brt_vdev_load(brt, brtvd); 1031*2a58b312SMartin Matuska } 1032*2a58b312SMartin Matuska } 1033*2a58b312SMartin Matuska 1034*2a58b312SMartin Matuska if (brt->brt_rangesize == 0) { 1035*2a58b312SMartin Matuska brt->brt_rangesize = BRT_RANGESIZE; 1036*2a58b312SMartin Matuska } 1037*2a58b312SMartin Matuska 1038*2a58b312SMartin Matuska brt_unlock(brt); 1039*2a58b312SMartin Matuska } 1040*2a58b312SMartin Matuska 1041*2a58b312SMartin Matuska static void 1042*2a58b312SMartin Matuska brt_vdevs_free(brt_t *brt) 1043*2a58b312SMartin Matuska { 1044*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1045*2a58b312SMartin Matuska uint64_t vdevid; 1046*2a58b312SMartin Matuska 1047*2a58b312SMartin Matuska brt_wlock(brt); 1048*2a58b312SMartin Matuska 1049*2a58b312SMartin Matuska for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1050*2a58b312SMartin Matuska brtvd = &brt->brt_vdevs[vdevid]; 1051*2a58b312SMartin Matuska if (brtvd->bv_initiated) 1052*2a58b312SMartin Matuska brt_vdev_dealloc(brt, brtvd); 1053*2a58b312SMartin Matuska } 1054*2a58b312SMartin Matuska kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); 1055*2a58b312SMartin Matuska 1056*2a58b312SMartin Matuska brt_unlock(brt); 1057*2a58b312SMartin Matuska } 1058*2a58b312SMartin Matuska 1059*2a58b312SMartin Matuska static void 1060*2a58b312SMartin Matuska brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 1061*2a58b312SMartin Matuska { 1062*2a58b312SMartin Matuska 1063*2a58b312SMartin Matuska bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 1064*2a58b312SMartin Matuska bre->bre_refcount = 0; 1065*2a58b312SMartin Matuska 1066*2a58b312SMartin Matuska *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 1067*2a58b312SMartin Matuska } 1068*2a58b312SMartin Matuska 1069*2a58b312SMartin Matuska static int 1070*2a58b312SMartin Matuska brt_entry_compare(const void *x1, const void *x2) 1071*2a58b312SMartin Matuska { 1072*2a58b312SMartin Matuska const brt_entry_t *bre1 = x1; 1073*2a58b312SMartin Matuska const brt_entry_t *bre2 = x2; 1074*2a58b312SMartin Matuska 1075*2a58b312SMartin Matuska return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); 1076*2a58b312SMartin Matuska } 1077*2a58b312SMartin Matuska 1078*2a58b312SMartin Matuska static int 1079*2a58b312SMartin Matuska brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) 1080*2a58b312SMartin Matuska { 1081*2a58b312SMartin Matuska uint64_t mos_entries; 1082*2a58b312SMartin Matuska uint64_t one, physsize; 1083*2a58b312SMartin Matuska int error; 1084*2a58b312SMartin Matuska 1085*2a58b312SMartin Matuska ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1086*2a58b312SMartin Matuska 1087*2a58b312SMartin Matuska if (!brt_vdev_lookup(brt, brtvd, bre)) 1088*2a58b312SMartin Matuska return (SET_ERROR(ENOENT)); 1089*2a58b312SMartin Matuska 1090*2a58b312SMartin Matuska /* 1091*2a58b312SMartin Matuska * Remember mos_entries object number. After we reacquire the BRT lock, 1092*2a58b312SMartin Matuska * the brtvd pointer may be invalid. 1093*2a58b312SMartin Matuska */ 1094*2a58b312SMartin Matuska mos_entries = brtvd->bv_mos_entries; 1095*2a58b312SMartin Matuska if (mos_entries == 0) 1096*2a58b312SMartin Matuska return (SET_ERROR(ENOENT)); 1097*2a58b312SMartin Matuska 1098*2a58b312SMartin Matuska brt_unlock(brt); 1099*2a58b312SMartin Matuska 1100*2a58b312SMartin Matuska error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, 1101*2a58b312SMartin Matuska BRT_KEY_WORDS, &one, &physsize); 1102*2a58b312SMartin Matuska if (error == 0) { 1103*2a58b312SMartin Matuska ASSERT3U(one, ==, 1); 1104*2a58b312SMartin Matuska ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); 1105*2a58b312SMartin Matuska 1106*2a58b312SMartin Matuska error = zap_lookup_uint64(brt->brt_mos, mos_entries, 1107*2a58b312SMartin Matuska &bre->bre_offset, BRT_KEY_WORDS, 1, 1108*2a58b312SMartin Matuska sizeof (bre->bre_refcount), &bre->bre_refcount); 1109*2a58b312SMartin Matuska BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " 1110*2a58b312SMartin Matuska "count=%llu error=%d", (u_longlong_t)mos_entries, 1111*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid, 1112*2a58b312SMartin Matuska (u_longlong_t)bre->bre_offset, 1113*2a58b312SMartin Matuska error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); 1114*2a58b312SMartin Matuska } 1115*2a58b312SMartin Matuska 1116*2a58b312SMartin Matuska brt_wlock(brt); 1117*2a58b312SMartin Matuska 1118*2a58b312SMartin Matuska return (error); 1119*2a58b312SMartin Matuska } 1120*2a58b312SMartin Matuska 1121*2a58b312SMartin Matuska static void 1122*2a58b312SMartin Matuska brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) 1123*2a58b312SMartin Matuska { 1124*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1125*2a58b312SMartin Matuska uint64_t mos_entries = 0; 1126*2a58b312SMartin Matuska 1127*2a58b312SMartin Matuska brt_rlock(brt); 1128*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1129*2a58b312SMartin Matuska if (brtvd != NULL) 1130*2a58b312SMartin Matuska mos_entries = brtvd->bv_mos_entries; 1131*2a58b312SMartin Matuska brt_unlock(brt); 1132*2a58b312SMartin Matuska 1133*2a58b312SMartin Matuska if (mos_entries == 0) 1134*2a58b312SMartin Matuska return; 1135*2a58b312SMartin Matuska 1136*2a58b312SMartin Matuska BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", 1137*2a58b312SMartin Matuska (u_longlong_t)mos_entries, (u_longlong_t)vdevid, 1138*2a58b312SMartin Matuska (u_longlong_t)bre->bre_offset); 1139*2a58b312SMartin Matuska (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, 1140*2a58b312SMartin Matuska (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); 1141*2a58b312SMartin Matuska } 1142*2a58b312SMartin Matuska 1143*2a58b312SMartin Matuska static int 1144*2a58b312SMartin Matuska brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1145*2a58b312SMartin Matuska { 1146*2a58b312SMartin Matuska int error; 1147*2a58b312SMartin Matuska 1148*2a58b312SMartin Matuska ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1149*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0); 1150*2a58b312SMartin Matuska ASSERT(bre->bre_refcount > 0); 1151*2a58b312SMartin Matuska 1152*2a58b312SMartin Matuska error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, 1153*2a58b312SMartin Matuska (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, 1154*2a58b312SMartin Matuska sizeof (bre->bre_refcount), &bre->bre_refcount, tx); 1155*2a58b312SMartin Matuska BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " 1156*2a58b312SMartin Matuska "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 1157*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 1158*2a58b312SMartin Matuska (u_longlong_t)bre->bre_refcount, error); 1159*2a58b312SMartin Matuska 1160*2a58b312SMartin Matuska return (error); 1161*2a58b312SMartin Matuska } 1162*2a58b312SMartin Matuska 1163*2a58b312SMartin Matuska static int 1164*2a58b312SMartin Matuska brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1165*2a58b312SMartin Matuska { 1166*2a58b312SMartin Matuska int error; 1167*2a58b312SMartin Matuska 1168*2a58b312SMartin Matuska ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1169*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0); 1170*2a58b312SMartin Matuska ASSERT0(bre->bre_refcount); 1171*2a58b312SMartin Matuska 1172*2a58b312SMartin Matuska error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, 1173*2a58b312SMartin Matuska (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); 1174*2a58b312SMartin Matuska BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " 1175*2a58b312SMartin Matuska "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 1176*2a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 1177*2a58b312SMartin Matuska (u_longlong_t)bre->bre_refcount, error); 1178*2a58b312SMartin Matuska 1179*2a58b312SMartin Matuska return (error); 1180*2a58b312SMartin Matuska } 1181*2a58b312SMartin Matuska 1182*2a58b312SMartin Matuska /* 1183*2a58b312SMartin Matuska * Return TRUE if we _can_ have BRT entry for this bp. It might be false 1184*2a58b312SMartin Matuska * positive, but gives us quick answer if we should look into BRT, which 1185*2a58b312SMartin Matuska * may require reads and thus will be more expensive. 1186*2a58b312SMartin Matuska */ 1187*2a58b312SMartin Matuska boolean_t 1188*2a58b312SMartin Matuska brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 1189*2a58b312SMartin Matuska { 1190*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1191*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1192*2a58b312SMartin Matuska brt_entry_t bre_search; 1193*2a58b312SMartin Matuska boolean_t mayexists = FALSE; 1194*2a58b312SMartin Matuska uint64_t vdevid; 1195*2a58b312SMartin Matuska 1196*2a58b312SMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid); 1197*2a58b312SMartin Matuska 1198*2a58b312SMartin Matuska brt_rlock(brt); 1199*2a58b312SMartin Matuska 1200*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1201*2a58b312SMartin Matuska if (brtvd != NULL && brtvd->bv_initiated) { 1202*2a58b312SMartin Matuska if (!avl_is_empty(&brtvd->bv_tree) || 1203*2a58b312SMartin Matuska brt_vdev_lookup(brt, brtvd, &bre_search)) { 1204*2a58b312SMartin Matuska mayexists = TRUE; 1205*2a58b312SMartin Matuska } 1206*2a58b312SMartin Matuska } 1207*2a58b312SMartin Matuska 1208*2a58b312SMartin Matuska brt_unlock(brt); 1209*2a58b312SMartin Matuska 1210*2a58b312SMartin Matuska return (mayexists); 1211*2a58b312SMartin Matuska } 1212*2a58b312SMartin Matuska 1213*2a58b312SMartin Matuska uint64_t 1214*2a58b312SMartin Matuska brt_get_dspace(spa_t *spa) 1215*2a58b312SMartin Matuska { 1216*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1217*2a58b312SMartin Matuska 1218*2a58b312SMartin Matuska if (brt == NULL) 1219*2a58b312SMartin Matuska return (0); 1220*2a58b312SMartin Matuska 1221*2a58b312SMartin Matuska return (brt->brt_savedspace); 1222*2a58b312SMartin Matuska } 1223*2a58b312SMartin Matuska 1224*2a58b312SMartin Matuska uint64_t 1225*2a58b312SMartin Matuska brt_get_used(spa_t *spa) 1226*2a58b312SMartin Matuska { 1227*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1228*2a58b312SMartin Matuska 1229*2a58b312SMartin Matuska if (brt == NULL) 1230*2a58b312SMartin Matuska return (0); 1231*2a58b312SMartin Matuska 1232*2a58b312SMartin Matuska return (brt->brt_usedspace); 1233*2a58b312SMartin Matuska } 1234*2a58b312SMartin Matuska 1235*2a58b312SMartin Matuska uint64_t 1236*2a58b312SMartin Matuska brt_get_saved(spa_t *spa) 1237*2a58b312SMartin Matuska { 1238*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1239*2a58b312SMartin Matuska 1240*2a58b312SMartin Matuska if (brt == NULL) 1241*2a58b312SMartin Matuska return (0); 1242*2a58b312SMartin Matuska 1243*2a58b312SMartin Matuska return (brt->brt_savedspace); 1244*2a58b312SMartin Matuska } 1245*2a58b312SMartin Matuska 1246*2a58b312SMartin Matuska uint64_t 1247*2a58b312SMartin Matuska brt_get_ratio(spa_t *spa) 1248*2a58b312SMartin Matuska { 1249*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1250*2a58b312SMartin Matuska 1251*2a58b312SMartin Matuska if (brt->brt_usedspace == 0) 1252*2a58b312SMartin Matuska return (100); 1253*2a58b312SMartin Matuska 1254*2a58b312SMartin Matuska return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / 1255*2a58b312SMartin Matuska brt->brt_usedspace); 1256*2a58b312SMartin Matuska } 1257*2a58b312SMartin Matuska 1258*2a58b312SMartin Matuska static int 1259*2a58b312SMartin Matuska brt_kstats_update(kstat_t *ksp, int rw) 1260*2a58b312SMartin Matuska { 1261*2a58b312SMartin Matuska brt_stats_t *bs = ksp->ks_data; 1262*2a58b312SMartin Matuska 1263*2a58b312SMartin Matuska if (rw == KSTAT_WRITE) 1264*2a58b312SMartin Matuska return (EACCES); 1265*2a58b312SMartin Matuska 1266*2a58b312SMartin Matuska bs->brt_addref_entry_in_memory.value.ui64 = 1267*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_in_memory); 1268*2a58b312SMartin Matuska bs->brt_addref_entry_not_on_disk.value.ui64 = 1269*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 1270*2a58b312SMartin Matuska bs->brt_addref_entry_on_disk.value.ui64 = 1271*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_on_disk); 1272*2a58b312SMartin Matuska bs->brt_addref_entry_read_lost_race.value.ui64 = 1273*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); 1274*2a58b312SMartin Matuska bs->brt_decref_entry_in_memory.value.ui64 = 1275*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_in_memory); 1276*2a58b312SMartin Matuska bs->brt_decref_entry_loaded_from_disk.value.ui64 = 1277*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 1278*2a58b312SMartin Matuska bs->brt_decref_entry_not_in_memory.value.ui64 = 1279*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 1280*2a58b312SMartin Matuska bs->brt_decref_entry_not_on_disk.value.ui64 = 1281*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); 1282*2a58b312SMartin Matuska bs->brt_decref_entry_read_lost_race.value.ui64 = 1283*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 1284*2a58b312SMartin Matuska bs->brt_decref_entry_still_referenced.value.ui64 = 1285*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 1286*2a58b312SMartin Matuska bs->brt_decref_free_data_later.value.ui64 = 1287*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_free_data_later); 1288*2a58b312SMartin Matuska bs->brt_decref_free_data_now.value.ui64 = 1289*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_free_data_now); 1290*2a58b312SMartin Matuska bs->brt_decref_no_entry.value.ui64 = 1291*2a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_no_entry); 1292*2a58b312SMartin Matuska 1293*2a58b312SMartin Matuska return (0); 1294*2a58b312SMartin Matuska } 1295*2a58b312SMartin Matuska 1296*2a58b312SMartin Matuska static void 1297*2a58b312SMartin Matuska brt_stat_init(void) 1298*2a58b312SMartin Matuska { 1299*2a58b312SMartin Matuska 1300*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); 1301*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1302*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 1303*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); 1304*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1305*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1306*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 1307*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); 1308*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1309*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1310*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1311*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1312*2a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1313*2a58b312SMartin Matuska 1314*2a58b312SMartin Matuska brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1315*2a58b312SMartin Matuska sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1316*2a58b312SMartin Matuska if (brt_ksp != NULL) { 1317*2a58b312SMartin Matuska brt_ksp->ks_data = &brt_stats; 1318*2a58b312SMartin Matuska brt_ksp->ks_update = brt_kstats_update; 1319*2a58b312SMartin Matuska kstat_install(brt_ksp); 1320*2a58b312SMartin Matuska } 1321*2a58b312SMartin Matuska } 1322*2a58b312SMartin Matuska 1323*2a58b312SMartin Matuska static void 1324*2a58b312SMartin Matuska brt_stat_fini(void) 1325*2a58b312SMartin Matuska { 1326*2a58b312SMartin Matuska if (brt_ksp != NULL) { 1327*2a58b312SMartin Matuska kstat_delete(brt_ksp); 1328*2a58b312SMartin Matuska brt_ksp = NULL; 1329*2a58b312SMartin Matuska } 1330*2a58b312SMartin Matuska 1331*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_in_memory); 1332*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1333*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1334*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); 1335*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1336*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1337*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1338*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); 1339*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1340*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1341*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_free_data_later); 1342*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_free_data_now); 1343*2a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_no_entry); 1344*2a58b312SMartin Matuska } 1345*2a58b312SMartin Matuska 1346*2a58b312SMartin Matuska void 1347*2a58b312SMartin Matuska brt_init(void) 1348*2a58b312SMartin Matuska { 1349*2a58b312SMartin Matuska brt_entry_cache = kmem_cache_create("brt_entry_cache", 1350*2a58b312SMartin Matuska sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1351*2a58b312SMartin Matuska brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", 1352*2a58b312SMartin Matuska sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1353*2a58b312SMartin Matuska 1354*2a58b312SMartin Matuska brt_stat_init(); 1355*2a58b312SMartin Matuska } 1356*2a58b312SMartin Matuska 1357*2a58b312SMartin Matuska void 1358*2a58b312SMartin Matuska brt_fini(void) 1359*2a58b312SMartin Matuska { 1360*2a58b312SMartin Matuska brt_stat_fini(); 1361*2a58b312SMartin Matuska 1362*2a58b312SMartin Matuska kmem_cache_destroy(brt_entry_cache); 1363*2a58b312SMartin Matuska kmem_cache_destroy(brt_pending_entry_cache); 1364*2a58b312SMartin Matuska } 1365*2a58b312SMartin Matuska 1366*2a58b312SMartin Matuska static brt_entry_t * 1367*2a58b312SMartin Matuska brt_entry_alloc(const brt_entry_t *bre_init) 1368*2a58b312SMartin Matuska { 1369*2a58b312SMartin Matuska brt_entry_t *bre; 1370*2a58b312SMartin Matuska 1371*2a58b312SMartin Matuska bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1372*2a58b312SMartin Matuska bre->bre_offset = bre_init->bre_offset; 1373*2a58b312SMartin Matuska bre->bre_refcount = bre_init->bre_refcount; 1374*2a58b312SMartin Matuska 1375*2a58b312SMartin Matuska return (bre); 1376*2a58b312SMartin Matuska } 1377*2a58b312SMartin Matuska 1378*2a58b312SMartin Matuska static void 1379*2a58b312SMartin Matuska brt_entry_free(brt_entry_t *bre) 1380*2a58b312SMartin Matuska { 1381*2a58b312SMartin Matuska 1382*2a58b312SMartin Matuska kmem_cache_free(brt_entry_cache, bre); 1383*2a58b312SMartin Matuska } 1384*2a58b312SMartin Matuska 1385*2a58b312SMartin Matuska static void 1386*2a58b312SMartin Matuska brt_entry_addref(brt_t *brt, const blkptr_t *bp) 1387*2a58b312SMartin Matuska { 1388*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1389*2a58b312SMartin Matuska brt_entry_t *bre, *racebre; 1390*2a58b312SMartin Matuska brt_entry_t bre_search; 1391*2a58b312SMartin Matuska avl_index_t where; 1392*2a58b312SMartin Matuska uint64_t vdevid; 1393*2a58b312SMartin Matuska int error; 1394*2a58b312SMartin Matuska 1395*2a58b312SMartin Matuska ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); 1396*2a58b312SMartin Matuska 1397*2a58b312SMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid); 1398*2a58b312SMartin Matuska 1399*2a58b312SMartin Matuska brt_wlock(brt); 1400*2a58b312SMartin Matuska 1401*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1402*2a58b312SMartin Matuska if (brtvd == NULL) { 1403*2a58b312SMartin Matuska ASSERT3U(vdevid, >=, brt->brt_nvdevs); 1404*2a58b312SMartin Matuska 1405*2a58b312SMartin Matuska /* New VDEV was added. */ 1406*2a58b312SMartin Matuska brt_vdevs_expand(brt, vdevid + 1); 1407*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1408*2a58b312SMartin Matuska } 1409*2a58b312SMartin Matuska ASSERT(brtvd != NULL); 1410*2a58b312SMartin Matuska if (!brtvd->bv_initiated) 1411*2a58b312SMartin Matuska brt_vdev_realloc(brt, brtvd); 1412*2a58b312SMartin Matuska 1413*2a58b312SMartin Matuska bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1414*2a58b312SMartin Matuska if (bre != NULL) { 1415*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_in_memory); 1416*2a58b312SMartin Matuska } else { 1417*2a58b312SMartin Matuska /* 1418*2a58b312SMartin Matuska * brt_entry_lookup() may drop the BRT (read) lock and 1419*2a58b312SMartin Matuska * reacquire it (write). 1420*2a58b312SMartin Matuska */ 1421*2a58b312SMartin Matuska error = brt_entry_lookup(brt, brtvd, &bre_search); 1422*2a58b312SMartin Matuska /* bre_search now contains correct bre_refcount */ 1423*2a58b312SMartin Matuska ASSERT(error == 0 || error == ENOENT); 1424*2a58b312SMartin Matuska if (error == 0) 1425*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_on_disk); 1426*2a58b312SMartin Matuska else 1427*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1428*2a58b312SMartin Matuska /* 1429*2a58b312SMartin Matuska * When the BRT lock was dropped, brt_vdevs[] may have been 1430*2a58b312SMartin Matuska * expanded and reallocated, we need to update brtvd's pointer. 1431*2a58b312SMartin Matuska */ 1432*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1433*2a58b312SMartin Matuska ASSERT(brtvd != NULL); 1434*2a58b312SMartin Matuska 1435*2a58b312SMartin Matuska racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1436*2a58b312SMartin Matuska if (racebre == NULL) { 1437*2a58b312SMartin Matuska bre = brt_entry_alloc(&bre_search); 1438*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1439*2a58b312SMartin Matuska avl_insert(&brtvd->bv_tree, bre, where); 1440*2a58b312SMartin Matuska brt->brt_nentries++; 1441*2a58b312SMartin Matuska } else { 1442*2a58b312SMartin Matuska /* 1443*2a58b312SMartin Matuska * The entry was added when the BRT lock was dropped in 1444*2a58b312SMartin Matuska * brt_entry_lookup(). 1445*2a58b312SMartin Matuska */ 1446*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_read_lost_race); 1447*2a58b312SMartin Matuska bre = racebre; 1448*2a58b312SMartin Matuska } 1449*2a58b312SMartin Matuska } 1450*2a58b312SMartin Matuska bre->bre_refcount++; 1451*2a58b312SMartin Matuska brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1452*2a58b312SMartin Matuska 1453*2a58b312SMartin Matuska brt_unlock(brt); 1454*2a58b312SMartin Matuska } 1455*2a58b312SMartin Matuska 1456*2a58b312SMartin Matuska /* Return TRUE if block should be freed immediately. */ 1457*2a58b312SMartin Matuska boolean_t 1458*2a58b312SMartin Matuska brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1459*2a58b312SMartin Matuska { 1460*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1461*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1462*2a58b312SMartin Matuska brt_entry_t *bre, *racebre; 1463*2a58b312SMartin Matuska brt_entry_t bre_search; 1464*2a58b312SMartin Matuska avl_index_t where; 1465*2a58b312SMartin Matuska uint64_t vdevid; 1466*2a58b312SMartin Matuska int error; 1467*2a58b312SMartin Matuska 1468*2a58b312SMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid); 1469*2a58b312SMartin Matuska 1470*2a58b312SMartin Matuska brt_wlock(brt); 1471*2a58b312SMartin Matuska 1472*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1473*2a58b312SMartin Matuska ASSERT(brtvd != NULL); 1474*2a58b312SMartin Matuska 1475*2a58b312SMartin Matuska bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1476*2a58b312SMartin Matuska if (bre != NULL) { 1477*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_in_memory); 1478*2a58b312SMartin Matuska goto out; 1479*2a58b312SMartin Matuska } else { 1480*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1481*2a58b312SMartin Matuska } 1482*2a58b312SMartin Matuska 1483*2a58b312SMartin Matuska /* 1484*2a58b312SMartin Matuska * brt_entry_lookup() may drop the BRT lock and reacquire it. 1485*2a58b312SMartin Matuska */ 1486*2a58b312SMartin Matuska error = brt_entry_lookup(brt, brtvd, &bre_search); 1487*2a58b312SMartin Matuska /* bre_search now contains correct bre_refcount */ 1488*2a58b312SMartin Matuska ASSERT(error == 0 || error == ENOENT); 1489*2a58b312SMartin Matuska /* 1490*2a58b312SMartin Matuska * When the BRT lock was dropped, brt_vdevs[] may have been expanded 1491*2a58b312SMartin Matuska * and reallocated, we need to update brtvd's pointer. 1492*2a58b312SMartin Matuska */ 1493*2a58b312SMartin Matuska brtvd = brt_vdev(brt, vdevid); 1494*2a58b312SMartin Matuska ASSERT(brtvd != NULL); 1495*2a58b312SMartin Matuska 1496*2a58b312SMartin Matuska if (error == ENOENT) { 1497*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_not_on_disk); 1498*2a58b312SMartin Matuska bre = NULL; 1499*2a58b312SMartin Matuska goto out; 1500*2a58b312SMartin Matuska } 1501*2a58b312SMartin Matuska 1502*2a58b312SMartin Matuska racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1503*2a58b312SMartin Matuska if (racebre != NULL) { 1504*2a58b312SMartin Matuska /* 1505*2a58b312SMartin Matuska * The entry was added when the BRT lock was dropped in 1506*2a58b312SMartin Matuska * brt_entry_lookup(). 1507*2a58b312SMartin Matuska */ 1508*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1509*2a58b312SMartin Matuska bre = racebre; 1510*2a58b312SMartin Matuska goto out; 1511*2a58b312SMartin Matuska } 1512*2a58b312SMartin Matuska 1513*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1514*2a58b312SMartin Matuska bre = brt_entry_alloc(&bre_search); 1515*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1516*2a58b312SMartin Matuska avl_insert(&brtvd->bv_tree, bre, where); 1517*2a58b312SMartin Matuska brt->brt_nentries++; 1518*2a58b312SMartin Matuska 1519*2a58b312SMartin Matuska out: 1520*2a58b312SMartin Matuska if (bre == NULL) { 1521*2a58b312SMartin Matuska /* 1522*2a58b312SMartin Matuska * This is a free of a regular (not cloned) block. 1523*2a58b312SMartin Matuska */ 1524*2a58b312SMartin Matuska brt_unlock(brt); 1525*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_no_entry); 1526*2a58b312SMartin Matuska return (B_TRUE); 1527*2a58b312SMartin Matuska } 1528*2a58b312SMartin Matuska if (bre->bre_refcount == 0) { 1529*2a58b312SMartin Matuska brt_unlock(brt); 1530*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_free_data_now); 1531*2a58b312SMartin Matuska return (B_TRUE); 1532*2a58b312SMartin Matuska } 1533*2a58b312SMartin Matuska 1534*2a58b312SMartin Matuska ASSERT(bre->bre_refcount > 0); 1535*2a58b312SMartin Matuska bre->bre_refcount--; 1536*2a58b312SMartin Matuska if (bre->bre_refcount == 0) 1537*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_free_data_later); 1538*2a58b312SMartin Matuska else 1539*2a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1540*2a58b312SMartin Matuska brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1541*2a58b312SMartin Matuska 1542*2a58b312SMartin Matuska brt_unlock(brt); 1543*2a58b312SMartin Matuska 1544*2a58b312SMartin Matuska return (B_FALSE); 1545*2a58b312SMartin Matuska } 1546*2a58b312SMartin Matuska 1547*2a58b312SMartin Matuska static void 1548*2a58b312SMartin Matuska brt_prefetch(brt_t *brt, const blkptr_t *bp) 1549*2a58b312SMartin Matuska { 1550*2a58b312SMartin Matuska brt_entry_t bre; 1551*2a58b312SMartin Matuska uint64_t vdevid; 1552*2a58b312SMartin Matuska 1553*2a58b312SMartin Matuska ASSERT(bp != NULL); 1554*2a58b312SMartin Matuska 1555*2a58b312SMartin Matuska if (!zfs_brt_prefetch) 1556*2a58b312SMartin Matuska return; 1557*2a58b312SMartin Matuska 1558*2a58b312SMartin Matuska brt_entry_fill(bp, &bre, &vdevid); 1559*2a58b312SMartin Matuska 1560*2a58b312SMartin Matuska brt_entry_prefetch(brt, vdevid, &bre); 1561*2a58b312SMartin Matuska } 1562*2a58b312SMartin Matuska 1563*2a58b312SMartin Matuska static int 1564*2a58b312SMartin Matuska brt_pending_entry_compare(const void *x1, const void *x2) 1565*2a58b312SMartin Matuska { 1566*2a58b312SMartin Matuska const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; 1567*2a58b312SMartin Matuska const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; 1568*2a58b312SMartin Matuska int cmp; 1569*2a58b312SMartin Matuska 1570*2a58b312SMartin Matuska cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); 1571*2a58b312SMartin Matuska if (cmp == 0) { 1572*2a58b312SMartin Matuska cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), 1573*2a58b312SMartin Matuska DVA_GET_VDEV(&bp2->blk_dva[0])); 1574*2a58b312SMartin Matuska if (cmp == 0) { 1575*2a58b312SMartin Matuska cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1576*2a58b312SMartin Matuska DVA_GET_OFFSET(&bp2->blk_dva[0])); 1577*2a58b312SMartin Matuska } 1578*2a58b312SMartin Matuska } 1579*2a58b312SMartin Matuska 1580*2a58b312SMartin Matuska return (cmp); 1581*2a58b312SMartin Matuska } 1582*2a58b312SMartin Matuska 1583*2a58b312SMartin Matuska void 1584*2a58b312SMartin Matuska brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1585*2a58b312SMartin Matuska { 1586*2a58b312SMartin Matuska brt_t *brt; 1587*2a58b312SMartin Matuska avl_tree_t *pending_tree; 1588*2a58b312SMartin Matuska kmutex_t *pending_lock; 1589*2a58b312SMartin Matuska brt_pending_entry_t *bpe, *newbpe; 1590*2a58b312SMartin Matuska avl_index_t where; 1591*2a58b312SMartin Matuska uint64_t txg; 1592*2a58b312SMartin Matuska 1593*2a58b312SMartin Matuska brt = spa->spa_brt; 1594*2a58b312SMartin Matuska txg = dmu_tx_get_txg(tx); 1595*2a58b312SMartin Matuska ASSERT3U(txg, !=, 0); 1596*2a58b312SMartin Matuska pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1597*2a58b312SMartin Matuska pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1598*2a58b312SMartin Matuska 1599*2a58b312SMartin Matuska newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); 1600*2a58b312SMartin Matuska newbpe->bpe_bp = *bp; 1601*2a58b312SMartin Matuska newbpe->bpe_count = 1; 1602*2a58b312SMartin Matuska 1603*2a58b312SMartin Matuska mutex_enter(pending_lock); 1604*2a58b312SMartin Matuska 1605*2a58b312SMartin Matuska bpe = avl_find(pending_tree, newbpe, &where); 1606*2a58b312SMartin Matuska if (bpe == NULL) { 1607*2a58b312SMartin Matuska avl_insert(pending_tree, newbpe, where); 1608*2a58b312SMartin Matuska newbpe = NULL; 1609*2a58b312SMartin Matuska } else { 1610*2a58b312SMartin Matuska bpe->bpe_count++; 1611*2a58b312SMartin Matuska } 1612*2a58b312SMartin Matuska 1613*2a58b312SMartin Matuska mutex_exit(pending_lock); 1614*2a58b312SMartin Matuska 1615*2a58b312SMartin Matuska if (newbpe != NULL) { 1616*2a58b312SMartin Matuska ASSERT(bpe != NULL); 1617*2a58b312SMartin Matuska ASSERT(bpe != newbpe); 1618*2a58b312SMartin Matuska kmem_cache_free(brt_pending_entry_cache, newbpe); 1619*2a58b312SMartin Matuska } else { 1620*2a58b312SMartin Matuska ASSERT(bpe == NULL); 1621*2a58b312SMartin Matuska } 1622*2a58b312SMartin Matuska 1623*2a58b312SMartin Matuska /* Prefetch BRT entry, as we will need it in the syncing context. */ 1624*2a58b312SMartin Matuska brt_prefetch(brt, bp); 1625*2a58b312SMartin Matuska } 1626*2a58b312SMartin Matuska 1627*2a58b312SMartin Matuska void 1628*2a58b312SMartin Matuska brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1629*2a58b312SMartin Matuska { 1630*2a58b312SMartin Matuska brt_t *brt; 1631*2a58b312SMartin Matuska avl_tree_t *pending_tree; 1632*2a58b312SMartin Matuska kmutex_t *pending_lock; 1633*2a58b312SMartin Matuska brt_pending_entry_t *bpe, bpe_search; 1634*2a58b312SMartin Matuska uint64_t txg; 1635*2a58b312SMartin Matuska 1636*2a58b312SMartin Matuska brt = spa->spa_brt; 1637*2a58b312SMartin Matuska txg = dmu_tx_get_txg(tx); 1638*2a58b312SMartin Matuska ASSERT3U(txg, !=, 0); 1639*2a58b312SMartin Matuska pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1640*2a58b312SMartin Matuska pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1641*2a58b312SMartin Matuska 1642*2a58b312SMartin Matuska bpe_search.bpe_bp = *bp; 1643*2a58b312SMartin Matuska 1644*2a58b312SMartin Matuska mutex_enter(pending_lock); 1645*2a58b312SMartin Matuska 1646*2a58b312SMartin Matuska bpe = avl_find(pending_tree, &bpe_search, NULL); 1647*2a58b312SMartin Matuska /* I believe we should always find bpe when this function is called. */ 1648*2a58b312SMartin Matuska if (bpe != NULL) { 1649*2a58b312SMartin Matuska ASSERT(bpe->bpe_count > 0); 1650*2a58b312SMartin Matuska 1651*2a58b312SMartin Matuska bpe->bpe_count--; 1652*2a58b312SMartin Matuska if (bpe->bpe_count == 0) { 1653*2a58b312SMartin Matuska avl_remove(pending_tree, bpe); 1654*2a58b312SMartin Matuska kmem_cache_free(brt_pending_entry_cache, bpe); 1655*2a58b312SMartin Matuska } 1656*2a58b312SMartin Matuska } 1657*2a58b312SMartin Matuska 1658*2a58b312SMartin Matuska mutex_exit(pending_lock); 1659*2a58b312SMartin Matuska } 1660*2a58b312SMartin Matuska 1661*2a58b312SMartin Matuska void 1662*2a58b312SMartin Matuska brt_pending_apply(spa_t *spa, uint64_t txg) 1663*2a58b312SMartin Matuska { 1664*2a58b312SMartin Matuska brt_t *brt; 1665*2a58b312SMartin Matuska brt_pending_entry_t *bpe; 1666*2a58b312SMartin Matuska avl_tree_t *pending_tree; 1667*2a58b312SMartin Matuska kmutex_t *pending_lock; 1668*2a58b312SMartin Matuska void *c; 1669*2a58b312SMartin Matuska 1670*2a58b312SMartin Matuska ASSERT3U(txg, !=, 0); 1671*2a58b312SMartin Matuska 1672*2a58b312SMartin Matuska brt = spa->spa_brt; 1673*2a58b312SMartin Matuska pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1674*2a58b312SMartin Matuska pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1675*2a58b312SMartin Matuska 1676*2a58b312SMartin Matuska mutex_enter(pending_lock); 1677*2a58b312SMartin Matuska 1678*2a58b312SMartin Matuska c = NULL; 1679*2a58b312SMartin Matuska while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { 1680*2a58b312SMartin Matuska boolean_t added_to_ddt; 1681*2a58b312SMartin Matuska 1682*2a58b312SMartin Matuska mutex_exit(pending_lock); 1683*2a58b312SMartin Matuska 1684*2a58b312SMartin Matuska for (int i = 0; i < bpe->bpe_count; i++) { 1685*2a58b312SMartin Matuska /* 1686*2a58b312SMartin Matuska * If the block has DEDUP bit set, it means that it 1687*2a58b312SMartin Matuska * already exists in the DEDUP table, so we can just 1688*2a58b312SMartin Matuska * use that instead of creating new entry in 1689*2a58b312SMartin Matuska * the BRT table. 1690*2a58b312SMartin Matuska */ 1691*2a58b312SMartin Matuska if (BP_GET_DEDUP(&bpe->bpe_bp)) { 1692*2a58b312SMartin Matuska added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); 1693*2a58b312SMartin Matuska } else { 1694*2a58b312SMartin Matuska added_to_ddt = B_FALSE; 1695*2a58b312SMartin Matuska } 1696*2a58b312SMartin Matuska if (!added_to_ddt) 1697*2a58b312SMartin Matuska brt_entry_addref(brt, &bpe->bpe_bp); 1698*2a58b312SMartin Matuska } 1699*2a58b312SMartin Matuska 1700*2a58b312SMartin Matuska kmem_cache_free(brt_pending_entry_cache, bpe); 1701*2a58b312SMartin Matuska mutex_enter(pending_lock); 1702*2a58b312SMartin Matuska } 1703*2a58b312SMartin Matuska 1704*2a58b312SMartin Matuska mutex_exit(pending_lock); 1705*2a58b312SMartin Matuska } 1706*2a58b312SMartin Matuska 1707*2a58b312SMartin Matuska static void 1708*2a58b312SMartin Matuska brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1709*2a58b312SMartin Matuska { 1710*2a58b312SMartin Matuska 1711*2a58b312SMartin Matuska ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1712*2a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0); 1713*2a58b312SMartin Matuska 1714*2a58b312SMartin Matuska if (bre->bre_refcount == 0) { 1715*2a58b312SMartin Matuska int error; 1716*2a58b312SMartin Matuska 1717*2a58b312SMartin Matuska error = brt_entry_remove(brt, brtvd, bre, tx); 1718*2a58b312SMartin Matuska ASSERT(error == 0 || error == ENOENT); 1719*2a58b312SMartin Matuska /* 1720*2a58b312SMartin Matuska * If error == ENOENT then zfs_clone_range() was done from a 1721*2a58b312SMartin Matuska * removed (but opened) file (open(), unlink()). 1722*2a58b312SMartin Matuska */ 1723*2a58b312SMartin Matuska ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); 1724*2a58b312SMartin Matuska } else { 1725*2a58b312SMartin Matuska VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); 1726*2a58b312SMartin Matuska } 1727*2a58b312SMartin Matuska } 1728*2a58b312SMartin Matuska 1729*2a58b312SMartin Matuska static void 1730*2a58b312SMartin Matuska brt_sync_table(brt_t *brt, dmu_tx_t *tx) 1731*2a58b312SMartin Matuska { 1732*2a58b312SMartin Matuska brt_vdev_t *brtvd; 1733*2a58b312SMartin Matuska brt_entry_t *bre; 1734*2a58b312SMartin Matuska uint64_t vdevid; 1735*2a58b312SMartin Matuska void *c; 1736*2a58b312SMartin Matuska 1737*2a58b312SMartin Matuska brt_wlock(brt); 1738*2a58b312SMartin Matuska 1739*2a58b312SMartin Matuska for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1740*2a58b312SMartin Matuska brtvd = &brt->brt_vdevs[vdevid]; 1741*2a58b312SMartin Matuska 1742*2a58b312SMartin Matuska if (!brtvd->bv_initiated) 1743*2a58b312SMartin Matuska continue; 1744*2a58b312SMartin Matuska 1745*2a58b312SMartin Matuska if (!brtvd->bv_meta_dirty) { 1746*2a58b312SMartin Matuska ASSERT(!brtvd->bv_entcount_dirty); 1747*2a58b312SMartin Matuska ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1748*2a58b312SMartin Matuska continue; 1749*2a58b312SMartin Matuska } 1750*2a58b312SMartin Matuska 1751*2a58b312SMartin Matuska ASSERT(!brtvd->bv_entcount_dirty || 1752*2a58b312SMartin Matuska avl_numnodes(&brtvd->bv_tree) != 0); 1753*2a58b312SMartin Matuska 1754*2a58b312SMartin Matuska if (brtvd->bv_mos_brtvdev == 0) 1755*2a58b312SMartin Matuska brt_vdev_create(brt, brtvd, tx); 1756*2a58b312SMartin Matuska 1757*2a58b312SMartin Matuska c = NULL; 1758*2a58b312SMartin Matuska while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1759*2a58b312SMartin Matuska brt_sync_entry(brt, brtvd, bre, tx); 1760*2a58b312SMartin Matuska brt_entry_free(bre); 1761*2a58b312SMartin Matuska ASSERT(brt->brt_nentries > 0); 1762*2a58b312SMartin Matuska brt->brt_nentries--; 1763*2a58b312SMartin Matuska } 1764*2a58b312SMartin Matuska 1765*2a58b312SMartin Matuska brt_vdev_sync(brt, brtvd, tx); 1766*2a58b312SMartin Matuska 1767*2a58b312SMartin Matuska if (brtvd->bv_totalcount == 0) 1768*2a58b312SMartin Matuska brt_vdev_destroy(brt, brtvd, tx); 1769*2a58b312SMartin Matuska } 1770*2a58b312SMartin Matuska 1771*2a58b312SMartin Matuska ASSERT0(brt->brt_nentries); 1772*2a58b312SMartin Matuska 1773*2a58b312SMartin Matuska brt_unlock(brt); 1774*2a58b312SMartin Matuska } 1775*2a58b312SMartin Matuska 1776*2a58b312SMartin Matuska void 1777*2a58b312SMartin Matuska brt_sync(spa_t *spa, uint64_t txg) 1778*2a58b312SMartin Matuska { 1779*2a58b312SMartin Matuska dmu_tx_t *tx; 1780*2a58b312SMartin Matuska brt_t *brt; 1781*2a58b312SMartin Matuska 1782*2a58b312SMartin Matuska ASSERT(spa_syncing_txg(spa) == txg); 1783*2a58b312SMartin Matuska 1784*2a58b312SMartin Matuska brt = spa->spa_brt; 1785*2a58b312SMartin Matuska brt_rlock(brt); 1786*2a58b312SMartin Matuska if (brt->brt_nentries == 0) { 1787*2a58b312SMartin Matuska /* No changes. */ 1788*2a58b312SMartin Matuska brt_unlock(brt); 1789*2a58b312SMartin Matuska return; 1790*2a58b312SMartin Matuska } 1791*2a58b312SMartin Matuska brt_unlock(brt); 1792*2a58b312SMartin Matuska 1793*2a58b312SMartin Matuska tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1794*2a58b312SMartin Matuska 1795*2a58b312SMartin Matuska brt_sync_table(brt, tx); 1796*2a58b312SMartin Matuska 1797*2a58b312SMartin Matuska dmu_tx_commit(tx); 1798*2a58b312SMartin Matuska } 1799*2a58b312SMartin Matuska 1800*2a58b312SMartin Matuska static void 1801*2a58b312SMartin Matuska brt_table_alloc(brt_t *brt) 1802*2a58b312SMartin Matuska { 1803*2a58b312SMartin Matuska 1804*2a58b312SMartin Matuska for (int i = 0; i < TXG_SIZE; i++) { 1805*2a58b312SMartin Matuska avl_create(&brt->brt_pending_tree[i], 1806*2a58b312SMartin Matuska brt_pending_entry_compare, 1807*2a58b312SMartin Matuska sizeof (brt_pending_entry_t), 1808*2a58b312SMartin Matuska offsetof(brt_pending_entry_t, bpe_node)); 1809*2a58b312SMartin Matuska mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, 1810*2a58b312SMartin Matuska NULL); 1811*2a58b312SMartin Matuska } 1812*2a58b312SMartin Matuska } 1813*2a58b312SMartin Matuska 1814*2a58b312SMartin Matuska static void 1815*2a58b312SMartin Matuska brt_table_free(brt_t *brt) 1816*2a58b312SMartin Matuska { 1817*2a58b312SMartin Matuska 1818*2a58b312SMartin Matuska for (int i = 0; i < TXG_SIZE; i++) { 1819*2a58b312SMartin Matuska ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); 1820*2a58b312SMartin Matuska 1821*2a58b312SMartin Matuska avl_destroy(&brt->brt_pending_tree[i]); 1822*2a58b312SMartin Matuska mutex_destroy(&brt->brt_pending_lock[i]); 1823*2a58b312SMartin Matuska } 1824*2a58b312SMartin Matuska } 1825*2a58b312SMartin Matuska 1826*2a58b312SMartin Matuska static void 1827*2a58b312SMartin Matuska brt_alloc(spa_t *spa) 1828*2a58b312SMartin Matuska { 1829*2a58b312SMartin Matuska brt_t *brt; 1830*2a58b312SMartin Matuska 1831*2a58b312SMartin Matuska ASSERT(spa->spa_brt == NULL); 1832*2a58b312SMartin Matuska 1833*2a58b312SMartin Matuska brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); 1834*2a58b312SMartin Matuska rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); 1835*2a58b312SMartin Matuska brt->brt_spa = spa; 1836*2a58b312SMartin Matuska brt->brt_rangesize = 0; 1837*2a58b312SMartin Matuska brt->brt_nentries = 0; 1838*2a58b312SMartin Matuska brt->brt_vdevs = NULL; 1839*2a58b312SMartin Matuska brt->brt_nvdevs = 0; 1840*2a58b312SMartin Matuska brt_table_alloc(brt); 1841*2a58b312SMartin Matuska 1842*2a58b312SMartin Matuska spa->spa_brt = brt; 1843*2a58b312SMartin Matuska } 1844*2a58b312SMartin Matuska 1845*2a58b312SMartin Matuska void 1846*2a58b312SMartin Matuska brt_create(spa_t *spa) 1847*2a58b312SMartin Matuska { 1848*2a58b312SMartin Matuska 1849*2a58b312SMartin Matuska brt_alloc(spa); 1850*2a58b312SMartin Matuska brt_vdevs_alloc(spa->spa_brt, B_FALSE); 1851*2a58b312SMartin Matuska } 1852*2a58b312SMartin Matuska 1853*2a58b312SMartin Matuska int 1854*2a58b312SMartin Matuska brt_load(spa_t *spa) 1855*2a58b312SMartin Matuska { 1856*2a58b312SMartin Matuska 1857*2a58b312SMartin Matuska brt_alloc(spa); 1858*2a58b312SMartin Matuska brt_vdevs_alloc(spa->spa_brt, B_TRUE); 1859*2a58b312SMartin Matuska 1860*2a58b312SMartin Matuska return (0); 1861*2a58b312SMartin Matuska } 1862*2a58b312SMartin Matuska 1863*2a58b312SMartin Matuska void 1864*2a58b312SMartin Matuska brt_unload(spa_t *spa) 1865*2a58b312SMartin Matuska { 1866*2a58b312SMartin Matuska brt_t *brt = spa->spa_brt; 1867*2a58b312SMartin Matuska 1868*2a58b312SMartin Matuska if (brt == NULL) 1869*2a58b312SMartin Matuska return; 1870*2a58b312SMartin Matuska 1871*2a58b312SMartin Matuska brt_vdevs_free(brt); 1872*2a58b312SMartin Matuska brt_table_free(brt); 1873*2a58b312SMartin Matuska rw_destroy(&brt->brt_lock); 1874*2a58b312SMartin Matuska kmem_free(brt, sizeof (*brt)); 1875*2a58b312SMartin Matuska spa->spa_brt = NULL; 1876*2a58b312SMartin Matuska } 1877*2a58b312SMartin Matuska 1878*2a58b312SMartin Matuska /* BEGIN CSTYLED */ 1879*2a58b312SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, 1880*2a58b312SMartin Matuska "Enable prefetching of BRT entries"); 1881*2a58b312SMartin Matuska #ifdef ZFS_BRT_DEBUG 1882*2a58b312SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); 1883*2a58b312SMartin Matuska #endif 1884*2a58b312SMartin Matuska /* END CSTYLED */ 1885