xref: /freebsd-src/sys/contrib/openzfs/module/zfs/brt.c (revision 2a58b312b62f908ec92311d1bd8536dbaeb8e55b)
1*2a58b312SMartin Matuska /*
2*2a58b312SMartin Matuska  * CDDL HEADER START
3*2a58b312SMartin Matuska  *
4*2a58b312SMartin Matuska  * The contents of this file are subject to the terms of the
5*2a58b312SMartin Matuska  * Common Development and Distribution License (the "License").
6*2a58b312SMartin Matuska  * You may not use this file except in compliance with the License.
7*2a58b312SMartin Matuska  *
8*2a58b312SMartin Matuska  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*2a58b312SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10*2a58b312SMartin Matuska  * See the License for the specific language governing permissions
11*2a58b312SMartin Matuska  * and limitations under the License.
12*2a58b312SMartin Matuska  *
13*2a58b312SMartin Matuska  * When distributing Covered Code, include this CDDL HEADER in each
14*2a58b312SMartin Matuska  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*2a58b312SMartin Matuska  * If applicable, add the following below this CDDL HEADER, with the
16*2a58b312SMartin Matuska  * fields enclosed by brackets "[]" replaced with your own identifying
17*2a58b312SMartin Matuska  * information: Portions Copyright [yyyy] [name of copyright owner]
18*2a58b312SMartin Matuska  *
19*2a58b312SMartin Matuska  * CDDL HEADER END
20*2a58b312SMartin Matuska  */
21*2a58b312SMartin Matuska 
22*2a58b312SMartin Matuska /*
23*2a58b312SMartin Matuska  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24*2a58b312SMartin Matuska  */
25*2a58b312SMartin Matuska 
26*2a58b312SMartin Matuska #include <sys/zfs_context.h>
27*2a58b312SMartin Matuska #include <sys/spa.h>
28*2a58b312SMartin Matuska #include <sys/spa_impl.h>
29*2a58b312SMartin Matuska #include <sys/zio.h>
30*2a58b312SMartin Matuska #include <sys/brt.h>
31*2a58b312SMartin Matuska #include <sys/ddt.h>
32*2a58b312SMartin Matuska #include <sys/bitmap.h>
33*2a58b312SMartin Matuska #include <sys/zap.h>
34*2a58b312SMartin Matuska #include <sys/dmu_tx.h>
35*2a58b312SMartin Matuska #include <sys/arc.h>
36*2a58b312SMartin Matuska #include <sys/dsl_pool.h>
37*2a58b312SMartin Matuska #include <sys/dsl_scan.h>
38*2a58b312SMartin Matuska #include <sys/vdev_impl.h>
39*2a58b312SMartin Matuska #include <sys/kstat.h>
40*2a58b312SMartin Matuska #include <sys/wmsum.h>
41*2a58b312SMartin Matuska 
42*2a58b312SMartin Matuska /*
43*2a58b312SMartin Matuska  * Block Cloning design.
44*2a58b312SMartin Matuska  *
45*2a58b312SMartin Matuska  * Block Cloning allows to manually clone a file (or a subset of its blocks)
46*2a58b312SMartin Matuska  * into another (or the same) file by just creating additional references to
47*2a58b312SMartin Matuska  * the data blocks without copying the data itself. Those references are kept
48*2a58b312SMartin Matuska  * in the Block Reference Tables (BRTs).
49*2a58b312SMartin Matuska  *
50*2a58b312SMartin Matuska  * In many ways this is similar to the existing deduplication, but there are
51*2a58b312SMartin Matuska  * some important differences:
52*2a58b312SMartin Matuska  *
53*2a58b312SMartin Matuska  * - Deduplication is automatic and Block Cloning is not - one has to use a
54*2a58b312SMartin Matuska  *   dedicated system call(s) to clone the given file/blocks.
55*2a58b312SMartin Matuska  * - Deduplication keeps all data blocks in its table, even those referenced
56*2a58b312SMartin Matuska  *   just once. Block Cloning creates an entry in its tables only when there
57*2a58b312SMartin Matuska  *   are at least two references to the given data block. If the block was
58*2a58b312SMartin Matuska  *   never explicitly cloned or the second to last reference was dropped,
59*2a58b312SMartin Matuska  *   there will be neither space nor performance overhead.
60*2a58b312SMartin Matuska  * - Deduplication needs data to work - one needs to pass real data to the
61*2a58b312SMartin Matuska  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
62*2a58b312SMartin Matuska  *   data, just block pointers to the data, so it is extremely fast, as we pay
63*2a58b312SMartin Matuska  *   neither the cost of reading the data, nor the cost of writing the data -
64*2a58b312SMartin Matuska  *   we operate exclusively on metadata.
65*2a58b312SMartin Matuska  * - If the D (dedup) bit is not set in the block pointer, it means that
66*2a58b312SMartin Matuska  *   the block is not in the dedup table (DDT) and we won't consult the DDT
67*2a58b312SMartin Matuska  *   when we need to free the block. Block Cloning must be consulted on every
68*2a58b312SMartin Matuska  *   free, because we cannot modify the source BP (eg. by setting something
69*2a58b312SMartin Matuska  *   similar to the D bit), thus we have no hint if the block is in the
70*2a58b312SMartin Matuska  *   Block Reference Table (BRT), so we need to look into the BRT. There is
71*2a58b312SMartin Matuska  *   an optimization in place that allows us to eliminate the majority of BRT
72*2a58b312SMartin Matuska  *   lookups which is described below in the "Minimizing free penalty" section.
73*2a58b312SMartin Matuska  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
74*2a58b312SMartin Matuska  *   64bit offset and 64bit reference counter.
75*2a58b312SMartin Matuska  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
76*2a58b312SMartin Matuska  *   other on disk are most likely in totally different parts of the DDT.
77*2a58b312SMartin Matuska  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
78*2a58b312SMartin Matuska  *   from one file should have BRT entries close to each other.
79*2a58b312SMartin Matuska  * - Scrub will only do a single pass over a block that is referenced multiple
80*2a58b312SMartin Matuska  *   times in the DDT. Unfortunately it is not currently (if at all) possible
81*2a58b312SMartin Matuska  *   with Block Cloning and block referenced multiple times will be scrubbed
82*2a58b312SMartin Matuska  *   multiple times. The new, sorted scrub should be able to eliminate
83*2a58b312SMartin Matuska  *   duplicated reads given enough memory.
84*2a58b312SMartin Matuska  * - Deduplication requires cryptographically strong hash as a checksum or
85*2a58b312SMartin Matuska  *   additional data verification. Block Cloning works with any checksum
86*2a58b312SMartin Matuska  *   algorithm or even with checksumming disabled.
87*2a58b312SMartin Matuska  *
88*2a58b312SMartin Matuska  * As mentioned above, the BRT entries are much smaller than the DDT entries.
89*2a58b312SMartin Matuska  * To uniquely identify a block we just need its vdev id and offset. We also
90*2a58b312SMartin Matuska  * need to maintain a reference counter. The vdev id will often repeat, as there
91*2a58b312SMartin Matuska  * is a small number of top-level VDEVs and a large number of blocks stored in
92*2a58b312SMartin Matuska  * each VDEV. We take advantage of that to reduce the BRT entry size further by
93*2a58b312SMartin Matuska  * maintaining one BRT for each top-level VDEV, so we can then have only offset
94*2a58b312SMartin Matuska  * and counter as the BRT entry.
95*2a58b312SMartin Matuska  *
96*2a58b312SMartin Matuska  * Minimizing free penalty.
97*2a58b312SMartin Matuska  *
98*2a58b312SMartin Matuska  * Block Cloning allows creating additional references to any existing block.
99*2a58b312SMartin Matuska  * When we free a block there is no hint in the block pointer whether the block
100*2a58b312SMartin Matuska  * was cloned or not, so on each free we have to check if there is a
101*2a58b312SMartin Matuska  * corresponding entry in the BRT or not. If there is, we need to decrease
102*2a58b312SMartin Matuska  * the reference counter. Doing BRT lookup on every free can potentially be
103*2a58b312SMartin Matuska  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
104*2a58b312SMartin Matuska  * This is the main problem with deduplication, so we've learned our lesson and
105*2a58b312SMartin Matuska  * try not to repeat the same mistake here. How do we do that? We divide each
106*2a58b312SMartin Matuska  * top-level VDEV into 16MB regions. For each region we maintain a counter that
107*2a58b312SMartin Matuska  * is a sum of all the BRT entries that have offsets within the region. This
108*2a58b312SMartin Matuska  * creates the entries count array of 16bit numbers for each top-level VDEV.
109*2a58b312SMartin Matuska  * The entries count array is always kept in memory and updated on disk in the
110*2a58b312SMartin Matuska  * same transaction group as the BRT updates to keep everything in-sync. We can
111*2a58b312SMartin Matuska  * keep the array in memory, because it is very small. With 16MB regions and
112*2a58b312SMartin Matuska  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
113*2a58b312SMartin Matuska  * the region size even further in the future). Now, when we want to free
114*2a58b312SMartin Matuska  * a block, we first consult the array. If the counter for the whole region is
115*2a58b312SMartin Matuska  * zero, there is no need to look for the BRT entry, as there isn't one for
116*2a58b312SMartin Matuska  * sure. If the counter for the region is greater than zero, only then we will
117*2a58b312SMartin Matuska  * do a BRT lookup and if an entry is found we will decrease the reference
118*2a58b312SMartin Matuska  * counter in the BRT entry and in the entry counters array.
119*2a58b312SMartin Matuska  *
120*2a58b312SMartin Matuska  * The entry counters array is small, but can potentially be larger for very
121*2a58b312SMartin Matuska  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
122*2a58b312SMartin Matuska  * array on every change. We then divide the array into 32kB block and keep
123*2a58b312SMartin Matuska  * a bitmap of dirty blocks within a transaction group. When we sync the
124*2a58b312SMartin Matuska  * transaction group we can only update the parts of the entry counters array
125*2a58b312SMartin Matuska  * that were modified. Note: Keeping track of the dirty parts of the entry
126*2a58b312SMartin Matuska  * counters array is implemented, but updating only parts of the array on disk
127*2a58b312SMartin Matuska  * is not yet implemented - for now we will update entire array if there was
128*2a58b312SMartin Matuska  * any change.
129*2a58b312SMartin Matuska  *
130*2a58b312SMartin Matuska  * The implementation tries to be economic: if BRT is not used, or no longer
131*2a58b312SMartin Matuska  * used, there will be no entries in the MOS and no additional memory used (eg.
132*2a58b312SMartin Matuska  * the entry counters array is only allocated if needed).
133*2a58b312SMartin Matuska  *
134*2a58b312SMartin Matuska  * Interaction between Deduplication and Block Cloning.
135*2a58b312SMartin Matuska  *
136*2a58b312SMartin Matuska  * If both functionalities are in use, we could end up with a block that is
137*2a58b312SMartin Matuska  * referenced multiple times in both DDT and BRT. When we free one of the
138*2a58b312SMartin Matuska  * references we couldn't tell where it belongs, so we would have to decide
139*2a58b312SMartin Matuska  * what table takes the precedence: do we first clear DDT references or BRT
140*2a58b312SMartin Matuska  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
141*2a58b312SMartin Matuska  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
142*2a58b312SMartin Matuska  * lookup DDT entry instead and increase the counter there. No BRT entry
143*2a58b312SMartin Matuska  * will be created for a block which has the D (dedup) bit set.
144*2a58b312SMartin Matuska  * BRT may be more efficient for manual deduplication, but if the block is
145*2a58b312SMartin Matuska  * already in the DDT, then creating additional BRT entry would be less
146*2a58b312SMartin Matuska  * efficient. This clever idea was proposed by Allan Jude.
147*2a58b312SMartin Matuska  *
148*2a58b312SMartin Matuska  * Block Cloning across datasets.
149*2a58b312SMartin Matuska  *
150*2a58b312SMartin Matuska  * Block Cloning is not limited to cloning blocks within the same dataset.
151*2a58b312SMartin Matuska  * It is possible (and very useful) to clone blocks between different datasets.
152*2a58b312SMartin Matuska  * One use case is recovering files from snapshots. By cloning the files into
153*2a58b312SMartin Matuska  * dataset we need no additional storage. Without Block Cloning we would need
154*2a58b312SMartin Matuska  * additional space for those files.
155*2a58b312SMartin Matuska  * Another interesting use case is moving the files between datasets
156*2a58b312SMartin Matuska  * (copying the file content to the new dataset and removing the source file).
157*2a58b312SMartin Matuska  * In that case Block Cloning will only be used briefly, because the BRT entries
158*2a58b312SMartin Matuska  * will be removed when the source is removed.
159*2a58b312SMartin Matuska  * Note: currently it is not possible to clone blocks between encrypted
160*2a58b312SMartin Matuska  * datasets, even if those datasets use the same encryption key (this includes
161*2a58b312SMartin Matuska  * snapshots of encrypted datasets). Cloning blocks between datasets that use
162*2a58b312SMartin Matuska  * the same keys should be possible and should be implemented in the future.
163*2a58b312SMartin Matuska  *
164*2a58b312SMartin Matuska  * Block Cloning flow through ZFS layers.
165*2a58b312SMartin Matuska  *
166*2a58b312SMartin Matuska  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167*2a58b312SMartin Matuska  * blocks. As of this writing no interface is implemented that allows for block
168*2a58b312SMartin Matuska  * cloning within a ZVOL.
169*2a58b312SMartin Matuska  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170*2a58b312SMartin Matuska  * for blocking cloning.
171*2a58b312SMartin Matuska  *
172*2a58b312SMartin Matuska  *	ssize_t
173*2a58b312SMartin Matuska  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174*2a58b312SMartin Matuska  *	                size_t len, unsigned int flags);
175*2a58b312SMartin Matuska  *
176*2a58b312SMartin Matuska  * Even though offsets and length represent bytes, they have to be
177*2a58b312SMartin Matuska  * block-aligned or we will return the EXDEV error so the upper layer can
178*2a58b312SMartin Matuska  * fallback to the generic mechanism that will just copy the data.
179*2a58b312SMartin Matuska  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180*2a58b312SMartin Matuska  * This function was implemented based on zfs_write(), but instead of writing
181*2a58b312SMartin Matuska  * the given data we first read block pointers using the new dmu_read_l0_bps()
182*2a58b312SMartin Matuska  * function from the source file. Once we have BPs from the source file we call
183*2a58b312SMartin Matuska  * the dmu_brt_clone() function on the destination file. This function
184*2a58b312SMartin Matuska  * allocates BPs for us. We iterate over all source BPs. If the given BP is
185*2a58b312SMartin Matuska  * a hole or an embedded block, we just copy BP as-is. If it points to a real
186*2a58b312SMartin Matuska  * data we place this BP on a BRT pending list using the brt_pending_add()
187*2a58b312SMartin Matuska  * function.
188*2a58b312SMartin Matuska  *
189*2a58b312SMartin Matuska  * We use this pending list to keep track of all BPs that got new references
190*2a58b312SMartin Matuska  * within this transaction group.
191*2a58b312SMartin Matuska  *
192*2a58b312SMartin Matuska  * Some special cases to consider and how we address them:
193*2a58b312SMartin Matuska  * - The block we want to clone may have been created within the same
194*2a58b312SMartin Matuska  *   transaction group that we are trying to clone. Such block has no BP
195*2a58b312SMartin Matuska  *   allocated yet, so cannot be immediately cloned. We return EXDEV.
196*2a58b312SMartin Matuska  * - The block we want to clone may have been modified within the same
197*2a58b312SMartin Matuska  *   transaction group. We return EXDEV.
198*2a58b312SMartin Matuska  * - A block may be cloned multiple times during one transaction group (that's
199*2a58b312SMartin Matuska  *   why pending list is actually a tree and not an append-only list - this
200*2a58b312SMartin Matuska  *   way we can figure out faster if this block is cloned for the first time
201*2a58b312SMartin Matuska  *   in this txg or consecutive time).
202*2a58b312SMartin Matuska  * - A block may be cloned and freed within the same transaction group
203*2a58b312SMartin Matuska  *   (see dbuf_undirty()).
204*2a58b312SMartin Matuska  * - A block may be cloned and within the same transaction group the clone
205*2a58b312SMartin Matuska  *   can be cloned again (see dmu_read_l0_bps()).
206*2a58b312SMartin Matuska  * - A file might have been deleted, but the caller still has a file descriptor
207*2a58b312SMartin Matuska  *   open to this file and clones it.
208*2a58b312SMartin Matuska  *
209*2a58b312SMartin Matuska  * When we free a block we have an additional step in the ZIO pipeline where we
210*2a58b312SMartin Matuska  * call the zio_brt_free() function. We then call the brt_entry_decref()
211*2a58b312SMartin Matuska  * that loads the corresponding BRT entry (if one exists) and decreases
212*2a58b312SMartin Matuska  * reference counter. If this is not the last reference we will stop ZIO
213*2a58b312SMartin Matuska  * pipeline here. If this is the last reference or the block is not in the
214*2a58b312SMartin Matuska  * BRT, we continue the pipeline and free the block as usual.
215*2a58b312SMartin Matuska  *
216*2a58b312SMartin Matuska  * At the beginning of spa_sync() where there can be no more block cloning,
217*2a58b312SMartin Matuska  * but before issuing frees we call brt_pending_apply(). This function applies
218*2a58b312SMartin Matuska  * all the new clones to the BRT table - we load BRT entries and update
219*2a58b312SMartin Matuska  * reference counters. To sync new BRT entries to disk, we use brt_sync()
220*2a58b312SMartin Matuska  * function. This function will sync all dirty per-top-level-vdev BRTs,
221*2a58b312SMartin Matuska  * the entry counters arrays, etc.
222*2a58b312SMartin Matuska  *
223*2a58b312SMartin Matuska  * Block Cloning and ZIL.
224*2a58b312SMartin Matuska  *
225*2a58b312SMartin Matuska  * Every clone operation is divided into chunks (similar to write) and each
226*2a58b312SMartin Matuska  * chunk is cloned in a separate transaction. The chunk size is determined by
227*2a58b312SMartin Matuska  * how many BPs we can fit into a single ZIL entry.
228*2a58b312SMartin Matuska  * Replaying clone operation is different from the regular clone operation,
229*2a58b312SMartin Matuska  * as when we log clone operations we cannot use the source object - it may
230*2a58b312SMartin Matuska  * reside on a different dataset, so we log BPs we want to clone.
231*2a58b312SMartin Matuska  * The ZIL is replayed when we mount the given dataset, not when the pool is
232*2a58b312SMartin Matuska  * imported. Taking this into account it is possible that the pool is imported
233*2a58b312SMartin Matuska  * without mounting datasets and the source dataset is destroyed before the
234*2a58b312SMartin Matuska  * destination dataset is mounted and its ZIL replayed.
235*2a58b312SMartin Matuska  * To address this situation we leverage zil_claim() mechanism where ZFS will
236*2a58b312SMartin Matuska  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237*2a58b312SMartin Matuska  * entries, we will bump reference counters for their BPs in the BRT and then
238*2a58b312SMartin Matuska  * on mount and ZIL replay we will just attach BPs to the file without
239*2a58b312SMartin Matuska  * bumping reference counters.
240*2a58b312SMartin Matuska  * Note it is still possible that after zil_claim() we never mount the
241*2a58b312SMartin Matuska  * destination, so we never replay its ZIL and we destroy it. This way we would
242*2a58b312SMartin Matuska  * end up with leaked references in BRT. We address that too as ZFS gives us
243*2a58b312SMartin Matuska  * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
244*2a58b312SMartin Matuska  */
245*2a58b312SMartin Matuska 
246*2a58b312SMartin Matuska /*
247*2a58b312SMartin Matuska  * BRT - Block Reference Table.
248*2a58b312SMartin Matuska  */
249*2a58b312SMartin Matuska #define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
250*2a58b312SMartin Matuska 
251*2a58b312SMartin Matuska /*
252*2a58b312SMartin Matuska  * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
253*2a58b312SMartin Matuska  * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
254*2a58b312SMartin Matuska  * Each element in this array represents how many BRT entries do we have in this
255*2a58b312SMartin Matuska  * chunk of storage. We always load this entire array into memory and update as
256*2a58b312SMartin Matuska  * needed. By having it in memory we can quickly tell (during zio_free()) if
257*2a58b312SMartin Matuska  * there are any BRT entries that we might need to update.
258*2a58b312SMartin Matuska  *
259*2a58b312SMartin Matuska  * This value cannot be larger than 16MB, at least as long as we support
260*2a58b312SMartin Matuska  * 512 byte block sizes. With 512 byte block size we can have exactly
261*2a58b312SMartin Matuska  * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
262*2a58b312SMartin Matuska  * many for a 16bit counter.
263*2a58b312SMartin Matuska  */
264*2a58b312SMartin Matuska #define	BRT_RANGESIZE	(16 * 1024 * 1024)
265*2a58b312SMartin Matuska _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
266*2a58b312SMartin Matuska 	"BRT_RANGESIZE is too large.");
267*2a58b312SMartin Matuska /*
268*2a58b312SMartin Matuska  * We don't want to update the whole structure every time. Maintain bitmap
269*2a58b312SMartin Matuska  * of dirty blocks within the regions, so that a single bit represents a
270*2a58b312SMartin Matuska  * block size of entcounts. For example if we have a 1PB vdev then all
271*2a58b312SMartin Matuska  * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
272*2a58b312SMartin Matuska  * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
273*2a58b312SMartin Matuska  * the whole 128MB on disk when we have updated only a single entcount.
274*2a58b312SMartin Matuska  * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
275*2a58b312SMartin Matuska  * is represented by a single bit. This gives us 4096 bits. A set bit in the
276*2a58b312SMartin Matuska  * bitmap means that we had a change in at least one of the 16384 entcounts
277*2a58b312SMartin Matuska  * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
278*2a58b312SMartin Matuska  */
279*2a58b312SMartin Matuska #define	BRT_BLOCKSIZE	(32 * 1024)
280*2a58b312SMartin Matuska #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
281*2a58b312SMartin Matuska 	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
282*2a58b312SMartin Matuska 
283*2a58b312SMartin Matuska #define	BRT_LITTLE_ENDIAN	0
284*2a58b312SMartin Matuska #define	BRT_BIG_ENDIAN		1
285*2a58b312SMartin Matuska #ifdef _ZFS_LITTLE_ENDIAN
286*2a58b312SMartin Matuska #define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
287*2a58b312SMartin Matuska #define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
288*2a58b312SMartin Matuska #else
289*2a58b312SMartin Matuska #define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
290*2a58b312SMartin Matuska #define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
291*2a58b312SMartin Matuska #endif
292*2a58b312SMartin Matuska 
293*2a58b312SMartin Matuska typedef struct brt_vdev_phys {
294*2a58b312SMartin Matuska 	uint64_t	bvp_mos_entries;
295*2a58b312SMartin Matuska 	uint64_t	bvp_size;
296*2a58b312SMartin Matuska 	uint64_t	bvp_byteorder;
297*2a58b312SMartin Matuska 	uint64_t	bvp_totalcount;
298*2a58b312SMartin Matuska 	uint64_t	bvp_rangesize;
299*2a58b312SMartin Matuska 	uint64_t	bvp_usedspace;
300*2a58b312SMartin Matuska 	uint64_t	bvp_savedspace;
301*2a58b312SMartin Matuska } brt_vdev_phys_t;
302*2a58b312SMartin Matuska 
303*2a58b312SMartin Matuska typedef struct brt_vdev {
304*2a58b312SMartin Matuska 	/*
305*2a58b312SMartin Matuska 	 * VDEV id.
306*2a58b312SMartin Matuska 	 */
307*2a58b312SMartin Matuska 	uint64_t	bv_vdevid;
308*2a58b312SMartin Matuska 	/*
309*2a58b312SMartin Matuska 	 * Is the structure initiated?
310*2a58b312SMartin Matuska 	 * (bv_entcount and bv_bitmap are allocated?)
311*2a58b312SMartin Matuska 	 */
312*2a58b312SMartin Matuska 	boolean_t	bv_initiated;
313*2a58b312SMartin Matuska 	/*
314*2a58b312SMartin Matuska 	 * Object number in the MOS for the entcount array and brt_vdev_phys.
315*2a58b312SMartin Matuska 	 */
316*2a58b312SMartin Matuska 	uint64_t	bv_mos_brtvdev;
317*2a58b312SMartin Matuska 	/*
318*2a58b312SMartin Matuska 	 * Object number in the MOS for the entries table.
319*2a58b312SMartin Matuska 	 */
320*2a58b312SMartin Matuska 	uint64_t	bv_mos_entries;
321*2a58b312SMartin Matuska 	/*
322*2a58b312SMartin Matuska 	 * Entries to sync.
323*2a58b312SMartin Matuska 	 */
324*2a58b312SMartin Matuska 	avl_tree_t	bv_tree;
325*2a58b312SMartin Matuska 	/*
326*2a58b312SMartin Matuska 	 * Does the bv_entcount[] array needs byte swapping?
327*2a58b312SMartin Matuska 	 */
328*2a58b312SMartin Matuska 	boolean_t	bv_need_byteswap;
329*2a58b312SMartin Matuska 	/*
330*2a58b312SMartin Matuska 	 * Number of entries in the bv_entcount[] array.
331*2a58b312SMartin Matuska 	 */
332*2a58b312SMartin Matuska 	uint64_t	bv_size;
333*2a58b312SMartin Matuska 	/*
334*2a58b312SMartin Matuska 	 * This is the array with BRT entry count per BRT_RANGESIZE.
335*2a58b312SMartin Matuska 	 */
336*2a58b312SMartin Matuska 	uint16_t	*bv_entcount;
337*2a58b312SMartin Matuska 	/*
338*2a58b312SMartin Matuska 	 * Sum of all bv_entcount[]s.
339*2a58b312SMartin Matuska 	 */
340*2a58b312SMartin Matuska 	uint64_t	bv_totalcount;
341*2a58b312SMartin Matuska 	/*
342*2a58b312SMartin Matuska 	 * Space on disk occupied by cloned blocks (without compression).
343*2a58b312SMartin Matuska 	 */
344*2a58b312SMartin Matuska 	uint64_t	bv_usedspace;
345*2a58b312SMartin Matuska 	/*
346*2a58b312SMartin Matuska 	 * How much additional space would be occupied without block cloning.
347*2a58b312SMartin Matuska 	 */
348*2a58b312SMartin Matuska 	uint64_t	bv_savedspace;
349*2a58b312SMartin Matuska 	/*
350*2a58b312SMartin Matuska 	 * brt_vdev_phys needs updating on disk.
351*2a58b312SMartin Matuska 	 */
352*2a58b312SMartin Matuska 	boolean_t	bv_meta_dirty;
353*2a58b312SMartin Matuska 	/*
354*2a58b312SMartin Matuska 	 * bv_entcount[] needs updating on disk.
355*2a58b312SMartin Matuska 	 */
356*2a58b312SMartin Matuska 	boolean_t	bv_entcount_dirty;
357*2a58b312SMartin Matuska 	/*
358*2a58b312SMartin Matuska 	 * bv_entcount[] potentially can be a bit too big to sychronize it all
359*2a58b312SMartin Matuska 	 * when we just changed few entcounts. The fields below allow us to
360*2a58b312SMartin Matuska 	 * track updates to bv_entcount[] array since the last sync.
361*2a58b312SMartin Matuska 	 * A single bit in the bv_bitmap represents as many entcounts as can
362*2a58b312SMartin Matuska 	 * fit into a single BRT_BLOCKSIZE.
363*2a58b312SMartin Matuska 	 * For example we have 65536 entcounts in the bv_entcount array
364*2a58b312SMartin Matuska 	 * (so the whole array is 128kB). We updated bv_entcount[2] and
365*2a58b312SMartin Matuska 	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
366*2a58b312SMartin Matuska 	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
367*2a58b312SMartin Matuska 	 */
368*2a58b312SMartin Matuska 	ulong_t		*bv_bitmap;
369*2a58b312SMartin Matuska 	uint64_t	bv_nblocks;
370*2a58b312SMartin Matuska } brt_vdev_t;
371*2a58b312SMartin Matuska 
372*2a58b312SMartin Matuska /*
373*2a58b312SMartin Matuska  * In-core brt
374*2a58b312SMartin Matuska  */
375*2a58b312SMartin Matuska typedef struct brt {
376*2a58b312SMartin Matuska 	krwlock_t	brt_lock;
377*2a58b312SMartin Matuska 	spa_t		*brt_spa;
378*2a58b312SMartin Matuska #define	brt_mos		brt_spa->spa_meta_objset
379*2a58b312SMartin Matuska 	uint64_t	brt_rangesize;
380*2a58b312SMartin Matuska 	uint64_t	brt_usedspace;
381*2a58b312SMartin Matuska 	uint64_t	brt_savedspace;
382*2a58b312SMartin Matuska 	avl_tree_t	brt_pending_tree[TXG_SIZE];
383*2a58b312SMartin Matuska 	kmutex_t	brt_pending_lock[TXG_SIZE];
384*2a58b312SMartin Matuska 	/* Sum of all entries across all bv_trees. */
385*2a58b312SMartin Matuska 	uint64_t	brt_nentries;
386*2a58b312SMartin Matuska 	brt_vdev_t	*brt_vdevs;
387*2a58b312SMartin Matuska 	uint64_t	brt_nvdevs;
388*2a58b312SMartin Matuska } brt_t;
389*2a58b312SMartin Matuska 
390*2a58b312SMartin Matuska /* Size of bre_offset / sizeof (uint64_t). */
391*2a58b312SMartin Matuska #define	BRT_KEY_WORDS	(1)
392*2a58b312SMartin Matuska 
393*2a58b312SMartin Matuska /*
394*2a58b312SMartin Matuska  * In-core brt entry.
395*2a58b312SMartin Matuska  * On-disk we use bre_offset as the key and bre_refcount as the value.
396*2a58b312SMartin Matuska  */
397*2a58b312SMartin Matuska typedef struct brt_entry {
398*2a58b312SMartin Matuska 	uint64_t	bre_offset;
399*2a58b312SMartin Matuska 	uint64_t	bre_refcount;
400*2a58b312SMartin Matuska 	avl_node_t	bre_node;
401*2a58b312SMartin Matuska } brt_entry_t;
402*2a58b312SMartin Matuska 
403*2a58b312SMartin Matuska typedef struct brt_pending_entry {
404*2a58b312SMartin Matuska 	blkptr_t	bpe_bp;
405*2a58b312SMartin Matuska 	int		bpe_count;
406*2a58b312SMartin Matuska 	avl_node_t	bpe_node;
407*2a58b312SMartin Matuska } brt_pending_entry_t;
408*2a58b312SMartin Matuska 
409*2a58b312SMartin Matuska static kmem_cache_t *brt_entry_cache;
410*2a58b312SMartin Matuska static kmem_cache_t *brt_pending_entry_cache;
411*2a58b312SMartin Matuska 
412*2a58b312SMartin Matuska /*
413*2a58b312SMartin Matuska  * Enable/disable prefetching of BRT entries that we are going to modify.
414*2a58b312SMartin Matuska  */
415*2a58b312SMartin Matuska int zfs_brt_prefetch = 1;
416*2a58b312SMartin Matuska 
417*2a58b312SMartin Matuska #ifdef ZFS_DEBUG
418*2a58b312SMartin Matuska #define	BRT_DEBUG(...)	do {						\
419*2a58b312SMartin Matuska 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
420*2a58b312SMartin Matuska 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
421*2a58b312SMartin Matuska 	}								\
422*2a58b312SMartin Matuska } while (0)
423*2a58b312SMartin Matuska #else
424*2a58b312SMartin Matuska #define	BRT_DEBUG(...)	do { } while (0)
425*2a58b312SMartin Matuska #endif
426*2a58b312SMartin Matuska 
427*2a58b312SMartin Matuska int brt_zap_leaf_blockshift = 12;
428*2a58b312SMartin Matuska int brt_zap_indirect_blockshift = 12;
429*2a58b312SMartin Matuska 
430*2a58b312SMartin Matuska static kstat_t	*brt_ksp;
431*2a58b312SMartin Matuska 
432*2a58b312SMartin Matuska typedef struct brt_stats {
433*2a58b312SMartin Matuska 	kstat_named_t brt_addref_entry_in_memory;
434*2a58b312SMartin Matuska 	kstat_named_t brt_addref_entry_not_on_disk;
435*2a58b312SMartin Matuska 	kstat_named_t brt_addref_entry_on_disk;
436*2a58b312SMartin Matuska 	kstat_named_t brt_addref_entry_read_lost_race;
437*2a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_in_memory;
438*2a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_loaded_from_disk;
439*2a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_not_in_memory;
440*2a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_not_on_disk;
441*2a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_read_lost_race;
442*2a58b312SMartin Matuska 	kstat_named_t brt_decref_entry_still_referenced;
443*2a58b312SMartin Matuska 	kstat_named_t brt_decref_free_data_later;
444*2a58b312SMartin Matuska 	kstat_named_t brt_decref_free_data_now;
445*2a58b312SMartin Matuska 	kstat_named_t brt_decref_no_entry;
446*2a58b312SMartin Matuska } brt_stats_t;
447*2a58b312SMartin Matuska 
448*2a58b312SMartin Matuska static brt_stats_t brt_stats = {
449*2a58b312SMartin Matuska 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
450*2a58b312SMartin Matuska 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
451*2a58b312SMartin Matuska 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
452*2a58b312SMartin Matuska 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
453*2a58b312SMartin Matuska 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
454*2a58b312SMartin Matuska 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
455*2a58b312SMartin Matuska 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
456*2a58b312SMartin Matuska 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
457*2a58b312SMartin Matuska 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
458*2a58b312SMartin Matuska 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
459*2a58b312SMartin Matuska 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
460*2a58b312SMartin Matuska 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
461*2a58b312SMartin Matuska 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
462*2a58b312SMartin Matuska };
463*2a58b312SMartin Matuska 
464*2a58b312SMartin Matuska struct {
465*2a58b312SMartin Matuska 	wmsum_t brt_addref_entry_in_memory;
466*2a58b312SMartin Matuska 	wmsum_t brt_addref_entry_not_on_disk;
467*2a58b312SMartin Matuska 	wmsum_t brt_addref_entry_on_disk;
468*2a58b312SMartin Matuska 	wmsum_t brt_addref_entry_read_lost_race;
469*2a58b312SMartin Matuska 	wmsum_t brt_decref_entry_in_memory;
470*2a58b312SMartin Matuska 	wmsum_t brt_decref_entry_loaded_from_disk;
471*2a58b312SMartin Matuska 	wmsum_t brt_decref_entry_not_in_memory;
472*2a58b312SMartin Matuska 	wmsum_t brt_decref_entry_not_on_disk;
473*2a58b312SMartin Matuska 	wmsum_t brt_decref_entry_read_lost_race;
474*2a58b312SMartin Matuska 	wmsum_t brt_decref_entry_still_referenced;
475*2a58b312SMartin Matuska 	wmsum_t brt_decref_free_data_later;
476*2a58b312SMartin Matuska 	wmsum_t brt_decref_free_data_now;
477*2a58b312SMartin Matuska 	wmsum_t brt_decref_no_entry;
478*2a58b312SMartin Matuska } brt_sums;
479*2a58b312SMartin Matuska 
480*2a58b312SMartin Matuska #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
481*2a58b312SMartin Matuska 
482*2a58b312SMartin Matuska static int brt_entry_compare(const void *x1, const void *x2);
483*2a58b312SMartin Matuska static int brt_pending_entry_compare(const void *x1, const void *x2);
484*2a58b312SMartin Matuska 
485*2a58b312SMartin Matuska static void
486*2a58b312SMartin Matuska brt_rlock(brt_t *brt)
487*2a58b312SMartin Matuska {
488*2a58b312SMartin Matuska 	rw_enter(&brt->brt_lock, RW_READER);
489*2a58b312SMartin Matuska }
490*2a58b312SMartin Matuska 
491*2a58b312SMartin Matuska static void
492*2a58b312SMartin Matuska brt_wlock(brt_t *brt)
493*2a58b312SMartin Matuska {
494*2a58b312SMartin Matuska 	rw_enter(&brt->brt_lock, RW_WRITER);
495*2a58b312SMartin Matuska }
496*2a58b312SMartin Matuska 
497*2a58b312SMartin Matuska static void
498*2a58b312SMartin Matuska brt_unlock(brt_t *brt)
499*2a58b312SMartin Matuska {
500*2a58b312SMartin Matuska 	rw_exit(&brt->brt_lock);
501*2a58b312SMartin Matuska }
502*2a58b312SMartin Matuska 
503*2a58b312SMartin Matuska static uint16_t
504*2a58b312SMartin Matuska brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
505*2a58b312SMartin Matuska {
506*2a58b312SMartin Matuska 
507*2a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
508*2a58b312SMartin Matuska 
509*2a58b312SMartin Matuska 	if (brtvd->bv_need_byteswap) {
510*2a58b312SMartin Matuska 		return (BSWAP_16(brtvd->bv_entcount[idx]));
511*2a58b312SMartin Matuska 	} else {
512*2a58b312SMartin Matuska 		return (brtvd->bv_entcount[idx]);
513*2a58b312SMartin Matuska 	}
514*2a58b312SMartin Matuska }
515*2a58b312SMartin Matuska 
516*2a58b312SMartin Matuska static void
517*2a58b312SMartin Matuska brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
518*2a58b312SMartin Matuska {
519*2a58b312SMartin Matuska 
520*2a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
521*2a58b312SMartin Matuska 
522*2a58b312SMartin Matuska 	if (brtvd->bv_need_byteswap) {
523*2a58b312SMartin Matuska 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
524*2a58b312SMartin Matuska 	} else {
525*2a58b312SMartin Matuska 		brtvd->bv_entcount[idx] = entcnt;
526*2a58b312SMartin Matuska 	}
527*2a58b312SMartin Matuska }
528*2a58b312SMartin Matuska 
529*2a58b312SMartin Matuska static void
530*2a58b312SMartin Matuska brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
531*2a58b312SMartin Matuska {
532*2a58b312SMartin Matuska 	uint16_t entcnt;
533*2a58b312SMartin Matuska 
534*2a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
535*2a58b312SMartin Matuska 
536*2a58b312SMartin Matuska 	entcnt = brt_vdev_entcount_get(brtvd, idx);
537*2a58b312SMartin Matuska 	ASSERT(entcnt < UINT16_MAX);
538*2a58b312SMartin Matuska 
539*2a58b312SMartin Matuska 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
540*2a58b312SMartin Matuska }
541*2a58b312SMartin Matuska 
542*2a58b312SMartin Matuska static void
543*2a58b312SMartin Matuska brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
544*2a58b312SMartin Matuska {
545*2a58b312SMartin Matuska 	uint16_t entcnt;
546*2a58b312SMartin Matuska 
547*2a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
548*2a58b312SMartin Matuska 
549*2a58b312SMartin Matuska 	entcnt = brt_vdev_entcount_get(brtvd, idx);
550*2a58b312SMartin Matuska 	ASSERT(entcnt > 0);
551*2a58b312SMartin Matuska 
552*2a58b312SMartin Matuska 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
553*2a58b312SMartin Matuska }
554*2a58b312SMartin Matuska 
555*2a58b312SMartin Matuska #ifdef ZFS_DEBUG
556*2a58b312SMartin Matuska static void
557*2a58b312SMartin Matuska brt_vdev_dump(brt_t *brt)
558*2a58b312SMartin Matuska {
559*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
560*2a58b312SMartin Matuska 	uint64_t vdevid;
561*2a58b312SMartin Matuska 
562*2a58b312SMartin Matuska 	if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
563*2a58b312SMartin Matuska 		return;
564*2a58b312SMartin Matuska 	}
565*2a58b312SMartin Matuska 
566*2a58b312SMartin Matuska 	if (brt->brt_nvdevs == 0) {
567*2a58b312SMartin Matuska 		zfs_dbgmsg("BRT empty");
568*2a58b312SMartin Matuska 		return;
569*2a58b312SMartin Matuska 	}
570*2a58b312SMartin Matuska 
571*2a58b312SMartin Matuska 	zfs_dbgmsg("BRT vdev dump:");
572*2a58b312SMartin Matuska 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
573*2a58b312SMartin Matuska 		uint64_t idx;
574*2a58b312SMartin Matuska 
575*2a58b312SMartin Matuska 		brtvd = &brt->brt_vdevs[vdevid];
576*2a58b312SMartin Matuska 		zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
577*2a58b312SMartin Matuska 		    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
578*2a58b312SMartin Matuska 		    (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
579*2a58b312SMartin Matuska 		    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
580*2a58b312SMartin Matuska 		    (u_longlong_t)brtvd->bv_size,
581*2a58b312SMartin Matuska 		    (u_longlong_t)brtvd->bv_totalcount,
582*2a58b312SMartin Matuska 		    (u_longlong_t)brtvd->bv_nblocks,
583*2a58b312SMartin Matuska 		    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
584*2a58b312SMartin Matuska 		if (brtvd->bv_totalcount > 0) {
585*2a58b312SMartin Matuska 			zfs_dbgmsg("    entcounts:");
586*2a58b312SMartin Matuska 			for (idx = 0; idx < brtvd->bv_size; idx++) {
587*2a58b312SMartin Matuska 				if (brt_vdev_entcount_get(brtvd, idx) > 0) {
588*2a58b312SMartin Matuska 					zfs_dbgmsg("      [%04llu] %hu",
589*2a58b312SMartin Matuska 					    (u_longlong_t)idx,
590*2a58b312SMartin Matuska 					    brt_vdev_entcount_get(brtvd, idx));
591*2a58b312SMartin Matuska 				}
592*2a58b312SMartin Matuska 			}
593*2a58b312SMartin Matuska 		}
594*2a58b312SMartin Matuska 		if (brtvd->bv_entcount_dirty) {
595*2a58b312SMartin Matuska 			char *bitmap;
596*2a58b312SMartin Matuska 
597*2a58b312SMartin Matuska 			bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
598*2a58b312SMartin Matuska 			for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
599*2a58b312SMartin Matuska 				bitmap[idx] =
600*2a58b312SMartin Matuska 				    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
601*2a58b312SMartin Matuska 			}
602*2a58b312SMartin Matuska 			bitmap[idx] = '\0';
603*2a58b312SMartin Matuska 			zfs_dbgmsg("    bitmap: %s", bitmap);
604*2a58b312SMartin Matuska 			kmem_free(bitmap, brtvd->bv_nblocks + 1);
605*2a58b312SMartin Matuska 		}
606*2a58b312SMartin Matuska 	}
607*2a58b312SMartin Matuska }
608*2a58b312SMartin Matuska #endif
609*2a58b312SMartin Matuska 
610*2a58b312SMartin Matuska static brt_vdev_t *
611*2a58b312SMartin Matuska brt_vdev(brt_t *brt, uint64_t vdevid)
612*2a58b312SMartin Matuska {
613*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
614*2a58b312SMartin Matuska 
615*2a58b312SMartin Matuska 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
616*2a58b312SMartin Matuska 
617*2a58b312SMartin Matuska 	if (vdevid < brt->brt_nvdevs) {
618*2a58b312SMartin Matuska 		brtvd = &brt->brt_vdevs[vdevid];
619*2a58b312SMartin Matuska 	} else {
620*2a58b312SMartin Matuska 		brtvd = NULL;
621*2a58b312SMartin Matuska 	}
622*2a58b312SMartin Matuska 
623*2a58b312SMartin Matuska 	return (brtvd);
624*2a58b312SMartin Matuska }
625*2a58b312SMartin Matuska 
626*2a58b312SMartin Matuska static void
627*2a58b312SMartin Matuska brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
628*2a58b312SMartin Matuska {
629*2a58b312SMartin Matuska 	char name[64];
630*2a58b312SMartin Matuska 
631*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
632*2a58b312SMartin Matuska 	ASSERT0(brtvd->bv_mos_brtvdev);
633*2a58b312SMartin Matuska 	ASSERT0(brtvd->bv_mos_entries);
634*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_entcount != NULL);
635*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_size > 0);
636*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_bitmap != NULL);
637*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_nblocks > 0);
638*2a58b312SMartin Matuska 
639*2a58b312SMartin Matuska 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
640*2a58b312SMartin Matuska 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
641*2a58b312SMartin Matuska 	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
642*2a58b312SMartin Matuska 	    0, tx);
643*2a58b312SMartin Matuska 	VERIFY(brtvd->bv_mos_entries != 0);
644*2a58b312SMartin Matuska 	BRT_DEBUG("MOS entries created, object=%llu",
645*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_entries);
646*2a58b312SMartin Matuska 
647*2a58b312SMartin Matuska 	/*
648*2a58b312SMartin Matuska 	 * We allocate DMU buffer to store the bv_entcount[] array.
649*2a58b312SMartin Matuska 	 * We will keep array size (bv_size) and cummulative count for all
650*2a58b312SMartin Matuska 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
651*2a58b312SMartin Matuska 	 */
652*2a58b312SMartin Matuska 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
653*2a58b312SMartin Matuska 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
654*2a58b312SMartin Matuska 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
655*2a58b312SMartin Matuska 	VERIFY(brtvd->bv_mos_brtvdev != 0);
656*2a58b312SMartin Matuska 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
657*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
658*2a58b312SMartin Matuska 
659*2a58b312SMartin Matuska 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
660*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid);
661*2a58b312SMartin Matuska 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
662*2a58b312SMartin Matuska 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
663*2a58b312SMartin Matuska 	BRT_DEBUG("Pool directory object created, object=%s", name);
664*2a58b312SMartin Matuska 
665*2a58b312SMartin Matuska 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
666*2a58b312SMartin Matuska }
667*2a58b312SMartin Matuska 
668*2a58b312SMartin Matuska static void
669*2a58b312SMartin Matuska brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
670*2a58b312SMartin Matuska {
671*2a58b312SMartin Matuska 	vdev_t *vd;
672*2a58b312SMartin Matuska 	uint16_t *entcount;
673*2a58b312SMartin Matuska 	ulong_t *bitmap;
674*2a58b312SMartin Matuska 	uint64_t nblocks, size;
675*2a58b312SMartin Matuska 
676*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
677*2a58b312SMartin Matuska 
678*2a58b312SMartin Matuska 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
679*2a58b312SMartin Matuska 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
680*2a58b312SMartin Matuska 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
681*2a58b312SMartin Matuska 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
682*2a58b312SMartin Matuska 
683*2a58b312SMartin Matuska 	entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
684*2a58b312SMartin Matuska 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
685*2a58b312SMartin Matuska 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
686*2a58b312SMartin Matuska 
687*2a58b312SMartin Matuska 	if (!brtvd->bv_initiated) {
688*2a58b312SMartin Matuska 		ASSERT0(brtvd->bv_size);
689*2a58b312SMartin Matuska 		ASSERT(brtvd->bv_entcount == NULL);
690*2a58b312SMartin Matuska 		ASSERT(brtvd->bv_bitmap == NULL);
691*2a58b312SMartin Matuska 		ASSERT0(brtvd->bv_nblocks);
692*2a58b312SMartin Matuska 
693*2a58b312SMartin Matuska 		avl_create(&brtvd->bv_tree, brt_entry_compare,
694*2a58b312SMartin Matuska 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
695*2a58b312SMartin Matuska 	} else {
696*2a58b312SMartin Matuska 		ASSERT(brtvd->bv_size > 0);
697*2a58b312SMartin Matuska 		ASSERT(brtvd->bv_entcount != NULL);
698*2a58b312SMartin Matuska 		ASSERT(brtvd->bv_bitmap != NULL);
699*2a58b312SMartin Matuska 		ASSERT(brtvd->bv_nblocks > 0);
700*2a58b312SMartin Matuska 		/*
701*2a58b312SMartin Matuska 		 * TODO: Allow vdev shrinking. We only need to implement
702*2a58b312SMartin Matuska 		 * shrinking the on-disk BRT VDEV object.
703*2a58b312SMartin Matuska 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
704*2a58b312SMartin Matuska 		 *     size, tx);
705*2a58b312SMartin Matuska 		 */
706*2a58b312SMartin Matuska 		ASSERT3U(brtvd->bv_size, <=, size);
707*2a58b312SMartin Matuska 
708*2a58b312SMartin Matuska 		memcpy(entcount, brtvd->bv_entcount,
709*2a58b312SMartin Matuska 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
710*2a58b312SMartin Matuska 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
711*2a58b312SMartin Matuska 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
712*2a58b312SMartin Matuska 		kmem_free(brtvd->bv_entcount,
713*2a58b312SMartin Matuska 		    sizeof (entcount[0]) * brtvd->bv_size);
714*2a58b312SMartin Matuska 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
715*2a58b312SMartin Matuska 	}
716*2a58b312SMartin Matuska 
717*2a58b312SMartin Matuska 	brtvd->bv_size = size;
718*2a58b312SMartin Matuska 	brtvd->bv_entcount = entcount;
719*2a58b312SMartin Matuska 	brtvd->bv_bitmap = bitmap;
720*2a58b312SMartin Matuska 	brtvd->bv_nblocks = nblocks;
721*2a58b312SMartin Matuska 	if (!brtvd->bv_initiated) {
722*2a58b312SMartin Matuska 		brtvd->bv_need_byteswap = FALSE;
723*2a58b312SMartin Matuska 		brtvd->bv_initiated = TRUE;
724*2a58b312SMartin Matuska 		BRT_DEBUG("BRT VDEV %llu initiated.",
725*2a58b312SMartin Matuska 		    (u_longlong_t)brtvd->bv_vdevid);
726*2a58b312SMartin Matuska 	}
727*2a58b312SMartin Matuska }
728*2a58b312SMartin Matuska 
729*2a58b312SMartin Matuska static void
730*2a58b312SMartin Matuska brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
731*2a58b312SMartin Matuska {
732*2a58b312SMartin Matuska 	char name[64];
733*2a58b312SMartin Matuska 	dmu_buf_t *db;
734*2a58b312SMartin Matuska 	brt_vdev_phys_t *bvphys;
735*2a58b312SMartin Matuska 	int error;
736*2a58b312SMartin Matuska 
737*2a58b312SMartin Matuska 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
738*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid);
739*2a58b312SMartin Matuska 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
740*2a58b312SMartin Matuska 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
741*2a58b312SMartin Matuska 	if (error != 0)
742*2a58b312SMartin Matuska 		return;
743*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_brtvdev != 0);
744*2a58b312SMartin Matuska 
745*2a58b312SMartin Matuska 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
746*2a58b312SMartin Matuska 	ASSERT0(error);
747*2a58b312SMartin Matuska 	if (error != 0)
748*2a58b312SMartin Matuska 		return;
749*2a58b312SMartin Matuska 
750*2a58b312SMartin Matuska 	bvphys = db->db_data;
751*2a58b312SMartin Matuska 	if (brt->brt_rangesize == 0) {
752*2a58b312SMartin Matuska 		brt->brt_rangesize = bvphys->bvp_rangesize;
753*2a58b312SMartin Matuska 	} else {
754*2a58b312SMartin Matuska 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
755*2a58b312SMartin Matuska 	}
756*2a58b312SMartin Matuska 
757*2a58b312SMartin Matuska 	ASSERT(!brtvd->bv_initiated);
758*2a58b312SMartin Matuska 	brt_vdev_realloc(brt, brtvd);
759*2a58b312SMartin Matuska 
760*2a58b312SMartin Matuska 	/* TODO: We don't support VDEV shrinking. */
761*2a58b312SMartin Matuska 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
762*2a58b312SMartin Matuska 
763*2a58b312SMartin Matuska 	/*
764*2a58b312SMartin Matuska 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
765*2a58b312SMartin Matuska 	 */
766*2a58b312SMartin Matuska 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
767*2a58b312SMartin Matuska 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
768*2a58b312SMartin Matuska 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
769*2a58b312SMartin Matuska 	ASSERT0(error);
770*2a58b312SMartin Matuska 
771*2a58b312SMartin Matuska 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
772*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_entries != 0);
773*2a58b312SMartin Matuska 	brtvd->bv_need_byteswap =
774*2a58b312SMartin Matuska 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
775*2a58b312SMartin Matuska 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
776*2a58b312SMartin Matuska 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
777*2a58b312SMartin Matuska 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
778*2a58b312SMartin Matuska 	brt->brt_usedspace += brtvd->bv_usedspace;
779*2a58b312SMartin Matuska 	brt->brt_savedspace += brtvd->bv_savedspace;
780*2a58b312SMartin Matuska 
781*2a58b312SMartin Matuska 	dmu_buf_rele(db, FTAG);
782*2a58b312SMartin Matuska 
783*2a58b312SMartin Matuska 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
784*2a58b312SMartin Matuska 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
785*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_entries);
786*2a58b312SMartin Matuska }
787*2a58b312SMartin Matuska 
788*2a58b312SMartin Matuska static void
789*2a58b312SMartin Matuska brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
790*2a58b312SMartin Matuska {
791*2a58b312SMartin Matuska 
792*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
793*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_initiated);
794*2a58b312SMartin Matuska 
795*2a58b312SMartin Matuska 	kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
796*2a58b312SMartin Matuska 	brtvd->bv_entcount = NULL;
797*2a58b312SMartin Matuska 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
798*2a58b312SMartin Matuska 	brtvd->bv_bitmap = NULL;
799*2a58b312SMartin Matuska 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
800*2a58b312SMartin Matuska 	avl_destroy(&brtvd->bv_tree);
801*2a58b312SMartin Matuska 
802*2a58b312SMartin Matuska 	brtvd->bv_size = 0;
803*2a58b312SMartin Matuska 	brtvd->bv_nblocks = 0;
804*2a58b312SMartin Matuska 
805*2a58b312SMartin Matuska 	brtvd->bv_initiated = FALSE;
806*2a58b312SMartin Matuska 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
807*2a58b312SMartin Matuska }
808*2a58b312SMartin Matuska 
809*2a58b312SMartin Matuska static void
810*2a58b312SMartin Matuska brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
811*2a58b312SMartin Matuska {
812*2a58b312SMartin Matuska 	char name[64];
813*2a58b312SMartin Matuska 	uint64_t count;
814*2a58b312SMartin Matuska 	dmu_buf_t *db;
815*2a58b312SMartin Matuska 	brt_vdev_phys_t *bvphys;
816*2a58b312SMartin Matuska 
817*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
818*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_brtvdev != 0);
819*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_entries != 0);
820*2a58b312SMartin Matuska 
821*2a58b312SMartin Matuska 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
822*2a58b312SMartin Matuska 	VERIFY0(count);
823*2a58b312SMartin Matuska 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
824*2a58b312SMartin Matuska 	BRT_DEBUG("MOS entries destroyed, object=%llu",
825*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_entries);
826*2a58b312SMartin Matuska 	brtvd->bv_mos_entries = 0;
827*2a58b312SMartin Matuska 
828*2a58b312SMartin Matuska 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
829*2a58b312SMartin Matuska 	bvphys = db->db_data;
830*2a58b312SMartin Matuska 	ASSERT0(bvphys->bvp_totalcount);
831*2a58b312SMartin Matuska 	ASSERT0(bvphys->bvp_usedspace);
832*2a58b312SMartin Matuska 	ASSERT0(bvphys->bvp_savedspace);
833*2a58b312SMartin Matuska 	dmu_buf_rele(db, FTAG);
834*2a58b312SMartin Matuska 
835*2a58b312SMartin Matuska 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
836*2a58b312SMartin Matuska 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
837*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
838*2a58b312SMartin Matuska 	brtvd->bv_mos_brtvdev = 0;
839*2a58b312SMartin Matuska 
840*2a58b312SMartin Matuska 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
841*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid);
842*2a58b312SMartin Matuska 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
843*2a58b312SMartin Matuska 	BRT_DEBUG("Pool directory object removed, object=%s", name);
844*2a58b312SMartin Matuska 
845*2a58b312SMartin Matuska 	brt_vdev_dealloc(brt, brtvd);
846*2a58b312SMartin Matuska 
847*2a58b312SMartin Matuska 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
848*2a58b312SMartin Matuska }
849*2a58b312SMartin Matuska 
850*2a58b312SMartin Matuska static void
851*2a58b312SMartin Matuska brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
852*2a58b312SMartin Matuska {
853*2a58b312SMartin Matuska 	brt_vdev_t *brtvd, *vdevs;
854*2a58b312SMartin Matuska 	uint64_t vdevid;
855*2a58b312SMartin Matuska 
856*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
857*2a58b312SMartin Matuska 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
858*2a58b312SMartin Matuska 
859*2a58b312SMartin Matuska 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
860*2a58b312SMartin Matuska 	if (brt->brt_nvdevs > 0) {
861*2a58b312SMartin Matuska 		ASSERT(brt->brt_vdevs != NULL);
862*2a58b312SMartin Matuska 
863*2a58b312SMartin Matuska 		memcpy(vdevs, brt->brt_vdevs,
864*2a58b312SMartin Matuska 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
865*2a58b312SMartin Matuska 		kmem_free(brt->brt_vdevs,
866*2a58b312SMartin Matuska 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
867*2a58b312SMartin Matuska 	}
868*2a58b312SMartin Matuska 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
869*2a58b312SMartin Matuska 		brtvd = &vdevs[vdevid];
870*2a58b312SMartin Matuska 
871*2a58b312SMartin Matuska 		brtvd->bv_vdevid = vdevid;
872*2a58b312SMartin Matuska 		brtvd->bv_initiated = FALSE;
873*2a58b312SMartin Matuska 	}
874*2a58b312SMartin Matuska 
875*2a58b312SMartin Matuska 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
876*2a58b312SMartin Matuska 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
877*2a58b312SMartin Matuska 
878*2a58b312SMartin Matuska 	brt->brt_vdevs = vdevs;
879*2a58b312SMartin Matuska 	brt->brt_nvdevs = nvdevs;
880*2a58b312SMartin Matuska }
881*2a58b312SMartin Matuska 
882*2a58b312SMartin Matuska static boolean_t
883*2a58b312SMartin Matuska brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
884*2a58b312SMartin Matuska {
885*2a58b312SMartin Matuska 	uint64_t idx;
886*2a58b312SMartin Matuska 
887*2a58b312SMartin Matuska 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
888*2a58b312SMartin Matuska 
889*2a58b312SMartin Matuska 	idx = bre->bre_offset / brt->brt_rangesize;
890*2a58b312SMartin Matuska 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
891*2a58b312SMartin Matuska 		/* VDEV wasn't expanded. */
892*2a58b312SMartin Matuska 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
893*2a58b312SMartin Matuska 	}
894*2a58b312SMartin Matuska 
895*2a58b312SMartin Matuska 	return (FALSE);
896*2a58b312SMartin Matuska }
897*2a58b312SMartin Matuska 
898*2a58b312SMartin Matuska static void
899*2a58b312SMartin Matuska brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
900*2a58b312SMartin Matuska     uint64_t dsize)
901*2a58b312SMartin Matuska {
902*2a58b312SMartin Matuska 	uint64_t idx;
903*2a58b312SMartin Matuska 
904*2a58b312SMartin Matuska 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
905*2a58b312SMartin Matuska 	ASSERT(brtvd != NULL);
906*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_entcount != NULL);
907*2a58b312SMartin Matuska 
908*2a58b312SMartin Matuska 	brt->brt_savedspace += dsize;
909*2a58b312SMartin Matuska 	brtvd->bv_savedspace += dsize;
910*2a58b312SMartin Matuska 	brtvd->bv_meta_dirty = TRUE;
911*2a58b312SMartin Matuska 
912*2a58b312SMartin Matuska 	if (bre->bre_refcount > 1) {
913*2a58b312SMartin Matuska 		return;
914*2a58b312SMartin Matuska 	}
915*2a58b312SMartin Matuska 
916*2a58b312SMartin Matuska 	brt->brt_usedspace += dsize;
917*2a58b312SMartin Matuska 	brtvd->bv_usedspace += dsize;
918*2a58b312SMartin Matuska 
919*2a58b312SMartin Matuska 	idx = bre->bre_offset / brt->brt_rangesize;
920*2a58b312SMartin Matuska 	if (idx >= brtvd->bv_size) {
921*2a58b312SMartin Matuska 		/* VDEV has been expanded. */
922*2a58b312SMartin Matuska 		brt_vdev_realloc(brt, brtvd);
923*2a58b312SMartin Matuska 	}
924*2a58b312SMartin Matuska 
925*2a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
926*2a58b312SMartin Matuska 
927*2a58b312SMartin Matuska 	brtvd->bv_totalcount++;
928*2a58b312SMartin Matuska 	brt_vdev_entcount_inc(brtvd, idx);
929*2a58b312SMartin Matuska 	brtvd->bv_entcount_dirty = TRUE;
930*2a58b312SMartin Matuska 	idx = idx / BRT_BLOCKSIZE / 8;
931*2a58b312SMartin Matuska 	BT_SET(brtvd->bv_bitmap, idx);
932*2a58b312SMartin Matuska 
933*2a58b312SMartin Matuska #ifdef ZFS_DEBUG
934*2a58b312SMartin Matuska 	brt_vdev_dump(brt);
935*2a58b312SMartin Matuska #endif
936*2a58b312SMartin Matuska }
937*2a58b312SMartin Matuska 
938*2a58b312SMartin Matuska static void
939*2a58b312SMartin Matuska brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
940*2a58b312SMartin Matuska     uint64_t dsize)
941*2a58b312SMartin Matuska {
942*2a58b312SMartin Matuska 	uint64_t idx;
943*2a58b312SMartin Matuska 
944*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
945*2a58b312SMartin Matuska 	ASSERT(brtvd != NULL);
946*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_entcount != NULL);
947*2a58b312SMartin Matuska 
948*2a58b312SMartin Matuska 	brt->brt_savedspace -= dsize;
949*2a58b312SMartin Matuska 	brtvd->bv_savedspace -= dsize;
950*2a58b312SMartin Matuska 	brtvd->bv_meta_dirty = TRUE;
951*2a58b312SMartin Matuska 
952*2a58b312SMartin Matuska 	if (bre->bre_refcount > 0) {
953*2a58b312SMartin Matuska 		return;
954*2a58b312SMartin Matuska 	}
955*2a58b312SMartin Matuska 
956*2a58b312SMartin Matuska 	brt->brt_usedspace -= dsize;
957*2a58b312SMartin Matuska 	brtvd->bv_usedspace -= dsize;
958*2a58b312SMartin Matuska 
959*2a58b312SMartin Matuska 	idx = bre->bre_offset / brt->brt_rangesize;
960*2a58b312SMartin Matuska 	ASSERT3U(idx, <, brtvd->bv_size);
961*2a58b312SMartin Matuska 
962*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_totalcount > 0);
963*2a58b312SMartin Matuska 	brtvd->bv_totalcount--;
964*2a58b312SMartin Matuska 	brt_vdev_entcount_dec(brtvd, idx);
965*2a58b312SMartin Matuska 	brtvd->bv_entcount_dirty = TRUE;
966*2a58b312SMartin Matuska 	idx = idx / BRT_BLOCKSIZE / 8;
967*2a58b312SMartin Matuska 	BT_SET(brtvd->bv_bitmap, idx);
968*2a58b312SMartin Matuska 
969*2a58b312SMartin Matuska #ifdef ZFS_DEBUG
970*2a58b312SMartin Matuska 	brt_vdev_dump(brt);
971*2a58b312SMartin Matuska #endif
972*2a58b312SMartin Matuska }
973*2a58b312SMartin Matuska 
974*2a58b312SMartin Matuska static void
975*2a58b312SMartin Matuska brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
976*2a58b312SMartin Matuska {
977*2a58b312SMartin Matuska 	dmu_buf_t *db;
978*2a58b312SMartin Matuska 	brt_vdev_phys_t *bvphys;
979*2a58b312SMartin Matuska 
980*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_meta_dirty);
981*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_brtvdev != 0);
982*2a58b312SMartin Matuska 	ASSERT(dmu_tx_is_syncing(tx));
983*2a58b312SMartin Matuska 
984*2a58b312SMartin Matuska 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
985*2a58b312SMartin Matuska 
986*2a58b312SMartin Matuska 	if (brtvd->bv_entcount_dirty) {
987*2a58b312SMartin Matuska 		/*
988*2a58b312SMartin Matuska 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
989*2a58b312SMartin Matuska 		 */
990*2a58b312SMartin Matuska 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
991*2a58b312SMartin Matuska 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
992*2a58b312SMartin Matuska 		    brtvd->bv_entcount, tx);
993*2a58b312SMartin Matuska 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
994*2a58b312SMartin Matuska 		brtvd->bv_entcount_dirty = FALSE;
995*2a58b312SMartin Matuska 	}
996*2a58b312SMartin Matuska 
997*2a58b312SMartin Matuska 	dmu_buf_will_dirty(db, tx);
998*2a58b312SMartin Matuska 	bvphys = db->db_data;
999*2a58b312SMartin Matuska 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
1000*2a58b312SMartin Matuska 	bvphys->bvp_size = brtvd->bv_size;
1001*2a58b312SMartin Matuska 	if (brtvd->bv_need_byteswap) {
1002*2a58b312SMartin Matuska 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
1003*2a58b312SMartin Matuska 	} else {
1004*2a58b312SMartin Matuska 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
1005*2a58b312SMartin Matuska 	}
1006*2a58b312SMartin Matuska 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
1007*2a58b312SMartin Matuska 	bvphys->bvp_rangesize = brt->brt_rangesize;
1008*2a58b312SMartin Matuska 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
1009*2a58b312SMartin Matuska 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
1010*2a58b312SMartin Matuska 	dmu_buf_rele(db, FTAG);
1011*2a58b312SMartin Matuska 
1012*2a58b312SMartin Matuska 	brtvd->bv_meta_dirty = FALSE;
1013*2a58b312SMartin Matuska }
1014*2a58b312SMartin Matuska 
1015*2a58b312SMartin Matuska static void
1016*2a58b312SMartin Matuska brt_vdevs_alloc(brt_t *brt, boolean_t load)
1017*2a58b312SMartin Matuska {
1018*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1019*2a58b312SMartin Matuska 	uint64_t vdevid;
1020*2a58b312SMartin Matuska 
1021*2a58b312SMartin Matuska 	brt_wlock(brt);
1022*2a58b312SMartin Matuska 
1023*2a58b312SMartin Matuska 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
1024*2a58b312SMartin Matuska 
1025*2a58b312SMartin Matuska 	if (load) {
1026*2a58b312SMartin Matuska 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1027*2a58b312SMartin Matuska 			brtvd = &brt->brt_vdevs[vdevid];
1028*2a58b312SMartin Matuska 			ASSERT(brtvd->bv_entcount == NULL);
1029*2a58b312SMartin Matuska 
1030*2a58b312SMartin Matuska 			brt_vdev_load(brt, brtvd);
1031*2a58b312SMartin Matuska 		}
1032*2a58b312SMartin Matuska 	}
1033*2a58b312SMartin Matuska 
1034*2a58b312SMartin Matuska 	if (brt->brt_rangesize == 0) {
1035*2a58b312SMartin Matuska 		brt->brt_rangesize = BRT_RANGESIZE;
1036*2a58b312SMartin Matuska 	}
1037*2a58b312SMartin Matuska 
1038*2a58b312SMartin Matuska 	brt_unlock(brt);
1039*2a58b312SMartin Matuska }
1040*2a58b312SMartin Matuska 
1041*2a58b312SMartin Matuska static void
1042*2a58b312SMartin Matuska brt_vdevs_free(brt_t *brt)
1043*2a58b312SMartin Matuska {
1044*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1045*2a58b312SMartin Matuska 	uint64_t vdevid;
1046*2a58b312SMartin Matuska 
1047*2a58b312SMartin Matuska 	brt_wlock(brt);
1048*2a58b312SMartin Matuska 
1049*2a58b312SMartin Matuska 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1050*2a58b312SMartin Matuska 		brtvd = &brt->brt_vdevs[vdevid];
1051*2a58b312SMartin Matuska 		if (brtvd->bv_initiated)
1052*2a58b312SMartin Matuska 			brt_vdev_dealloc(brt, brtvd);
1053*2a58b312SMartin Matuska 	}
1054*2a58b312SMartin Matuska 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
1055*2a58b312SMartin Matuska 
1056*2a58b312SMartin Matuska 	brt_unlock(brt);
1057*2a58b312SMartin Matuska }
1058*2a58b312SMartin Matuska 
1059*2a58b312SMartin Matuska static void
1060*2a58b312SMartin Matuska brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
1061*2a58b312SMartin Matuska {
1062*2a58b312SMartin Matuska 
1063*2a58b312SMartin Matuska 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
1064*2a58b312SMartin Matuska 	bre->bre_refcount = 0;
1065*2a58b312SMartin Matuska 
1066*2a58b312SMartin Matuska 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
1067*2a58b312SMartin Matuska }
1068*2a58b312SMartin Matuska 
1069*2a58b312SMartin Matuska static int
1070*2a58b312SMartin Matuska brt_entry_compare(const void *x1, const void *x2)
1071*2a58b312SMartin Matuska {
1072*2a58b312SMartin Matuska 	const brt_entry_t *bre1 = x1;
1073*2a58b312SMartin Matuska 	const brt_entry_t *bre2 = x2;
1074*2a58b312SMartin Matuska 
1075*2a58b312SMartin Matuska 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
1076*2a58b312SMartin Matuska }
1077*2a58b312SMartin Matuska 
1078*2a58b312SMartin Matuska static int
1079*2a58b312SMartin Matuska brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
1080*2a58b312SMartin Matuska {
1081*2a58b312SMartin Matuska 	uint64_t mos_entries;
1082*2a58b312SMartin Matuska 	uint64_t one, physsize;
1083*2a58b312SMartin Matuska 	int error;
1084*2a58b312SMartin Matuska 
1085*2a58b312SMartin Matuska 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1086*2a58b312SMartin Matuska 
1087*2a58b312SMartin Matuska 	if (!brt_vdev_lookup(brt, brtvd, bre))
1088*2a58b312SMartin Matuska 		return (SET_ERROR(ENOENT));
1089*2a58b312SMartin Matuska 
1090*2a58b312SMartin Matuska 	/*
1091*2a58b312SMartin Matuska 	 * Remember mos_entries object number. After we reacquire the BRT lock,
1092*2a58b312SMartin Matuska 	 * the brtvd pointer may be invalid.
1093*2a58b312SMartin Matuska 	 */
1094*2a58b312SMartin Matuska 	mos_entries = brtvd->bv_mos_entries;
1095*2a58b312SMartin Matuska 	if (mos_entries == 0)
1096*2a58b312SMartin Matuska 		return (SET_ERROR(ENOENT));
1097*2a58b312SMartin Matuska 
1098*2a58b312SMartin Matuska 	brt_unlock(brt);
1099*2a58b312SMartin Matuska 
1100*2a58b312SMartin Matuska 	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
1101*2a58b312SMartin Matuska 	    BRT_KEY_WORDS, &one, &physsize);
1102*2a58b312SMartin Matuska 	if (error == 0) {
1103*2a58b312SMartin Matuska 		ASSERT3U(one, ==, 1);
1104*2a58b312SMartin Matuska 		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
1105*2a58b312SMartin Matuska 
1106*2a58b312SMartin Matuska 		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
1107*2a58b312SMartin Matuska 		    &bre->bre_offset, BRT_KEY_WORDS, 1,
1108*2a58b312SMartin Matuska 		    sizeof (bre->bre_refcount), &bre->bre_refcount);
1109*2a58b312SMartin Matuska 		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
1110*2a58b312SMartin Matuska 		    "count=%llu error=%d", (u_longlong_t)mos_entries,
1111*2a58b312SMartin Matuska 		    (u_longlong_t)brtvd->bv_vdevid,
1112*2a58b312SMartin Matuska 		    (u_longlong_t)bre->bre_offset,
1113*2a58b312SMartin Matuska 		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
1114*2a58b312SMartin Matuska 	}
1115*2a58b312SMartin Matuska 
1116*2a58b312SMartin Matuska 	brt_wlock(brt);
1117*2a58b312SMartin Matuska 
1118*2a58b312SMartin Matuska 	return (error);
1119*2a58b312SMartin Matuska }
1120*2a58b312SMartin Matuska 
1121*2a58b312SMartin Matuska static void
1122*2a58b312SMartin Matuska brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
1123*2a58b312SMartin Matuska {
1124*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1125*2a58b312SMartin Matuska 	uint64_t mos_entries = 0;
1126*2a58b312SMartin Matuska 
1127*2a58b312SMartin Matuska 	brt_rlock(brt);
1128*2a58b312SMartin Matuska 	brtvd = brt_vdev(brt, vdevid);
1129*2a58b312SMartin Matuska 	if (brtvd != NULL)
1130*2a58b312SMartin Matuska 		mos_entries = brtvd->bv_mos_entries;
1131*2a58b312SMartin Matuska 	brt_unlock(brt);
1132*2a58b312SMartin Matuska 
1133*2a58b312SMartin Matuska 	if (mos_entries == 0)
1134*2a58b312SMartin Matuska 		return;
1135*2a58b312SMartin Matuska 
1136*2a58b312SMartin Matuska 	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
1137*2a58b312SMartin Matuska 	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
1138*2a58b312SMartin Matuska 	    (u_longlong_t)bre->bre_offset);
1139*2a58b312SMartin Matuska 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
1140*2a58b312SMartin Matuska 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
1141*2a58b312SMartin Matuska }
1142*2a58b312SMartin Matuska 
1143*2a58b312SMartin Matuska static int
1144*2a58b312SMartin Matuska brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1145*2a58b312SMartin Matuska {
1146*2a58b312SMartin Matuska 	int error;
1147*2a58b312SMartin Matuska 
1148*2a58b312SMartin Matuska 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1149*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_entries != 0);
1150*2a58b312SMartin Matuska 	ASSERT(bre->bre_refcount > 0);
1151*2a58b312SMartin Matuska 
1152*2a58b312SMartin Matuska 	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1153*2a58b312SMartin Matuska 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
1154*2a58b312SMartin Matuska 	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
1155*2a58b312SMartin Matuska 	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
1156*2a58b312SMartin Matuska 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1157*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1158*2a58b312SMartin Matuska 	    (u_longlong_t)bre->bre_refcount, error);
1159*2a58b312SMartin Matuska 
1160*2a58b312SMartin Matuska 	return (error);
1161*2a58b312SMartin Matuska }
1162*2a58b312SMartin Matuska 
1163*2a58b312SMartin Matuska static int
1164*2a58b312SMartin Matuska brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1165*2a58b312SMartin Matuska {
1166*2a58b312SMartin Matuska 	int error;
1167*2a58b312SMartin Matuska 
1168*2a58b312SMartin Matuska 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1169*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_entries != 0);
1170*2a58b312SMartin Matuska 	ASSERT0(bre->bre_refcount);
1171*2a58b312SMartin Matuska 
1172*2a58b312SMartin Matuska 	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1173*2a58b312SMartin Matuska 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
1174*2a58b312SMartin Matuska 	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1175*2a58b312SMartin Matuska 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1176*2a58b312SMartin Matuska 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1177*2a58b312SMartin Matuska 	    (u_longlong_t)bre->bre_refcount, error);
1178*2a58b312SMartin Matuska 
1179*2a58b312SMartin Matuska 	return (error);
1180*2a58b312SMartin Matuska }
1181*2a58b312SMartin Matuska 
1182*2a58b312SMartin Matuska /*
1183*2a58b312SMartin Matuska  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1184*2a58b312SMartin Matuska  * positive, but gives us quick answer if we should look into BRT, which
1185*2a58b312SMartin Matuska  * may require reads and thus will be more expensive.
1186*2a58b312SMartin Matuska  */
1187*2a58b312SMartin Matuska boolean_t
1188*2a58b312SMartin Matuska brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1189*2a58b312SMartin Matuska {
1190*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1191*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1192*2a58b312SMartin Matuska 	brt_entry_t bre_search;
1193*2a58b312SMartin Matuska 	boolean_t mayexists = FALSE;
1194*2a58b312SMartin Matuska 	uint64_t vdevid;
1195*2a58b312SMartin Matuska 
1196*2a58b312SMartin Matuska 	brt_entry_fill(bp, &bre_search, &vdevid);
1197*2a58b312SMartin Matuska 
1198*2a58b312SMartin Matuska 	brt_rlock(brt);
1199*2a58b312SMartin Matuska 
1200*2a58b312SMartin Matuska 	brtvd = brt_vdev(brt, vdevid);
1201*2a58b312SMartin Matuska 	if (brtvd != NULL && brtvd->bv_initiated) {
1202*2a58b312SMartin Matuska 		if (!avl_is_empty(&brtvd->bv_tree) ||
1203*2a58b312SMartin Matuska 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
1204*2a58b312SMartin Matuska 			mayexists = TRUE;
1205*2a58b312SMartin Matuska 		}
1206*2a58b312SMartin Matuska 	}
1207*2a58b312SMartin Matuska 
1208*2a58b312SMartin Matuska 	brt_unlock(brt);
1209*2a58b312SMartin Matuska 
1210*2a58b312SMartin Matuska 	return (mayexists);
1211*2a58b312SMartin Matuska }
1212*2a58b312SMartin Matuska 
1213*2a58b312SMartin Matuska uint64_t
1214*2a58b312SMartin Matuska brt_get_dspace(spa_t *spa)
1215*2a58b312SMartin Matuska {
1216*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1217*2a58b312SMartin Matuska 
1218*2a58b312SMartin Matuska 	if (brt == NULL)
1219*2a58b312SMartin Matuska 		return (0);
1220*2a58b312SMartin Matuska 
1221*2a58b312SMartin Matuska 	return (brt->brt_savedspace);
1222*2a58b312SMartin Matuska }
1223*2a58b312SMartin Matuska 
1224*2a58b312SMartin Matuska uint64_t
1225*2a58b312SMartin Matuska brt_get_used(spa_t *spa)
1226*2a58b312SMartin Matuska {
1227*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1228*2a58b312SMartin Matuska 
1229*2a58b312SMartin Matuska 	if (brt == NULL)
1230*2a58b312SMartin Matuska 		return (0);
1231*2a58b312SMartin Matuska 
1232*2a58b312SMartin Matuska 	return (brt->brt_usedspace);
1233*2a58b312SMartin Matuska }
1234*2a58b312SMartin Matuska 
1235*2a58b312SMartin Matuska uint64_t
1236*2a58b312SMartin Matuska brt_get_saved(spa_t *spa)
1237*2a58b312SMartin Matuska {
1238*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1239*2a58b312SMartin Matuska 
1240*2a58b312SMartin Matuska 	if (brt == NULL)
1241*2a58b312SMartin Matuska 		return (0);
1242*2a58b312SMartin Matuska 
1243*2a58b312SMartin Matuska 	return (brt->brt_savedspace);
1244*2a58b312SMartin Matuska }
1245*2a58b312SMartin Matuska 
1246*2a58b312SMartin Matuska uint64_t
1247*2a58b312SMartin Matuska brt_get_ratio(spa_t *spa)
1248*2a58b312SMartin Matuska {
1249*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1250*2a58b312SMartin Matuska 
1251*2a58b312SMartin Matuska 	if (brt->brt_usedspace == 0)
1252*2a58b312SMartin Matuska 		return (100);
1253*2a58b312SMartin Matuska 
1254*2a58b312SMartin Matuska 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1255*2a58b312SMartin Matuska 	    brt->brt_usedspace);
1256*2a58b312SMartin Matuska }
1257*2a58b312SMartin Matuska 
1258*2a58b312SMartin Matuska static int
1259*2a58b312SMartin Matuska brt_kstats_update(kstat_t *ksp, int rw)
1260*2a58b312SMartin Matuska {
1261*2a58b312SMartin Matuska 	brt_stats_t *bs = ksp->ks_data;
1262*2a58b312SMartin Matuska 
1263*2a58b312SMartin Matuska 	if (rw == KSTAT_WRITE)
1264*2a58b312SMartin Matuska 		return (EACCES);
1265*2a58b312SMartin Matuska 
1266*2a58b312SMartin Matuska 	bs->brt_addref_entry_in_memory.value.ui64 =
1267*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1268*2a58b312SMartin Matuska 	bs->brt_addref_entry_not_on_disk.value.ui64 =
1269*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1270*2a58b312SMartin Matuska 	bs->brt_addref_entry_on_disk.value.ui64 =
1271*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1272*2a58b312SMartin Matuska 	bs->brt_addref_entry_read_lost_race.value.ui64 =
1273*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1274*2a58b312SMartin Matuska 	bs->brt_decref_entry_in_memory.value.ui64 =
1275*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1276*2a58b312SMartin Matuska 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1277*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1278*2a58b312SMartin Matuska 	bs->brt_decref_entry_not_in_memory.value.ui64 =
1279*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1280*2a58b312SMartin Matuska 	bs->brt_decref_entry_not_on_disk.value.ui64 =
1281*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1282*2a58b312SMartin Matuska 	bs->brt_decref_entry_read_lost_race.value.ui64 =
1283*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1284*2a58b312SMartin Matuska 	bs->brt_decref_entry_still_referenced.value.ui64 =
1285*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1286*2a58b312SMartin Matuska 	bs->brt_decref_free_data_later.value.ui64 =
1287*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
1288*2a58b312SMartin Matuska 	bs->brt_decref_free_data_now.value.ui64 =
1289*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
1290*2a58b312SMartin Matuska 	bs->brt_decref_no_entry.value.ui64 =
1291*2a58b312SMartin Matuska 	    wmsum_value(&brt_sums.brt_decref_no_entry);
1292*2a58b312SMartin Matuska 
1293*2a58b312SMartin Matuska 	return (0);
1294*2a58b312SMartin Matuska }
1295*2a58b312SMartin Matuska 
1296*2a58b312SMartin Matuska static void
1297*2a58b312SMartin Matuska brt_stat_init(void)
1298*2a58b312SMartin Matuska {
1299*2a58b312SMartin Matuska 
1300*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1301*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1302*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1303*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1304*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1305*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1306*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1307*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1308*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1309*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1310*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1311*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1312*2a58b312SMartin Matuska 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1313*2a58b312SMartin Matuska 
1314*2a58b312SMartin Matuska 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1315*2a58b312SMartin Matuska 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1316*2a58b312SMartin Matuska 	if (brt_ksp != NULL) {
1317*2a58b312SMartin Matuska 		brt_ksp->ks_data = &brt_stats;
1318*2a58b312SMartin Matuska 		brt_ksp->ks_update = brt_kstats_update;
1319*2a58b312SMartin Matuska 		kstat_install(brt_ksp);
1320*2a58b312SMartin Matuska 	}
1321*2a58b312SMartin Matuska }
1322*2a58b312SMartin Matuska 
1323*2a58b312SMartin Matuska static void
1324*2a58b312SMartin Matuska brt_stat_fini(void)
1325*2a58b312SMartin Matuska {
1326*2a58b312SMartin Matuska 	if (brt_ksp != NULL) {
1327*2a58b312SMartin Matuska 		kstat_delete(brt_ksp);
1328*2a58b312SMartin Matuska 		brt_ksp = NULL;
1329*2a58b312SMartin Matuska 	}
1330*2a58b312SMartin Matuska 
1331*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1332*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1333*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1334*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1335*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1336*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1337*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1338*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1339*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1340*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1341*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1342*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1343*2a58b312SMartin Matuska 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1344*2a58b312SMartin Matuska }
1345*2a58b312SMartin Matuska 
1346*2a58b312SMartin Matuska void
1347*2a58b312SMartin Matuska brt_init(void)
1348*2a58b312SMartin Matuska {
1349*2a58b312SMartin Matuska 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1350*2a58b312SMartin Matuska 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1351*2a58b312SMartin Matuska 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1352*2a58b312SMartin Matuska 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1353*2a58b312SMartin Matuska 
1354*2a58b312SMartin Matuska 	brt_stat_init();
1355*2a58b312SMartin Matuska }
1356*2a58b312SMartin Matuska 
1357*2a58b312SMartin Matuska void
1358*2a58b312SMartin Matuska brt_fini(void)
1359*2a58b312SMartin Matuska {
1360*2a58b312SMartin Matuska 	brt_stat_fini();
1361*2a58b312SMartin Matuska 
1362*2a58b312SMartin Matuska 	kmem_cache_destroy(brt_entry_cache);
1363*2a58b312SMartin Matuska 	kmem_cache_destroy(brt_pending_entry_cache);
1364*2a58b312SMartin Matuska }
1365*2a58b312SMartin Matuska 
1366*2a58b312SMartin Matuska static brt_entry_t *
1367*2a58b312SMartin Matuska brt_entry_alloc(const brt_entry_t *bre_init)
1368*2a58b312SMartin Matuska {
1369*2a58b312SMartin Matuska 	brt_entry_t *bre;
1370*2a58b312SMartin Matuska 
1371*2a58b312SMartin Matuska 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1372*2a58b312SMartin Matuska 	bre->bre_offset = bre_init->bre_offset;
1373*2a58b312SMartin Matuska 	bre->bre_refcount = bre_init->bre_refcount;
1374*2a58b312SMartin Matuska 
1375*2a58b312SMartin Matuska 	return (bre);
1376*2a58b312SMartin Matuska }
1377*2a58b312SMartin Matuska 
1378*2a58b312SMartin Matuska static void
1379*2a58b312SMartin Matuska brt_entry_free(brt_entry_t *bre)
1380*2a58b312SMartin Matuska {
1381*2a58b312SMartin Matuska 
1382*2a58b312SMartin Matuska 	kmem_cache_free(brt_entry_cache, bre);
1383*2a58b312SMartin Matuska }
1384*2a58b312SMartin Matuska 
1385*2a58b312SMartin Matuska static void
1386*2a58b312SMartin Matuska brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1387*2a58b312SMartin Matuska {
1388*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1389*2a58b312SMartin Matuska 	brt_entry_t *bre, *racebre;
1390*2a58b312SMartin Matuska 	brt_entry_t bre_search;
1391*2a58b312SMartin Matuska 	avl_index_t where;
1392*2a58b312SMartin Matuska 	uint64_t vdevid;
1393*2a58b312SMartin Matuska 	int error;
1394*2a58b312SMartin Matuska 
1395*2a58b312SMartin Matuska 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1396*2a58b312SMartin Matuska 
1397*2a58b312SMartin Matuska 	brt_entry_fill(bp, &bre_search, &vdevid);
1398*2a58b312SMartin Matuska 
1399*2a58b312SMartin Matuska 	brt_wlock(brt);
1400*2a58b312SMartin Matuska 
1401*2a58b312SMartin Matuska 	brtvd = brt_vdev(brt, vdevid);
1402*2a58b312SMartin Matuska 	if (brtvd == NULL) {
1403*2a58b312SMartin Matuska 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1404*2a58b312SMartin Matuska 
1405*2a58b312SMartin Matuska 		/* New VDEV was added. */
1406*2a58b312SMartin Matuska 		brt_vdevs_expand(brt, vdevid + 1);
1407*2a58b312SMartin Matuska 		brtvd = brt_vdev(brt, vdevid);
1408*2a58b312SMartin Matuska 	}
1409*2a58b312SMartin Matuska 	ASSERT(brtvd != NULL);
1410*2a58b312SMartin Matuska 	if (!brtvd->bv_initiated)
1411*2a58b312SMartin Matuska 		brt_vdev_realloc(brt, brtvd);
1412*2a58b312SMartin Matuska 
1413*2a58b312SMartin Matuska 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1414*2a58b312SMartin Matuska 	if (bre != NULL) {
1415*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
1416*2a58b312SMartin Matuska 	} else {
1417*2a58b312SMartin Matuska 		/*
1418*2a58b312SMartin Matuska 		 * brt_entry_lookup() may drop the BRT (read) lock and
1419*2a58b312SMartin Matuska 		 * reacquire it (write).
1420*2a58b312SMartin Matuska 		 */
1421*2a58b312SMartin Matuska 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1422*2a58b312SMartin Matuska 		/* bre_search now contains correct bre_refcount */
1423*2a58b312SMartin Matuska 		ASSERT(error == 0 || error == ENOENT);
1424*2a58b312SMartin Matuska 		if (error == 0)
1425*2a58b312SMartin Matuska 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
1426*2a58b312SMartin Matuska 		else
1427*2a58b312SMartin Matuska 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1428*2a58b312SMartin Matuska 		/*
1429*2a58b312SMartin Matuska 		 * When the BRT lock was dropped, brt_vdevs[] may have been
1430*2a58b312SMartin Matuska 		 * expanded and reallocated, we need to update brtvd's pointer.
1431*2a58b312SMartin Matuska 		 */
1432*2a58b312SMartin Matuska 		brtvd = brt_vdev(brt, vdevid);
1433*2a58b312SMartin Matuska 		ASSERT(brtvd != NULL);
1434*2a58b312SMartin Matuska 
1435*2a58b312SMartin Matuska 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1436*2a58b312SMartin Matuska 		if (racebre == NULL) {
1437*2a58b312SMartin Matuska 			bre = brt_entry_alloc(&bre_search);
1438*2a58b312SMartin Matuska 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1439*2a58b312SMartin Matuska 			avl_insert(&brtvd->bv_tree, bre, where);
1440*2a58b312SMartin Matuska 			brt->brt_nentries++;
1441*2a58b312SMartin Matuska 		} else {
1442*2a58b312SMartin Matuska 			/*
1443*2a58b312SMartin Matuska 			 * The entry was added when the BRT lock was dropped in
1444*2a58b312SMartin Matuska 			 * brt_entry_lookup().
1445*2a58b312SMartin Matuska 			 */
1446*2a58b312SMartin Matuska 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1447*2a58b312SMartin Matuska 			bre = racebre;
1448*2a58b312SMartin Matuska 		}
1449*2a58b312SMartin Matuska 	}
1450*2a58b312SMartin Matuska 	bre->bre_refcount++;
1451*2a58b312SMartin Matuska 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1452*2a58b312SMartin Matuska 
1453*2a58b312SMartin Matuska 	brt_unlock(brt);
1454*2a58b312SMartin Matuska }
1455*2a58b312SMartin Matuska 
1456*2a58b312SMartin Matuska /* Return TRUE if block should be freed immediately. */
1457*2a58b312SMartin Matuska boolean_t
1458*2a58b312SMartin Matuska brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1459*2a58b312SMartin Matuska {
1460*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1461*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1462*2a58b312SMartin Matuska 	brt_entry_t *bre, *racebre;
1463*2a58b312SMartin Matuska 	brt_entry_t bre_search;
1464*2a58b312SMartin Matuska 	avl_index_t where;
1465*2a58b312SMartin Matuska 	uint64_t vdevid;
1466*2a58b312SMartin Matuska 	int error;
1467*2a58b312SMartin Matuska 
1468*2a58b312SMartin Matuska 	brt_entry_fill(bp, &bre_search, &vdevid);
1469*2a58b312SMartin Matuska 
1470*2a58b312SMartin Matuska 	brt_wlock(brt);
1471*2a58b312SMartin Matuska 
1472*2a58b312SMartin Matuska 	brtvd = brt_vdev(brt, vdevid);
1473*2a58b312SMartin Matuska 	ASSERT(brtvd != NULL);
1474*2a58b312SMartin Matuska 
1475*2a58b312SMartin Matuska 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1476*2a58b312SMartin Matuska 	if (bre != NULL) {
1477*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1478*2a58b312SMartin Matuska 		goto out;
1479*2a58b312SMartin Matuska 	} else {
1480*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1481*2a58b312SMartin Matuska 	}
1482*2a58b312SMartin Matuska 
1483*2a58b312SMartin Matuska 	/*
1484*2a58b312SMartin Matuska 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1485*2a58b312SMartin Matuska 	 */
1486*2a58b312SMartin Matuska 	error = brt_entry_lookup(brt, brtvd, &bre_search);
1487*2a58b312SMartin Matuska 	/* bre_search now contains correct bre_refcount */
1488*2a58b312SMartin Matuska 	ASSERT(error == 0 || error == ENOENT);
1489*2a58b312SMartin Matuska 	/*
1490*2a58b312SMartin Matuska 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1491*2a58b312SMartin Matuska 	 * and reallocated, we need to update brtvd's pointer.
1492*2a58b312SMartin Matuska 	 */
1493*2a58b312SMartin Matuska 	brtvd = brt_vdev(brt, vdevid);
1494*2a58b312SMartin Matuska 	ASSERT(brtvd != NULL);
1495*2a58b312SMartin Matuska 
1496*2a58b312SMartin Matuska 	if (error == ENOENT) {
1497*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1498*2a58b312SMartin Matuska 		bre = NULL;
1499*2a58b312SMartin Matuska 		goto out;
1500*2a58b312SMartin Matuska 	}
1501*2a58b312SMartin Matuska 
1502*2a58b312SMartin Matuska 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1503*2a58b312SMartin Matuska 	if (racebre != NULL) {
1504*2a58b312SMartin Matuska 		/*
1505*2a58b312SMartin Matuska 		 * The entry was added when the BRT lock was dropped in
1506*2a58b312SMartin Matuska 		 * brt_entry_lookup().
1507*2a58b312SMartin Matuska 		 */
1508*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1509*2a58b312SMartin Matuska 		bre = racebre;
1510*2a58b312SMartin Matuska 		goto out;
1511*2a58b312SMartin Matuska 	}
1512*2a58b312SMartin Matuska 
1513*2a58b312SMartin Matuska 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1514*2a58b312SMartin Matuska 	bre = brt_entry_alloc(&bre_search);
1515*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1516*2a58b312SMartin Matuska 	avl_insert(&brtvd->bv_tree, bre, where);
1517*2a58b312SMartin Matuska 	brt->brt_nentries++;
1518*2a58b312SMartin Matuska 
1519*2a58b312SMartin Matuska out:
1520*2a58b312SMartin Matuska 	if (bre == NULL) {
1521*2a58b312SMartin Matuska 		/*
1522*2a58b312SMartin Matuska 		 * This is a free of a regular (not cloned) block.
1523*2a58b312SMartin Matuska 		 */
1524*2a58b312SMartin Matuska 		brt_unlock(brt);
1525*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_no_entry);
1526*2a58b312SMartin Matuska 		return (B_TRUE);
1527*2a58b312SMartin Matuska 	}
1528*2a58b312SMartin Matuska 	if (bre->bre_refcount == 0) {
1529*2a58b312SMartin Matuska 		brt_unlock(brt);
1530*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_free_data_now);
1531*2a58b312SMartin Matuska 		return (B_TRUE);
1532*2a58b312SMartin Matuska 	}
1533*2a58b312SMartin Matuska 
1534*2a58b312SMartin Matuska 	ASSERT(bre->bre_refcount > 0);
1535*2a58b312SMartin Matuska 	bre->bre_refcount--;
1536*2a58b312SMartin Matuska 	if (bre->bre_refcount == 0)
1537*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_free_data_later);
1538*2a58b312SMartin Matuska 	else
1539*2a58b312SMartin Matuska 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1540*2a58b312SMartin Matuska 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1541*2a58b312SMartin Matuska 
1542*2a58b312SMartin Matuska 	brt_unlock(brt);
1543*2a58b312SMartin Matuska 
1544*2a58b312SMartin Matuska 	return (B_FALSE);
1545*2a58b312SMartin Matuska }
1546*2a58b312SMartin Matuska 
1547*2a58b312SMartin Matuska static void
1548*2a58b312SMartin Matuska brt_prefetch(brt_t *brt, const blkptr_t *bp)
1549*2a58b312SMartin Matuska {
1550*2a58b312SMartin Matuska 	brt_entry_t bre;
1551*2a58b312SMartin Matuska 	uint64_t vdevid;
1552*2a58b312SMartin Matuska 
1553*2a58b312SMartin Matuska 	ASSERT(bp != NULL);
1554*2a58b312SMartin Matuska 
1555*2a58b312SMartin Matuska 	if (!zfs_brt_prefetch)
1556*2a58b312SMartin Matuska 		return;
1557*2a58b312SMartin Matuska 
1558*2a58b312SMartin Matuska 	brt_entry_fill(bp, &bre, &vdevid);
1559*2a58b312SMartin Matuska 
1560*2a58b312SMartin Matuska 	brt_entry_prefetch(brt, vdevid, &bre);
1561*2a58b312SMartin Matuska }
1562*2a58b312SMartin Matuska 
1563*2a58b312SMartin Matuska static int
1564*2a58b312SMartin Matuska brt_pending_entry_compare(const void *x1, const void *x2)
1565*2a58b312SMartin Matuska {
1566*2a58b312SMartin Matuska 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1567*2a58b312SMartin Matuska 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1568*2a58b312SMartin Matuska 	int cmp;
1569*2a58b312SMartin Matuska 
1570*2a58b312SMartin Matuska 	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1571*2a58b312SMartin Matuska 	if (cmp == 0) {
1572*2a58b312SMartin Matuska 		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1573*2a58b312SMartin Matuska 		    DVA_GET_VDEV(&bp2->blk_dva[0]));
1574*2a58b312SMartin Matuska 		if (cmp == 0) {
1575*2a58b312SMartin Matuska 			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1576*2a58b312SMartin Matuska 			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
1577*2a58b312SMartin Matuska 		}
1578*2a58b312SMartin Matuska 	}
1579*2a58b312SMartin Matuska 
1580*2a58b312SMartin Matuska 	return (cmp);
1581*2a58b312SMartin Matuska }
1582*2a58b312SMartin Matuska 
1583*2a58b312SMartin Matuska void
1584*2a58b312SMartin Matuska brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1585*2a58b312SMartin Matuska {
1586*2a58b312SMartin Matuska 	brt_t *brt;
1587*2a58b312SMartin Matuska 	avl_tree_t *pending_tree;
1588*2a58b312SMartin Matuska 	kmutex_t *pending_lock;
1589*2a58b312SMartin Matuska 	brt_pending_entry_t *bpe, *newbpe;
1590*2a58b312SMartin Matuska 	avl_index_t where;
1591*2a58b312SMartin Matuska 	uint64_t txg;
1592*2a58b312SMartin Matuska 
1593*2a58b312SMartin Matuska 	brt = spa->spa_brt;
1594*2a58b312SMartin Matuska 	txg = dmu_tx_get_txg(tx);
1595*2a58b312SMartin Matuska 	ASSERT3U(txg, !=, 0);
1596*2a58b312SMartin Matuska 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1597*2a58b312SMartin Matuska 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1598*2a58b312SMartin Matuska 
1599*2a58b312SMartin Matuska 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1600*2a58b312SMartin Matuska 	newbpe->bpe_bp = *bp;
1601*2a58b312SMartin Matuska 	newbpe->bpe_count = 1;
1602*2a58b312SMartin Matuska 
1603*2a58b312SMartin Matuska 	mutex_enter(pending_lock);
1604*2a58b312SMartin Matuska 
1605*2a58b312SMartin Matuska 	bpe = avl_find(pending_tree, newbpe, &where);
1606*2a58b312SMartin Matuska 	if (bpe == NULL) {
1607*2a58b312SMartin Matuska 		avl_insert(pending_tree, newbpe, where);
1608*2a58b312SMartin Matuska 		newbpe = NULL;
1609*2a58b312SMartin Matuska 	} else {
1610*2a58b312SMartin Matuska 		bpe->bpe_count++;
1611*2a58b312SMartin Matuska 	}
1612*2a58b312SMartin Matuska 
1613*2a58b312SMartin Matuska 	mutex_exit(pending_lock);
1614*2a58b312SMartin Matuska 
1615*2a58b312SMartin Matuska 	if (newbpe != NULL) {
1616*2a58b312SMartin Matuska 		ASSERT(bpe != NULL);
1617*2a58b312SMartin Matuska 		ASSERT(bpe != newbpe);
1618*2a58b312SMartin Matuska 		kmem_cache_free(brt_pending_entry_cache, newbpe);
1619*2a58b312SMartin Matuska 	} else {
1620*2a58b312SMartin Matuska 		ASSERT(bpe == NULL);
1621*2a58b312SMartin Matuska 	}
1622*2a58b312SMartin Matuska 
1623*2a58b312SMartin Matuska 	/* Prefetch BRT entry, as we will need it in the syncing context. */
1624*2a58b312SMartin Matuska 	brt_prefetch(brt, bp);
1625*2a58b312SMartin Matuska }
1626*2a58b312SMartin Matuska 
1627*2a58b312SMartin Matuska void
1628*2a58b312SMartin Matuska brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1629*2a58b312SMartin Matuska {
1630*2a58b312SMartin Matuska 	brt_t *brt;
1631*2a58b312SMartin Matuska 	avl_tree_t *pending_tree;
1632*2a58b312SMartin Matuska 	kmutex_t *pending_lock;
1633*2a58b312SMartin Matuska 	brt_pending_entry_t *bpe, bpe_search;
1634*2a58b312SMartin Matuska 	uint64_t txg;
1635*2a58b312SMartin Matuska 
1636*2a58b312SMartin Matuska 	brt = spa->spa_brt;
1637*2a58b312SMartin Matuska 	txg = dmu_tx_get_txg(tx);
1638*2a58b312SMartin Matuska 	ASSERT3U(txg, !=, 0);
1639*2a58b312SMartin Matuska 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1640*2a58b312SMartin Matuska 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1641*2a58b312SMartin Matuska 
1642*2a58b312SMartin Matuska 	bpe_search.bpe_bp = *bp;
1643*2a58b312SMartin Matuska 
1644*2a58b312SMartin Matuska 	mutex_enter(pending_lock);
1645*2a58b312SMartin Matuska 
1646*2a58b312SMartin Matuska 	bpe = avl_find(pending_tree, &bpe_search, NULL);
1647*2a58b312SMartin Matuska 	/* I believe we should always find bpe when this function is called. */
1648*2a58b312SMartin Matuska 	if (bpe != NULL) {
1649*2a58b312SMartin Matuska 		ASSERT(bpe->bpe_count > 0);
1650*2a58b312SMartin Matuska 
1651*2a58b312SMartin Matuska 		bpe->bpe_count--;
1652*2a58b312SMartin Matuska 		if (bpe->bpe_count == 0) {
1653*2a58b312SMartin Matuska 			avl_remove(pending_tree, bpe);
1654*2a58b312SMartin Matuska 			kmem_cache_free(brt_pending_entry_cache, bpe);
1655*2a58b312SMartin Matuska 		}
1656*2a58b312SMartin Matuska 	}
1657*2a58b312SMartin Matuska 
1658*2a58b312SMartin Matuska 	mutex_exit(pending_lock);
1659*2a58b312SMartin Matuska }
1660*2a58b312SMartin Matuska 
1661*2a58b312SMartin Matuska void
1662*2a58b312SMartin Matuska brt_pending_apply(spa_t *spa, uint64_t txg)
1663*2a58b312SMartin Matuska {
1664*2a58b312SMartin Matuska 	brt_t *brt;
1665*2a58b312SMartin Matuska 	brt_pending_entry_t *bpe;
1666*2a58b312SMartin Matuska 	avl_tree_t *pending_tree;
1667*2a58b312SMartin Matuska 	kmutex_t *pending_lock;
1668*2a58b312SMartin Matuska 	void *c;
1669*2a58b312SMartin Matuska 
1670*2a58b312SMartin Matuska 	ASSERT3U(txg, !=, 0);
1671*2a58b312SMartin Matuska 
1672*2a58b312SMartin Matuska 	brt = spa->spa_brt;
1673*2a58b312SMartin Matuska 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1674*2a58b312SMartin Matuska 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1675*2a58b312SMartin Matuska 
1676*2a58b312SMartin Matuska 	mutex_enter(pending_lock);
1677*2a58b312SMartin Matuska 
1678*2a58b312SMartin Matuska 	c = NULL;
1679*2a58b312SMartin Matuska 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1680*2a58b312SMartin Matuska 		boolean_t added_to_ddt;
1681*2a58b312SMartin Matuska 
1682*2a58b312SMartin Matuska 		mutex_exit(pending_lock);
1683*2a58b312SMartin Matuska 
1684*2a58b312SMartin Matuska 		for (int i = 0; i < bpe->bpe_count; i++) {
1685*2a58b312SMartin Matuska 			/*
1686*2a58b312SMartin Matuska 			 * If the block has DEDUP bit set, it means that it
1687*2a58b312SMartin Matuska 			 * already exists in the DEDUP table, so we can just
1688*2a58b312SMartin Matuska 			 * use that instead of creating new entry in
1689*2a58b312SMartin Matuska 			 * the BRT table.
1690*2a58b312SMartin Matuska 			 */
1691*2a58b312SMartin Matuska 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1692*2a58b312SMartin Matuska 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1693*2a58b312SMartin Matuska 			} else {
1694*2a58b312SMartin Matuska 				added_to_ddt = B_FALSE;
1695*2a58b312SMartin Matuska 			}
1696*2a58b312SMartin Matuska 			if (!added_to_ddt)
1697*2a58b312SMartin Matuska 				brt_entry_addref(brt, &bpe->bpe_bp);
1698*2a58b312SMartin Matuska 		}
1699*2a58b312SMartin Matuska 
1700*2a58b312SMartin Matuska 		kmem_cache_free(brt_pending_entry_cache, bpe);
1701*2a58b312SMartin Matuska 		mutex_enter(pending_lock);
1702*2a58b312SMartin Matuska 	}
1703*2a58b312SMartin Matuska 
1704*2a58b312SMartin Matuska 	mutex_exit(pending_lock);
1705*2a58b312SMartin Matuska }
1706*2a58b312SMartin Matuska 
1707*2a58b312SMartin Matuska static void
1708*2a58b312SMartin Matuska brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1709*2a58b312SMartin Matuska {
1710*2a58b312SMartin Matuska 
1711*2a58b312SMartin Matuska 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1712*2a58b312SMartin Matuska 	ASSERT(brtvd->bv_mos_entries != 0);
1713*2a58b312SMartin Matuska 
1714*2a58b312SMartin Matuska 	if (bre->bre_refcount == 0) {
1715*2a58b312SMartin Matuska 		int error;
1716*2a58b312SMartin Matuska 
1717*2a58b312SMartin Matuska 		error = brt_entry_remove(brt, brtvd, bre, tx);
1718*2a58b312SMartin Matuska 		ASSERT(error == 0 || error == ENOENT);
1719*2a58b312SMartin Matuska 		/*
1720*2a58b312SMartin Matuska 		 * If error == ENOENT then zfs_clone_range() was done from a
1721*2a58b312SMartin Matuska 		 * removed (but opened) file (open(), unlink()).
1722*2a58b312SMartin Matuska 		 */
1723*2a58b312SMartin Matuska 		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1724*2a58b312SMartin Matuska 	} else {
1725*2a58b312SMartin Matuska 		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1726*2a58b312SMartin Matuska 	}
1727*2a58b312SMartin Matuska }
1728*2a58b312SMartin Matuska 
1729*2a58b312SMartin Matuska static void
1730*2a58b312SMartin Matuska brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1731*2a58b312SMartin Matuska {
1732*2a58b312SMartin Matuska 	brt_vdev_t *brtvd;
1733*2a58b312SMartin Matuska 	brt_entry_t *bre;
1734*2a58b312SMartin Matuska 	uint64_t vdevid;
1735*2a58b312SMartin Matuska 	void *c;
1736*2a58b312SMartin Matuska 
1737*2a58b312SMartin Matuska 	brt_wlock(brt);
1738*2a58b312SMartin Matuska 
1739*2a58b312SMartin Matuska 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1740*2a58b312SMartin Matuska 		brtvd = &brt->brt_vdevs[vdevid];
1741*2a58b312SMartin Matuska 
1742*2a58b312SMartin Matuska 		if (!brtvd->bv_initiated)
1743*2a58b312SMartin Matuska 			continue;
1744*2a58b312SMartin Matuska 
1745*2a58b312SMartin Matuska 		if (!brtvd->bv_meta_dirty) {
1746*2a58b312SMartin Matuska 			ASSERT(!brtvd->bv_entcount_dirty);
1747*2a58b312SMartin Matuska 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1748*2a58b312SMartin Matuska 			continue;
1749*2a58b312SMartin Matuska 		}
1750*2a58b312SMartin Matuska 
1751*2a58b312SMartin Matuska 		ASSERT(!brtvd->bv_entcount_dirty ||
1752*2a58b312SMartin Matuska 		    avl_numnodes(&brtvd->bv_tree) != 0);
1753*2a58b312SMartin Matuska 
1754*2a58b312SMartin Matuska 		if (brtvd->bv_mos_brtvdev == 0)
1755*2a58b312SMartin Matuska 			brt_vdev_create(brt, brtvd, tx);
1756*2a58b312SMartin Matuska 
1757*2a58b312SMartin Matuska 		c = NULL;
1758*2a58b312SMartin Matuska 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1759*2a58b312SMartin Matuska 			brt_sync_entry(brt, brtvd, bre, tx);
1760*2a58b312SMartin Matuska 			brt_entry_free(bre);
1761*2a58b312SMartin Matuska 			ASSERT(brt->brt_nentries > 0);
1762*2a58b312SMartin Matuska 			brt->brt_nentries--;
1763*2a58b312SMartin Matuska 		}
1764*2a58b312SMartin Matuska 
1765*2a58b312SMartin Matuska 		brt_vdev_sync(brt, brtvd, tx);
1766*2a58b312SMartin Matuska 
1767*2a58b312SMartin Matuska 		if (brtvd->bv_totalcount == 0)
1768*2a58b312SMartin Matuska 			brt_vdev_destroy(brt, brtvd, tx);
1769*2a58b312SMartin Matuska 	}
1770*2a58b312SMartin Matuska 
1771*2a58b312SMartin Matuska 	ASSERT0(brt->brt_nentries);
1772*2a58b312SMartin Matuska 
1773*2a58b312SMartin Matuska 	brt_unlock(brt);
1774*2a58b312SMartin Matuska }
1775*2a58b312SMartin Matuska 
1776*2a58b312SMartin Matuska void
1777*2a58b312SMartin Matuska brt_sync(spa_t *spa, uint64_t txg)
1778*2a58b312SMartin Matuska {
1779*2a58b312SMartin Matuska 	dmu_tx_t *tx;
1780*2a58b312SMartin Matuska 	brt_t *brt;
1781*2a58b312SMartin Matuska 
1782*2a58b312SMartin Matuska 	ASSERT(spa_syncing_txg(spa) == txg);
1783*2a58b312SMartin Matuska 
1784*2a58b312SMartin Matuska 	brt = spa->spa_brt;
1785*2a58b312SMartin Matuska 	brt_rlock(brt);
1786*2a58b312SMartin Matuska 	if (brt->brt_nentries == 0) {
1787*2a58b312SMartin Matuska 		/* No changes. */
1788*2a58b312SMartin Matuska 		brt_unlock(brt);
1789*2a58b312SMartin Matuska 		return;
1790*2a58b312SMartin Matuska 	}
1791*2a58b312SMartin Matuska 	brt_unlock(brt);
1792*2a58b312SMartin Matuska 
1793*2a58b312SMartin Matuska 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1794*2a58b312SMartin Matuska 
1795*2a58b312SMartin Matuska 	brt_sync_table(brt, tx);
1796*2a58b312SMartin Matuska 
1797*2a58b312SMartin Matuska 	dmu_tx_commit(tx);
1798*2a58b312SMartin Matuska }
1799*2a58b312SMartin Matuska 
1800*2a58b312SMartin Matuska static void
1801*2a58b312SMartin Matuska brt_table_alloc(brt_t *brt)
1802*2a58b312SMartin Matuska {
1803*2a58b312SMartin Matuska 
1804*2a58b312SMartin Matuska 	for (int i = 0; i < TXG_SIZE; i++) {
1805*2a58b312SMartin Matuska 		avl_create(&brt->brt_pending_tree[i],
1806*2a58b312SMartin Matuska 		    brt_pending_entry_compare,
1807*2a58b312SMartin Matuska 		    sizeof (brt_pending_entry_t),
1808*2a58b312SMartin Matuska 		    offsetof(brt_pending_entry_t, bpe_node));
1809*2a58b312SMartin Matuska 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1810*2a58b312SMartin Matuska 		    NULL);
1811*2a58b312SMartin Matuska 	}
1812*2a58b312SMartin Matuska }
1813*2a58b312SMartin Matuska 
1814*2a58b312SMartin Matuska static void
1815*2a58b312SMartin Matuska brt_table_free(brt_t *brt)
1816*2a58b312SMartin Matuska {
1817*2a58b312SMartin Matuska 
1818*2a58b312SMartin Matuska 	for (int i = 0; i < TXG_SIZE; i++) {
1819*2a58b312SMartin Matuska 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1820*2a58b312SMartin Matuska 
1821*2a58b312SMartin Matuska 		avl_destroy(&brt->brt_pending_tree[i]);
1822*2a58b312SMartin Matuska 		mutex_destroy(&brt->brt_pending_lock[i]);
1823*2a58b312SMartin Matuska 	}
1824*2a58b312SMartin Matuska }
1825*2a58b312SMartin Matuska 
1826*2a58b312SMartin Matuska static void
1827*2a58b312SMartin Matuska brt_alloc(spa_t *spa)
1828*2a58b312SMartin Matuska {
1829*2a58b312SMartin Matuska 	brt_t *brt;
1830*2a58b312SMartin Matuska 
1831*2a58b312SMartin Matuska 	ASSERT(spa->spa_brt == NULL);
1832*2a58b312SMartin Matuska 
1833*2a58b312SMartin Matuska 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1834*2a58b312SMartin Matuska 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1835*2a58b312SMartin Matuska 	brt->brt_spa = spa;
1836*2a58b312SMartin Matuska 	brt->brt_rangesize = 0;
1837*2a58b312SMartin Matuska 	brt->brt_nentries = 0;
1838*2a58b312SMartin Matuska 	brt->brt_vdevs = NULL;
1839*2a58b312SMartin Matuska 	brt->brt_nvdevs = 0;
1840*2a58b312SMartin Matuska 	brt_table_alloc(brt);
1841*2a58b312SMartin Matuska 
1842*2a58b312SMartin Matuska 	spa->spa_brt = brt;
1843*2a58b312SMartin Matuska }
1844*2a58b312SMartin Matuska 
1845*2a58b312SMartin Matuska void
1846*2a58b312SMartin Matuska brt_create(spa_t *spa)
1847*2a58b312SMartin Matuska {
1848*2a58b312SMartin Matuska 
1849*2a58b312SMartin Matuska 	brt_alloc(spa);
1850*2a58b312SMartin Matuska 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1851*2a58b312SMartin Matuska }
1852*2a58b312SMartin Matuska 
1853*2a58b312SMartin Matuska int
1854*2a58b312SMartin Matuska brt_load(spa_t *spa)
1855*2a58b312SMartin Matuska {
1856*2a58b312SMartin Matuska 
1857*2a58b312SMartin Matuska 	brt_alloc(spa);
1858*2a58b312SMartin Matuska 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1859*2a58b312SMartin Matuska 
1860*2a58b312SMartin Matuska 	return (0);
1861*2a58b312SMartin Matuska }
1862*2a58b312SMartin Matuska 
1863*2a58b312SMartin Matuska void
1864*2a58b312SMartin Matuska brt_unload(spa_t *spa)
1865*2a58b312SMartin Matuska {
1866*2a58b312SMartin Matuska 	brt_t *brt = spa->spa_brt;
1867*2a58b312SMartin Matuska 
1868*2a58b312SMartin Matuska 	if (brt == NULL)
1869*2a58b312SMartin Matuska 		return;
1870*2a58b312SMartin Matuska 
1871*2a58b312SMartin Matuska 	brt_vdevs_free(brt);
1872*2a58b312SMartin Matuska 	brt_table_free(brt);
1873*2a58b312SMartin Matuska 	rw_destroy(&brt->brt_lock);
1874*2a58b312SMartin Matuska 	kmem_free(brt, sizeof (*brt));
1875*2a58b312SMartin Matuska 	spa->spa_brt = NULL;
1876*2a58b312SMartin Matuska }
1877*2a58b312SMartin Matuska 
1878*2a58b312SMartin Matuska /* BEGIN CSTYLED */
1879*2a58b312SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1880*2a58b312SMartin Matuska     "Enable prefetching of BRT entries");
1881*2a58b312SMartin Matuska #ifdef ZFS_BRT_DEBUG
1882*2a58b312SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1883*2a58b312SMartin Matuska #endif
1884*2a58b312SMartin Matuska /* END CSTYLED */
1885