xref: /freebsd-src/sys/contrib/openzfs/module/zfs/ddt_log.c (revision e2df9bb44109577475aeb186e7186ac040f9bde1)
1*e2df9bb4SMartin Matuska /*
2*e2df9bb4SMartin Matuska  * CDDL HEADER START
3*e2df9bb4SMartin Matuska  *
4*e2df9bb4SMartin Matuska  * The contents of this file are subject to the terms of the
5*e2df9bb4SMartin Matuska  * Common Development and Distribution License (the "License").
6*e2df9bb4SMartin Matuska  * You may not use this file except in compliance with the License.
7*e2df9bb4SMartin Matuska  *
8*e2df9bb4SMartin Matuska  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*e2df9bb4SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10*e2df9bb4SMartin Matuska  * See the License for the specific language governing permissions
11*e2df9bb4SMartin Matuska  * and limitations under the License.
12*e2df9bb4SMartin Matuska  *
13*e2df9bb4SMartin Matuska  * When distributing Covered Code, include this CDDL HEADER in each
14*e2df9bb4SMartin Matuska  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*e2df9bb4SMartin Matuska  * If applicable, add the following below this CDDL HEADER, with the
16*e2df9bb4SMartin Matuska  * fields enclosed by brackets "[]" replaced with your own identifying
17*e2df9bb4SMartin Matuska  * information: Portions Copyright [yyyy] [name of copyright owner]
18*e2df9bb4SMartin Matuska  *
19*e2df9bb4SMartin Matuska  * CDDL HEADER END
20*e2df9bb4SMartin Matuska  */
21*e2df9bb4SMartin Matuska 
22*e2df9bb4SMartin Matuska /*
23*e2df9bb4SMartin Matuska  * Copyright (c) 2023, Klara Inc.
24*e2df9bb4SMartin Matuska  */
25*e2df9bb4SMartin Matuska 
26*e2df9bb4SMartin Matuska #include <sys/zfs_context.h>
27*e2df9bb4SMartin Matuska #include <sys/spa.h>
28*e2df9bb4SMartin Matuska #include <sys/ddt.h>
29*e2df9bb4SMartin Matuska #include <sys/dmu_tx.h>
30*e2df9bb4SMartin Matuska #include <sys/dmu.h>
31*e2df9bb4SMartin Matuska #include <sys/ddt_impl.h>
32*e2df9bb4SMartin Matuska #include <sys/dnode.h>
33*e2df9bb4SMartin Matuska #include <sys/dbuf.h>
34*e2df9bb4SMartin Matuska #include <sys/zap.h>
35*e2df9bb4SMartin Matuska #include <sys/zio_checksum.h>
36*e2df9bb4SMartin Matuska 
37*e2df9bb4SMartin Matuska /*
38*e2df9bb4SMartin Matuska  * No more than this many txgs before swapping logs.
39*e2df9bb4SMartin Matuska  */
40*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_txg_max = 8;
41*e2df9bb4SMartin Matuska 
42*e2df9bb4SMartin Matuska /*
43*e2df9bb4SMartin Matuska  * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
44*e2df9bb4SMartin Matuska  * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
45*e2df9bb4SMartin Matuska  */
46*e2df9bb4SMartin Matuska uint64_t zfs_dedup_log_mem_max = 0;
47*e2df9bb4SMartin Matuska uint_t zfs_dedup_log_mem_max_percent = 1;
48*e2df9bb4SMartin Matuska 
49*e2df9bb4SMartin Matuska 
50*e2df9bb4SMartin Matuska static kmem_cache_t *ddt_log_entry_flat_cache;
51*e2df9bb4SMartin Matuska static kmem_cache_t *ddt_log_entry_trad_cache;
52*e2df9bb4SMartin Matuska 
53*e2df9bb4SMartin Matuska #define	DDT_LOG_ENTRY_FLAT_SIZE	\
54*e2df9bb4SMartin Matuska 	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
55*e2df9bb4SMartin Matuska #define	DDT_LOG_ENTRY_TRAD_SIZE	\
56*e2df9bb4SMartin Matuska 	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
57*e2df9bb4SMartin Matuska 
58*e2df9bb4SMartin Matuska #define	DDT_LOG_ENTRY_SIZE(ddt)	\
59*e2df9bb4SMartin Matuska 	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
60*e2df9bb4SMartin Matuska 
61*e2df9bb4SMartin Matuska void
62*e2df9bb4SMartin Matuska ddt_log_init(void)
63*e2df9bb4SMartin Matuska {
64*e2df9bb4SMartin Matuska 	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
65*e2df9bb4SMartin Matuska 	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
66*e2df9bb4SMartin Matuska 	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
67*e2df9bb4SMartin Matuska 	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
68*e2df9bb4SMartin Matuska 
69*e2df9bb4SMartin Matuska 	/*
70*e2df9bb4SMartin Matuska 	 * Max memory for log AVL entries. At least 1M, because we need
71*e2df9bb4SMartin Matuska 	 * something (that's ~3800 entries per tree). They can say 100% if they
72*e2df9bb4SMartin Matuska 	 * want; it just means they're at the mercy of the the txg flush limit.
73*e2df9bb4SMartin Matuska 	 */
74*e2df9bb4SMartin Matuska 	if (zfs_dedup_log_mem_max == 0) {
75*e2df9bb4SMartin Matuska 		zfs_dedup_log_mem_max_percent =
76*e2df9bb4SMartin Matuska 		    MIN(zfs_dedup_log_mem_max_percent, 100);
77*e2df9bb4SMartin Matuska 		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
78*e2df9bb4SMartin Matuska 		    zfs_dedup_log_mem_max_percent / 100;
79*e2df9bb4SMartin Matuska 	}
80*e2df9bb4SMartin Matuska 	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
81*e2df9bb4SMartin Matuska }
82*e2df9bb4SMartin Matuska 
83*e2df9bb4SMartin Matuska void
84*e2df9bb4SMartin Matuska ddt_log_fini(void)
85*e2df9bb4SMartin Matuska {
86*e2df9bb4SMartin Matuska 	kmem_cache_destroy(ddt_log_entry_trad_cache);
87*e2df9bb4SMartin Matuska 	kmem_cache_destroy(ddt_log_entry_flat_cache);
88*e2df9bb4SMartin Matuska }
89*e2df9bb4SMartin Matuska 
90*e2df9bb4SMartin Matuska static void
91*e2df9bb4SMartin Matuska ddt_log_name(ddt_t *ddt, char *name, uint_t n)
92*e2df9bb4SMartin Matuska {
93*e2df9bb4SMartin Matuska 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
94*e2df9bb4SMartin Matuska 	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
95*e2df9bb4SMartin Matuska }
96*e2df9bb4SMartin Matuska 
97*e2df9bb4SMartin Matuska static void
98*e2df9bb4SMartin Matuska ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
99*e2df9bb4SMartin Matuska {
100*e2df9bb4SMartin Matuska 	dmu_buf_t *db;
101*e2df9bb4SMartin Matuska 	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
102*e2df9bb4SMartin Matuska 	dmu_buf_will_dirty(db, tx);
103*e2df9bb4SMartin Matuska 
104*e2df9bb4SMartin Matuska 	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
105*e2df9bb4SMartin Matuska 	DLH_SET_VERSION(hdr, 1);
106*e2df9bb4SMartin Matuska 	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
107*e2df9bb4SMartin Matuska 	hdr->dlh_length = ddl->ddl_length;
108*e2df9bb4SMartin Matuska 	hdr->dlh_first_txg = ddl->ddl_first_txg;
109*e2df9bb4SMartin Matuska 	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
110*e2df9bb4SMartin Matuska 
111*e2df9bb4SMartin Matuska 	dmu_buf_rele(db, FTAG);
112*e2df9bb4SMartin Matuska }
113*e2df9bb4SMartin Matuska 
114*e2df9bb4SMartin Matuska static void
115*e2df9bb4SMartin Matuska ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
116*e2df9bb4SMartin Matuska {
117*e2df9bb4SMartin Matuska 	ASSERT3U(ddt->ddt_dir_object, >, 0);
118*e2df9bb4SMartin Matuska 	ASSERT3U(ddl->ddl_object, ==, 0);
119*e2df9bb4SMartin Matuska 
120*e2df9bb4SMartin Matuska 	char name[DDT_NAMELEN];
121*e2df9bb4SMartin Matuska 	ddt_log_name(ddt, name, n);
122*e2df9bb4SMartin Matuska 
123*e2df9bb4SMartin Matuska 	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
124*e2df9bb4SMartin Matuska 	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
125*e2df9bb4SMartin Matuska 	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
126*e2df9bb4SMartin Matuska 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
127*e2df9bb4SMartin Matuska 	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
128*e2df9bb4SMartin Matuska 	ddl->ddl_length = 0;
129*e2df9bb4SMartin Matuska 	ddl->ddl_first_txg = tx->tx_txg;
130*e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddl, tx);
131*e2df9bb4SMartin Matuska }
132*e2df9bb4SMartin Matuska 
133*e2df9bb4SMartin Matuska static void
134*e2df9bb4SMartin Matuska ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
135*e2df9bb4SMartin Matuska {
136*e2df9bb4SMartin Matuska 	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
137*e2df9bb4SMartin Matuska 	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
138*e2df9bb4SMartin Matuska }
139*e2df9bb4SMartin Matuska 
140*e2df9bb4SMartin Matuska static void
141*e2df9bb4SMartin Matuska ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
142*e2df9bb4SMartin Matuska {
143*e2df9bb4SMartin Matuska 	ASSERT3U(ddt->ddt_dir_object, >, 0);
144*e2df9bb4SMartin Matuska 
145*e2df9bb4SMartin Matuska 	if (ddl->ddl_object == 0)
146*e2df9bb4SMartin Matuska 		return;
147*e2df9bb4SMartin Matuska 
148*e2df9bb4SMartin Matuska 	ASSERT0(ddl->ddl_length);
149*e2df9bb4SMartin Matuska 
150*e2df9bb4SMartin Matuska 	char name[DDT_NAMELEN];
151*e2df9bb4SMartin Matuska 	ddt_log_name(ddt, name, n);
152*e2df9bb4SMartin Matuska 
153*e2df9bb4SMartin Matuska 	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
154*e2df9bb4SMartin Matuska 	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
155*e2df9bb4SMartin Matuska 
156*e2df9bb4SMartin Matuska 	ddl->ddl_object = 0;
157*e2df9bb4SMartin Matuska }
158*e2df9bb4SMartin Matuska 
159*e2df9bb4SMartin Matuska void
160*e2df9bb4SMartin Matuska ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
161*e2df9bb4SMartin Matuska {
162*e2df9bb4SMartin Matuska 	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
163*e2df9bb4SMartin Matuska 	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
164*e2df9bb4SMartin Matuska }
165*e2df9bb4SMartin Matuska 
166*e2df9bb4SMartin Matuska static void
167*e2df9bb4SMartin Matuska ddt_log_update_stats(ddt_t *ddt)
168*e2df9bb4SMartin Matuska {
169*e2df9bb4SMartin Matuska 	/*
170*e2df9bb4SMartin Matuska 	 * Log object stats. We count the number of live entries in the log
171*e2df9bb4SMartin Matuska 	 * tree, even if there are more than on disk, and even if the same
172*e2df9bb4SMartin Matuska 	 * entry is on both append and flush trees, because that's more what
173*e2df9bb4SMartin Matuska 	 * the user expects to see. This does mean the on-disk size is not
174*e2df9bb4SMartin Matuska 	 * really correlated with the number of entries, but I don't think
175*e2df9bb4SMartin Matuska 	 * that's reasonable to expect anyway.
176*e2df9bb4SMartin Matuska 	 */
177*e2df9bb4SMartin Matuska 	dmu_object_info_t doi;
178*e2df9bb4SMartin Matuska 	uint64_t nblocks;
179*e2df9bb4SMartin Matuska 	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
180*e2df9bb4SMartin Matuska 	nblocks = doi.doi_physical_blocks_512;
181*e2df9bb4SMartin Matuska 	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
182*e2df9bb4SMartin Matuska 	nblocks += doi.doi_physical_blocks_512;
183*e2df9bb4SMartin Matuska 
184*e2df9bb4SMartin Matuska 	ddt_object_t *ddo = &ddt->ddt_log_stats;
185*e2df9bb4SMartin Matuska 	ddo->ddo_count =
186*e2df9bb4SMartin Matuska 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
187*e2df9bb4SMartin Matuska 	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
188*e2df9bb4SMartin Matuska 	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
189*e2df9bb4SMartin Matuska 	ddo->ddo_dspace = nblocks << 9;
190*e2df9bb4SMartin Matuska }
191*e2df9bb4SMartin Matuska 
192*e2df9bb4SMartin Matuska void
193*e2df9bb4SMartin Matuska ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
194*e2df9bb4SMartin Matuska {
195*e2df9bb4SMartin Matuska 	ASSERT3U(nentries, >, 0);
196*e2df9bb4SMartin Matuska 	ASSERT3P(dlu->dlu_dbp, ==, NULL);
197*e2df9bb4SMartin Matuska 
198*e2df9bb4SMartin Matuska 	if (ddt->ddt_log_active->ddl_object == 0)
199*e2df9bb4SMartin Matuska 		ddt_log_create(ddt, tx);
200*e2df9bb4SMartin Matuska 
201*e2df9bb4SMartin Matuska 	/*
202*e2df9bb4SMartin Matuska 	 * We want to store as many entries as we can in a block, but never
203*e2df9bb4SMartin Matuska 	 * split an entry across block boundaries.
204*e2df9bb4SMartin Matuska 	 */
205*e2df9bb4SMartin Matuska 	size_t reclen = P2ALIGN_TYPED(
206*e2df9bb4SMartin Matuska 	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
207*e2df9bb4SMartin Matuska 	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
208*e2df9bb4SMartin Matuska 	ASSERT3U(reclen, <=, UINT16_MAX);
209*e2df9bb4SMartin Matuska 	dlu->dlu_reclen = reclen;
210*e2df9bb4SMartin Matuska 
211*e2df9bb4SMartin Matuska 	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
212*e2df9bb4SMartin Matuska 	    &dlu->dlu_dn));
213*e2df9bb4SMartin Matuska 	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
214*e2df9bb4SMartin Matuska 
215*e2df9bb4SMartin Matuska 	uint64_t nblocks = howmany(nentries,
216*e2df9bb4SMartin Matuska 	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
217*e2df9bb4SMartin Matuska 	uint64_t offset = ddt->ddt_log_active->ddl_length;
218*e2df9bb4SMartin Matuska 	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
219*e2df9bb4SMartin Matuska 
220*e2df9bb4SMartin Matuska 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
221*e2df9bb4SMartin Matuska 	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
222*e2df9bb4SMartin Matuska 	    DMU_READ_NO_PREFETCH));
223*e2df9bb4SMartin Matuska 
224*e2df9bb4SMartin Matuska 	dlu->dlu_tx = tx;
225*e2df9bb4SMartin Matuska 	dlu->dlu_block = dlu->dlu_offset = 0;
226*e2df9bb4SMartin Matuska }
227*e2df9bb4SMartin Matuska 
228*e2df9bb4SMartin Matuska static ddt_log_entry_t *
229*e2df9bb4SMartin Matuska ddt_log_alloc_entry(ddt_t *ddt)
230*e2df9bb4SMartin Matuska {
231*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle;
232*e2df9bb4SMartin Matuska 
233*e2df9bb4SMartin Matuska 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
234*e2df9bb4SMartin Matuska 		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
235*e2df9bb4SMartin Matuska 		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
236*e2df9bb4SMartin Matuska 	} else {
237*e2df9bb4SMartin Matuska 		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
238*e2df9bb4SMartin Matuska 		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
239*e2df9bb4SMartin Matuska 	}
240*e2df9bb4SMartin Matuska 
241*e2df9bb4SMartin Matuska 	return (ddle);
242*e2df9bb4SMartin Matuska }
243*e2df9bb4SMartin Matuska 
244*e2df9bb4SMartin Matuska static void
245*e2df9bb4SMartin Matuska ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
246*e2df9bb4SMartin Matuska {
247*e2df9bb4SMartin Matuska 	/* Create the log tree entry from a live or stored entry */
248*e2df9bb4SMartin Matuska 	avl_index_t where;
249*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle =
250*e2df9bb4SMartin Matuska 	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
251*e2df9bb4SMartin Matuska 	if (ddle == NULL) {
252*e2df9bb4SMartin Matuska 		ddle = ddt_log_alloc_entry(ddt);
253*e2df9bb4SMartin Matuska 		ddle->ddle_key = ddlwe->ddlwe_key;
254*e2df9bb4SMartin Matuska 		avl_insert(&ddl->ddl_tree, ddle, where);
255*e2df9bb4SMartin Matuska 	}
256*e2df9bb4SMartin Matuska 	ddle->ddle_type = ddlwe->ddlwe_type;
257*e2df9bb4SMartin Matuska 	ddle->ddle_class = ddlwe->ddlwe_class;
258*e2df9bb4SMartin Matuska 	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
259*e2df9bb4SMartin Matuska }
260*e2df9bb4SMartin Matuska 
261*e2df9bb4SMartin Matuska void
262*e2df9bb4SMartin Matuska ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
263*e2df9bb4SMartin Matuska {
264*e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
265*e2df9bb4SMartin Matuska 
266*e2df9bb4SMartin Matuska 	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
267*e2df9bb4SMartin Matuska 	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
268*e2df9bb4SMartin Matuska 
269*e2df9bb4SMartin Matuska 	/* Get our block */
270*e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
271*e2df9bb4SMartin Matuska 	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
272*e2df9bb4SMartin Matuska 
273*e2df9bb4SMartin Matuska 	/*
274*e2df9bb4SMartin Matuska 	 * If this would take us past the end of the block, finish it and
275*e2df9bb4SMartin Matuska 	 * move to the next one.
276*e2df9bb4SMartin Matuska 	 */
277*e2df9bb4SMartin Matuska 	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
278*e2df9bb4SMartin Matuska 		ASSERT3U(dlu->dlu_offset, >, 0);
279*e2df9bb4SMartin Matuska 		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
280*e2df9bb4SMartin Matuska 		dlu->dlu_block++;
281*e2df9bb4SMartin Matuska 		dlu->dlu_offset = 0;
282*e2df9bb4SMartin Matuska 		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
283*e2df9bb4SMartin Matuska 		db = dlu->dlu_dbp[dlu->dlu_block];
284*e2df9bb4SMartin Matuska 	}
285*e2df9bb4SMartin Matuska 
286*e2df9bb4SMartin Matuska 	/*
287*e2df9bb4SMartin Matuska 	 * If this is the first time touching the block, inform the DMU that
288*e2df9bb4SMartin Matuska 	 * we will fill it, and zero it out.
289*e2df9bb4SMartin Matuska 	 */
290*e2df9bb4SMartin Matuska 	if (dlu->dlu_offset == 0) {
291*e2df9bb4SMartin Matuska 		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
292*e2df9bb4SMartin Matuska 		memset(db->db_data, 0, db->db_size);
293*e2df9bb4SMartin Matuska 	}
294*e2df9bb4SMartin Matuska 
295*e2df9bb4SMartin Matuska 	/* Create the log record directly in the buffer */
296*e2df9bb4SMartin Matuska 	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
297*e2df9bb4SMartin Matuska 	DLR_SET_TYPE(dlr, DLR_ENTRY);
298*e2df9bb4SMartin Matuska 	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
299*e2df9bb4SMartin Matuska 	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
300*e2df9bb4SMartin Matuska 	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
301*e2df9bb4SMartin Matuska 
302*e2df9bb4SMartin Matuska 	ddt_log_record_entry_t *dlre =
303*e2df9bb4SMartin Matuska 	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
304*e2df9bb4SMartin Matuska 	dlre->dlre_key = ddlwe->ddlwe_key;
305*e2df9bb4SMartin Matuska 	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
306*e2df9bb4SMartin Matuska 
307*e2df9bb4SMartin Matuska 	/* Advance offset for next record. */
308*e2df9bb4SMartin Matuska 	dlu->dlu_offset += dlu->dlu_reclen;
309*e2df9bb4SMartin Matuska }
310*e2df9bb4SMartin Matuska 
311*e2df9bb4SMartin Matuska void
312*e2df9bb4SMartin Matuska ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
313*e2df9bb4SMartin Matuska {
314*e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
315*e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
316*e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_offset, >, 0);
317*e2df9bb4SMartin Matuska 
318*e2df9bb4SMartin Matuska 	/*
319*e2df9bb4SMartin Matuska 	 * Close out the last block. Whatever we haven't used will be zeroed,
320*e2df9bb4SMartin Matuska 	 * which matches DLR_INVALID, so we can detect this during load.
321*e2df9bb4SMartin Matuska 	 */
322*e2df9bb4SMartin Matuska 	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
323*e2df9bb4SMartin Matuska 
324*e2df9bb4SMartin Matuska 	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
325*e2df9bb4SMartin Matuska 
326*e2df9bb4SMartin Matuska 	ddt->ddt_log_active->ddl_length +=
327*e2df9bb4SMartin Matuska 	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
328*e2df9bb4SMartin Matuska 	dnode_rele(dlu->dlu_dn, FTAG);
329*e2df9bb4SMartin Matuska 
330*e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
331*e2df9bb4SMartin Matuska 
332*e2df9bb4SMartin Matuska 	memset(dlu, 0, sizeof (ddt_log_update_t));
333*e2df9bb4SMartin Matuska 
334*e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
335*e2df9bb4SMartin Matuska }
336*e2df9bb4SMartin Matuska 
337*e2df9bb4SMartin Matuska boolean_t
338*e2df9bb4SMartin Matuska ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
339*e2df9bb4SMartin Matuska {
340*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
341*e2df9bb4SMartin Matuska 	if (ddle == NULL)
342*e2df9bb4SMartin Matuska 		return (B_FALSE);
343*e2df9bb4SMartin Matuska 
344*e2df9bb4SMartin Matuska 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
345*e2df9bb4SMartin Matuska 
346*e2df9bb4SMartin Matuska 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
347*e2df9bb4SMartin Matuska 
348*e2df9bb4SMartin Matuska 	avl_remove(&ddl->ddl_tree, ddle);
349*e2df9bb4SMartin Matuska 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
350*e2df9bb4SMartin Matuska 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
351*e2df9bb4SMartin Matuska 
352*e2df9bb4SMartin Matuska 	return (B_TRUE);
353*e2df9bb4SMartin Matuska }
354*e2df9bb4SMartin Matuska 
355*e2df9bb4SMartin Matuska boolean_t
356*e2df9bb4SMartin Matuska ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
357*e2df9bb4SMartin Matuska {
358*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
359*e2df9bb4SMartin Matuska 	if (ddle == NULL)
360*e2df9bb4SMartin Matuska 		return (B_FALSE);
361*e2df9bb4SMartin Matuska 
362*e2df9bb4SMartin Matuska 	ddt_lightweight_entry_t ddlwe;
363*e2df9bb4SMartin Matuska 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
364*e2df9bb4SMartin Matuska 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
365*e2df9bb4SMartin Matuska 
366*e2df9bb4SMartin Matuska 	avl_remove(&ddl->ddl_tree, ddle);
367*e2df9bb4SMartin Matuska 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
368*e2df9bb4SMartin Matuska 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
369*e2df9bb4SMartin Matuska 
370*e2df9bb4SMartin Matuska 	return (B_TRUE);
371*e2df9bb4SMartin Matuska }
372*e2df9bb4SMartin Matuska 
373*e2df9bb4SMartin Matuska boolean_t
374*e2df9bb4SMartin Matuska ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
375*e2df9bb4SMartin Matuska     ddt_lightweight_entry_t *ddlwe)
376*e2df9bb4SMartin Matuska {
377*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle =
378*e2df9bb4SMartin Matuska 	    avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
379*e2df9bb4SMartin Matuska 	if (!ddle)
380*e2df9bb4SMartin Matuska 		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
381*e2df9bb4SMartin Matuska 	if (!ddle)
382*e2df9bb4SMartin Matuska 		return (B_FALSE);
383*e2df9bb4SMartin Matuska 	if (ddlwe)
384*e2df9bb4SMartin Matuska 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
385*e2df9bb4SMartin Matuska 	return (B_TRUE);
386*e2df9bb4SMartin Matuska }
387*e2df9bb4SMartin Matuska 
388*e2df9bb4SMartin Matuska void
389*e2df9bb4SMartin Matuska ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
390*e2df9bb4SMartin Matuska {
391*e2df9bb4SMartin Matuska 	ddt_log_t *ddl = ddt->ddt_log_flushing;
392*e2df9bb4SMartin Matuska 
393*e2df9bb4SMartin Matuska 	ASSERT3U(ddl->ddl_object, !=, 0);
394*e2df9bb4SMartin Matuska 
395*e2df9bb4SMartin Matuska #ifdef ZFS_DEBUG
396*e2df9bb4SMartin Matuska 	/*
397*e2df9bb4SMartin Matuska 	 * There should not be any entries on the log tree before the given
398*e2df9bb4SMartin Matuska 	 * checkpoint. Assert that this is the case.
399*e2df9bb4SMartin Matuska 	 */
400*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
401*e2df9bb4SMartin Matuska 	if (ddle != NULL)
402*e2df9bb4SMartin Matuska 		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
403*e2df9bb4SMartin Matuska 		    >, 0);
404*e2df9bb4SMartin Matuska #endif
405*e2df9bb4SMartin Matuska 
406*e2df9bb4SMartin Matuska 	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
407*e2df9bb4SMartin Matuska 	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
408*e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddl, tx);
409*e2df9bb4SMartin Matuska 
410*e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
411*e2df9bb4SMartin Matuska }
412*e2df9bb4SMartin Matuska 
413*e2df9bb4SMartin Matuska void
414*e2df9bb4SMartin Matuska ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
415*e2df9bb4SMartin Matuska {
416*e2df9bb4SMartin Matuska 	ddt_log_t *ddl = ddt->ddt_log_flushing;
417*e2df9bb4SMartin Matuska 
418*e2df9bb4SMartin Matuska 	if (ddl->ddl_object == 0)
419*e2df9bb4SMartin Matuska 		return;
420*e2df9bb4SMartin Matuska 
421*e2df9bb4SMartin Matuska 	ASSERT(avl_is_empty(&ddl->ddl_tree));
422*e2df9bb4SMartin Matuska 
423*e2df9bb4SMartin Matuska 	/* Eject the entire object */
424*e2df9bb4SMartin Matuska 	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
425*e2df9bb4SMartin Matuska 
426*e2df9bb4SMartin Matuska 	ddl->ddl_length = 0;
427*e2df9bb4SMartin Matuska 	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
428*e2df9bb4SMartin Matuska 	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
429*e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddl, tx);
430*e2df9bb4SMartin Matuska 
431*e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
432*e2df9bb4SMartin Matuska }
433*e2df9bb4SMartin Matuska 
434*e2df9bb4SMartin Matuska boolean_t
435*e2df9bb4SMartin Matuska ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
436*e2df9bb4SMartin Matuska {
437*e2df9bb4SMartin Matuska 	/* Swap the logs. The old flushing one must be empty */
438*e2df9bb4SMartin Matuska 	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
439*e2df9bb4SMartin Matuska 
440*e2df9bb4SMartin Matuska 	/*
441*e2df9bb4SMartin Matuska 	 * If there are still blocks on the flushing log, truncate it first.
442*e2df9bb4SMartin Matuska 	 * This can happen if there were entries on the flushing log that were
443*e2df9bb4SMartin Matuska 	 * removed in memory via ddt_lookup(); their vestigal remains are
444*e2df9bb4SMartin Matuska 	 * on disk.
445*e2df9bb4SMartin Matuska 	 */
446*e2df9bb4SMartin Matuska 	if (ddt->ddt_log_flushing->ddl_length > 0)
447*e2df9bb4SMartin Matuska 		ddt_log_truncate(ddt, tx);
448*e2df9bb4SMartin Matuska 
449*e2df9bb4SMartin Matuska 	/*
450*e2df9bb4SMartin Matuska 	 * Swap policy. We swap the logs (and so begin flushing) when the
451*e2df9bb4SMartin Matuska 	 * active tree grows too large, or when we haven't swapped it in
452*e2df9bb4SMartin Matuska 	 * some amount of time, or if something has requested the logs be
453*e2df9bb4SMartin Matuska 	 * flushed ASAP (see ddt_walk_init()).
454*e2df9bb4SMartin Matuska 	 */
455*e2df9bb4SMartin Matuska 
456*e2df9bb4SMartin Matuska 	/*
457*e2df9bb4SMartin Matuska 	 * The log tree is too large if the memory usage of its entries is over
458*e2df9bb4SMartin Matuska 	 * half of the memory limit. This effectively gives each log tree half
459*e2df9bb4SMartin Matuska 	 * the available memory.
460*e2df9bb4SMartin Matuska 	 */
461*e2df9bb4SMartin Matuska 	const boolean_t too_large =
462*e2df9bb4SMartin Matuska 	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
463*e2df9bb4SMartin Matuska 	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
464*e2df9bb4SMartin Matuska 
465*e2df9bb4SMartin Matuska 	const boolean_t too_old =
466*e2df9bb4SMartin Matuska 	    tx->tx_txg >=
467*e2df9bb4SMartin Matuska 	    (ddt->ddt_log_active->ddl_first_txg +
468*e2df9bb4SMartin Matuska 	    MAX(1, zfs_dedup_log_txg_max));
469*e2df9bb4SMartin Matuska 
470*e2df9bb4SMartin Matuska 	const boolean_t force =
471*e2df9bb4SMartin Matuska 	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
472*e2df9bb4SMartin Matuska 
473*e2df9bb4SMartin Matuska 	if (!(too_large || too_old || force))
474*e2df9bb4SMartin Matuska 		return (B_FALSE);
475*e2df9bb4SMartin Matuska 
476*e2df9bb4SMartin Matuska 	ddt_log_t *swap = ddt->ddt_log_active;
477*e2df9bb4SMartin Matuska 	ddt->ddt_log_active = ddt->ddt_log_flushing;
478*e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing = swap;
479*e2df9bb4SMartin Matuska 
480*e2df9bb4SMartin Matuska 	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
481*e2df9bb4SMartin Matuska 	ddt->ddt_log_active->ddl_flags &=
482*e2df9bb4SMartin Matuska 	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
483*e2df9bb4SMartin Matuska 
484*e2df9bb4SMartin Matuska 	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
485*e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
486*e2df9bb4SMartin Matuska 
487*e2df9bb4SMartin Matuska 	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
488*e2df9bb4SMartin Matuska 
489*e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
490*e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
491*e2df9bb4SMartin Matuska 
492*e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
493*e2df9bb4SMartin Matuska 
494*e2df9bb4SMartin Matuska 	return (B_TRUE);
495*e2df9bb4SMartin Matuska }
496*e2df9bb4SMartin Matuska 
497*e2df9bb4SMartin Matuska static inline void
498*e2df9bb4SMartin Matuska ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
499*e2df9bb4SMartin Matuska     const ddt_key_t *checkpoint)
500*e2df9bb4SMartin Matuska {
501*e2df9bb4SMartin Matuska 	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
502*e2df9bb4SMartin Matuska 
503*e2df9bb4SMartin Matuska 	ddt_log_record_entry_t *dlre =
504*e2df9bb4SMartin Matuska 	    (ddt_log_record_entry_t *)dlr->dlr_payload;
505*e2df9bb4SMartin Matuska 	if (checkpoint != NULL &&
506*e2df9bb4SMartin Matuska 	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
507*e2df9bb4SMartin Matuska 		/* Skip pre-checkpoint entries; they're already flushed. */
508*e2df9bb4SMartin Matuska 		return;
509*e2df9bb4SMartin Matuska 	}
510*e2df9bb4SMartin Matuska 
511*e2df9bb4SMartin Matuska 	ddt_lightweight_entry_t ddlwe;
512*e2df9bb4SMartin Matuska 	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
513*e2df9bb4SMartin Matuska 	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
514*e2df9bb4SMartin Matuska 
515*e2df9bb4SMartin Matuska 	ddlwe.ddlwe_key = dlre->dlre_key;
516*e2df9bb4SMartin Matuska 	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
517*e2df9bb4SMartin Matuska 
518*e2df9bb4SMartin Matuska 	ddt_log_update_entry(ddt, ddl, &ddlwe);
519*e2df9bb4SMartin Matuska }
520*e2df9bb4SMartin Matuska 
521*e2df9bb4SMartin Matuska static void
522*e2df9bb4SMartin Matuska ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
523*e2df9bb4SMartin Matuska {
524*e2df9bb4SMartin Matuska 	void *cookie = NULL;
525*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle;
526*e2df9bb4SMartin Matuska 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
527*e2df9bb4SMartin Matuska 	while ((ddle =
528*e2df9bb4SMartin Matuska 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
529*e2df9bb4SMartin Matuska 		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
530*e2df9bb4SMartin Matuska 		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
531*e2df9bb4SMartin Matuska 	}
532*e2df9bb4SMartin Matuska 	ASSERT(avl_is_empty(&ddl->ddl_tree));
533*e2df9bb4SMartin Matuska }
534*e2df9bb4SMartin Matuska 
535*e2df9bb4SMartin Matuska static int
536*e2df9bb4SMartin Matuska ddt_log_load_one(ddt_t *ddt, uint_t n)
537*e2df9bb4SMartin Matuska {
538*e2df9bb4SMartin Matuska 	ASSERT3U(n, <, 2);
539*e2df9bb4SMartin Matuska 
540*e2df9bb4SMartin Matuska 	ddt_log_t *ddl = &ddt->ddt_log[n];
541*e2df9bb4SMartin Matuska 
542*e2df9bb4SMartin Matuska 	char name[DDT_NAMELEN];
543*e2df9bb4SMartin Matuska 	ddt_log_name(ddt, name, n);
544*e2df9bb4SMartin Matuska 
545*e2df9bb4SMartin Matuska 	uint64_t obj;
546*e2df9bb4SMartin Matuska 	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
547*e2df9bb4SMartin Matuska 	    sizeof (uint64_t), 1, &obj);
548*e2df9bb4SMartin Matuska 	if (err == ENOENT)
549*e2df9bb4SMartin Matuska 		return (0);
550*e2df9bb4SMartin Matuska 	if (err != 0)
551*e2df9bb4SMartin Matuska 		return (err);
552*e2df9bb4SMartin Matuska 
553*e2df9bb4SMartin Matuska 	dnode_t *dn;
554*e2df9bb4SMartin Matuska 	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
555*e2df9bb4SMartin Matuska 	if (err != 0)
556*e2df9bb4SMartin Matuska 		return (err);
557*e2df9bb4SMartin Matuska 
558*e2df9bb4SMartin Matuska 	ddt_log_header_t hdr;
559*e2df9bb4SMartin Matuska 	dmu_buf_t *db;
560*e2df9bb4SMartin Matuska 	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
561*e2df9bb4SMartin Matuska 	if (err != 0) {
562*e2df9bb4SMartin Matuska 		dnode_rele(dn, FTAG);
563*e2df9bb4SMartin Matuska 		return (err);
564*e2df9bb4SMartin Matuska 	}
565*e2df9bb4SMartin Matuska 	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
566*e2df9bb4SMartin Matuska 	dmu_buf_rele(db, FTAG);
567*e2df9bb4SMartin Matuska 
568*e2df9bb4SMartin Matuska 	if (DLH_GET_VERSION(&hdr) != 1) {
569*e2df9bb4SMartin Matuska 		dnode_rele(dn, FTAG);
570*e2df9bb4SMartin Matuska 		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
571*e2df9bb4SMartin Matuska 		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
572*e2df9bb4SMartin Matuska 		    (u_longlong_t)DLH_GET_VERSION(&hdr));
573*e2df9bb4SMartin Matuska 		return (SET_ERROR(EINVAL));
574*e2df9bb4SMartin Matuska 	}
575*e2df9bb4SMartin Matuska 
576*e2df9bb4SMartin Matuska 	ddt_key_t *checkpoint = NULL;
577*e2df9bb4SMartin Matuska 	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
578*e2df9bb4SMartin Matuska 		/*
579*e2df9bb4SMartin Matuska 		 * If the log has a checkpoint, then we can ignore any entries
580*e2df9bb4SMartin Matuska 		 * that have already been flushed.
581*e2df9bb4SMartin Matuska 		 */
582*e2df9bb4SMartin Matuska 		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
583*e2df9bb4SMartin Matuska 		checkpoint = &hdr.dlh_checkpoint;
584*e2df9bb4SMartin Matuska 	}
585*e2df9bb4SMartin Matuska 
586*e2df9bb4SMartin Matuska 	if (hdr.dlh_length > 0) {
587*e2df9bb4SMartin Matuska 		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
588*e2df9bb4SMartin Matuska 		    ZIO_PRIORITY_SYNC_READ);
589*e2df9bb4SMartin Matuska 
590*e2df9bb4SMartin Matuska 		for (uint64_t offset = 0; offset < hdr.dlh_length;
591*e2df9bb4SMartin Matuska 		    offset += dn->dn_datablksz) {
592*e2df9bb4SMartin Matuska 			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
593*e2df9bb4SMartin Matuska 			    DMU_READ_PREFETCH);
594*e2df9bb4SMartin Matuska 			if (err != 0) {
595*e2df9bb4SMartin Matuska 				dnode_rele(dn, FTAG);
596*e2df9bb4SMartin Matuska 				ddt_log_empty(ddt, ddl);
597*e2df9bb4SMartin Matuska 				return (err);
598*e2df9bb4SMartin Matuska 			}
599*e2df9bb4SMartin Matuska 
600*e2df9bb4SMartin Matuska 			uint64_t boffset = 0;
601*e2df9bb4SMartin Matuska 			while (boffset < db->db_size) {
602*e2df9bb4SMartin Matuska 				ddt_log_record_t *dlr =
603*e2df9bb4SMartin Matuska 				    (ddt_log_record_t *)(db->db_data + boffset);
604*e2df9bb4SMartin Matuska 
605*e2df9bb4SMartin Matuska 				/* Partially-filled block, skip the rest */
606*e2df9bb4SMartin Matuska 				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
607*e2df9bb4SMartin Matuska 					break;
608*e2df9bb4SMartin Matuska 
609*e2df9bb4SMartin Matuska 				switch (DLR_GET_TYPE(dlr)) {
610*e2df9bb4SMartin Matuska 				case DLR_ENTRY:
611*e2df9bb4SMartin Matuska 					ddt_log_load_entry(ddt, ddl, dlr,
612*e2df9bb4SMartin Matuska 					    checkpoint);
613*e2df9bb4SMartin Matuska 					break;
614*e2df9bb4SMartin Matuska 
615*e2df9bb4SMartin Matuska 				default:
616*e2df9bb4SMartin Matuska 					dmu_buf_rele(db, FTAG);
617*e2df9bb4SMartin Matuska 					dnode_rele(dn, FTAG);
618*e2df9bb4SMartin Matuska 					ddt_log_empty(ddt, ddl);
619*e2df9bb4SMartin Matuska 					return (SET_ERROR(EINVAL));
620*e2df9bb4SMartin Matuska 				}
621*e2df9bb4SMartin Matuska 
622*e2df9bb4SMartin Matuska 				boffset += DLR_GET_RECLEN(dlr);
623*e2df9bb4SMartin Matuska 			}
624*e2df9bb4SMartin Matuska 
625*e2df9bb4SMartin Matuska 			dmu_buf_rele(db, FTAG);
626*e2df9bb4SMartin Matuska 		}
627*e2df9bb4SMartin Matuska 	}
628*e2df9bb4SMartin Matuska 
629*e2df9bb4SMartin Matuska 	dnode_rele(dn, FTAG);
630*e2df9bb4SMartin Matuska 
631*e2df9bb4SMartin Matuska 	ddl->ddl_object = obj;
632*e2df9bb4SMartin Matuska 	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
633*e2df9bb4SMartin Matuska 	ddl->ddl_length = hdr.dlh_length;
634*e2df9bb4SMartin Matuska 	ddl->ddl_first_txg = hdr.dlh_first_txg;
635*e2df9bb4SMartin Matuska 
636*e2df9bb4SMartin Matuska 	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
637*e2df9bb4SMartin Matuska 		ddt->ddt_log_flushing = ddl;
638*e2df9bb4SMartin Matuska 	else
639*e2df9bb4SMartin Matuska 		ddt->ddt_log_active = ddl;
640*e2df9bb4SMartin Matuska 
641*e2df9bb4SMartin Matuska 	return (0);
642*e2df9bb4SMartin Matuska }
643*e2df9bb4SMartin Matuska 
644*e2df9bb4SMartin Matuska int
645*e2df9bb4SMartin Matuska ddt_log_load(ddt_t *ddt)
646*e2df9bb4SMartin Matuska {
647*e2df9bb4SMartin Matuska 	int err;
648*e2df9bb4SMartin Matuska 
649*e2df9bb4SMartin Matuska 	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
650*e2df9bb4SMartin Matuska 		/*
651*e2df9bb4SMartin Matuska 		 * The DDT is going to be freed again in a moment, so there's
652*e2df9bb4SMartin Matuska 		 * no point loading the log; it'll just slow down import.
653*e2df9bb4SMartin Matuska 		 */
654*e2df9bb4SMartin Matuska 		return (0);
655*e2df9bb4SMartin Matuska 	}
656*e2df9bb4SMartin Matuska 
657*e2df9bb4SMartin Matuska 	ASSERT0(ddt->ddt_log[0].ddl_object);
658*e2df9bb4SMartin Matuska 	ASSERT0(ddt->ddt_log[1].ddl_object);
659*e2df9bb4SMartin Matuska 	if (ddt->ddt_dir_object == 0) {
660*e2df9bb4SMartin Matuska 		/*
661*e2df9bb4SMartin Matuska 		 * If we're configured but the containing dir doesn't exist
662*e2df9bb4SMartin Matuska 		 * yet, then the log object can't possibly exist either.
663*e2df9bb4SMartin Matuska 		 */
664*e2df9bb4SMartin Matuska 		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
665*e2df9bb4SMartin Matuska 		return (SET_ERROR(ENOENT));
666*e2df9bb4SMartin Matuska 	}
667*e2df9bb4SMartin Matuska 
668*e2df9bb4SMartin Matuska 	if ((err = ddt_log_load_one(ddt, 0)) != 0)
669*e2df9bb4SMartin Matuska 		return (err);
670*e2df9bb4SMartin Matuska 	if ((err = ddt_log_load_one(ddt, 1)) != 0)
671*e2df9bb4SMartin Matuska 		return (err);
672*e2df9bb4SMartin Matuska 
673*e2df9bb4SMartin Matuska 	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
674*e2df9bb4SMartin Matuska 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
675*e2df9bb4SMartin Matuska 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
676*e2df9bb4SMartin Matuska 	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
677*e2df9bb4SMartin Matuska 
678*e2df9bb4SMartin Matuska 	/*
679*e2df9bb4SMartin Matuska 	 * We have two finalisation tasks:
680*e2df9bb4SMartin Matuska 	 *
681*e2df9bb4SMartin Matuska 	 * - rebuild the histogram. We do this at the end rather than while
682*e2df9bb4SMartin Matuska 	 *   we're loading so we don't need to uncount and recount entries that
683*e2df9bb4SMartin Matuska 	 *   appear multiple times in the log.
684*e2df9bb4SMartin Matuska 	 *
685*e2df9bb4SMartin Matuska 	 * - remove entries from the flushing tree that are on both trees. This
686*e2df9bb4SMartin Matuska 	 *   happens when ddt_lookup() rehydrates an entry from the flushing
687*e2df9bb4SMartin Matuska 	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
688*e2df9bb4SMartin Matuska 	 *   tree but doesn't remove it from disk.
689*e2df9bb4SMartin Matuska 	 */
690*e2df9bb4SMartin Matuska 
691*e2df9bb4SMartin Matuska 	/*
692*e2df9bb4SMartin Matuska 	 * We don't technically need a config lock here, since there shouldn't
693*e2df9bb4SMartin Matuska 	 * be pool config changes during DDT load. dva_get_dsize_sync() via
694*e2df9bb4SMartin Matuska 	 * ddt_stat_generate() is expecting it though, and it won't hurt
695*e2df9bb4SMartin Matuska 	 * anything, so we take it.
696*e2df9bb4SMartin Matuska 	 */
697*e2df9bb4SMartin Matuska 	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
698*e2df9bb4SMartin Matuska 
699*e2df9bb4SMartin Matuska 	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
700*e2df9bb4SMartin Matuska 	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
701*e2df9bb4SMartin Matuska 	ddt_log_entry_t *ae = avl_first(al);
702*e2df9bb4SMartin Matuska 	ddt_log_entry_t *fe = avl_first(fl);
703*e2df9bb4SMartin Matuska 	while (ae != NULL || fe != NULL) {
704*e2df9bb4SMartin Matuska 		ddt_log_entry_t *ddle;
705*e2df9bb4SMartin Matuska 		if (ae == NULL) {
706*e2df9bb4SMartin Matuska 			/* active exhausted, take flushing */
707*e2df9bb4SMartin Matuska 			ddle = fe;
708*e2df9bb4SMartin Matuska 			fe = AVL_NEXT(fl, fe);
709*e2df9bb4SMartin Matuska 		} else if (fe == NULL) {
710*e2df9bb4SMartin Matuska 			/* flushing exuhausted, take active */
711*e2df9bb4SMartin Matuska 			ddle = ae;
712*e2df9bb4SMartin Matuska 			ae = AVL_NEXT(al, ae);
713*e2df9bb4SMartin Matuska 		} else {
714*e2df9bb4SMartin Matuska 			/* compare active and flushing */
715*e2df9bb4SMartin Matuska 			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
716*e2df9bb4SMartin Matuska 			if (c < 0) {
717*e2df9bb4SMartin Matuska 				/* active behind, take and advance */
718*e2df9bb4SMartin Matuska 				ddle = ae;
719*e2df9bb4SMartin Matuska 				ae = AVL_NEXT(al, ae);
720*e2df9bb4SMartin Matuska 			} else if (c > 0) {
721*e2df9bb4SMartin Matuska 				/* flushing behind, take and advance */
722*e2df9bb4SMartin Matuska 				ddle = fe;
723*e2df9bb4SMartin Matuska 				fe = AVL_NEXT(fl, fe);
724*e2df9bb4SMartin Matuska 			} else {
725*e2df9bb4SMartin Matuska 				/* match. remove from flushing, take active */
726*e2df9bb4SMartin Matuska 				ddle = fe;
727*e2df9bb4SMartin Matuska 				fe = AVL_NEXT(fl, fe);
728*e2df9bb4SMartin Matuska 				avl_remove(fl, ddle);
729*e2df9bb4SMartin Matuska 
730*e2df9bb4SMartin Matuska 				ddle = ae;
731*e2df9bb4SMartin Matuska 				ae = AVL_NEXT(al, ae);
732*e2df9bb4SMartin Matuska 			}
733*e2df9bb4SMartin Matuska 		}
734*e2df9bb4SMartin Matuska 
735*e2df9bb4SMartin Matuska 		ddt_lightweight_entry_t ddlwe;
736*e2df9bb4SMartin Matuska 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
737*e2df9bb4SMartin Matuska 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
738*e2df9bb4SMartin Matuska 	}
739*e2df9bb4SMartin Matuska 
740*e2df9bb4SMartin Matuska 	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
741*e2df9bb4SMartin Matuska 
742*e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
743*e2df9bb4SMartin Matuska 
744*e2df9bb4SMartin Matuska 	return (0);
745*e2df9bb4SMartin Matuska }
746*e2df9bb4SMartin Matuska 
747*e2df9bb4SMartin Matuska void
748*e2df9bb4SMartin Matuska ddt_log_alloc(ddt_t *ddt)
749*e2df9bb4SMartin Matuska {
750*e2df9bb4SMartin Matuska 	ASSERT3P(ddt->ddt_log_active, ==, NULL);
751*e2df9bb4SMartin Matuska 	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
752*e2df9bb4SMartin Matuska 
753*e2df9bb4SMartin Matuska 	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
754*e2df9bb4SMartin Matuska 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
755*e2df9bb4SMartin Matuska 	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
756*e2df9bb4SMartin Matuska 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
757*e2df9bb4SMartin Matuska 	ddt->ddt_log_active = &ddt->ddt_log[0];
758*e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing = &ddt->ddt_log[1];
759*e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
760*e2df9bb4SMartin Matuska }
761*e2df9bb4SMartin Matuska 
762*e2df9bb4SMartin Matuska void
763*e2df9bb4SMartin Matuska ddt_log_free(ddt_t *ddt)
764*e2df9bb4SMartin Matuska {
765*e2df9bb4SMartin Matuska 	ddt_log_empty(ddt, &ddt->ddt_log[0]);
766*e2df9bb4SMartin Matuska 	ddt_log_empty(ddt, &ddt->ddt_log[1]);
767*e2df9bb4SMartin Matuska 	avl_destroy(&ddt->ddt_log[0].ddl_tree);
768*e2df9bb4SMartin Matuska 	avl_destroy(&ddt->ddt_log[1].ddl_tree);
769*e2df9bb4SMartin Matuska }
770*e2df9bb4SMartin Matuska 
771*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
772*e2df9bb4SMartin Matuska 	"Max transactions before starting to flush dedup logs");
773*e2df9bb4SMartin Matuska 
774*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
775*e2df9bb4SMartin Matuska 	"Max memory for dedup logs");
776*e2df9bb4SMartin Matuska 
777*e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
778*e2df9bb4SMartin Matuska 	"Max memory for dedup logs, as % of total memory");
779