xref: /onnv-gate/usr/src/uts/common/fs/zfs/zap.c (revision 789:b348f31ed315)
1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens 
30*789Sahrens /*
31*789Sahrens  * This file contains the top half of the zfs directory structure
32*789Sahrens  * implementation. The bottom half is in zap_leaf.c.
33*789Sahrens  *
34*789Sahrens  * The zdir is an extendable hash data structure. There is a table of
35*789Sahrens  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
36*789Sahrens  * each a constant size and hold a variable number of directory entries.
37*789Sahrens  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
38*789Sahrens  *
39*789Sahrens  * The pointer table holds a power of 2 number of pointers.
40*789Sahrens  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
41*789Sahrens  * by the pointer at index i in the table holds entries whose hash value
42*789Sahrens  * has a zd_prefix_len - bit prefix
43*789Sahrens  */
44*789Sahrens 
45*789Sahrens #include <sys/spa.h>
46*789Sahrens #include <sys/dmu.h>
47*789Sahrens #include <sys/zfs_context.h>
48*789Sahrens #include <sys/zap.h>
49*789Sahrens #include <sys/zap_impl.h>
50*789Sahrens #include <sys/zap_leaf.h>
51*789Sahrens 
52*789Sahrens #define	MIN_FREE (ZAP_LEAF_NUMCHUNKS*9/10)
53*789Sahrens 
54*789Sahrens static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
55*789Sahrens static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
56*789Sahrens static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
57*789Sahrens     dmu_tx_t *tx, krw_t lt);
58*789Sahrens static void zap_put_leaf(zap_leaf_t *l);
59*789Sahrens static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
60*789Sahrens 
61*789Sahrens 
62*789Sahrens void
63*789Sahrens fzap_byteswap(void *vbuf, size_t size)
64*789Sahrens {
65*789Sahrens 	uint64_t block_type;
66*789Sahrens 
67*789Sahrens 	ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
68*789Sahrens 	block_type = *(uint64_t *)vbuf;
69*789Sahrens 
70*789Sahrens 	switch (block_type) {
71*789Sahrens 	case ZBT_LEAF:
72*789Sahrens 	case BSWAP_64(ZBT_LEAF):
73*789Sahrens 		zap_leaf_byteswap(vbuf);
74*789Sahrens 		return;
75*789Sahrens 	case ZBT_HEADER:
76*789Sahrens 	case BSWAP_64(ZBT_HEADER):
77*789Sahrens 	default:
78*789Sahrens 		/* it's a ptrtbl block */
79*789Sahrens 		byteswap_uint64_array(vbuf, 1<<ZAP_BLOCK_SHIFT);
80*789Sahrens 		return;
81*789Sahrens 	}
82*789Sahrens }
83*789Sahrens 
84*789Sahrens void
85*789Sahrens fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
86*789Sahrens {
87*789Sahrens 	dmu_buf_t *db;
88*789Sahrens 	zap_leaf_t *l;
89*789Sahrens 	int i;
90*789Sahrens 	zap_phys_t *zp;
91*789Sahrens 
92*789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
93*789Sahrens 	zap->zap_ismicro = FALSE;
94*789Sahrens 
95*789Sahrens 	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
96*789Sahrens 	    &zap->zap_f.zap_phys, zap_pageout);
97*789Sahrens 
98*789Sahrens 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
99*789Sahrens 
100*789Sahrens 	zp = zap->zap_f.zap_phys;
101*789Sahrens 	/*
102*789Sahrens 	 * explicitly zero it since it might be coming from an
103*789Sahrens 	 * initialized microzap
104*789Sahrens 	 */
105*789Sahrens 	ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size);
106*789Sahrens 	bzero(zp, sizeof (zap_phys_t));
107*789Sahrens 	zp->zap_block_type = ZBT_HEADER;
108*789Sahrens 	zp->zap_magic = ZAP_MAGIC;
109*789Sahrens 
110*789Sahrens 	zp->zap_ptrtbl.zt_shift = ZAP_PTRTBL_MIN_SHIFT;
111*789Sahrens 
112*789Sahrens 	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
113*789Sahrens 	zp->zap_num_leafs = 1;
114*789Sahrens 	zp->zap_num_entries = 0;
115*789Sahrens 	zp->zap_salt = zap->zap_salt;
116*789Sahrens 
117*789Sahrens 	for (i = 0; i < (1<<ZAP_PTRTBL_MIN_SHIFT); i++)
118*789Sahrens 		zp->zap_leafs[i] = 1;	/* block 1 will be the first leaf */
119*789Sahrens 
120*789Sahrens 	/*
121*789Sahrens 	 * set up block 1 - the first leaf
122*789Sahrens 	 */
123*789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
124*789Sahrens 	    1<<ZAP_BLOCK_SHIFT);
125*789Sahrens 	dmu_buf_will_dirty(db, tx);
126*789Sahrens 
127*789Sahrens 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
128*789Sahrens 	l->l_dbuf = db;
129*789Sahrens 	l->l_phys = db->db_data;
130*789Sahrens 
131*789Sahrens 	zap_leaf_init(l);
132*789Sahrens 
133*789Sahrens 	kmem_free(l, sizeof (zap_leaf_t));
134*789Sahrens 	dmu_buf_rele(db);
135*789Sahrens }
136*789Sahrens 
137*789Sahrens static int
138*789Sahrens zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
139*789Sahrens {
140*789Sahrens 	if (RW_WRITE_HELD(&zap->zap_rwlock))
141*789Sahrens 		return (1);
142*789Sahrens 	if (rw_tryupgrade(&zap->zap_rwlock)) {
143*789Sahrens 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
144*789Sahrens 		return (1);
145*789Sahrens 	}
146*789Sahrens 	return (0);
147*789Sahrens }
148*789Sahrens 
149*789Sahrens /*
150*789Sahrens  * Generic routines for dealing with the pointer & cookie tables.
151*789Sahrens  */
152*789Sahrens 
153*789Sahrens static void
154*789Sahrens zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
155*789Sahrens     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
156*789Sahrens     dmu_tx_t *tx)
157*789Sahrens {
158*789Sahrens 	uint64_t b, newblk;
159*789Sahrens 	dmu_buf_t *db_old, *db_new;
160*789Sahrens 	int hepb = 1<<(ZAP_BLOCK_SHIFT-4);
161*789Sahrens 	/* hepb = half the number of entries in a block */
162*789Sahrens 
163*789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
164*789Sahrens 	ASSERT(tbl->zt_blk != 0);
165*789Sahrens 	ASSERT(tbl->zt_numblks > 0);
166*789Sahrens 
167*789Sahrens 	if (tbl->zt_nextblk != 0) {
168*789Sahrens 		newblk = tbl->zt_nextblk;
169*789Sahrens 	} else {
170*789Sahrens 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
171*789Sahrens 		tbl->zt_nextblk = newblk;
172*789Sahrens 		ASSERT3U(tbl->zt_blks_copied, ==, 0);
173*789Sahrens 		dmu_prefetch(zap->zap_objset, zap->zap_object,
174*789Sahrens 		    tbl->zt_blk << ZAP_BLOCK_SHIFT, tbl->zt_numblks <<
175*789Sahrens 		    ZAP_BLOCK_SHIFT);
176*789Sahrens 	}
177*789Sahrens 
178*789Sahrens 	/*
179*789Sahrens 	 * Copy the ptrtbl from the old to new location, leaving the odd
180*789Sahrens 	 * entries blank as we go.
181*789Sahrens 	 */
182*789Sahrens 
183*789Sahrens 	b = tbl->zt_blks_copied;
184*789Sahrens 	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
185*789Sahrens 	    (tbl->zt_blk + b) << ZAP_BLOCK_SHIFT);
186*789Sahrens 	dmu_buf_read(db_old);
187*789Sahrens 
188*789Sahrens 	/* first half of entries in old[b] go to new[2*b+0] */
189*789Sahrens 	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
190*789Sahrens 	    (newblk + 2*b+0) << ZAP_BLOCK_SHIFT);
191*789Sahrens 	dmu_buf_will_dirty(db_new, tx);
192*789Sahrens 	transfer_func(db_old->db_data, db_new->db_data, hepb);
193*789Sahrens 	dmu_buf_rele(db_new);
194*789Sahrens 
195*789Sahrens 	/* second half of entries in old[b] go to new[2*b+1] */
196*789Sahrens 	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
197*789Sahrens 	    (newblk + 2*b+1) << ZAP_BLOCK_SHIFT);
198*789Sahrens 	dmu_buf_will_dirty(db_new, tx);
199*789Sahrens 	transfer_func((uint64_t *)db_old->db_data + hepb,
200*789Sahrens 	    db_new->db_data, hepb);
201*789Sahrens 	dmu_buf_rele(db_new);
202*789Sahrens 
203*789Sahrens 	dmu_buf_rele(db_old);
204*789Sahrens 
205*789Sahrens 	tbl->zt_blks_copied++;
206*789Sahrens 
207*789Sahrens 	dprintf("copied block %llu of %llu\n",
208*789Sahrens 	    tbl->zt_blks_copied, tbl->zt_numblks);
209*789Sahrens 
210*789Sahrens 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
211*789Sahrens 		dmu_free_range(zap->zap_objset, zap->zap_object,
212*789Sahrens 		    tbl->zt_blk << ZAP_BLOCK_SHIFT,
213*789Sahrens 		    tbl->zt_numblks << ZAP_BLOCK_SHIFT, tx);
214*789Sahrens 
215*789Sahrens 		tbl->zt_blk = newblk;
216*789Sahrens 		tbl->zt_numblks *= 2;
217*789Sahrens 		tbl->zt_shift++;
218*789Sahrens 		tbl->zt_nextblk = 0;
219*789Sahrens 		tbl->zt_blks_copied = 0;
220*789Sahrens 
221*789Sahrens 		dprintf("finished; numblocks now %llu (%lluk entries)\n",
222*789Sahrens 		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
223*789Sahrens 	}
224*789Sahrens }
225*789Sahrens 
226*789Sahrens static uint64_t
227*789Sahrens zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
228*789Sahrens     dmu_tx_t *tx)
229*789Sahrens {
230*789Sahrens 	uint64_t blk, off, oldval;
231*789Sahrens 	dmu_buf_t *db;
232*789Sahrens 
233*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
234*789Sahrens 	ASSERT(tbl->zt_blk != 0);
235*789Sahrens 
236*789Sahrens 	dprintf("storing %llx at index %llx\n", val, idx);
237*789Sahrens 
238*789Sahrens 	blk = idx >> (ZAP_BLOCK_SHIFT-3);
239*789Sahrens 	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
240*789Sahrens 
241*789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
242*789Sahrens 	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
243*789Sahrens 	dmu_buf_will_dirty(db, tx);
244*789Sahrens 	oldval = ((uint64_t *)db->db_data)[off];
245*789Sahrens 	((uint64_t *)db->db_data)[off] = val;
246*789Sahrens 	dmu_buf_rele(db);
247*789Sahrens 
248*789Sahrens 	if (tbl->zt_nextblk != 0) {
249*789Sahrens 		idx *= 2;
250*789Sahrens 		blk = idx >> (ZAP_BLOCK_SHIFT-3);
251*789Sahrens 		off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
252*789Sahrens 
253*789Sahrens 		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
254*789Sahrens 		    (tbl->zt_nextblk + blk) << ZAP_BLOCK_SHIFT);
255*789Sahrens 		dmu_buf_will_dirty(db, tx);
256*789Sahrens 		((uint64_t *)db->db_data)[off] = val;
257*789Sahrens 		((uint64_t *)db->db_data)[off+1] = val;
258*789Sahrens 		dmu_buf_rele(db);
259*789Sahrens 	}
260*789Sahrens 
261*789Sahrens 	return (oldval);
262*789Sahrens }
263*789Sahrens 
264*789Sahrens static uint64_t
265*789Sahrens zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
266*789Sahrens {
267*789Sahrens 	uint64_t blk, off, val;
268*789Sahrens 	dmu_buf_t *db;
269*789Sahrens 
270*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
271*789Sahrens 
272*789Sahrens 	blk = idx >> (ZAP_BLOCK_SHIFT-3);
273*789Sahrens 	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
274*789Sahrens 
275*789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
276*789Sahrens 	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
277*789Sahrens 	dmu_buf_read(db);
278*789Sahrens 	val = ((uint64_t *)db->db_data)[off];
279*789Sahrens 	dmu_buf_rele(db);
280*789Sahrens 	return (val);
281*789Sahrens }
282*789Sahrens 
283*789Sahrens /*
284*789Sahrens  * Routines for growing the ptrtbl.
285*789Sahrens  */
286*789Sahrens 
287*789Sahrens static void
288*789Sahrens zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
289*789Sahrens {
290*789Sahrens 	int i;
291*789Sahrens 	for (i = 0; i < n; i++) {
292*789Sahrens 		uint64_t lb = src[i];
293*789Sahrens 		dst[2*i+0] = lb;
294*789Sahrens 		dst[2*i+1] = lb;
295*789Sahrens 	}
296*789Sahrens }
297*789Sahrens 
298*789Sahrens static void
299*789Sahrens zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
300*789Sahrens {
301*789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32)
302*789Sahrens 		return;
303*789Sahrens 
304*789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
305*789Sahrens 		/*
306*789Sahrens 		 * The ptrtbl can no longer be contained in the
307*789Sahrens 		 * header block.  Give it its own entire block, which
308*789Sahrens 		 * will quadruple the size of the ptrtbl.
309*789Sahrens 		 */
310*789Sahrens 		uint64_t newblk;
311*789Sahrens 		dmu_buf_t *db_new;
312*789Sahrens 
313*789Sahrens 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
314*789Sahrens 		    ZAP_PTRTBL_MIN_SHIFT);
315*789Sahrens 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
316*789Sahrens 
317*789Sahrens 		newblk = zap_allocate_blocks(zap, 1, tx);
318*789Sahrens 		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
319*789Sahrens 		    newblk << ZAP_BLOCK_SHIFT);
320*789Sahrens 
321*789Sahrens 		dmu_buf_will_dirty(db_new, tx);
322*789Sahrens 		zap_ptrtbl_transfer(zap->zap_f.zap_phys->zap_leafs,
323*789Sahrens 		    db_new->db_data, 1 << ZAP_PTRTBL_MIN_SHIFT);
324*789Sahrens 		dmu_buf_rele(db_new);
325*789Sahrens 
326*789Sahrens 		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
327*789Sahrens 		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
328*789Sahrens 		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
329*789Sahrens 
330*789Sahrens 		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
331*789Sahrens 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
332*789Sahrens 		    (ZAP_BLOCK_SHIFT-3));
333*789Sahrens 	} else {
334*789Sahrens 		zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
335*789Sahrens 		    zap_ptrtbl_transfer, tx);
336*789Sahrens 	}
337*789Sahrens }
338*789Sahrens 
339*789Sahrens static void
340*789Sahrens zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
341*789Sahrens {
342*789Sahrens 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
343*789Sahrens 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
344*789Sahrens 
345*789Sahrens 	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
346*789Sahrens 
347*789Sahrens 	zap->zap_f.zap_phys->zap_num_entries += delta;
348*789Sahrens 
349*789Sahrens 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
350*789Sahrens }
351*789Sahrens 
352*789Sahrens uint64_t
353*789Sahrens zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
354*789Sahrens {
355*789Sahrens 	uint64_t newblk;
356*789Sahrens 	ASSERT(tx != NULL);
357*789Sahrens 	if (!RW_WRITE_HELD(&zap->zap_rwlock)) {
358*789Sahrens 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
359*789Sahrens 	}
360*789Sahrens 	newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
361*789Sahrens 	    nblocks;
362*789Sahrens 	return (newblk);
363*789Sahrens }
364*789Sahrens 
365*789Sahrens 
366*789Sahrens /*
367*789Sahrens  * This function doesn't increment zap_num_leafs because it's used to
368*789Sahrens  * allocate a leaf chain, which doesn't count against zap_num_leafs.
369*789Sahrens  * The directory must be held exclusively for this tx.
370*789Sahrens  */
371*789Sahrens zap_leaf_t *
372*789Sahrens zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
373*789Sahrens {
374*789Sahrens 	void *winner;
375*789Sahrens 	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
376*789Sahrens 
377*789Sahrens 	ASSERT(tx != NULL);
378*789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
379*789Sahrens 	/* hence we already dirtied zap->zap_dbuf */
380*789Sahrens 
381*789Sahrens 	rw_init(&l->l_rwlock, 0, 0, 0);
382*789Sahrens 	rw_enter(&l->l_rwlock, RW_WRITER);
383*789Sahrens 	l->l_blkid = zap_allocate_blocks(zap, 1, tx);
384*789Sahrens 	l->l_next = NULL;
385*789Sahrens 	l->l_dbuf = NULL;
386*789Sahrens 	l->l_phys = NULL;
387*789Sahrens 
388*789Sahrens 	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
389*789Sahrens 	    l->l_blkid << ZAP_BLOCK_SHIFT);
390*789Sahrens 	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
391*789Sahrens 	ASSERT(winner == NULL);
392*789Sahrens 	dmu_buf_will_dirty(l->l_dbuf, tx);
393*789Sahrens 
394*789Sahrens 	zap_leaf_init(l);
395*789Sahrens 
396*789Sahrens 	return (l);
397*789Sahrens }
398*789Sahrens 
399*789Sahrens /* ARGSUSED */
400*789Sahrens void
401*789Sahrens zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
402*789Sahrens {
403*789Sahrens 	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
404*789Sahrens 	rw_exit(&l->l_rwlock);
405*789Sahrens 	dmu_buf_rele(l->l_dbuf);
406*789Sahrens 	/* XXX there are still holds on this block, so we can't free it? */
407*789Sahrens 	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
408*789Sahrens 	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
409*789Sahrens }
410*789Sahrens 
411*789Sahrens int
412*789Sahrens fzap_count(zap_t *zap, uint64_t *count)
413*789Sahrens {
414*789Sahrens 	ASSERT(!zap->zap_ismicro);
415*789Sahrens 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
416*789Sahrens 	*count = zap->zap_f.zap_phys->zap_num_entries;
417*789Sahrens 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
418*789Sahrens 	return (0);
419*789Sahrens }
420*789Sahrens 
421*789Sahrens /*
422*789Sahrens  * Routines for obtaining zap_leaf_t's
423*789Sahrens  */
424*789Sahrens 
425*789Sahrens static void
426*789Sahrens zap_put_leaf(zap_leaf_t *l)
427*789Sahrens {
428*789Sahrens 	zap_leaf_t *nl = l->l_next;
429*789Sahrens 	while (nl) {
430*789Sahrens 		zap_leaf_t *nnl = nl->l_next;
431*789Sahrens 		rw_exit(&nl->l_rwlock);
432*789Sahrens 		dmu_buf_rele(nl->l_dbuf);
433*789Sahrens 		nl = nnl;
434*789Sahrens 	}
435*789Sahrens 	rw_exit(&l->l_rwlock);
436*789Sahrens 	dmu_buf_rele(l->l_dbuf);
437*789Sahrens }
438*789Sahrens 
439*789Sahrens _NOTE(ARGSUSED(0))
440*789Sahrens static void
441*789Sahrens zap_leaf_pageout(dmu_buf_t *db, void *vl)
442*789Sahrens {
443*789Sahrens 	zap_leaf_t *l = vl;
444*789Sahrens 
445*789Sahrens 	rw_destroy(&l->l_rwlock);
446*789Sahrens 	kmem_free(l, sizeof (zap_leaf_t));
447*789Sahrens }
448*789Sahrens 
449*789Sahrens static zap_leaf_t *
450*789Sahrens zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
451*789Sahrens {
452*789Sahrens 	zap_leaf_t *l, *winner;
453*789Sahrens 
454*789Sahrens 	ASSERT(blkid != 0);
455*789Sahrens 
456*789Sahrens 	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
457*789Sahrens 	rw_init(&l->l_rwlock, 0, 0, 0);
458*789Sahrens 	rw_enter(&l->l_rwlock, RW_WRITER);
459*789Sahrens 	l->l_blkid = blkid;
460*789Sahrens 	l->l_next = NULL;
461*789Sahrens 	l->l_dbuf = db;
462*789Sahrens 	l->l_phys = NULL;
463*789Sahrens 
464*789Sahrens 	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
465*789Sahrens 
466*789Sahrens 	rw_exit(&l->l_rwlock);
467*789Sahrens 	if (winner != NULL) {
468*789Sahrens 		/* someone else set it first */
469*789Sahrens 		zap_leaf_pageout(NULL, l);
470*789Sahrens 		l = winner;
471*789Sahrens 	}
472*789Sahrens 
473*789Sahrens 	return (l);
474*789Sahrens }
475*789Sahrens 
476*789Sahrens static zap_leaf_t *
477*789Sahrens zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
478*789Sahrens {
479*789Sahrens 	dmu_buf_t *db;
480*789Sahrens 	zap_leaf_t *l;
481*789Sahrens 
482*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
483*789Sahrens 
484*789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
485*789Sahrens 	    blkid << ZAP_BLOCK_SHIFT);
486*789Sahrens 
487*789Sahrens 	ASSERT3U(db->db_object, ==, zap->zap_object);
488*789Sahrens 	ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT);
489*789Sahrens 	ASSERT3U(db->db_size, ==, 1 << ZAP_BLOCK_SHIFT);
490*789Sahrens 	ASSERT(blkid != 0);
491*789Sahrens 
492*789Sahrens 	dmu_buf_read(db);
493*789Sahrens 	l = dmu_buf_get_user(db);
494*789Sahrens 
495*789Sahrens 	if (l == NULL)
496*789Sahrens 		l = zap_open_leaf(blkid, db);
497*789Sahrens 
498*789Sahrens 	rw_enter(&l->l_rwlock, lt);
499*789Sahrens 	/*
500*789Sahrens 	 * Must lock before dirtying, otherwise l->l_phys could change,
501*789Sahrens 	 * causing ASSERT below to fail.
502*789Sahrens 	 */
503*789Sahrens 	if (lt == RW_WRITER)
504*789Sahrens 		dmu_buf_will_dirty(db, tx);
505*789Sahrens 	ASSERT3U(l->l_blkid, ==, blkid);
506*789Sahrens 	ASSERT3P(l->l_dbuf, ==, db);
507*789Sahrens 	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
508*789Sahrens 	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
509*789Sahrens 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
510*789Sahrens 
511*789Sahrens 	return (l);
512*789Sahrens }
513*789Sahrens 
514*789Sahrens static zap_leaf_t *
515*789Sahrens zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
516*789Sahrens {
517*789Sahrens 	zap_leaf_t *l, *nl;
518*789Sahrens 
519*789Sahrens 	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
520*789Sahrens 
521*789Sahrens 	nl = l;
522*789Sahrens 	while (nl->lh_next != 0) {
523*789Sahrens 		zap_leaf_t *nnl;
524*789Sahrens 		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
525*789Sahrens 		nl->l_next = nnl;
526*789Sahrens 		nl = nnl;
527*789Sahrens 	}
528*789Sahrens 
529*789Sahrens 	return (l);
530*789Sahrens }
531*789Sahrens 
532*789Sahrens static uint64_t
533*789Sahrens zap_idx_to_blk(zap_t *zap, uint64_t idx)
534*789Sahrens {
535*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
536*789Sahrens 
537*789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
538*789Sahrens 		ASSERT3U(idx, <,
539*789Sahrens 		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
540*789Sahrens 		return (zap->zap_f.zap_phys->zap_leafs[idx]);
541*789Sahrens 	} else {
542*789Sahrens 		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
543*789Sahrens 		    idx));
544*789Sahrens 	}
545*789Sahrens }
546*789Sahrens 
547*789Sahrens static void
548*789Sahrens zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
549*789Sahrens {
550*789Sahrens 	ASSERT(tx != NULL);
551*789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
552*789Sahrens 
553*789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
554*789Sahrens 		zap->zap_f.zap_phys->zap_leafs[idx] = blk;
555*789Sahrens 	} else {
556*789Sahrens 		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
557*789Sahrens 		    idx, blk, tx);
558*789Sahrens 	}
559*789Sahrens }
560*789Sahrens 
561*789Sahrens static zap_leaf_t *
562*789Sahrens zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
563*789Sahrens {
564*789Sahrens 	uint64_t idx;
565*789Sahrens 	zap_leaf_t *l;
566*789Sahrens 
567*789Sahrens 	ASSERT(zap->zap_dbuf == NULL ||
568*789Sahrens 	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
569*789Sahrens 	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
570*789Sahrens 	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
571*789Sahrens 	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
572*789Sahrens 
573*789Sahrens 	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
574*789Sahrens 
575*789Sahrens 	return (l);
576*789Sahrens }
577*789Sahrens 
578*789Sahrens 
579*789Sahrens static zap_leaf_t *
580*789Sahrens zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
581*789Sahrens {
582*789Sahrens 	zap_leaf_t *nl;
583*789Sahrens 	int prefix_diff, i, err;
584*789Sahrens 	uint64_t sibling;
585*789Sahrens 
586*789Sahrens 	ASSERT3U(l->lh_prefix_len, <=,
587*789Sahrens 	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
588*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
589*789Sahrens 
590*789Sahrens 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
591*789Sahrens 
592*789Sahrens 	if (zap_tryupgradedir(zap, tx) == 0) {
593*789Sahrens 		/* failed to upgrade */
594*789Sahrens 		int old_prefix_len = l->lh_prefix_len;
595*789Sahrens 		objset_t *os = zap->zap_objset;
596*789Sahrens 		uint64_t object = zap->zap_object;
597*789Sahrens 
598*789Sahrens 		zap_put_leaf(l);
599*789Sahrens 		zap_unlockdir(zap);
600*789Sahrens 		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
601*789Sahrens 		ASSERT3U(err, ==, 0);
602*789Sahrens 		ASSERT(!zap->zap_ismicro);
603*789Sahrens 		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
604*789Sahrens 
605*789Sahrens 		if (l->lh_prefix_len != old_prefix_len)
606*789Sahrens 			/* it split while our locks were down */
607*789Sahrens 			return (l);
608*789Sahrens 	}
609*789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
610*789Sahrens 
611*789Sahrens 	if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
612*789Sahrens 		/* There's only one pointer to us. Chain on another leaf blk. */
613*789Sahrens 		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
614*789Sahrens 		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
615*789Sahrens 		    l->lh_prefix_len);
616*789Sahrens 		return (l);
617*789Sahrens 	}
618*789Sahrens 
619*789Sahrens 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
620*789Sahrens 
621*789Sahrens 	/* There's more than one pointer to us. Split this leaf. */
622*789Sahrens 	nl = zap_leaf_split(zap, l, tx);
623*789Sahrens 
624*789Sahrens 	/* set sibling pointers */
625*789Sahrens 	prefix_diff =
626*789Sahrens 	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
627*789Sahrens 	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
628*789Sahrens 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
629*789Sahrens 		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
630*789Sahrens 		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
631*789Sahrens 		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
632*789Sahrens 	}
633*789Sahrens 
634*789Sahrens 	zap->zap_f.zap_phys->zap_num_leafs++;
635*789Sahrens 
636*789Sahrens 	if (hash & (1ULL << (64 - l->lh_prefix_len))) {
637*789Sahrens 		/* we want the sibling */
638*789Sahrens 		zap_put_leaf(l);
639*789Sahrens 		l = nl;
640*789Sahrens 	} else {
641*789Sahrens 		zap_put_leaf(nl);
642*789Sahrens 	}
643*789Sahrens 
644*789Sahrens 	return (l);
645*789Sahrens }
646*789Sahrens 
647*789Sahrens static void
648*789Sahrens zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap,
649*789Sahrens     zap_leaf_t *l, dmu_tx_t *tx)
650*789Sahrens {
651*789Sahrens 	int shift, err;
652*789Sahrens 
653*789Sahrens again:
654*789Sahrens 	shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
655*789Sahrens 
656*789Sahrens 	if (l->lh_prefix_len == shift &&
657*789Sahrens 	    (l->l_next != NULL || l->lh_nfree < MIN_FREE)) {
658*789Sahrens 		/* this leaf will soon make us grow the pointer table */
659*789Sahrens 
660*789Sahrens 		if (zap_tryupgradedir(zap, tx) == 0) {
661*789Sahrens 			objset_t *os = zap->zap_objset;
662*789Sahrens 			uint64_t zapobj = zap->zap_object;
663*789Sahrens 			uint64_t blkid = l->l_blkid;
664*789Sahrens 
665*789Sahrens 			zap_put_leaf(l);
666*789Sahrens 			zap_unlockdir(zap);
667*789Sahrens 			err = zap_lockdir(os, zapobj, tx,
668*789Sahrens 			    RW_WRITER, FALSE, &zap);
669*789Sahrens 			ASSERT3U(err, ==, 0);
670*789Sahrens 			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
671*789Sahrens 			goto again;
672*789Sahrens 		}
673*789Sahrens 
674*789Sahrens 		zap_put_leaf(l);
675*789Sahrens 		zap_grow_ptrtbl(zap, tx);
676*789Sahrens 	} else {
677*789Sahrens 		zap_put_leaf(l);
678*789Sahrens 	}
679*789Sahrens }
680*789Sahrens 
681*789Sahrens 
682*789Sahrens static int
683*789Sahrens fzap_checksize(uint64_t integer_size, uint64_t num_integers)
684*789Sahrens {
685*789Sahrens 	/* Only integer sizes supported by C */
686*789Sahrens 	switch (integer_size) {
687*789Sahrens 	case 1:
688*789Sahrens 	case 2:
689*789Sahrens 	case 4:
690*789Sahrens 	case 8:
691*789Sahrens 		break;
692*789Sahrens 	default:
693*789Sahrens 		return (EINVAL);
694*789Sahrens 	}
695*789Sahrens 
696*789Sahrens 	/* Make sure we won't overflow */
697*789Sahrens 	if (integer_size * num_integers < num_integers)
698*789Sahrens 		return (EINVAL);
699*789Sahrens 	if (integer_size * num_integers > DMU_MAX_ACCESS)
700*789Sahrens 		return (EINVAL);
701*789Sahrens 
702*789Sahrens 	return (0);
703*789Sahrens }
704*789Sahrens 
705*789Sahrens /*
706*789Sahrens  * Routines for maniplulating attributes.
707*789Sahrens  */
708*789Sahrens int
709*789Sahrens fzap_lookup(zap_t *zap, const char *name,
710*789Sahrens     uint64_t integer_size, uint64_t num_integers, void *buf)
711*789Sahrens {
712*789Sahrens 	zap_leaf_t *l;
713*789Sahrens 	int err;
714*789Sahrens 	uint64_t hash;
715*789Sahrens 	zap_entry_handle_t zeh;
716*789Sahrens 
717*789Sahrens 	err = fzap_checksize(integer_size, num_integers);
718*789Sahrens 	if (err != 0)
719*789Sahrens 		return (err);
720*789Sahrens 
721*789Sahrens 	hash = zap_hash(zap, name);
722*789Sahrens 	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
723*789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
724*789Sahrens 	if (err != 0)
725*789Sahrens 		goto out;
726*789Sahrens 	err = zap_entry_read(&zeh, integer_size, num_integers, buf);
727*789Sahrens out:
728*789Sahrens 	zap_put_leaf(l);
729*789Sahrens 	return (err);
730*789Sahrens }
731*789Sahrens 
732*789Sahrens int
733*789Sahrens fzap_add_cd(zap_t *zap, const char *name,
734*789Sahrens     uint64_t integer_size, uint64_t num_integers,
735*789Sahrens     const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
736*789Sahrens {
737*789Sahrens 	zap_leaf_t *l;
738*789Sahrens 	uint64_t hash;
739*789Sahrens 	int err;
740*789Sahrens 	zap_entry_handle_t zeh;
741*789Sahrens 
742*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
743*789Sahrens 	ASSERT(!zap->zap_ismicro);
744*789Sahrens 	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
745*789Sahrens 
746*789Sahrens 	hash = zap_hash(zap, name);
747*789Sahrens 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
748*789Sahrens retry:
749*789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
750*789Sahrens 	if (err == 0) {
751*789Sahrens 		err = EEXIST;
752*789Sahrens 		goto out;
753*789Sahrens 	}
754*789Sahrens 	ASSERT(err == ENOENT);
755*789Sahrens 
756*789Sahrens 	/* XXX If this leaf is chained, split it if we can. */
757*789Sahrens 	err = zap_entry_create(l, name, hash, cd,
758*789Sahrens 	    integer_size, num_integers, val, &zeh);
759*789Sahrens 
760*789Sahrens 	if (err == 0) {
761*789Sahrens 		zap_increment_num_entries(zap, 1, tx);
762*789Sahrens 	} else if (err == EAGAIN) {
763*789Sahrens 		l = zap_expand_leaf(zap, l, hash, tx);
764*789Sahrens 		goto retry;
765*789Sahrens 	}
766*789Sahrens 
767*789Sahrens out:
768*789Sahrens 	if (lp)
769*789Sahrens 		*lp = l;
770*789Sahrens 	else
771*789Sahrens 		zap_put_leaf(l);
772*789Sahrens 	return (err);
773*789Sahrens }
774*789Sahrens 
775*789Sahrens int
776*789Sahrens fzap_add(zap_t *zap, const char *name,
777*789Sahrens     uint64_t integer_size, uint64_t num_integers,
778*789Sahrens     const void *val, dmu_tx_t *tx)
779*789Sahrens {
780*789Sahrens 	int err;
781*789Sahrens 	zap_leaf_t *l;
782*789Sahrens 
783*789Sahrens 	err = fzap_checksize(integer_size, num_integers);
784*789Sahrens 	if (err != 0)
785*789Sahrens 		return (err);
786*789Sahrens 
787*789Sahrens 	err = fzap_add_cd(zap, name, integer_size, num_integers,
788*789Sahrens 	    val, ZAP_MAXCD, tx, &l);
789*789Sahrens 
790*789Sahrens 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
791*789Sahrens 	return (err);
792*789Sahrens }
793*789Sahrens 
794*789Sahrens int
795*789Sahrens fzap_update(zap_t *zap, const char *name,
796*789Sahrens     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
797*789Sahrens {
798*789Sahrens 	zap_leaf_t *l;
799*789Sahrens 	uint64_t hash;
800*789Sahrens 	int err, create;
801*789Sahrens 	zap_entry_handle_t zeh;
802*789Sahrens 
803*789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
804*789Sahrens 	err = fzap_checksize(integer_size, num_integers);
805*789Sahrens 	if (err != 0)
806*789Sahrens 		return (err);
807*789Sahrens 
808*789Sahrens 	hash = zap_hash(zap, name);
809*789Sahrens 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
810*789Sahrens retry:
811*789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
812*789Sahrens 	create = (err == ENOENT);
813*789Sahrens 	ASSERT(err == 0 || err == ENOENT);
814*789Sahrens 
815*789Sahrens 	/* XXX If this leaf is chained, split it if we can. */
816*789Sahrens 
817*789Sahrens 	if (create) {
818*789Sahrens 		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
819*789Sahrens 		    integer_size, num_integers, val, &zeh);
820*789Sahrens 		if (err == 0)
821*789Sahrens 			zap_increment_num_entries(zap, 1, tx);
822*789Sahrens 	} else {
823*789Sahrens 		err = zap_entry_update(&zeh, integer_size, num_integers, val);
824*789Sahrens 	}
825*789Sahrens 
826*789Sahrens 	if (err == EAGAIN) {
827*789Sahrens 		l = zap_expand_leaf(zap, l, hash, tx);
828*789Sahrens 		goto retry;
829*789Sahrens 	}
830*789Sahrens 
831*789Sahrens 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
832*789Sahrens 	return (err);
833*789Sahrens }
834*789Sahrens 
835*789Sahrens int
836*789Sahrens fzap_length(zap_t *zap, const char *name,
837*789Sahrens     uint64_t *integer_size, uint64_t *num_integers)
838*789Sahrens {
839*789Sahrens 	zap_leaf_t *l;
840*789Sahrens 	int err;
841*789Sahrens 	uint64_t hash;
842*789Sahrens 	zap_entry_handle_t zeh;
843*789Sahrens 
844*789Sahrens 	hash = zap_hash(zap, name);
845*789Sahrens 	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
846*789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
847*789Sahrens 	if (err != 0)
848*789Sahrens 		goto out;
849*789Sahrens 
850*789Sahrens 	if (integer_size)
851*789Sahrens 		*integer_size = zeh.zeh_integer_size;
852*789Sahrens 	if (num_integers)
853*789Sahrens 		*num_integers = zeh.zeh_num_integers;
854*789Sahrens out:
855*789Sahrens 	zap_put_leaf(l);
856*789Sahrens 	return (err);
857*789Sahrens }
858*789Sahrens 
859*789Sahrens int
860*789Sahrens fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
861*789Sahrens {
862*789Sahrens 	zap_leaf_t *l;
863*789Sahrens 	uint64_t hash;
864*789Sahrens 	int err;
865*789Sahrens 	zap_entry_handle_t zeh;
866*789Sahrens 
867*789Sahrens 	hash = zap_hash(zap, name);
868*789Sahrens 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
869*789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
870*789Sahrens 	if (err == 0) {
871*789Sahrens 		zap_entry_remove(&zeh);
872*789Sahrens 		zap_increment_num_entries(zap, -1, tx);
873*789Sahrens 	}
874*789Sahrens 	zap_put_leaf(l);
875*789Sahrens 	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
876*789Sahrens 	    zap->zap_objset, zap->zap_object, name, err);
877*789Sahrens 	return (err);
878*789Sahrens }
879*789Sahrens 
880*789Sahrens int
881*789Sahrens zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
882*789Sahrens {
883*789Sahrens 	zap_cursor_t zc;
884*789Sahrens 	zap_attribute_t *za;
885*789Sahrens 	int err;
886*789Sahrens 
887*789Sahrens 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
888*789Sahrens 	for (zap_cursor_init(&zc, os, zapobj);
889*789Sahrens 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
890*789Sahrens 	    zap_cursor_advance(&zc)) {
891*789Sahrens 		if (za->za_first_integer == value) {
892*789Sahrens 			(void) strcpy(name, za->za_name);
893*789Sahrens 			break;
894*789Sahrens 		}
895*789Sahrens 	}
896*789Sahrens 	kmem_free(za, sizeof (zap_attribute_t));
897*789Sahrens 	return (err);
898*789Sahrens }
899*789Sahrens 
900*789Sahrens 
901*789Sahrens /*
902*789Sahrens  * Routines for iterating over the attributes.
903*789Sahrens  */
904*789Sahrens 
905*789Sahrens int
906*789Sahrens fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
907*789Sahrens {
908*789Sahrens 	int err = ENOENT;
909*789Sahrens 	zap_entry_handle_t zeh;
910*789Sahrens 	zap_leaf_t *l;
911*789Sahrens 
912*789Sahrens 	/* retrieve the next entry at or after zc_hash/zc_cd */
913*789Sahrens 	/* if no entry, return ENOENT */
914*789Sahrens 
915*789Sahrens again:
916*789Sahrens 	l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
917*789Sahrens 	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
918*789Sahrens 
919*789Sahrens 	if (err == ENOENT) {
920*789Sahrens 		uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
921*789Sahrens 		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
922*789Sahrens 		zc->zc_cd = 0;
923*789Sahrens 		if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
924*789Sahrens 			zc->zc_hash = -1ULL;
925*789Sahrens 		} else {
926*789Sahrens 			zap_put_leaf(l);
927*789Sahrens 			goto again;
928*789Sahrens 		}
929*789Sahrens 	}
930*789Sahrens 
931*789Sahrens 	if (err == 0) {
932*789Sahrens 		zc->zc_hash = zeh.zeh_hash;
933*789Sahrens 		zc->zc_cd = zeh.zeh_cd;
934*789Sahrens 		za->za_integer_length = zeh.zeh_integer_size;
935*789Sahrens 		za->za_num_integers = zeh.zeh_num_integers;
936*789Sahrens 		if (zeh.zeh_num_integers == 0) {
937*789Sahrens 			za->za_first_integer = 0;
938*789Sahrens 		} else {
939*789Sahrens 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
940*789Sahrens 			ASSERT(err == 0 || err == EOVERFLOW);
941*789Sahrens 		}
942*789Sahrens 		err = zap_entry_read_name(&zeh,
943*789Sahrens 		    sizeof (za->za_name), za->za_name);
944*789Sahrens 		ASSERT(err == 0);
945*789Sahrens 	}
946*789Sahrens 	zap_put_leaf(l);
947*789Sahrens 	return (err);
948*789Sahrens }
949*789Sahrens 
950*789Sahrens 
951*789Sahrens static void
952*789Sahrens zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
953*789Sahrens {
954*789Sahrens 	int i;
955*789Sahrens 	uint64_t lastblk = 0;
956*789Sahrens 
957*789Sahrens 	/*
958*789Sahrens 	 * NB: if a leaf has more pointers than an entire ptrtbl block
959*789Sahrens 	 * can hold, then it'll be accounted for more than once, since
960*789Sahrens 	 * we won't have lastblk.
961*789Sahrens 	 */
962*789Sahrens 	for (i = 0; i < len; i++) {
963*789Sahrens 		zap_leaf_t *l;
964*789Sahrens 
965*789Sahrens 		if (tbl[i] == lastblk)
966*789Sahrens 			continue;
967*789Sahrens 		lastblk = tbl[i];
968*789Sahrens 
969*789Sahrens 		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
970*789Sahrens 
971*789Sahrens 		zap_stats_leaf(zap, l, zs);
972*789Sahrens 		zap_put_leaf(l);
973*789Sahrens 	}
974*789Sahrens }
975*789Sahrens 
976*789Sahrens void
977*789Sahrens fzap_get_stats(zap_t *zap, zap_stats_t *zs)
978*789Sahrens {
979*789Sahrens 	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
980*789Sahrens 	zs->zs_blocksize = 1ULL << ZAP_BLOCK_SHIFT;
981*789Sahrens 	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
982*789Sahrens 	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
983*789Sahrens 	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
984*789Sahrens 
985*789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
986*789Sahrens 		/* the ptrtbl is entirely in the header block. */
987*789Sahrens 		zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs,
988*789Sahrens 		    1 << ZAP_PTRTBL_MIN_SHIFT, zs);
989*789Sahrens 	} else {
990*789Sahrens 		int b;
991*789Sahrens 
992*789Sahrens 		dmu_prefetch(zap->zap_objset, zap->zap_object,
993*789Sahrens 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT,
994*789Sahrens 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
995*789Sahrens 			ZAP_BLOCK_SHIFT);
996*789Sahrens 
997*789Sahrens 		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
998*789Sahrens 		    b++) {
999*789Sahrens 			dmu_buf_t *db;
1000*789Sahrens 
1001*789Sahrens 			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1002*789Sahrens 			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) <<
1003*789Sahrens 			    ZAP_BLOCK_SHIFT);
1004*789Sahrens 			dmu_buf_read(db);
1005*789Sahrens 			zap_stats_ptrtbl(zap, db->db_data,
1006*789Sahrens 			    1<<(ZAP_BLOCK_SHIFT-3), zs);
1007*789Sahrens 			dmu_buf_rele(db);
1008*789Sahrens 		}
1009*789Sahrens 	}
1010*789Sahrens }
1011