xref: /onnv-gate/usr/src/uts/common/fs/zfs/zap.c (revision 1491:bdcb30e07e7d)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
5*1491Sahrens  * Common Development and Distribution License (the "License").
6*1491Sahrens  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21789Sahrens /*
22*1491Sahrens  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23789Sahrens  * Use is subject to license terms.
24789Sahrens  */
25789Sahrens 
26789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27789Sahrens 
28789Sahrens 
29789Sahrens /*
30789Sahrens  * This file contains the top half of the zfs directory structure
31789Sahrens  * implementation. The bottom half is in zap_leaf.c.
32789Sahrens  *
33789Sahrens  * The zdir is an extendable hash data structure. There is a table of
34789Sahrens  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
35789Sahrens  * each a constant size and hold a variable number of directory entries.
36789Sahrens  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
37789Sahrens  *
38789Sahrens  * The pointer table holds a power of 2 number of pointers.
39789Sahrens  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
40789Sahrens  * by the pointer at index i in the table holds entries whose hash value
41789Sahrens  * has a zd_prefix_len - bit prefix
42789Sahrens  */
43789Sahrens 
44789Sahrens #include <sys/spa.h>
45789Sahrens #include <sys/dmu.h>
46789Sahrens #include <sys/zfs_context.h>
47789Sahrens #include <sys/zap.h>
48789Sahrens #include <sys/zap_impl.h>
49789Sahrens #include <sys/zap_leaf.h>
50789Sahrens 
51*1491Sahrens #define	MIN_FREE(l) (ZAP_LEAF_NUMCHUNKS(l)*9/10)
52*1491Sahrens 
53*1491Sahrens int fzap_default_block_shift = 14; /* 16k blocksize */
54789Sahrens 
55789Sahrens static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
56789Sahrens static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
57789Sahrens static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
58789Sahrens     dmu_tx_t *tx, krw_t lt);
59789Sahrens static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
60789Sahrens 
61789Sahrens 
62789Sahrens void
63789Sahrens fzap_byteswap(void *vbuf, size_t size)
64789Sahrens {
65789Sahrens 	uint64_t block_type;
66789Sahrens 
67789Sahrens 	block_type = *(uint64_t *)vbuf;
68789Sahrens 
69789Sahrens 	switch (block_type) {
70789Sahrens 	case ZBT_LEAF:
71789Sahrens 	case BSWAP_64(ZBT_LEAF):
72*1491Sahrens 		zap_leaf_byteswap(vbuf, size);
73789Sahrens 		return;
74789Sahrens 	case ZBT_HEADER:
75789Sahrens 	case BSWAP_64(ZBT_HEADER):
76789Sahrens 	default:
77789Sahrens 		/* it's a ptrtbl block */
78*1491Sahrens 		byteswap_uint64_array(vbuf, size);
79789Sahrens 		return;
80789Sahrens 	}
81789Sahrens }
82789Sahrens 
83789Sahrens void
84789Sahrens fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
85789Sahrens {
86789Sahrens 	dmu_buf_t *db;
87789Sahrens 	zap_leaf_t *l;
88789Sahrens 	int i;
89789Sahrens 	zap_phys_t *zp;
90789Sahrens 
91789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
92789Sahrens 	zap->zap_ismicro = FALSE;
93789Sahrens 
94789Sahrens 	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
95789Sahrens 	    &zap->zap_f.zap_phys, zap_pageout);
96789Sahrens 
97789Sahrens 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
98*1491Sahrens 	zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
99789Sahrens 
100789Sahrens 	zp = zap->zap_f.zap_phys;
101789Sahrens 	/*
102789Sahrens 	 * explicitly zero it since it might be coming from an
103789Sahrens 	 * initialized microzap
104789Sahrens 	 */
105*1491Sahrens 	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
106789Sahrens 	zp->zap_block_type = ZBT_HEADER;
107789Sahrens 	zp->zap_magic = ZAP_MAGIC;
108789Sahrens 
109*1491Sahrens 	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
110789Sahrens 
111789Sahrens 	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
112789Sahrens 	zp->zap_num_leafs = 1;
113789Sahrens 	zp->zap_num_entries = 0;
114789Sahrens 	zp->zap_salt = zap->zap_salt;
115789Sahrens 
116*1491Sahrens 	/* block 1 will be the first leaf */
117*1491Sahrens 	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
118*1491Sahrens 		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
119789Sahrens 
120789Sahrens 	/*
121789Sahrens 	 * set up block 1 - the first leaf
122789Sahrens 	 */
123789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
124*1491Sahrens 	    1<<FZAP_BLOCK_SHIFT(zap));
125789Sahrens 	dmu_buf_will_dirty(db, tx);
126789Sahrens 
127789Sahrens 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
128789Sahrens 	l->l_dbuf = db;
129789Sahrens 	l->l_phys = db->db_data;
130789Sahrens 
131789Sahrens 	zap_leaf_init(l);
132789Sahrens 
133789Sahrens 	kmem_free(l, sizeof (zap_leaf_t));
134789Sahrens 	dmu_buf_rele(db);
135789Sahrens }
136789Sahrens 
137789Sahrens static int
138789Sahrens zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
139789Sahrens {
140789Sahrens 	if (RW_WRITE_HELD(&zap->zap_rwlock))
141789Sahrens 		return (1);
142789Sahrens 	if (rw_tryupgrade(&zap->zap_rwlock)) {
143789Sahrens 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
144789Sahrens 		return (1);
145789Sahrens 	}
146789Sahrens 	return (0);
147789Sahrens }
148789Sahrens 
149789Sahrens /*
150789Sahrens  * Generic routines for dealing with the pointer & cookie tables.
151789Sahrens  */
152789Sahrens 
153789Sahrens static void
154789Sahrens zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
155789Sahrens     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
156789Sahrens     dmu_tx_t *tx)
157789Sahrens {
158789Sahrens 	uint64_t b, newblk;
159789Sahrens 	dmu_buf_t *db_old, *db_new;
160*1491Sahrens 	int bs = FZAP_BLOCK_SHIFT(zap);
161*1491Sahrens 	int hepb = 1<<(bs-4);
162789Sahrens 	/* hepb = half the number of entries in a block */
163789Sahrens 
164789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
165789Sahrens 	ASSERT(tbl->zt_blk != 0);
166789Sahrens 	ASSERT(tbl->zt_numblks > 0);
167789Sahrens 
168789Sahrens 	if (tbl->zt_nextblk != 0) {
169789Sahrens 		newblk = tbl->zt_nextblk;
170789Sahrens 	} else {
171789Sahrens 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
172789Sahrens 		tbl->zt_nextblk = newblk;
173789Sahrens 		ASSERT3U(tbl->zt_blks_copied, ==, 0);
174789Sahrens 		dmu_prefetch(zap->zap_objset, zap->zap_object,
175*1491Sahrens 		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
176789Sahrens 	}
177789Sahrens 
178789Sahrens 	/*
179789Sahrens 	 * Copy the ptrtbl from the old to new location, leaving the odd
180789Sahrens 	 * entries blank as we go.
181789Sahrens 	 */
182789Sahrens 
183789Sahrens 	b = tbl->zt_blks_copied;
184789Sahrens 	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
185*1491Sahrens 	    (tbl->zt_blk + b) << bs);
186789Sahrens 	dmu_buf_read(db_old);
187789Sahrens 
188789Sahrens 	/* first half of entries in old[b] go to new[2*b+0] */
189789Sahrens 	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
190*1491Sahrens 	    (newblk + 2*b+0) << bs);
191789Sahrens 	dmu_buf_will_dirty(db_new, tx);
192789Sahrens 	transfer_func(db_old->db_data, db_new->db_data, hepb);
193789Sahrens 	dmu_buf_rele(db_new);
194789Sahrens 
195789Sahrens 	/* second half of entries in old[b] go to new[2*b+1] */
196789Sahrens 	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
197*1491Sahrens 	    (newblk + 2*b+1) << bs);
198789Sahrens 	dmu_buf_will_dirty(db_new, tx);
199789Sahrens 	transfer_func((uint64_t *)db_old->db_data + hepb,
200789Sahrens 	    db_new->db_data, hepb);
201789Sahrens 	dmu_buf_rele(db_new);
202789Sahrens 
203789Sahrens 	dmu_buf_rele(db_old);
204789Sahrens 
205789Sahrens 	tbl->zt_blks_copied++;
206789Sahrens 
207789Sahrens 	dprintf("copied block %llu of %llu\n",
208789Sahrens 	    tbl->zt_blks_copied, tbl->zt_numblks);
209789Sahrens 
210789Sahrens 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
211789Sahrens 		dmu_free_range(zap->zap_objset, zap->zap_object,
212*1491Sahrens 		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
213789Sahrens 
214789Sahrens 		tbl->zt_blk = newblk;
215789Sahrens 		tbl->zt_numblks *= 2;
216789Sahrens 		tbl->zt_shift++;
217789Sahrens 		tbl->zt_nextblk = 0;
218789Sahrens 		tbl->zt_blks_copied = 0;
219789Sahrens 
220789Sahrens 		dprintf("finished; numblocks now %llu (%lluk entries)\n",
221789Sahrens 		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
222789Sahrens 	}
223789Sahrens }
224789Sahrens 
225789Sahrens static uint64_t
226789Sahrens zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
227789Sahrens     dmu_tx_t *tx)
228789Sahrens {
229789Sahrens 	uint64_t blk, off, oldval;
230789Sahrens 	dmu_buf_t *db;
231*1491Sahrens 	int bs = FZAP_BLOCK_SHIFT(zap);
232789Sahrens 
233789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
234789Sahrens 	ASSERT(tbl->zt_blk != 0);
235789Sahrens 
236789Sahrens 	dprintf("storing %llx at index %llx\n", val, idx);
237789Sahrens 
238*1491Sahrens 	blk = idx >> (bs-3);
239*1491Sahrens 	off = idx & ((1<<(bs-3))-1);
240789Sahrens 
241789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
242*1491Sahrens 	    (tbl->zt_blk + blk) << bs);
243789Sahrens 	dmu_buf_will_dirty(db, tx);
244789Sahrens 	oldval = ((uint64_t *)db->db_data)[off];
245789Sahrens 	((uint64_t *)db->db_data)[off] = val;
246789Sahrens 	dmu_buf_rele(db);
247789Sahrens 
248789Sahrens 	if (tbl->zt_nextblk != 0) {
249789Sahrens 		idx *= 2;
250*1491Sahrens 		blk = idx >> (bs-3);
251*1491Sahrens 		off = idx & ((1<<(bs-3))-1);
252789Sahrens 
253789Sahrens 		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
254*1491Sahrens 		    (tbl->zt_nextblk + blk) << bs);
255789Sahrens 		dmu_buf_will_dirty(db, tx);
256789Sahrens 		((uint64_t *)db->db_data)[off] = val;
257789Sahrens 		((uint64_t *)db->db_data)[off+1] = val;
258789Sahrens 		dmu_buf_rele(db);
259789Sahrens 	}
260789Sahrens 
261789Sahrens 	return (oldval);
262789Sahrens }
263789Sahrens 
264789Sahrens static uint64_t
265789Sahrens zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
266789Sahrens {
267789Sahrens 	uint64_t blk, off, val;
268789Sahrens 	dmu_buf_t *db;
269*1491Sahrens 	int bs = FZAP_BLOCK_SHIFT(zap);
270789Sahrens 
271789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
272789Sahrens 
273*1491Sahrens 	blk = idx >> (bs-3);
274*1491Sahrens 	off = idx & ((1<<(bs-3))-1);
275789Sahrens 
276789Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
277*1491Sahrens 	    (tbl->zt_blk + blk) << bs);
278789Sahrens 	dmu_buf_read(db);
279789Sahrens 	val = ((uint64_t *)db->db_data)[off];
280789Sahrens 	dmu_buf_rele(db);
281789Sahrens 	return (val);
282789Sahrens }
283789Sahrens 
284789Sahrens /*
285789Sahrens  * Routines for growing the ptrtbl.
286789Sahrens  */
287789Sahrens 
288789Sahrens static void
289789Sahrens zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
290789Sahrens {
291789Sahrens 	int i;
292789Sahrens 	for (i = 0; i < n; i++) {
293789Sahrens 		uint64_t lb = src[i];
294789Sahrens 		dst[2*i+0] = lb;
295789Sahrens 		dst[2*i+1] = lb;
296789Sahrens 	}
297789Sahrens }
298789Sahrens 
299789Sahrens static void
300789Sahrens zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
301789Sahrens {
302789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32)
303789Sahrens 		return;
304789Sahrens 
305789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
306789Sahrens 		/*
307*1491Sahrens 		 * We are outgrowing the "embedded" ptrtbl (the one
308*1491Sahrens 		 * stored in the header block).  Give it its own entire
309*1491Sahrens 		 * block, which will double the size of the ptrtbl.
310789Sahrens 		 */
311789Sahrens 		uint64_t newblk;
312789Sahrens 		dmu_buf_t *db_new;
313789Sahrens 
314789Sahrens 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
315*1491Sahrens 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
316789Sahrens 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
317789Sahrens 
318789Sahrens 		newblk = zap_allocate_blocks(zap, 1, tx);
319789Sahrens 		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
320*1491Sahrens 		    newblk << FZAP_BLOCK_SHIFT(zap));
321789Sahrens 
322789Sahrens 		dmu_buf_will_dirty(db_new, tx);
323*1491Sahrens 		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
324*1491Sahrens 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
325789Sahrens 		dmu_buf_rele(db_new);
326789Sahrens 
327789Sahrens 		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
328789Sahrens 		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
329789Sahrens 		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
330789Sahrens 
331789Sahrens 		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
332789Sahrens 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
333*1491Sahrens 		    (FZAP_BLOCK_SHIFT(zap)-3));
334789Sahrens 	} else {
335789Sahrens 		zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
336789Sahrens 		    zap_ptrtbl_transfer, tx);
337789Sahrens 	}
338789Sahrens }
339789Sahrens 
340789Sahrens static void
341789Sahrens zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
342789Sahrens {
343789Sahrens 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
344789Sahrens 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
345789Sahrens 
346789Sahrens 	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
347789Sahrens 
348789Sahrens 	zap->zap_f.zap_phys->zap_num_entries += delta;
349789Sahrens 
350789Sahrens 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
351789Sahrens }
352789Sahrens 
353789Sahrens uint64_t
354789Sahrens zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
355789Sahrens {
356789Sahrens 	uint64_t newblk;
357789Sahrens 	ASSERT(tx != NULL);
358789Sahrens 	if (!RW_WRITE_HELD(&zap->zap_rwlock)) {
359789Sahrens 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
360789Sahrens 	}
361789Sahrens 	newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
362789Sahrens 	    nblocks;
363789Sahrens 	return (newblk);
364789Sahrens }
365789Sahrens 
366789Sahrens 
367789Sahrens /*
368789Sahrens  * This function doesn't increment zap_num_leafs because it's used to
369789Sahrens  * allocate a leaf chain, which doesn't count against zap_num_leafs.
370789Sahrens  * The directory must be held exclusively for this tx.
371789Sahrens  */
372789Sahrens zap_leaf_t *
373789Sahrens zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
374789Sahrens {
375789Sahrens 	void *winner;
376789Sahrens 	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
377789Sahrens 
378789Sahrens 	ASSERT(tx != NULL);
379789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
380789Sahrens 	/* hence we already dirtied zap->zap_dbuf */
381789Sahrens 
382789Sahrens 	rw_init(&l->l_rwlock, 0, 0, 0);
383789Sahrens 	rw_enter(&l->l_rwlock, RW_WRITER);
384789Sahrens 	l->l_blkid = zap_allocate_blocks(zap, 1, tx);
385789Sahrens 	l->l_next = NULL;
386789Sahrens 	l->l_dbuf = NULL;
387789Sahrens 	l->l_phys = NULL;
388789Sahrens 
389789Sahrens 	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
390*1491Sahrens 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap));
391789Sahrens 	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
392789Sahrens 	ASSERT(winner == NULL);
393789Sahrens 	dmu_buf_will_dirty(l->l_dbuf, tx);
394789Sahrens 
395789Sahrens 	zap_leaf_init(l);
396789Sahrens 
397789Sahrens 	return (l);
398789Sahrens }
399789Sahrens 
400789Sahrens /* ARGSUSED */
401789Sahrens void
402789Sahrens zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
403789Sahrens {
404789Sahrens 	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
405789Sahrens 	rw_exit(&l->l_rwlock);
406789Sahrens 	dmu_buf_rele(l->l_dbuf);
407789Sahrens 	/* XXX there are still holds on this block, so we can't free it? */
408789Sahrens 	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
409789Sahrens 	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
410789Sahrens }
411789Sahrens 
412789Sahrens int
413789Sahrens fzap_count(zap_t *zap, uint64_t *count)
414789Sahrens {
415789Sahrens 	ASSERT(!zap->zap_ismicro);
416789Sahrens 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
417789Sahrens 	*count = zap->zap_f.zap_phys->zap_num_entries;
418789Sahrens 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
419789Sahrens 	return (0);
420789Sahrens }
421789Sahrens 
422789Sahrens /*
423789Sahrens  * Routines for obtaining zap_leaf_t's
424789Sahrens  */
425789Sahrens 
426885Sahrens void
427789Sahrens zap_put_leaf(zap_leaf_t *l)
428789Sahrens {
429789Sahrens 	zap_leaf_t *nl = l->l_next;
430789Sahrens 	while (nl) {
431789Sahrens 		zap_leaf_t *nnl = nl->l_next;
432789Sahrens 		rw_exit(&nl->l_rwlock);
433789Sahrens 		dmu_buf_rele(nl->l_dbuf);
434789Sahrens 		nl = nnl;
435789Sahrens 	}
436789Sahrens 	rw_exit(&l->l_rwlock);
437789Sahrens 	dmu_buf_rele(l->l_dbuf);
438789Sahrens }
439789Sahrens 
440789Sahrens _NOTE(ARGSUSED(0))
441789Sahrens static void
442789Sahrens zap_leaf_pageout(dmu_buf_t *db, void *vl)
443789Sahrens {
444789Sahrens 	zap_leaf_t *l = vl;
445789Sahrens 
446789Sahrens 	rw_destroy(&l->l_rwlock);
447789Sahrens 	kmem_free(l, sizeof (zap_leaf_t));
448789Sahrens }
449789Sahrens 
450789Sahrens static zap_leaf_t *
451789Sahrens zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
452789Sahrens {
453789Sahrens 	zap_leaf_t *l, *winner;
454789Sahrens 
455789Sahrens 	ASSERT(blkid != 0);
456789Sahrens 
457789Sahrens 	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
458789Sahrens 	rw_init(&l->l_rwlock, 0, 0, 0);
459789Sahrens 	rw_enter(&l->l_rwlock, RW_WRITER);
460789Sahrens 	l->l_blkid = blkid;
461*1491Sahrens 	l->l_bs = highbit(db->db_size)-1;
462789Sahrens 	l->l_next = NULL;
463789Sahrens 	l->l_dbuf = db;
464789Sahrens 	l->l_phys = NULL;
465789Sahrens 
466789Sahrens 	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
467789Sahrens 
468789Sahrens 	rw_exit(&l->l_rwlock);
469789Sahrens 	if (winner != NULL) {
470789Sahrens 		/* someone else set it first */
471789Sahrens 		zap_leaf_pageout(NULL, l);
472789Sahrens 		l = winner;
473789Sahrens 	}
474789Sahrens 
475*1491Sahrens 	/*
476*1491Sahrens 	 * There should be more hash entries than there can be
477*1491Sahrens 	 * chunks to put in the hash table
478*1491Sahrens 	 */
479*1491Sahrens 	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
480*1491Sahrens 
481*1491Sahrens 	/* The chunks should begin at the end of the hash table */
482*1491Sahrens 	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
483*1491Sahrens 	    &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
484*1491Sahrens 
485*1491Sahrens 	/* The chunks should end at the end of the block */
486*1491Sahrens 	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
487*1491Sahrens 	    (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
488*1491Sahrens 
489789Sahrens 	return (l);
490789Sahrens }
491789Sahrens 
492789Sahrens static zap_leaf_t *
493789Sahrens zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
494789Sahrens {
495789Sahrens 	dmu_buf_t *db;
496789Sahrens 	zap_leaf_t *l;
497*1491Sahrens 	int bs = FZAP_BLOCK_SHIFT(zap);
498789Sahrens 
499789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
500789Sahrens 
501*1491Sahrens 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object, blkid << bs);
502789Sahrens 
503789Sahrens 	ASSERT3U(db->db_object, ==, zap->zap_object);
504*1491Sahrens 	ASSERT3U(db->db_offset, ==, blkid << bs);
505*1491Sahrens 	ASSERT3U(db->db_size, ==, 1 << bs);
506789Sahrens 	ASSERT(blkid != 0);
507789Sahrens 
508789Sahrens 	dmu_buf_read(db);
509789Sahrens 	l = dmu_buf_get_user(db);
510789Sahrens 
511789Sahrens 	if (l == NULL)
512789Sahrens 		l = zap_open_leaf(blkid, db);
513789Sahrens 
514789Sahrens 	rw_enter(&l->l_rwlock, lt);
515789Sahrens 	/*
516789Sahrens 	 * Must lock before dirtying, otherwise l->l_phys could change,
517789Sahrens 	 * causing ASSERT below to fail.
518789Sahrens 	 */
519789Sahrens 	if (lt == RW_WRITER)
520789Sahrens 		dmu_buf_will_dirty(db, tx);
521789Sahrens 	ASSERT3U(l->l_blkid, ==, blkid);
522789Sahrens 	ASSERT3P(l->l_dbuf, ==, db);
523789Sahrens 	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
524789Sahrens 	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
525789Sahrens 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
526789Sahrens 
527789Sahrens 	return (l);
528789Sahrens }
529789Sahrens 
530789Sahrens static zap_leaf_t *
531789Sahrens zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
532789Sahrens {
533789Sahrens 	zap_leaf_t *l, *nl;
534789Sahrens 
535789Sahrens 	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
536789Sahrens 
537789Sahrens 	nl = l;
538789Sahrens 	while (nl->lh_next != 0) {
539789Sahrens 		zap_leaf_t *nnl;
540789Sahrens 		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
541789Sahrens 		nl->l_next = nnl;
542789Sahrens 		nl = nnl;
543789Sahrens 	}
544789Sahrens 
545789Sahrens 	return (l);
546789Sahrens }
547789Sahrens 
548789Sahrens static uint64_t
549789Sahrens zap_idx_to_blk(zap_t *zap, uint64_t idx)
550789Sahrens {
551789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
552789Sahrens 
553789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
554789Sahrens 		ASSERT3U(idx, <,
555789Sahrens 		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
556*1491Sahrens 		return (ZAP_EMBEDDED_PTRTBL_ENT(zap, idx));
557789Sahrens 	} else {
558789Sahrens 		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
559789Sahrens 		    idx));
560789Sahrens 	}
561789Sahrens }
562789Sahrens 
563789Sahrens static void
564789Sahrens zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
565789Sahrens {
566789Sahrens 	ASSERT(tx != NULL);
567789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
568789Sahrens 
569789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
570*1491Sahrens 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
571789Sahrens 	} else {
572789Sahrens 		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
573789Sahrens 		    idx, blk, tx);
574789Sahrens 	}
575789Sahrens }
576789Sahrens 
577789Sahrens static zap_leaf_t *
578789Sahrens zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
579789Sahrens {
580789Sahrens 	uint64_t idx;
581789Sahrens 	zap_leaf_t *l;
582789Sahrens 
583789Sahrens 	ASSERT(zap->zap_dbuf == NULL ||
584789Sahrens 	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
585789Sahrens 	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
586789Sahrens 	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
587789Sahrens 	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
588789Sahrens 
589789Sahrens 	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
590789Sahrens 
591789Sahrens 	return (l);
592789Sahrens }
593789Sahrens 
594789Sahrens 
595789Sahrens static zap_leaf_t *
596789Sahrens zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
597789Sahrens {
598789Sahrens 	zap_leaf_t *nl;
599789Sahrens 	int prefix_diff, i, err;
600789Sahrens 	uint64_t sibling;
601789Sahrens 
602789Sahrens 	ASSERT3U(l->lh_prefix_len, <=,
603789Sahrens 	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
604789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
605789Sahrens 
606789Sahrens 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
607789Sahrens 
608789Sahrens 	if (zap_tryupgradedir(zap, tx) == 0) {
609789Sahrens 		/* failed to upgrade */
610789Sahrens 		int old_prefix_len = l->lh_prefix_len;
611789Sahrens 		objset_t *os = zap->zap_objset;
612789Sahrens 		uint64_t object = zap->zap_object;
613789Sahrens 
614789Sahrens 		zap_put_leaf(l);
615789Sahrens 		zap_unlockdir(zap);
616789Sahrens 		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
617789Sahrens 		ASSERT3U(err, ==, 0);
618789Sahrens 		ASSERT(!zap->zap_ismicro);
619789Sahrens 		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
620789Sahrens 
621789Sahrens 		if (l->lh_prefix_len != old_prefix_len)
622789Sahrens 			/* it split while our locks were down */
623789Sahrens 			return (l);
624789Sahrens 	}
625789Sahrens 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
626789Sahrens 
627789Sahrens 	if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
628789Sahrens 		/* There's only one pointer to us. Chain on another leaf blk. */
629789Sahrens 		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
630789Sahrens 		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
631789Sahrens 		    l->lh_prefix_len);
632789Sahrens 		return (l);
633789Sahrens 	}
634789Sahrens 
635789Sahrens 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
636789Sahrens 
637789Sahrens 	/* There's more than one pointer to us. Split this leaf. */
638789Sahrens 	nl = zap_leaf_split(zap, l, tx);
639789Sahrens 
640789Sahrens 	/* set sibling pointers */
641789Sahrens 	prefix_diff =
642789Sahrens 	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
643789Sahrens 	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
644789Sahrens 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
645789Sahrens 		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
646789Sahrens 		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
647789Sahrens 		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
648789Sahrens 	}
649789Sahrens 
650789Sahrens 	zap->zap_f.zap_phys->zap_num_leafs++;
651789Sahrens 
652789Sahrens 	if (hash & (1ULL << (64 - l->lh_prefix_len))) {
653789Sahrens 		/* we want the sibling */
654789Sahrens 		zap_put_leaf(l);
655789Sahrens 		l = nl;
656789Sahrens 	} else {
657789Sahrens 		zap_put_leaf(nl);
658789Sahrens 	}
659789Sahrens 
660789Sahrens 	return (l);
661789Sahrens }
662789Sahrens 
663789Sahrens static void
664*1491Sahrens zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
665789Sahrens {
666789Sahrens 	int shift, err;
667789Sahrens 
668789Sahrens again:
669789Sahrens 	shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
670789Sahrens 
671789Sahrens 	if (l->lh_prefix_len == shift &&
672*1491Sahrens 	    (l->l_next != NULL || l->lh_nfree < MIN_FREE(l))) {
673789Sahrens 		/* this leaf will soon make us grow the pointer table */
674789Sahrens 
675789Sahrens 		if (zap_tryupgradedir(zap, tx) == 0) {
676789Sahrens 			objset_t *os = zap->zap_objset;
677789Sahrens 			uint64_t zapobj = zap->zap_object;
678789Sahrens 			uint64_t blkid = l->l_blkid;
679789Sahrens 
680789Sahrens 			zap_put_leaf(l);
681789Sahrens 			zap_unlockdir(zap);
682789Sahrens 			err = zap_lockdir(os, zapobj, tx,
683789Sahrens 			    RW_WRITER, FALSE, &zap);
684789Sahrens 			ASSERT3U(err, ==, 0);
685789Sahrens 			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
686789Sahrens 			goto again;
687789Sahrens 		}
688789Sahrens 
689789Sahrens 		zap_put_leaf(l);
690789Sahrens 		zap_grow_ptrtbl(zap, tx);
691789Sahrens 	} else {
692789Sahrens 		zap_put_leaf(l);
693789Sahrens 	}
694789Sahrens }
695789Sahrens 
696789Sahrens 
697789Sahrens static int
698789Sahrens fzap_checksize(uint64_t integer_size, uint64_t num_integers)
699789Sahrens {
700789Sahrens 	/* Only integer sizes supported by C */
701789Sahrens 	switch (integer_size) {
702789Sahrens 	case 1:
703789Sahrens 	case 2:
704789Sahrens 	case 4:
705789Sahrens 	case 8:
706789Sahrens 		break;
707789Sahrens 	default:
708789Sahrens 		return (EINVAL);
709789Sahrens 	}
710789Sahrens 
711789Sahrens 	/* Make sure we won't overflow */
712789Sahrens 	if (integer_size * num_integers < num_integers)
713789Sahrens 		return (EINVAL);
714*1491Sahrens 	if (integer_size * num_integers > (1<<fzap_default_block_shift))
715789Sahrens 		return (EINVAL);
716789Sahrens 
717789Sahrens 	return (0);
718789Sahrens }
719789Sahrens 
720789Sahrens /*
721789Sahrens  * Routines for maniplulating attributes.
722789Sahrens  */
723789Sahrens int
724789Sahrens fzap_lookup(zap_t *zap, const char *name,
725789Sahrens     uint64_t integer_size, uint64_t num_integers, void *buf)
726789Sahrens {
727789Sahrens 	zap_leaf_t *l;
728789Sahrens 	int err;
729789Sahrens 	uint64_t hash;
730789Sahrens 	zap_entry_handle_t zeh;
731789Sahrens 
732789Sahrens 	err = fzap_checksize(integer_size, num_integers);
733789Sahrens 	if (err != 0)
734789Sahrens 		return (err);
735789Sahrens 
736789Sahrens 	hash = zap_hash(zap, name);
737789Sahrens 	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
738789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
739789Sahrens 	if (err != 0)
740789Sahrens 		goto out;
741789Sahrens 	err = zap_entry_read(&zeh, integer_size, num_integers, buf);
742789Sahrens out:
743789Sahrens 	zap_put_leaf(l);
744789Sahrens 	return (err);
745789Sahrens }
746789Sahrens 
747789Sahrens int
748789Sahrens fzap_add_cd(zap_t *zap, const char *name,
749789Sahrens     uint64_t integer_size, uint64_t num_integers,
750789Sahrens     const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
751789Sahrens {
752789Sahrens 	zap_leaf_t *l;
753789Sahrens 	uint64_t hash;
754789Sahrens 	int err;
755789Sahrens 	zap_entry_handle_t zeh;
756789Sahrens 
757789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
758789Sahrens 	ASSERT(!zap->zap_ismicro);
759789Sahrens 	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
760789Sahrens 
761789Sahrens 	hash = zap_hash(zap, name);
762789Sahrens 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
763789Sahrens retry:
764789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
765789Sahrens 	if (err == 0) {
766789Sahrens 		err = EEXIST;
767789Sahrens 		goto out;
768789Sahrens 	}
769789Sahrens 	ASSERT(err == ENOENT);
770789Sahrens 
771789Sahrens 	/* XXX If this leaf is chained, split it if we can. */
772789Sahrens 	err = zap_entry_create(l, name, hash, cd,
773789Sahrens 	    integer_size, num_integers, val, &zeh);
774789Sahrens 
775789Sahrens 	if (err == 0) {
776789Sahrens 		zap_increment_num_entries(zap, 1, tx);
777789Sahrens 	} else if (err == EAGAIN) {
778789Sahrens 		l = zap_expand_leaf(zap, l, hash, tx);
779789Sahrens 		goto retry;
780789Sahrens 	}
781789Sahrens 
782789Sahrens out:
783789Sahrens 	if (lp)
784789Sahrens 		*lp = l;
785789Sahrens 	else
786789Sahrens 		zap_put_leaf(l);
787789Sahrens 	return (err);
788789Sahrens }
789789Sahrens 
790789Sahrens int
791789Sahrens fzap_add(zap_t *zap, const char *name,
792789Sahrens     uint64_t integer_size, uint64_t num_integers,
793789Sahrens     const void *val, dmu_tx_t *tx)
794789Sahrens {
795789Sahrens 	int err;
796789Sahrens 	zap_leaf_t *l;
797789Sahrens 
798789Sahrens 	err = fzap_checksize(integer_size, num_integers);
799789Sahrens 	if (err != 0)
800789Sahrens 		return (err);
801789Sahrens 
802789Sahrens 	err = fzap_add_cd(zap, name, integer_size, num_integers,
803789Sahrens 	    val, ZAP_MAXCD, tx, &l);
804789Sahrens 
805789Sahrens 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
806789Sahrens 	return (err);
807789Sahrens }
808789Sahrens 
809789Sahrens int
810789Sahrens fzap_update(zap_t *zap, const char *name,
811789Sahrens     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
812789Sahrens {
813789Sahrens 	zap_leaf_t *l;
814789Sahrens 	uint64_t hash;
815789Sahrens 	int err, create;
816789Sahrens 	zap_entry_handle_t zeh;
817789Sahrens 
818789Sahrens 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
819789Sahrens 	err = fzap_checksize(integer_size, num_integers);
820789Sahrens 	if (err != 0)
821789Sahrens 		return (err);
822789Sahrens 
823789Sahrens 	hash = zap_hash(zap, name);
824789Sahrens 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
825789Sahrens retry:
826789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
827789Sahrens 	create = (err == ENOENT);
828789Sahrens 	ASSERT(err == 0 || err == ENOENT);
829789Sahrens 
830789Sahrens 	/* XXX If this leaf is chained, split it if we can. */
831789Sahrens 
832789Sahrens 	if (create) {
833789Sahrens 		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
834789Sahrens 		    integer_size, num_integers, val, &zeh);
835789Sahrens 		if (err == 0)
836789Sahrens 			zap_increment_num_entries(zap, 1, tx);
837789Sahrens 	} else {
838789Sahrens 		err = zap_entry_update(&zeh, integer_size, num_integers, val);
839789Sahrens 	}
840789Sahrens 
841789Sahrens 	if (err == EAGAIN) {
842789Sahrens 		l = zap_expand_leaf(zap, l, hash, tx);
843789Sahrens 		goto retry;
844789Sahrens 	}
845789Sahrens 
846789Sahrens 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
847789Sahrens 	return (err);
848789Sahrens }
849789Sahrens 
850789Sahrens int
851789Sahrens fzap_length(zap_t *zap, const char *name,
852789Sahrens     uint64_t *integer_size, uint64_t *num_integers)
853789Sahrens {
854789Sahrens 	zap_leaf_t *l;
855789Sahrens 	int err;
856789Sahrens 	uint64_t hash;
857789Sahrens 	zap_entry_handle_t zeh;
858789Sahrens 
859789Sahrens 	hash = zap_hash(zap, name);
860789Sahrens 	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
861789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
862789Sahrens 	if (err != 0)
863789Sahrens 		goto out;
864789Sahrens 
865789Sahrens 	if (integer_size)
866789Sahrens 		*integer_size = zeh.zeh_integer_size;
867789Sahrens 	if (num_integers)
868789Sahrens 		*num_integers = zeh.zeh_num_integers;
869789Sahrens out:
870789Sahrens 	zap_put_leaf(l);
871789Sahrens 	return (err);
872789Sahrens }
873789Sahrens 
874789Sahrens int
875789Sahrens fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
876789Sahrens {
877789Sahrens 	zap_leaf_t *l;
878789Sahrens 	uint64_t hash;
879789Sahrens 	int err;
880789Sahrens 	zap_entry_handle_t zeh;
881789Sahrens 
882789Sahrens 	hash = zap_hash(zap, name);
883789Sahrens 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
884789Sahrens 	err = zap_leaf_lookup(l, name, hash, &zeh);
885789Sahrens 	if (err == 0) {
886789Sahrens 		zap_entry_remove(&zeh);
887789Sahrens 		zap_increment_num_entries(zap, -1, tx);
888789Sahrens 	}
889789Sahrens 	zap_put_leaf(l);
890789Sahrens 	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
891789Sahrens 	    zap->zap_objset, zap->zap_object, name, err);
892789Sahrens 	return (err);
893789Sahrens }
894789Sahrens 
895789Sahrens int
896789Sahrens zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
897789Sahrens {
898789Sahrens 	zap_cursor_t zc;
899789Sahrens 	zap_attribute_t *za;
900789Sahrens 	int err;
901789Sahrens 
902789Sahrens 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
903789Sahrens 	for (zap_cursor_init(&zc, os, zapobj);
904789Sahrens 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
905789Sahrens 	    zap_cursor_advance(&zc)) {
906789Sahrens 		if (za->za_first_integer == value) {
907789Sahrens 			(void) strcpy(name, za->za_name);
908789Sahrens 			break;
909789Sahrens 		}
910789Sahrens 	}
911885Sahrens 	zap_cursor_fini(&zc);
912789Sahrens 	kmem_free(za, sizeof (zap_attribute_t));
913789Sahrens 	return (err);
914789Sahrens }
915789Sahrens 
916789Sahrens 
917789Sahrens /*
918789Sahrens  * Routines for iterating over the attributes.
919789Sahrens  */
920789Sahrens 
921789Sahrens int
922789Sahrens fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
923789Sahrens {
924789Sahrens 	int err = ENOENT;
925789Sahrens 	zap_entry_handle_t zeh;
926789Sahrens 	zap_leaf_t *l;
927789Sahrens 
928789Sahrens 	/* retrieve the next entry at or after zc_hash/zc_cd */
929789Sahrens 	/* if no entry, return ENOENT */
930789Sahrens 
931885Sahrens 	if (zc->zc_leaf &&
932885Sahrens 	    (ZAP_HASH_IDX(zc->zc_hash, zc->zc_leaf->lh_prefix_len) !=
933885Sahrens 	    zc->zc_leaf->lh_prefix)) {
934885Sahrens 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
935885Sahrens 		zap_put_leaf(zc->zc_leaf);
936885Sahrens 		zc->zc_leaf = NULL;
937885Sahrens 	}
938885Sahrens 
939789Sahrens again:
940885Sahrens 	if (zc->zc_leaf == NULL) {
941885Sahrens 		zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
942885Sahrens 	} else {
943885Sahrens 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
944885Sahrens 	}
945885Sahrens 	l = zc->zc_leaf;
946885Sahrens 
947789Sahrens 	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
948789Sahrens 
949789Sahrens 	if (err == ENOENT) {
950789Sahrens 		uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
951789Sahrens 		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
952789Sahrens 		zc->zc_cd = 0;
953789Sahrens 		if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
954789Sahrens 			zc->zc_hash = -1ULL;
955789Sahrens 		} else {
956885Sahrens 			zap_put_leaf(zc->zc_leaf);
957885Sahrens 			zc->zc_leaf = NULL;
958789Sahrens 			goto again;
959789Sahrens 		}
960789Sahrens 	}
961789Sahrens 
962789Sahrens 	if (err == 0) {
963789Sahrens 		zc->zc_hash = zeh.zeh_hash;
964789Sahrens 		zc->zc_cd = zeh.zeh_cd;
965789Sahrens 		za->za_integer_length = zeh.zeh_integer_size;
966789Sahrens 		za->za_num_integers = zeh.zeh_num_integers;
967789Sahrens 		if (zeh.zeh_num_integers == 0) {
968789Sahrens 			za->za_first_integer = 0;
969789Sahrens 		} else {
970789Sahrens 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
971789Sahrens 			ASSERT(err == 0 || err == EOVERFLOW);
972789Sahrens 		}
973789Sahrens 		err = zap_entry_read_name(&zeh,
974789Sahrens 		    sizeof (za->za_name), za->za_name);
975789Sahrens 		ASSERT(err == 0);
976789Sahrens 	}
977885Sahrens 	rw_exit(&zc->zc_leaf->l_rwlock);
978789Sahrens 	return (err);
979789Sahrens }
980789Sahrens 
981789Sahrens 
982789Sahrens static void
983789Sahrens zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
984789Sahrens {
985789Sahrens 	int i;
986789Sahrens 	uint64_t lastblk = 0;
987789Sahrens 
988789Sahrens 	/*
989789Sahrens 	 * NB: if a leaf has more pointers than an entire ptrtbl block
990789Sahrens 	 * can hold, then it'll be accounted for more than once, since
991789Sahrens 	 * we won't have lastblk.
992789Sahrens 	 */
993789Sahrens 	for (i = 0; i < len; i++) {
994789Sahrens 		zap_leaf_t *l;
995789Sahrens 
996789Sahrens 		if (tbl[i] == lastblk)
997789Sahrens 			continue;
998789Sahrens 		lastblk = tbl[i];
999789Sahrens 
1000789Sahrens 		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
1001789Sahrens 
1002789Sahrens 		zap_stats_leaf(zap, l, zs);
1003789Sahrens 		zap_put_leaf(l);
1004789Sahrens 	}
1005789Sahrens }
1006789Sahrens 
1007789Sahrens void
1008789Sahrens fzap_get_stats(zap_t *zap, zap_stats_t *zs)
1009789Sahrens {
1010*1491Sahrens 	int bs = FZAP_BLOCK_SHIFT(zap);
1011789Sahrens 	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
1012*1491Sahrens 	zs->zs_blocksize = 1ULL << bs;
1013789Sahrens 	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
1014789Sahrens 	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
1015789Sahrens 	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
1016789Sahrens 
1017789Sahrens 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
1018789Sahrens 		/* the ptrtbl is entirely in the header block. */
1019*1491Sahrens 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1020*1491Sahrens 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
1021789Sahrens 	} else {
1022789Sahrens 		int b;
1023789Sahrens 
1024789Sahrens 		dmu_prefetch(zap->zap_objset, zap->zap_object,
1025*1491Sahrens 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
1026*1491Sahrens 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
1027789Sahrens 
1028789Sahrens 		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
1029789Sahrens 		    b++) {
1030789Sahrens 			dmu_buf_t *db;
1031789Sahrens 
1032789Sahrens 			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1033*1491Sahrens 			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs);
1034789Sahrens 			dmu_buf_read(db);
1035*1491Sahrens 			zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs);
1036789Sahrens 			dmu_buf_rele(db);
1037789Sahrens 		}
1038789Sahrens 	}
1039789Sahrens }
1040