xref: /onnv-gate/usr/src/uts/common/fs/zfs/dmu_objset.c (revision 2082:76b439ec3ac1)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21789Sahrens /*
221544Seschrock  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23789Sahrens  * Use is subject to license terms.
24789Sahrens  */
25789Sahrens 
26789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27789Sahrens 
28789Sahrens #include <sys/zfs_context.h>
29789Sahrens #include <sys/dmu_objset.h>
30789Sahrens #include <sys/dsl_dir.h>
31789Sahrens #include <sys/dsl_dataset.h>
32789Sahrens #include <sys/dsl_prop.h>
33789Sahrens #include <sys/dsl_pool.h>
34789Sahrens #include <sys/dnode.h>
35789Sahrens #include <sys/dbuf.h>
36789Sahrens #include <sys/dmu_tx.h>
37789Sahrens #include <sys/zio_checksum.h>
38789Sahrens #include <sys/zap.h>
39789Sahrens #include <sys/zil.h>
40789Sahrens #include <sys/dmu_impl.h>
41789Sahrens 
42789Sahrens 
43789Sahrens spa_t *
44789Sahrens dmu_objset_spa(objset_t *os)
45789Sahrens {
46789Sahrens 	return (os->os->os_spa);
47789Sahrens }
48789Sahrens 
49789Sahrens zilog_t *
50789Sahrens dmu_objset_zil(objset_t *os)
51789Sahrens {
52789Sahrens 	return (os->os->os_zil);
53789Sahrens }
54789Sahrens 
55789Sahrens dsl_pool_t *
56789Sahrens dmu_objset_pool(objset_t *os)
57789Sahrens {
58789Sahrens 	dsl_dataset_t *ds;
59789Sahrens 
60789Sahrens 	if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
61789Sahrens 		return (ds->ds_dir->dd_pool);
62789Sahrens 	else
63789Sahrens 		return (spa_get_dsl(os->os->os_spa));
64789Sahrens }
65789Sahrens 
66789Sahrens dsl_dataset_t *
67789Sahrens dmu_objset_ds(objset_t *os)
68789Sahrens {
69789Sahrens 	return (os->os->os_dsl_dataset);
70789Sahrens }
71789Sahrens 
72789Sahrens dmu_objset_type_t
73789Sahrens dmu_objset_type(objset_t *os)
74789Sahrens {
75789Sahrens 	return (os->os->os_phys->os_type);
76789Sahrens }
77789Sahrens 
78789Sahrens void
79789Sahrens dmu_objset_name(objset_t *os, char *buf)
80789Sahrens {
81789Sahrens 	dsl_dataset_name(os->os->os_dsl_dataset, buf);
82789Sahrens }
83789Sahrens 
84789Sahrens uint64_t
85789Sahrens dmu_objset_id(objset_t *os)
86789Sahrens {
87789Sahrens 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
88789Sahrens 
89789Sahrens 	return (ds ? ds->ds_object : 0);
90789Sahrens }
91789Sahrens 
92789Sahrens static void
93789Sahrens checksum_changed_cb(void *arg, uint64_t newval)
94789Sahrens {
95789Sahrens 	objset_impl_t *osi = arg;
96789Sahrens 
97789Sahrens 	/*
98789Sahrens 	 * Inheritance should have been done by now.
99789Sahrens 	 */
100789Sahrens 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
101789Sahrens 
102789Sahrens 	osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
103789Sahrens }
104789Sahrens 
105789Sahrens static void
106789Sahrens compression_changed_cb(void *arg, uint64_t newval)
107789Sahrens {
108789Sahrens 	objset_impl_t *osi = arg;
109789Sahrens 
110789Sahrens 	/*
111789Sahrens 	 * Inheritance and range checking should have been done by now.
112789Sahrens 	 */
113789Sahrens 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
114789Sahrens 
115789Sahrens 	osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
116789Sahrens }
117789Sahrens 
118789Sahrens void
119789Sahrens dmu_objset_byteswap(void *buf, size_t size)
120789Sahrens {
121789Sahrens 	objset_phys_t *osp = buf;
122789Sahrens 
123789Sahrens 	ASSERT(size == sizeof (objset_phys_t));
124789Sahrens 	dnode_byteswap(&osp->os_meta_dnode);
125789Sahrens 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
126789Sahrens 	osp->os_type = BSWAP_64(osp->os_type);
127789Sahrens }
128789Sahrens 
1291544Seschrock int
1301544Seschrock dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
1311544Seschrock     objset_impl_t **osip)
132789Sahrens {
133789Sahrens 	objset_impl_t *winner, *osi;
134789Sahrens 	int i, err, checksum;
135789Sahrens 
136789Sahrens 	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
137789Sahrens 	osi->os.os = osi;
138789Sahrens 	osi->os_dsl_dataset = ds;
139789Sahrens 	osi->os_spa = spa;
140789Sahrens 	if (bp)
141789Sahrens 		osi->os_rootbp = *bp;
142789Sahrens 	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
143789Sahrens 	if (!BP_IS_HOLE(&osi->os_rootbp)) {
1441544Seschrock 		zbookmark_t zb;
1451544Seschrock 		zb.zb_objset = ds ? ds->ds_object : 0;
1461544Seschrock 		zb.zb_object = 0;
1471544Seschrock 		zb.zb_level = -1;
1481544Seschrock 		zb.zb_blkid = 0;
1491544Seschrock 
150789Sahrens 		dprintf_bp(&osi->os_rootbp, "reading %s", "");
1511544Seschrock 		err = arc_read(NULL, spa, &osi->os_rootbp,
152789Sahrens 		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
153789Sahrens 		    arc_bcopy_func, osi->os_phys,
1541544Seschrock 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb);
1551544Seschrock 		if (err) {
1561544Seschrock 			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
1571544Seschrock 			kmem_free(osi, sizeof (objset_impl_t));
1581544Seschrock 			return (err);
1591544Seschrock 		}
160789Sahrens 	} else {
161789Sahrens 		bzero(osi->os_phys, sizeof (objset_phys_t));
162789Sahrens 	}
163789Sahrens 
164789Sahrens 	/*
165789Sahrens 	 * Note: the changed_cb will be called once before the register
166789Sahrens 	 * func returns, thus changing the checksum/compression from the
167*2082Seschrock 	 * default (fletcher2/off).  Snapshots don't need to know, and
168*2082Seschrock 	 * registering would complicate clone promotion.
169789Sahrens 	 */
170*2082Seschrock 	if (ds && ds->ds_phys->ds_num_children == 0) {
171789Sahrens 		err = dsl_prop_register(ds, "checksum",
172789Sahrens 		    checksum_changed_cb, osi);
1731544Seschrock 		if (err == 0)
1741544Seschrock 			err = dsl_prop_register(ds, "compression",
1751544Seschrock 			    compression_changed_cb, osi);
1761544Seschrock 		if (err) {
1771544Seschrock 			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
1781544Seschrock 			kmem_free(osi, sizeof (objset_impl_t));
1791544Seschrock 			return (err);
1801544Seschrock 		}
181*2082Seschrock 	} else if (ds == NULL) {
182789Sahrens 		/* It's the meta-objset. */
183789Sahrens 		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
1841544Seschrock 		osi->os_compress = ZIO_COMPRESS_LZJB;
185789Sahrens 	}
186789Sahrens 
1871544Seschrock 	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
1881544Seschrock 
189789Sahrens 	/*
190789Sahrens 	 * Metadata always gets compressed and checksummed.
191789Sahrens 	 * If the data checksum is multi-bit correctable, and it's not
192789Sahrens 	 * a ZBT-style checksum, then it's suitable for metadata as well.
193789Sahrens 	 * Otherwise, the metadata checksum defaults to fletcher4.
194789Sahrens 	 */
195789Sahrens 	checksum = osi->os_checksum;
196789Sahrens 
197789Sahrens 	if (zio_checksum_table[checksum].ci_correctable &&
198789Sahrens 	    !zio_checksum_table[checksum].ci_zbt)
199789Sahrens 		osi->os_md_checksum = checksum;
200789Sahrens 	else
201789Sahrens 		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
2021544Seschrock 	osi->os_md_compress = ZIO_COMPRESS_LZJB;
203789Sahrens 
204789Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
205789Sahrens 		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
206789Sahrens 		    offsetof(dnode_t, dn_dirty_link[i]));
207789Sahrens 		list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
208789Sahrens 		    offsetof(dnode_t, dn_dirty_link[i]));
209789Sahrens 	}
210789Sahrens 	list_create(&osi->os_dnodes, sizeof (dnode_t),
211789Sahrens 	    offsetof(dnode_t, dn_link));
212789Sahrens 	list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
213789Sahrens 	    offsetof(dmu_buf_impl_t, db_link));
214789Sahrens 
215789Sahrens 	osi->os_meta_dnode = dnode_special_open(osi,
216789Sahrens 	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
217789Sahrens 
218789Sahrens 	if (ds != NULL) {
219789Sahrens 		winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
220789Sahrens 		if (winner) {
221789Sahrens 			dmu_objset_evict(ds, osi);
222789Sahrens 			osi = winner;
223789Sahrens 		}
224789Sahrens 	}
225789Sahrens 
2261544Seschrock 	*osip = osi;
2271544Seschrock 	return (0);
228789Sahrens }
229789Sahrens 
230789Sahrens /* called from zpl */
231789Sahrens int
232789Sahrens dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
233789Sahrens     objset_t **osp)
234789Sahrens {
235789Sahrens 	dsl_dataset_t *ds;
236789Sahrens 	int err;
237789Sahrens 	objset_t *os;
238789Sahrens 	objset_impl_t *osi;
239789Sahrens 
240789Sahrens 	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
241789Sahrens 	err = dsl_dataset_open(name, mode, os, &ds);
242789Sahrens 	if (err) {
243789Sahrens 		kmem_free(os, sizeof (objset_t));
244789Sahrens 		return (err);
245789Sahrens 	}
246789Sahrens 
247789Sahrens 	osi = dsl_dataset_get_user_ptr(ds);
248789Sahrens 	if (osi == NULL) {
249789Sahrens 		blkptr_t bp;
250789Sahrens 
251789Sahrens 		dsl_dataset_get_blkptr(ds, &bp);
2521544Seschrock 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
2531544Seschrock 		    ds, &bp, &osi);
2541544Seschrock 		if (err) {
2551544Seschrock 			dsl_dataset_close(ds, mode, os);
2561544Seschrock 			kmem_free(os, sizeof (objset_t));
2571544Seschrock 			return (err);
2581544Seschrock 		}
259789Sahrens 	}
260789Sahrens 
261789Sahrens 	os->os = osi;
262789Sahrens 	os->os_mode = mode;
263789Sahrens 
264789Sahrens 	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
265789Sahrens 		dmu_objset_close(os);
266789Sahrens 		return (EINVAL);
267789Sahrens 	}
268789Sahrens 	*osp = os;
269789Sahrens 	return (0);
270789Sahrens }
271789Sahrens 
272789Sahrens void
273789Sahrens dmu_objset_close(objset_t *os)
274789Sahrens {
275789Sahrens 	dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
276789Sahrens 	kmem_free(os, sizeof (objset_t));
277789Sahrens }
278789Sahrens 
2791646Sperrin int
2801646Sperrin dmu_objset_evict_dbufs(objset_t *os, int try)
2811544Seschrock {
2821544Seschrock 	objset_impl_t *osi = os->os;
2831544Seschrock 	dnode_t *dn;
2841596Sahrens 
2851596Sahrens 	mutex_enter(&osi->os_lock);
2861596Sahrens 
2871596Sahrens 	/* process the mdn last, since the other dnodes have holds on it */
2881596Sahrens 	list_remove(&osi->os_dnodes, osi->os_meta_dnode);
2891596Sahrens 	list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
2901544Seschrock 
2911544Seschrock 	/*
2921596Sahrens 	 * Find the first dnode with holds.  We have to do this dance
2931596Sahrens 	 * because dnode_add_ref() only works if you already have a
2941596Sahrens 	 * hold.  If there are no holds then it has no dbufs so OK to
2951596Sahrens 	 * skip.
2961544Seschrock 	 */
2971596Sahrens 	for (dn = list_head(&osi->os_dnodes);
2981596Sahrens 	    dn && refcount_is_zero(&dn->dn_holds);
2991596Sahrens 	    dn = list_next(&osi->os_dnodes, dn))
3001596Sahrens 		continue;
3011596Sahrens 	if (dn)
3021596Sahrens 		dnode_add_ref(dn, FTAG);
3031596Sahrens 
3041596Sahrens 	while (dn) {
3051596Sahrens 		dnode_t *next_dn = dn;
3061596Sahrens 
3071596Sahrens 		do {
3081596Sahrens 			next_dn = list_next(&osi->os_dnodes, next_dn);
3091596Sahrens 		} while (next_dn && refcount_is_zero(&next_dn->dn_holds));
3101596Sahrens 		if (next_dn)
3111596Sahrens 			dnode_add_ref(next_dn, FTAG);
3121596Sahrens 
3131596Sahrens 		mutex_exit(&osi->os_lock);
3141646Sperrin 		if (dnode_evict_dbufs(dn, try)) {
3151646Sperrin 			dnode_rele(dn, FTAG);
3161646Sperrin 			if (next_dn)
3171646Sperrin 				dnode_rele(next_dn, FTAG);
3181646Sperrin 			return (1);
3191646Sperrin 		}
3201596Sahrens 		dnode_rele(dn, FTAG);
3211596Sahrens 		mutex_enter(&osi->os_lock);
3221596Sahrens 		dn = next_dn;
3231544Seschrock 	}
3241544Seschrock 	mutex_exit(&osi->os_lock);
3251646Sperrin 	return (0);
3261544Seschrock }
3271544Seschrock 
3281544Seschrock void
329789Sahrens dmu_objset_evict(dsl_dataset_t *ds, void *arg)
330789Sahrens {
331789Sahrens 	objset_impl_t *osi = arg;
3321544Seschrock 	objset_t os;
333*2082Seschrock 	int i;
334789Sahrens 
335789Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
336789Sahrens 		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
337789Sahrens 		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
338789Sahrens 	}
339789Sahrens 
340*2082Seschrock 	if (ds && ds->ds_phys->ds_num_children == 0) {
341*2082Seschrock 		VERIFY(0 == dsl_prop_unregister(ds, "checksum",
342*2082Seschrock 		    checksum_changed_cb, osi));
343*2082Seschrock 		VERIFY(0 == dsl_prop_unregister(ds, "compression",
344*2082Seschrock 		    compression_changed_cb, osi));
345789Sahrens 	}
346789Sahrens 
3471544Seschrock 	/*
3481544Seschrock 	 * We should need only a single pass over the dnode list, since
3491544Seschrock 	 * nothing can be added to the list at this point.
3501544Seschrock 	 */
3511544Seschrock 	os.os = osi;
3521646Sperrin 	(void) dmu_objset_evict_dbufs(&os, 0);
3531544Seschrock 
354789Sahrens 	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
355789Sahrens 	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
356789Sahrens 	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
357789Sahrens 
358789Sahrens 	dnode_special_close(osi->os_meta_dnode);
359789Sahrens 	zil_free(osi->os_zil);
360789Sahrens 
361789Sahrens 	zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
362789Sahrens 	kmem_free(osi, sizeof (objset_impl_t));
363789Sahrens }
364789Sahrens 
365789Sahrens /* called from dsl for meta-objset */
366789Sahrens objset_impl_t *
367789Sahrens dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
368789Sahrens     dmu_tx_t *tx)
369789Sahrens {
370789Sahrens 	objset_impl_t *osi;
371789Sahrens 	dnode_t *mdn;
372789Sahrens 
373789Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
3741544Seschrock 	VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi));
375789Sahrens 	mdn = osi->os_meta_dnode;
376789Sahrens 
377789Sahrens 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
378789Sahrens 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
379789Sahrens 
380789Sahrens 	/*
381789Sahrens 	 * We don't want to have to increase the meta-dnode's nlevels
382789Sahrens 	 * later, because then we could do it in quescing context while
383789Sahrens 	 * we are also accessing it in open context.
384789Sahrens 	 *
385789Sahrens 	 * This precaution is not necessary for the MOS (ds == NULL),
386789Sahrens 	 * because the MOS is only updated in syncing context.
387789Sahrens 	 * This is most fortunate: the MOS is the only objset that
388789Sahrens 	 * needs to be synced multiple times as spa_sync() iterates
389789Sahrens 	 * to convergence, so minimizing its dn_nlevels matters.
390789Sahrens 	 */
3911544Seschrock 	if (ds != NULL) {
3921544Seschrock 		int levels = 1;
3931544Seschrock 
3941544Seschrock 		/*
3951544Seschrock 		 * Determine the number of levels necessary for the meta-dnode
3961544Seschrock 		 * to contain DN_MAX_OBJECT dnodes.
3971544Seschrock 		 */
3981544Seschrock 		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
3991544Seschrock 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
4001544Seschrock 		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
4011544Seschrock 			levels++;
4021544Seschrock 
403789Sahrens 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
4041544Seschrock 		    mdn->dn_nlevels = levels;
4051544Seschrock 	}
406789Sahrens 
407789Sahrens 	ASSERT(type != DMU_OST_NONE);
408789Sahrens 	ASSERT(type != DMU_OST_ANY);
409789Sahrens 	ASSERT(type < DMU_OST_NUMTYPES);
410789Sahrens 	osi->os_phys->os_type = type;
411789Sahrens 
412789Sahrens 	dsl_dataset_dirty(ds, tx);
413789Sahrens 
414789Sahrens 	return (osi);
415789Sahrens }
416789Sahrens 
417789Sahrens struct oscarg {
418789Sahrens 	void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
419789Sahrens 	void *userarg;
420789Sahrens 	dsl_dataset_t *clone_parent;
421789Sahrens 	const char *fullname;
422789Sahrens 	const char *lastname;
423789Sahrens 	dmu_objset_type_t type;
424789Sahrens };
425789Sahrens 
426789Sahrens static int
427789Sahrens dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
428789Sahrens {
429789Sahrens 	struct oscarg *oa = arg;
430789Sahrens 	dsl_dataset_t *ds;
431789Sahrens 	int err;
432789Sahrens 	blkptr_t bp;
433789Sahrens 
434789Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
435789Sahrens 
436789Sahrens 	err = dsl_dataset_create_sync(dd, oa->fullname, oa->lastname,
437789Sahrens 	    oa->clone_parent, tx);
438789Sahrens 	dprintf_dd(dd, "fn=%s ln=%s err=%d\n",
439789Sahrens 	    oa->fullname, oa->lastname, err);
440789Sahrens 	if (err)
441789Sahrens 		return (err);
442789Sahrens 
4431544Seschrock 	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
4441544Seschrock 	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
445789Sahrens 	dsl_dataset_get_blkptr(ds, &bp);
446789Sahrens 	if (BP_IS_HOLE(&bp)) {
447789Sahrens 		objset_impl_t *osi;
448789Sahrens 
449789Sahrens 		/* This is an empty dmu_objset; not a clone. */
450789Sahrens 		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
451789Sahrens 		    ds, oa->type, tx);
452789Sahrens 
453789Sahrens 		if (oa->userfunc)
454789Sahrens 			oa->userfunc(&osi->os, oa->userarg, tx);
455789Sahrens 	}
456789Sahrens 	dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
457789Sahrens 
458789Sahrens 	return (0);
459789Sahrens }
460789Sahrens 
461789Sahrens int
462789Sahrens dmu_objset_create(const char *name, dmu_objset_type_t type,
463789Sahrens     objset_t *clone_parent,
464789Sahrens     void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
465789Sahrens {
466789Sahrens 	dsl_dir_t *pds;
467789Sahrens 	const char *tail;
468789Sahrens 	int err = 0;
469789Sahrens 
4701544Seschrock 	err = dsl_dir_open(name, FTAG, &pds, &tail);
4711544Seschrock 	if (err)
4721544Seschrock 		return (err);
473789Sahrens 	if (tail == NULL) {
474789Sahrens 		dsl_dir_close(pds, FTAG);
475789Sahrens 		return (EEXIST);
476789Sahrens 	}
477789Sahrens 
478789Sahrens 	dprintf("name=%s\n", name);
479789Sahrens 
480789Sahrens 	if (tail[0] == '@') {
481789Sahrens 		/*
482789Sahrens 		 * If we're creating a snapshot, make sure everything
483789Sahrens 		 * they might want is on disk.  XXX Sketchy to know
484789Sahrens 		 * about snapshots here, better to put in DSL.
485789Sahrens 		 */
486789Sahrens 		objset_t *os;
487789Sahrens 		size_t plen = strchr(name, '@') - name + 1;
488789Sahrens 		char *pbuf = kmem_alloc(plen, KM_SLEEP);
489789Sahrens 		bcopy(name, pbuf, plen - 1);
490789Sahrens 		pbuf[plen - 1] = '\0';
491789Sahrens 
492789Sahrens 		err = dmu_objset_open(pbuf, DMU_OST_ANY, DS_MODE_STANDARD, &os);
493789Sahrens 		if (err == 0) {
494789Sahrens 			err = zil_suspend(dmu_objset_zil(os));
495789Sahrens 			if (err == 0) {
496789Sahrens 				err = dsl_dir_sync_task(pds,
497789Sahrens 				    dsl_dataset_snapshot_sync,
498789Sahrens 				    (void*)(tail+1), 16*1024);
499789Sahrens 				zil_resume(dmu_objset_zil(os));
500789Sahrens 			}
501789Sahrens 			dmu_objset_close(os);
502789Sahrens 		}
503789Sahrens 		kmem_free(pbuf, plen);
504789Sahrens 	} else {
505789Sahrens 		struct oscarg oa = { 0 };
506789Sahrens 		oa.userfunc = func;
507789Sahrens 		oa.userarg = arg;
508789Sahrens 		oa.fullname = name;
509789Sahrens 		oa.lastname = tail;
510789Sahrens 		oa.type = type;
511789Sahrens 		if (clone_parent != NULL) {
512789Sahrens 			/*
513789Sahrens 			 * You can't clone to a different type.
514789Sahrens 			 */
515789Sahrens 			if (clone_parent->os->os_phys->os_type != type) {
516789Sahrens 				dsl_dir_close(pds, FTAG);
517789Sahrens 				return (EINVAL);
518789Sahrens 			}
519789Sahrens 			oa.clone_parent = clone_parent->os->os_dsl_dataset;
520789Sahrens 		}
521789Sahrens 		err = dsl_dir_sync_task(pds, dmu_objset_create_sync, &oa,
522789Sahrens 		    256*1024);
523789Sahrens 	}
524789Sahrens 	dsl_dir_close(pds, FTAG);
525789Sahrens 	return (err);
526789Sahrens }
527789Sahrens 
528789Sahrens int
529789Sahrens dmu_objset_destroy(const char *name)
530789Sahrens {
531789Sahrens 	objset_t *os;
532789Sahrens 	int error;
533789Sahrens 
534789Sahrens 	/*
535789Sahrens 	 * If it looks like we'll be able to destroy it, and there's
536789Sahrens 	 * an unplayed replay log sitting around, destroy the log.
537789Sahrens 	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
538789Sahrens 	 * but the replay log objset is modified in open context.
539789Sahrens 	 */
540789Sahrens 	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
541789Sahrens 	if (error == 0) {
5421807Sbonwick 		zil_destroy(dmu_objset_zil(os), B_FALSE);
543789Sahrens 		dmu_objset_close(os);
544789Sahrens 	}
545789Sahrens 
546789Sahrens 	/* XXX uncache everything? */
547789Sahrens 	return (dsl_dataset_destroy(name));
548789Sahrens }
549789Sahrens 
550789Sahrens int
551789Sahrens dmu_objset_rollback(const char *name)
552789Sahrens {
553789Sahrens 	int err;
554789Sahrens 	objset_t *os;
555789Sahrens 
556789Sahrens 	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
557789Sahrens 	if (err == 0) {
558789Sahrens 		err = zil_suspend(dmu_objset_zil(os));
559789Sahrens 		if (err == 0)
560789Sahrens 			zil_resume(dmu_objset_zil(os));
561789Sahrens 		dmu_objset_close(os);
562789Sahrens 		if (err == 0) {
563789Sahrens 			/* XXX uncache everything? */
564789Sahrens 			err = dsl_dataset_rollback(name);
565789Sahrens 		}
566789Sahrens 	}
567789Sahrens 	return (err);
568789Sahrens }
569789Sahrens 
570789Sahrens static void
571789Sahrens dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
572789Sahrens {
573789Sahrens 	dnode_t *dn = list_head(list);
574789Sahrens 	int level, err;
575789Sahrens 
576789Sahrens 	for (level = 0; dn = list_head(list); level++) {
577789Sahrens 		zio_t *zio;
578789Sahrens 		zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
579789Sahrens 
580789Sahrens 		ASSERT3U(level, <=, DN_MAX_LEVELS);
581789Sahrens 
582789Sahrens 		while (dn) {
583789Sahrens 			dnode_t *next = list_next(list, dn);
584789Sahrens 
585789Sahrens 			list_remove(list, dn);
586789Sahrens 			if (dnode_sync(dn, level, zio, tx) == 0) {
587789Sahrens 				/*
588789Sahrens 				 * This dnode requires syncing at higher
589789Sahrens 				 * levels; put it back onto the list.
590789Sahrens 				 */
591789Sahrens 				if (next)
592789Sahrens 					list_insert_before(list, next, dn);
593789Sahrens 				else
594789Sahrens 					list_insert_tail(list, dn);
595789Sahrens 			}
596789Sahrens 			dn = next;
597789Sahrens 		}
598789Sahrens 		err = zio_wait(zio);
599789Sahrens 		ASSERT(err == 0);
600789Sahrens 	}
601789Sahrens }
602789Sahrens 
603789Sahrens /* ARGSUSED */
604789Sahrens static void
605789Sahrens killer(zio_t *zio, arc_buf_t *abuf, void *arg)
606789Sahrens {
607789Sahrens 	objset_impl_t *os = arg;
608789Sahrens 	objset_phys_t *osphys = zio->io_data;
609789Sahrens 	dnode_phys_t *dnp = &osphys->os_meta_dnode;
610789Sahrens 	int i;
611789Sahrens 
612789Sahrens 	ASSERT3U(zio->io_error, ==, 0);
613789Sahrens 
614789Sahrens 	/*
615789Sahrens 	 * Update rootbp fill count.
616789Sahrens 	 */
617789Sahrens 	os->os_rootbp.blk_fill = 1;	/* count the meta-dnode */
618789Sahrens 	for (i = 0; i < dnp->dn_nblkptr; i++)
619789Sahrens 		os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
620789Sahrens 
621789Sahrens 	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
622789Sahrens 	BP_SET_LEVEL(zio->io_bp, 0);
623789Sahrens 
624789Sahrens 	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
625789Sahrens 	    BP_IDENTITY(&zio->io_bp_orig))) {
626789Sahrens 		dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
627789Sahrens 		    os->os_synctx);
628789Sahrens 		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
629789Sahrens 		    os->os_synctx);
630789Sahrens 	}
631789Sahrens }
632789Sahrens 
633789Sahrens 
634789Sahrens /* called from dsl */
635789Sahrens void
636789Sahrens dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
637789Sahrens {
638789Sahrens 	extern taskq_t *dbuf_tq;
639789Sahrens 	int txgoff;
640789Sahrens 	list_t *dirty_list;
641789Sahrens 	int err;
6421544Seschrock 	zbookmark_t zb;
643789Sahrens 	arc_buf_t *abuf =
644789Sahrens 	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
645789Sahrens 
646789Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
647789Sahrens 	ASSERT(os->os_synctx == NULL);
648789Sahrens 	/* XXX the write_done callback should really give us the tx... */
649789Sahrens 	os->os_synctx = tx;
650789Sahrens 
651789Sahrens 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
652789Sahrens 
653789Sahrens 	txgoff = tx->tx_txg & TXG_MASK;
654789Sahrens 
655789Sahrens 	dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
656789Sahrens 	dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
657789Sahrens 
658789Sahrens 	/*
659789Sahrens 	 * Free intent log blocks up to this tx.
660789Sahrens 	 */
661789Sahrens 	zil_sync(os->os_zil, tx);
662789Sahrens 
663789Sahrens 	/*
664789Sahrens 	 * Sync meta-dnode
665789Sahrens 	 */
666789Sahrens 	dirty_list = &os->os_dirty_dnodes[txgoff];
667789Sahrens 	ASSERT(list_head(dirty_list) == NULL);
668789Sahrens 	list_insert_tail(dirty_list, os->os_meta_dnode);
669789Sahrens 	dmu_objset_sync_dnodes(os, dirty_list, tx);
670789Sahrens 
671789Sahrens 	/*
672789Sahrens 	 * Sync the root block.
673789Sahrens 	 */
674789Sahrens 	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
6751544Seschrock 	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
6761544Seschrock 	zb.zb_object = 0;
6771544Seschrock 	zb.zb_level = -1;
6781544Seschrock 	zb.zb_blkid = 0;
679789Sahrens 	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
6801775Sbillm 	    os->os_md_compress,
6811775Sbillm 	    dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
6821775Sbillm 	    tx->tx_txg, &os->os_rootbp, abuf, killer, os,
6831544Seschrock 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
684789Sahrens 	ASSERT(err == 0);
6851544Seschrock 	VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
686789Sahrens 
687789Sahrens 	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
688789Sahrens 
689789Sahrens 	ASSERT3P(os->os_synctx, ==, tx);
690789Sahrens 	taskq_wait(dbuf_tq);
691789Sahrens 	os->os_synctx = NULL;
692789Sahrens }
693789Sahrens 
694789Sahrens void
695789Sahrens dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds)
696789Sahrens {
697789Sahrens 	if (os->os->os_dsl_dataset != NULL) {
698789Sahrens 		dsl_dataset_stats(os->os->os_dsl_dataset, dds);
699789Sahrens 	} else {
700789Sahrens 		ASSERT(os->os->os_phys->os_type == DMU_OST_META);
701789Sahrens 		bzero(dds, sizeof (*dds));
702789Sahrens 	}
703789Sahrens 	dds->dds_type = os->os->os_phys->os_type;
704789Sahrens }
705789Sahrens 
706789Sahrens int
707789Sahrens dmu_objset_is_snapshot(objset_t *os)
708789Sahrens {
709789Sahrens 	if (os->os->os_dsl_dataset != NULL)
710789Sahrens 		return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
711789Sahrens 	else
712789Sahrens 		return (B_FALSE);
713789Sahrens }
714789Sahrens 
715789Sahrens int
716789Sahrens dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
717885Sahrens     uint64_t *idp, uint64_t *offp)
718789Sahrens {
719789Sahrens 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
720789Sahrens 	zap_cursor_t cursor;
721789Sahrens 	zap_attribute_t attr;
722789Sahrens 
723789Sahrens 	if (ds->ds_phys->ds_snapnames_zapobj == 0)
724789Sahrens 		return (ENOENT);
725789Sahrens 
726789Sahrens 	zap_cursor_init_serialized(&cursor,
727789Sahrens 	    ds->ds_dir->dd_pool->dp_meta_objset,
728789Sahrens 	    ds->ds_phys->ds_snapnames_zapobj, *offp);
729789Sahrens 
730885Sahrens 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
731885Sahrens 		zap_cursor_fini(&cursor);
732885Sahrens 		return (ENOENT);
733885Sahrens 	}
734885Sahrens 
735885Sahrens 	if (strlen(attr.za_name) + 1 > namelen) {
736885Sahrens 		zap_cursor_fini(&cursor);
737885Sahrens 		return (ENAMETOOLONG);
738885Sahrens 	}
739885Sahrens 
740885Sahrens 	(void) strcpy(name, attr.za_name);
741885Sahrens 	if (idp)
742885Sahrens 		*idp = attr.za_first_integer;
743885Sahrens 	zap_cursor_advance(&cursor);
744885Sahrens 	*offp = zap_cursor_serialize(&cursor);
745885Sahrens 	zap_cursor_fini(&cursor);
746885Sahrens 
747885Sahrens 	return (0);
748885Sahrens }
749885Sahrens 
750885Sahrens int
751885Sahrens dmu_dir_list_next(objset_t *os, int namelen, char *name,
752885Sahrens     uint64_t *idp, uint64_t *offp)
753885Sahrens {
754885Sahrens 	dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
755885Sahrens 	zap_cursor_t cursor;
756885Sahrens 	zap_attribute_t attr;
757885Sahrens 
758885Sahrens 	if (dd->dd_phys->dd_child_dir_zapobj == 0)
759789Sahrens 		return (ENOENT);
760789Sahrens 
761885Sahrens 	/* there is no next dir on a snapshot! */
762885Sahrens 	if (os->os->os_dsl_dataset->ds_object !=
763885Sahrens 	    dd->dd_phys->dd_head_dataset_obj)
764885Sahrens 		return (ENOENT);
765885Sahrens 
766885Sahrens 	zap_cursor_init_serialized(&cursor,
767885Sahrens 	    dd->dd_pool->dp_meta_objset,
768885Sahrens 	    dd->dd_phys->dd_child_dir_zapobj, *offp);
769885Sahrens 
770885Sahrens 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
771885Sahrens 		zap_cursor_fini(&cursor);
772885Sahrens 		return (ENOENT);
773885Sahrens 	}
774885Sahrens 
775885Sahrens 	if (strlen(attr.za_name) + 1 > namelen) {
776885Sahrens 		zap_cursor_fini(&cursor);
777789Sahrens 		return (ENAMETOOLONG);
778885Sahrens 	}
779789Sahrens 
780789Sahrens 	(void) strcpy(name, attr.za_name);
781885Sahrens 	if (idp)
782885Sahrens 		*idp = attr.za_first_integer;
783789Sahrens 	zap_cursor_advance(&cursor);
784789Sahrens 	*offp = zap_cursor_serialize(&cursor);
785885Sahrens 	zap_cursor_fini(&cursor);
786789Sahrens 
787789Sahrens 	return (0);
788789Sahrens }
789789Sahrens 
790789Sahrens /*
791789Sahrens  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
792789Sahrens  */
793789Sahrens void
794789Sahrens dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
795789Sahrens {
796789Sahrens 	dsl_dir_t *dd;
797789Sahrens 	objset_t *os;
798789Sahrens 	uint64_t snapobj;
799789Sahrens 	zap_cursor_t zc;
800789Sahrens 	zap_attribute_t attr;
801789Sahrens 	char *child;
8021544Seschrock 	int do_self, err;
803789Sahrens 
8041544Seschrock 	err = dsl_dir_open(name, FTAG, &dd, NULL);
8051544Seschrock 	if (err)
806789Sahrens 		return;
807789Sahrens 
808789Sahrens 	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
809789Sahrens 
810789Sahrens 	/*
811789Sahrens 	 * Iterate over all children.
812789Sahrens 	 */
813789Sahrens 	if (dd->dd_phys->dd_child_dir_zapobj != 0) {
814789Sahrens 		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
815789Sahrens 		    dd->dd_phys->dd_child_dir_zapobj);
816789Sahrens 		    zap_cursor_retrieve(&zc, &attr) == 0;
817789Sahrens 		    (void) zap_cursor_advance(&zc)) {
818789Sahrens 			ASSERT(attr.za_integer_length == sizeof (uint64_t));
819789Sahrens 			ASSERT(attr.za_num_integers == 1);
820789Sahrens 
821789Sahrens 			/*
822789Sahrens 			 * No separating '/' because parent's name ends in /.
823789Sahrens 			 */
824789Sahrens 			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
825789Sahrens 			/* XXX could probably just use name here */
826789Sahrens 			dsl_dir_name(dd, child);
827789Sahrens 			(void) strcat(child, "/");
828789Sahrens 			(void) strcat(child, attr.za_name);
829789Sahrens 			dmu_objset_find(child, func, arg, flags);
830789Sahrens 			kmem_free(child, MAXPATHLEN);
831789Sahrens 		}
832885Sahrens 		zap_cursor_fini(&zc);
833789Sahrens 	}
834789Sahrens 
835789Sahrens 	/*
836789Sahrens 	 * Iterate over all snapshots.
837789Sahrens 	 */
838789Sahrens 	if ((flags & DS_FIND_SNAPSHOTS) &&
839789Sahrens 	    dmu_objset_open(name, DMU_OST_ANY,
840789Sahrens 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
841789Sahrens 
842789Sahrens 		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
843789Sahrens 		dmu_objset_close(os);
844789Sahrens 
845789Sahrens 		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
846789Sahrens 		    zap_cursor_retrieve(&zc, &attr) == 0;
847789Sahrens 		    (void) zap_cursor_advance(&zc)) {
848789Sahrens 			ASSERT(attr.za_integer_length == sizeof (uint64_t));
849789Sahrens 			ASSERT(attr.za_num_integers == 1);
850789Sahrens 
851789Sahrens 			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
852789Sahrens 			/* XXX could probably just use name here */
853789Sahrens 			dsl_dir_name(dd, child);
854789Sahrens 			(void) strcat(child, "@");
855789Sahrens 			(void) strcat(child, attr.za_name);
856789Sahrens 			func(child, arg);
857789Sahrens 			kmem_free(child, MAXPATHLEN);
858789Sahrens 		}
859885Sahrens 		zap_cursor_fini(&zc);
860789Sahrens 	}
861789Sahrens 
862789Sahrens 	dsl_dir_close(dd, FTAG);
863789Sahrens 
864789Sahrens 	/*
865789Sahrens 	 * Apply to self if appropriate.
866789Sahrens 	 */
867789Sahrens 	if (do_self)
868789Sahrens 		func(name, arg);
869789Sahrens }
870