xref: /onnv-gate/usr/src/uts/common/fs/zfs/dsl_dir.c (revision 789:b348f31ed315)
1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens #include <sys/dmu.h>
30*789Sahrens #include <sys/dmu_tx.h>
31*789Sahrens #include <sys/dsl_dataset.h>
32*789Sahrens #include <sys/dsl_dir.h>
33*789Sahrens #include <sys/dsl_prop.h>
34*789Sahrens #include <sys/spa.h>
35*789Sahrens #include <sys/zap.h>
36*789Sahrens #include <sys/zio.h>
37*789Sahrens #include <sys/arc.h>
38*789Sahrens #include "zfs_namecheck.h"
39*789Sahrens 
40*789Sahrens static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd);
41*789Sahrens static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
42*789Sahrens static int dsl_dir_set_reservation_sync(dsl_dir_t *dd,
43*789Sahrens     void *arg, dmu_tx_t *tx);
44*789Sahrens static uint64_t dsl_dir_space_available(dsl_dir_t *dd,
45*789Sahrens     dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
46*789Sahrens 
47*789Sahrens 
48*789Sahrens /* ARGSUSED */
49*789Sahrens static void
50*789Sahrens dsl_dir_evict(dmu_buf_t *db, void *arg)
51*789Sahrens {
52*789Sahrens 	dsl_dir_t *dd = arg;
53*789Sahrens 	dsl_pool_t *dp = dd->dd_pool;
54*789Sahrens 	int t;
55*789Sahrens 
56*789Sahrens 	for (t = 0; t < TXG_SIZE; t++) {
57*789Sahrens 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
58*789Sahrens 		ASSERT(dd->dd_tempreserved[t] == 0);
59*789Sahrens 		ASSERT(dd->dd_space_towrite[t] == 0);
60*789Sahrens 	}
61*789Sahrens 
62*789Sahrens 	ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
63*789Sahrens 
64*789Sahrens 	ASSERT(dd->dd_sync_txg == 0);
65*789Sahrens 
66*789Sahrens 	if (dd->dd_parent)
67*789Sahrens 		dsl_dir_close(dd->dd_parent, dd);
68*789Sahrens 
69*789Sahrens 	spa_close(dd->dd_pool->dp_spa, dd);
70*789Sahrens 
71*789Sahrens 	/*
72*789Sahrens 	 * The props callback list should be empty since they hold the
73*789Sahrens 	 * dir open.
74*789Sahrens 	 */
75*789Sahrens 	list_destroy(&dd->dd_prop_cbs);
76*789Sahrens 	kmem_free(dd, sizeof (dsl_dir_t));
77*789Sahrens }
78*789Sahrens 
79*789Sahrens dsl_dir_t *
80*789Sahrens dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
81*789Sahrens     const char *tail, void *tag)
82*789Sahrens {
83*789Sahrens 	dmu_buf_t *dbuf;
84*789Sahrens 	dsl_dir_t *dd;
85*789Sahrens 
86*789Sahrens 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
87*789Sahrens 	    dsl_pool_sync_context(dp));
88*789Sahrens 
89*789Sahrens 	dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
90*789Sahrens 	dmu_buf_read(dbuf);
91*789Sahrens 	dd = dmu_buf_get_user(dbuf);
92*789Sahrens #ifdef ZFS_DEBUG
93*789Sahrens 	{
94*789Sahrens 		dmu_object_info_t doi;
95*789Sahrens 		dmu_object_info_from_db(dbuf, &doi);
96*789Sahrens 		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DATASET);
97*789Sahrens 	}
98*789Sahrens #endif
99*789Sahrens 	/* XXX assert bonus buffer size is correct */
100*789Sahrens 	if (dd == NULL) {
101*789Sahrens 		dsl_dir_t *winner;
102*789Sahrens 		int err;
103*789Sahrens 
104*789Sahrens 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
105*789Sahrens 		dd->dd_object = ddobj;
106*789Sahrens 		dd->dd_dbuf = dbuf;
107*789Sahrens 		dd->dd_pool = dp;
108*789Sahrens 		dd->dd_phys = dbuf->db_data;
109*789Sahrens 		dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
110*789Sahrens 
111*789Sahrens 		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
112*789Sahrens 		    offsetof(dsl_prop_cb_record_t, cbr_node));
113*789Sahrens 
114*789Sahrens 		if (dd->dd_phys->dd_parent_obj) {
115*789Sahrens 			dd->dd_parent = dsl_dir_open_obj(dp,
116*789Sahrens 			    dd->dd_phys->dd_parent_obj, NULL, dd);
117*789Sahrens 			if (tail) {
118*789Sahrens #ifdef ZFS_DEBUG
119*789Sahrens 				uint64_t foundobj;
120*789Sahrens 
121*789Sahrens 				err = zap_lookup(dp->dp_meta_objset,
122*789Sahrens 				    dd->dd_parent->dd_phys->
123*789Sahrens 				    dd_child_dir_zapobj,
124*789Sahrens 				    tail, sizeof (foundobj), 1, &foundobj);
125*789Sahrens 				ASSERT3U(err, ==, 0);
126*789Sahrens 				ASSERT3U(foundobj, ==, ddobj);
127*789Sahrens #endif
128*789Sahrens 				(void) strcpy(dd->dd_myname, tail);
129*789Sahrens 			} else {
130*789Sahrens 				err = zap_value_search(dp->dp_meta_objset,
131*789Sahrens 				    dd->dd_parent->dd_phys->
132*789Sahrens 				    dd_child_dir_zapobj,
133*789Sahrens 				    ddobj, dd->dd_myname);
134*789Sahrens 				/*
135*789Sahrens 				 * The caller should be protecting this ddobj
136*789Sahrens 				 * from being deleted concurrently
137*789Sahrens 				 */
138*789Sahrens 				ASSERT(err == 0);
139*789Sahrens 			}
140*789Sahrens 		} else {
141*789Sahrens 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
142*789Sahrens 		}
143*789Sahrens 
144*789Sahrens 		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
145*789Sahrens 		    dsl_dir_evict);
146*789Sahrens 		if (winner) {
147*789Sahrens 			if (dd->dd_parent)
148*789Sahrens 				dsl_dir_close(dd->dd_parent, dd);
149*789Sahrens 			kmem_free(dd, sizeof (dsl_dir_t));
150*789Sahrens 			dd = winner;
151*789Sahrens 		} else {
152*789Sahrens 			spa_open_ref(dp->dp_spa, dd);
153*789Sahrens 		}
154*789Sahrens 	}
155*789Sahrens 
156*789Sahrens 	/*
157*789Sahrens 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
158*789Sahrens 	 * holds on the spa.  We need the open-to-close holds because
159*789Sahrens 	 * otherwise the spa_refcnt wouldn't change when we open a
160*789Sahrens 	 * dir which the spa also has open, so we could incorrectly
161*789Sahrens 	 * think it was OK to unload/export/destroy the pool.  We need
162*789Sahrens 	 * the instantiate-to-evict hold because the dsl_dir_t has a
163*789Sahrens 	 * pointer to the dd_pool, which has a pointer to the spa_t.
164*789Sahrens 	 */
165*789Sahrens 	spa_open_ref(dp->dp_spa, tag);
166*789Sahrens 	ASSERT3P(dd->dd_pool, ==, dp);
167*789Sahrens 	ASSERT3U(dd->dd_object, ==, ddobj);
168*789Sahrens 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
169*789Sahrens 	return (dd);
170*789Sahrens }
171*789Sahrens 
172*789Sahrens void
173*789Sahrens dsl_dir_close(dsl_dir_t *dd, void *tag)
174*789Sahrens {
175*789Sahrens 	dprintf_dd(dd, "%s\n", "");
176*789Sahrens 	spa_close(dd->dd_pool->dp_spa, tag);
177*789Sahrens 	dmu_buf_rele_tag(dd->dd_dbuf, tag);
178*789Sahrens }
179*789Sahrens 
180*789Sahrens /* buf must be long enough (MAXNAMELEN should do) */
181*789Sahrens void
182*789Sahrens dsl_dir_name(dsl_dir_t *dd, char *buf)
183*789Sahrens {
184*789Sahrens 	if (dd->dd_parent) {
185*789Sahrens 		dsl_dir_name(dd->dd_parent, buf);
186*789Sahrens 		(void) strcat(buf, "/");
187*789Sahrens 	} else {
188*789Sahrens 		buf[0] = '\0';
189*789Sahrens 	}
190*789Sahrens 	if (!MUTEX_HELD(&dd->dd_lock)) {
191*789Sahrens 		/*
192*789Sahrens 		 * recursive mutex so that we can use
193*789Sahrens 		 * dprintf_dd() with dd_lock held
194*789Sahrens 		 */
195*789Sahrens 		mutex_enter(&dd->dd_lock);
196*789Sahrens 		(void) strcat(buf, dd->dd_myname);
197*789Sahrens 		mutex_exit(&dd->dd_lock);
198*789Sahrens 	} else {
199*789Sahrens 		(void) strcat(buf, dd->dd_myname);
200*789Sahrens 	}
201*789Sahrens }
202*789Sahrens 
203*789Sahrens int
204*789Sahrens dsl_dir_is_private(dsl_dir_t *dd)
205*789Sahrens {
206*789Sahrens 	int rv = FALSE;
207*789Sahrens 
208*789Sahrens 	if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
209*789Sahrens 		rv = TRUE;
210*789Sahrens 	if (dataset_name_hidden(dd->dd_myname))
211*789Sahrens 		rv = TRUE;
212*789Sahrens 	return (rv);
213*789Sahrens }
214*789Sahrens 
215*789Sahrens 
216*789Sahrens static int
217*789Sahrens getcomponent(const char *path, char *component, const char **nextp)
218*789Sahrens {
219*789Sahrens 	char *p;
220*789Sahrens 	if (path == NULL)
221*789Sahrens 		return (NULL);
222*789Sahrens 	/* This would be a good place to reserve some namespace... */
223*789Sahrens 	p = strpbrk(path, "/@");
224*789Sahrens 	if (p && (p[1] == '/' || p[1] == '@')) {
225*789Sahrens 		/* two separators in a row */
226*789Sahrens 		return (EINVAL);
227*789Sahrens 	}
228*789Sahrens 	if (p == NULL || p == path) {
229*789Sahrens 		/*
230*789Sahrens 		 * if the first thing is an @ or /, it had better be an
231*789Sahrens 		 * @ and it had better not have any more ats or slashes,
232*789Sahrens 		 * and it had better have something after the @.
233*789Sahrens 		 */
234*789Sahrens 		if (p != NULL &&
235*789Sahrens 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
236*789Sahrens 			return (EINVAL);
237*789Sahrens 		if (strlen(path) >= MAXNAMELEN)
238*789Sahrens 			return (ENAMETOOLONG);
239*789Sahrens 		(void) strcpy(component, path);
240*789Sahrens 		p = NULL;
241*789Sahrens 	} else if (p[0] == '/') {
242*789Sahrens 		if (p-path >= MAXNAMELEN)
243*789Sahrens 			return (ENAMETOOLONG);
244*789Sahrens 		(void) strncpy(component, path, p - path);
245*789Sahrens 		component[p-path] = '\0';
246*789Sahrens 		p++;
247*789Sahrens 	} else if (p[0] == '@') {
248*789Sahrens 		/*
249*789Sahrens 		 * if the next separator is an @, there better not be
250*789Sahrens 		 * any more slashes.
251*789Sahrens 		 */
252*789Sahrens 		if (strchr(path, '/'))
253*789Sahrens 			return (EINVAL);
254*789Sahrens 		if (p-path >= MAXNAMELEN)
255*789Sahrens 			return (ENAMETOOLONG);
256*789Sahrens 		(void) strncpy(component, path, p - path);
257*789Sahrens 		component[p-path] = '\0';
258*789Sahrens 	} else {
259*789Sahrens 		ASSERT(!"invalid p");
260*789Sahrens 	}
261*789Sahrens 	*nextp = p;
262*789Sahrens 	return (0);
263*789Sahrens }
264*789Sahrens 
265*789Sahrens /*
266*789Sahrens  * same as dsl_open_dir, ignore the first component of name and use the
267*789Sahrens  * spa instead
268*789Sahrens  */
269*789Sahrens dsl_dir_t *
270*789Sahrens dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
271*789Sahrens {
272*789Sahrens 	char buf[MAXNAMELEN];
273*789Sahrens 	const char *next, *nextnext = NULL;
274*789Sahrens 	int err;
275*789Sahrens 	dsl_dir_t *dd;
276*789Sahrens 	dsl_pool_t *dp;
277*789Sahrens 	uint64_t ddobj;
278*789Sahrens 	int openedspa = FALSE;
279*789Sahrens 
280*789Sahrens 	dprintf("%s\n", name);
281*789Sahrens 
282*789Sahrens 	if (name == NULL)
283*789Sahrens 		return (NULL);
284*789Sahrens 	err = getcomponent(name, buf, &next);
285*789Sahrens 	if (err)
286*789Sahrens 		return (NULL);
287*789Sahrens 	if (spa == NULL) {
288*789Sahrens 		err = spa_open(buf, &spa, FTAG);
289*789Sahrens 		if (err) {
290*789Sahrens 			dprintf("spa_open(%s) failed\n", buf);
291*789Sahrens 			return (NULL);
292*789Sahrens 		}
293*789Sahrens 		openedspa = TRUE;
294*789Sahrens 
295*789Sahrens 		/* XXX this assertion belongs in spa_open */
296*789Sahrens 		ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
297*789Sahrens 	}
298*789Sahrens 
299*789Sahrens 	dp = spa_get_dsl(spa);
300*789Sahrens 
301*789Sahrens 	rw_enter(&dp->dp_config_rwlock, RW_READER);
302*789Sahrens 	dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
303*789Sahrens 	while (next != NULL) {
304*789Sahrens 		dsl_dir_t *child_ds;
305*789Sahrens 		err = getcomponent(next, buf, &nextnext);
306*789Sahrens 		if (err) {
307*789Sahrens 			dsl_dir_close(dd, tag);
308*789Sahrens 			if (openedspa)
309*789Sahrens 				spa_close(spa, FTAG);
310*789Sahrens 			return (NULL);
311*789Sahrens 		}
312*789Sahrens 		ASSERT(next[0] != '\0');
313*789Sahrens 		if (next[0] == '@')
314*789Sahrens 			break;
315*789Sahrens 		if (dd->dd_phys->dd_child_dir_zapobj == 0)
316*789Sahrens 			break;
317*789Sahrens 		dprintf("looking up %s in obj%lld\n",
318*789Sahrens 		    buf, dd->dd_phys->dd_child_dir_zapobj);
319*789Sahrens 
320*789Sahrens 		err = zap_lookup(dp->dp_meta_objset,
321*789Sahrens 		    dd->dd_phys->dd_child_dir_zapobj,
322*789Sahrens 		    buf, sizeof (ddobj), 1, &ddobj);
323*789Sahrens 		if (err == ENOENT) {
324*789Sahrens 			break;
325*789Sahrens 		}
326*789Sahrens 		ASSERT(err == 0);
327*789Sahrens 
328*789Sahrens 		child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
329*789Sahrens 		dsl_dir_close(dd, tag);
330*789Sahrens 		dd = child_ds;
331*789Sahrens 		next = nextnext;
332*789Sahrens 	}
333*789Sahrens 	rw_exit(&dp->dp_config_rwlock);
334*789Sahrens 
335*789Sahrens 	/*
336*789Sahrens 	 * It's an error if there's more than one component left, or
337*789Sahrens 	 * tailp==NULL and there's any component left.
338*789Sahrens 	 */
339*789Sahrens 	if (next != NULL &&
340*789Sahrens 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
341*789Sahrens 		/* bad path name */
342*789Sahrens 		dsl_dir_close(dd, tag);
343*789Sahrens 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
344*789Sahrens 		next = NULL;
345*789Sahrens 		dd = NULL;
346*789Sahrens 	}
347*789Sahrens 	if (tailp)
348*789Sahrens 		*tailp = next;
349*789Sahrens 	if (openedspa)
350*789Sahrens 		spa_close(spa, FTAG);
351*789Sahrens 	return (dd);
352*789Sahrens }
353*789Sahrens 
354*789Sahrens /*
355*789Sahrens  * Return the dsl_dir_t, and possibly the last component which couldn't
356*789Sahrens  * be found in *tail.  Return NULL if the path is bogus, or if
357*789Sahrens  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
358*789Sahrens  * means that the last component is a snapshot.
359*789Sahrens  */
360*789Sahrens dsl_dir_t *
361*789Sahrens dsl_dir_open(const char *name, void *tag, const char **tailp)
362*789Sahrens {
363*789Sahrens 	return (dsl_dir_open_spa(NULL, name, tag, tailp));
364*789Sahrens }
365*789Sahrens 
366*789Sahrens int
367*789Sahrens dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
368*789Sahrens {
369*789Sahrens 	objset_t *mos = pds->dd_pool->dp_meta_objset;
370*789Sahrens 	uint64_t ddobj;
371*789Sahrens 	dsl_dir_phys_t *dsphys;
372*789Sahrens 	dmu_buf_t *dbuf;
373*789Sahrens 	int err;
374*789Sahrens 
375*789Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
376*789Sahrens 
377*789Sahrens 	if (pds->dd_phys->dd_child_dir_zapobj == 0) {
378*789Sahrens 		dmu_buf_will_dirty(pds->dd_dbuf, tx);
379*789Sahrens 		pds->dd_phys->dd_child_dir_zapobj = zap_create(mos,
380*789Sahrens 		    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
381*789Sahrens 	}
382*789Sahrens 
383*789Sahrens 	rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER);
384*789Sahrens 	err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj,
385*789Sahrens 	    name, sizeof (uint64_t), 1, &ddobj);
386*789Sahrens 	if (err != ENOENT) {
387*789Sahrens 		rw_exit(&pds->dd_pool->dp_config_rwlock);
388*789Sahrens 		return (err ? err : EEXIST);
389*789Sahrens 	}
390*789Sahrens 
391*789Sahrens 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
392*789Sahrens 	    DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
393*789Sahrens 	err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
394*789Sahrens 	    name, sizeof (uint64_t), 1, &ddobj, tx);
395*789Sahrens 	ASSERT3U(err, ==, 0);
396*789Sahrens 	dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
397*789Sahrens 	    name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
398*789Sahrens 
399*789Sahrens 	dbuf = dmu_bonus_hold(mos, ddobj);
400*789Sahrens 	dmu_buf_will_dirty(dbuf, tx);
401*789Sahrens 	dsphys = dbuf->db_data;
402*789Sahrens 
403*789Sahrens 	dsphys->dd_creation_time = gethrestime_sec();
404*789Sahrens 	dsphys->dd_parent_obj = pds->dd_object;
405*789Sahrens 	dsphys->dd_props_zapobj = zap_create(mos,
406*789Sahrens 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
407*789Sahrens 	dsphys->dd_child_dir_zapobj = zap_create(mos,
408*789Sahrens 	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
409*789Sahrens 	dmu_buf_rele(dbuf);
410*789Sahrens 
411*789Sahrens 	rw_exit(&pds->dd_pool->dp_config_rwlock);
412*789Sahrens 
413*789Sahrens 	return (0);
414*789Sahrens }
415*789Sahrens 
416*789Sahrens int
417*789Sahrens dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
418*789Sahrens {
419*789Sahrens 	const char *name = arg;
420*789Sahrens 	dsl_dir_t *dd = NULL;
421*789Sahrens 	dsl_pool_t *dp = pds->dd_pool;
422*789Sahrens 	objset_t *mos = dp->dp_meta_objset;
423*789Sahrens 	uint64_t val, obj, child_zapobj, props_zapobj;
424*789Sahrens 	int t, err;
425*789Sahrens 
426*789Sahrens 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
427*789Sahrens 
428*789Sahrens 	err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name,
429*789Sahrens 	    8, 1, &obj);
430*789Sahrens 	if (err)
431*789Sahrens 		goto out;
432*789Sahrens 
433*789Sahrens 	dd = dsl_dir_open_obj(dp, obj, name, FTAG);
434*789Sahrens 	ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
435*789Sahrens 
436*789Sahrens 	if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
437*789Sahrens 		err = EBUSY;
438*789Sahrens 		goto out;
439*789Sahrens 	}
440*789Sahrens 
441*789Sahrens 	for (t = 0; t < TXG_SIZE; t++) {
442*789Sahrens 		/*
443*789Sahrens 		 * if they were dirty, they'd also be open.
444*789Sahrens 		 * dp_config_rwlock ensures that it stays that way.
445*789Sahrens 		 */
446*789Sahrens 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
447*789Sahrens 	}
448*789Sahrens 
449*789Sahrens 	child_zapobj = dd->dd_phys->dd_child_dir_zapobj;
450*789Sahrens 	props_zapobj = dd->dd_phys->dd_props_zapobj;
451*789Sahrens 
452*789Sahrens 	if (child_zapobj != 0) {
453*789Sahrens 		uint64_t count;
454*789Sahrens 		err = EEXIST;
455*789Sahrens 		(void) zap_count(mos, child_zapobj, &count);
456*789Sahrens 		if (count != 0)
457*789Sahrens 			goto out;
458*789Sahrens 	}
459*789Sahrens 
460*789Sahrens 	if (dd->dd_phys->dd_head_dataset_obj != 0) {
461*789Sahrens 		err = dsl_dataset_destroy_sync(dd, NULL, tx);
462*789Sahrens 		if (err)
463*789Sahrens 			goto out;
464*789Sahrens 	}
465*789Sahrens 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
466*789Sahrens 
467*789Sahrens 	/* The point of no (unsuccessful) return */
468*789Sahrens 
469*789Sahrens 	/* Make sure parent's used gets updated */
470*789Sahrens 	val = 0;
471*789Sahrens 	err = dsl_dir_set_reservation_sync(dd, &val, tx);
472*789Sahrens 	ASSERT(err == 0);
473*789Sahrens 	ASSERT3U(dd->dd_used_bytes, ==, 0);
474*789Sahrens 	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
475*789Sahrens 	dsl_dir_close(dd, FTAG);
476*789Sahrens 	dd = NULL;
477*789Sahrens 
478*789Sahrens 	err = dmu_object_free(mos, obj, tx);
479*789Sahrens 	ASSERT(err == 0);
480*789Sahrens 
481*789Sahrens 	if (child_zapobj)
482*789Sahrens 		err = zap_destroy(mos, child_zapobj, tx);
483*789Sahrens 	ASSERT(err == 0);
484*789Sahrens 
485*789Sahrens 	if (props_zapobj)
486*789Sahrens 		err = zap_destroy(mos, props_zapobj, tx);
487*789Sahrens 	ASSERT(err == 0);
488*789Sahrens 
489*789Sahrens 	err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx);
490*789Sahrens 	ASSERT(err == 0);
491*789Sahrens 
492*789Sahrens out:
493*789Sahrens 	rw_exit(&dp->dp_config_rwlock);
494*789Sahrens 	if (dd)
495*789Sahrens 		dsl_dir_close(dd, FTAG);
496*789Sahrens 
497*789Sahrens 	return (err);
498*789Sahrens }
499*789Sahrens 
500*789Sahrens void
501*789Sahrens dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
502*789Sahrens {
503*789Sahrens 	dsl_dir_phys_t *dsp;
504*789Sahrens 	dmu_buf_t *dbuf;
505*789Sahrens 	int error;
506*789Sahrens 
507*789Sahrens 	*ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
508*789Sahrens 	    DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
509*789Sahrens 
510*789Sahrens 	error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
511*789Sahrens 	    sizeof (uint64_t), 1, ddobjp, tx);
512*789Sahrens 	ASSERT3U(error, ==, 0);
513*789Sahrens 
514*789Sahrens 	dbuf = dmu_bonus_hold(mos, *ddobjp);
515*789Sahrens 	dmu_buf_will_dirty(dbuf, tx);
516*789Sahrens 	dsp = dbuf->db_data;
517*789Sahrens 
518*789Sahrens 	dsp->dd_creation_time = gethrestime_sec();
519*789Sahrens 	dsp->dd_props_zapobj = zap_create(mos,
520*789Sahrens 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
521*789Sahrens 	dsp->dd_child_dir_zapobj = zap_create(mos,
522*789Sahrens 	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
523*789Sahrens 
524*789Sahrens 	dmu_buf_rele(dbuf);
525*789Sahrens }
526*789Sahrens 
527*789Sahrens void
528*789Sahrens dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
529*789Sahrens {
530*789Sahrens 	bzero(dds, sizeof (dmu_objset_stats_t));
531*789Sahrens 
532*789Sahrens 	dds->dds_dir_obj = dd->dd_object;
533*789Sahrens 	dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
534*789Sahrens 
535*789Sahrens 	mutex_enter(&dd->dd_lock);
536*789Sahrens 	dds->dds_space_used = dd->dd_used_bytes;
537*789Sahrens 	dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes;
538*789Sahrens 	dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes;
539*789Sahrens 	dds->dds_quota = dd->dd_phys->dd_quota;
540*789Sahrens 	dds->dds_reserved = dd->dd_phys->dd_reserved;
541*789Sahrens 	mutex_exit(&dd->dd_lock);
542*789Sahrens 
543*789Sahrens 	dds->dds_creation_time = dd->dd_phys->dd_creation_time;
544*789Sahrens 
545*789Sahrens 	dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
546*789Sahrens 
547*789Sahrens 	if (dd->dd_phys->dd_clone_parent_obj) {
548*789Sahrens 		dsl_dataset_t *ds;
549*789Sahrens 
550*789Sahrens 		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
551*789Sahrens 		ds = dsl_dataset_open_obj(dd->dd_pool,
552*789Sahrens 		    dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
553*789Sahrens 		dsl_dataset_name(ds, dds->dds_clone_of);
554*789Sahrens 		dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
555*789Sahrens 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
556*789Sahrens 		rw_exit(&dd->dd_pool->dp_config_rwlock);
557*789Sahrens 	}
558*789Sahrens 
559*789Sahrens 	VERIFY(dsl_prop_get_ds_integer(dd, "checksum",
560*789Sahrens 	    &dds->dds_checksum, dds->dds_checksum_setpoint) == 0);
561*789Sahrens 
562*789Sahrens 	VERIFY(dsl_prop_get_ds_integer(dd, "compression",
563*789Sahrens 	    &dds->dds_compression, dds->dds_compression_setpoint) == 0);
564*789Sahrens 
565*789Sahrens 	VERIFY(dsl_prop_get_ds_integer(dd, "zoned",
566*789Sahrens 	    &dds->dds_zoned, dds->dds_zoned_setpoint) == 0);
567*789Sahrens 
568*789Sahrens 	spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
569*789Sahrens 	    sizeof (dds->dds_altroot));
570*789Sahrens }
571*789Sahrens 
572*789Sahrens int
573*789Sahrens dsl_dir_sync_task(dsl_dir_t *dd,
574*789Sahrens     int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space)
575*789Sahrens {
576*789Sahrens 	dmu_tx_t *tx;
577*789Sahrens 	dsl_pool_t *dp = dd->dd_pool;
578*789Sahrens 	int err = 0;
579*789Sahrens 	uint64_t txg;
580*789Sahrens 
581*789Sahrens 	dprintf_dd(dd, "func=%p space=%llu\n", func, space);
582*789Sahrens 
583*789Sahrens again:
584*789Sahrens 	tx = dmu_tx_create_ds(dd);
585*789Sahrens 	dmu_tx_hold_space(tx, space);
586*789Sahrens 	err = dmu_tx_assign(tx, TXG_WAIT);
587*789Sahrens 	if (err == ENOSPC || err == EDQUOT) {
588*789Sahrens 		dsl_dir_t *rds;
589*789Sahrens 		/*
590*789Sahrens 		 * They can get their space from either this dd, or the
591*789Sahrens 		 * root dd.
592*789Sahrens 		 */
593*789Sahrens 		for (rds = dd; rds->dd_parent; rds = rds->dd_parent)
594*789Sahrens 			continue;
595*789Sahrens 		dmu_tx_abort(tx);
596*789Sahrens 		tx = dmu_tx_create_ds(rds);
597*789Sahrens 		dmu_tx_hold_space(tx, space);
598*789Sahrens 		err = dmu_tx_assign(tx, TXG_WAIT);
599*789Sahrens 	}
600*789Sahrens 	if (err) {
601*789Sahrens 		dmu_tx_abort(tx);
602*789Sahrens 		return (err);
603*789Sahrens 	}
604*789Sahrens 
605*789Sahrens 	txg = dmu_tx_get_txg(tx);
606*789Sahrens 	mutex_enter(&dd->dd_lock);
607*789Sahrens 	if (dd->dd_sync_txg != 0) {
608*789Sahrens 		mutex_exit(&dd->dd_lock);
609*789Sahrens 		dmu_tx_commit(tx);
610*789Sahrens 		txg_wait_synced(dp, 0);
611*789Sahrens 		goto again;
612*789Sahrens 	}
613*789Sahrens 
614*789Sahrens 	/* We're good to go */
615*789Sahrens 
616*789Sahrens 	dd->dd_sync_txg = txg;
617*789Sahrens 	dd->dd_sync_func = func;
618*789Sahrens 	dd->dd_sync_arg = arg;
619*789Sahrens 
620*789Sahrens 	mutex_exit(&dd->dd_lock);
621*789Sahrens 
622*789Sahrens 	dsl_dir_dirty(dd, tx);
623*789Sahrens 	dmu_tx_commit(tx);
624*789Sahrens 
625*789Sahrens 	txg_wait_synced(dp, txg);
626*789Sahrens 
627*789Sahrens 	mutex_enter(&dd->dd_lock);
628*789Sahrens 	ASSERT(dd->dd_sync_txg == txg);
629*789Sahrens 	ASSERT(dd->dd_sync_func == NULL);
630*789Sahrens 	err = dd->dd_sync_err;
631*789Sahrens 	dd->dd_sync_txg = 0;
632*789Sahrens 	mutex_exit(&dd->dd_lock);
633*789Sahrens 
634*789Sahrens 	return (err);
635*789Sahrens }
636*789Sahrens 
637*789Sahrens void
638*789Sahrens dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
639*789Sahrens {
640*789Sahrens 	dsl_pool_t *dp = dd->dd_pool;
641*789Sahrens 
642*789Sahrens 	ASSERT(dd->dd_phys);
643*789Sahrens 
644*789Sahrens 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
645*789Sahrens 		/* up the hold count until we can be written out */
646*789Sahrens 		dmu_buf_add_ref(dd->dd_dbuf, dd);
647*789Sahrens 	}
648*789Sahrens }
649*789Sahrens 
650*789Sahrens static int64_t
651*789Sahrens parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
652*789Sahrens {
653*789Sahrens 	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
654*789Sahrens 	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
655*789Sahrens 	return (new_accounted - old_accounted);
656*789Sahrens }
657*789Sahrens 
658*789Sahrens void
659*789Sahrens dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
660*789Sahrens {
661*789Sahrens 	if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) {
662*789Sahrens 		dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx);
663*789Sahrens 		dd->dd_sync_func = NULL;
664*789Sahrens 	}
665*789Sahrens 
666*789Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
667*789Sahrens 
668*789Sahrens 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
669*789Sahrens 
670*789Sahrens 	mutex_enter(&dd->dd_lock);
671*789Sahrens 	ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
672*789Sahrens 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
673*789Sahrens 	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
674*789Sahrens 	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
675*789Sahrens 	dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
676*789Sahrens 	mutex_exit(&dd->dd_lock);
677*789Sahrens 
678*789Sahrens 	/* release the hold from dsl_dir_dirty */
679*789Sahrens 	dmu_buf_remove_ref(dd->dd_dbuf, dd);
680*789Sahrens }
681*789Sahrens 
682*789Sahrens static uint64_t
683*789Sahrens dsl_dir_estimated_space(dsl_dir_t *dd)
684*789Sahrens {
685*789Sahrens 	int64_t space;
686*789Sahrens 	int i;
687*789Sahrens 
688*789Sahrens 	ASSERT(MUTEX_HELD(&dd->dd_lock));
689*789Sahrens 
690*789Sahrens 	space = dd->dd_used_bytes;
691*789Sahrens 	ASSERT(space >= 0);
692*789Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
693*789Sahrens 		space += dd->dd_space_towrite[i&TXG_MASK];
694*789Sahrens 		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
695*789Sahrens 	}
696*789Sahrens 	return (space);
697*789Sahrens }
698*789Sahrens 
699*789Sahrens /*
700*789Sahrens  * How much space would dd have available if ancestor had delta applied
701*789Sahrens  * to it?  If ondiskonly is set, we're only interested in what's
702*789Sahrens  * on-disk, not estimated pending changes.
703*789Sahrens  */
704*789Sahrens static uint64_t
705*789Sahrens dsl_dir_space_available(dsl_dir_t *dd,
706*789Sahrens     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
707*789Sahrens {
708*789Sahrens 	uint64_t parentspace, myspace, quota, used;
709*789Sahrens 
710*789Sahrens 	/*
711*789Sahrens 	 * If there are no restrictions otherwise, assume we have
712*789Sahrens 	 * unlimited space available.
713*789Sahrens 	 */
714*789Sahrens 	quota = UINT64_MAX;
715*789Sahrens 	parentspace = UINT64_MAX;
716*789Sahrens 
717*789Sahrens 	if (dd->dd_parent != NULL) {
718*789Sahrens 		parentspace = dsl_dir_space_available(dd->dd_parent,
719*789Sahrens 		    ancestor, delta, ondiskonly);
720*789Sahrens 	}
721*789Sahrens 
722*789Sahrens 	mutex_enter(&dd->dd_lock);
723*789Sahrens 	if (dd->dd_phys->dd_quota != 0)
724*789Sahrens 		quota = dd->dd_phys->dd_quota;
725*789Sahrens 	if (ondiskonly) {
726*789Sahrens 		used = dd->dd_used_bytes;
727*789Sahrens 	} else {
728*789Sahrens 		used = dsl_dir_estimated_space(dd);
729*789Sahrens 	}
730*789Sahrens 	if (dd == ancestor)
731*789Sahrens 		used += delta;
732*789Sahrens 
733*789Sahrens 	if (dd->dd_parent == NULL) {
734*789Sahrens 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE);
735*789Sahrens 		quota = MIN(quota, poolsize);
736*789Sahrens 	}
737*789Sahrens 
738*789Sahrens 	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
739*789Sahrens 		/*
740*789Sahrens 		 * We have some space reserved, in addition to what our
741*789Sahrens 		 * parent gave us.
742*789Sahrens 		 */
743*789Sahrens 		parentspace += dd->dd_phys->dd_reserved - used;
744*789Sahrens 	}
745*789Sahrens 
746*789Sahrens 	if (used > quota) {
747*789Sahrens 		/* over quota */
748*789Sahrens 		myspace = 0;
749*789Sahrens #ifdef ZFS_DEBUG
750*789Sahrens 		{
751*789Sahrens 			/*
752*789Sahrens 			 * While it's OK to be a little over quota, if
753*789Sahrens 			 * we think we are using more space than there
754*789Sahrens 			 * is in the pool (which is already 6% more than
755*789Sahrens 			 * dsl_pool_adjustedsize()), something is very
756*789Sahrens 			 * wrong.
757*789Sahrens 			 */
758*789Sahrens 			uint64_t space = spa_get_space(dd->dd_pool->dp_spa);
759*789Sahrens 			ASSERT3U(used, <=, space);
760*789Sahrens 		}
761*789Sahrens #endif
762*789Sahrens 	} else {
763*789Sahrens 		/*
764*789Sahrens 		 * the lesser of parent's space and the space
765*789Sahrens 		 * left in our quota
766*789Sahrens 		 */
767*789Sahrens 		myspace = MIN(parentspace, quota - used);
768*789Sahrens 	}
769*789Sahrens 
770*789Sahrens 	mutex_exit(&dd->dd_lock);
771*789Sahrens 
772*789Sahrens 	return (myspace);
773*789Sahrens }
774*789Sahrens 
775*789Sahrens struct tempreserve {
776*789Sahrens 	list_node_t tr_node;
777*789Sahrens 	dsl_dir_t *tr_ds;
778*789Sahrens 	uint64_t tr_size;
779*789Sahrens };
780*789Sahrens 
781*789Sahrens /*
782*789Sahrens  * Reserve space in this dsl_dir, to be used in this tx's txg.
783*789Sahrens  * After the space has been dirtied (and thus
784*789Sahrens  * dsl_dir_willuse_space() has been called), the reservation should
785*789Sahrens  * be canceled, using dsl_dir_tempreserve_clear().
786*789Sahrens  */
787*789Sahrens static int
788*789Sahrens dsl_dir_tempreserve_impl(dsl_dir_t *dd,
789*789Sahrens     uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
790*789Sahrens {
791*789Sahrens 	uint64_t txg = tx->tx_txg;
792*789Sahrens 	uint64_t est_used, quota, parent_rsrv;
793*789Sahrens 	int edquot = EDQUOT;
794*789Sahrens 	int txgidx = txg & TXG_MASK;
795*789Sahrens 	int i;
796*789Sahrens 	struct tempreserve *tr;
797*789Sahrens 
798*789Sahrens 	ASSERT3U(txg, !=, 0);
799*789Sahrens 
800*789Sahrens 	mutex_enter(&dd->dd_lock);
801*789Sahrens 	/*
802*789Sahrens 	 * Check against the dsl_dir's quota.  We don't add in the delta
803*789Sahrens 	 * when checking for over-quota because they get one free hit.
804*789Sahrens 	 */
805*789Sahrens 	est_used = dsl_dir_estimated_space(dd);
806*789Sahrens 	for (i = 0; i < TXG_SIZE; i++)
807*789Sahrens 		est_used += dd->dd_tempreserved[i];
808*789Sahrens 
809*789Sahrens 	quota = UINT64_MAX;
810*789Sahrens 
811*789Sahrens 	if (dd->dd_phys->dd_quota)
812*789Sahrens 		quota = dd->dd_phys->dd_quota;
813*789Sahrens 
814*789Sahrens 	/*
815*789Sahrens 	 * If this transaction will result in a net free of space, we want
816*789Sahrens 	 * to let it through, but we have to be careful: the space that it
817*789Sahrens 	 * frees won't become available until *after* this txg syncs.
818*789Sahrens 	 * Therefore, to ensure that it's possible to remove files from
819*789Sahrens 	 * a full pool without inducing transient overcommits, we throttle
820*789Sahrens 	 * netfree transactions against a quota that is slightly larger,
821*789Sahrens 	 * but still within the pool's allocation slop.  In cases where
822*789Sahrens 	 * we're very close to full, this will allow a steady trickle of
823*789Sahrens 	 * removes to get through.
824*789Sahrens 	 */
825*789Sahrens 	if (dd->dd_parent == NULL) {
826*789Sahrens 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
827*789Sahrens 		if (poolsize < quota) {
828*789Sahrens 			quota = poolsize;
829*789Sahrens 			edquot = ENOSPC;
830*789Sahrens 		}
831*789Sahrens 	} else if (netfree) {
832*789Sahrens 		quota = UINT64_MAX;
833*789Sahrens 	}
834*789Sahrens 
835*789Sahrens 	/*
836*789Sahrens 	 * If they are requesting more space, and our current estimate
837*789Sahrens 	 * is over quota.  They get to try again unless the actual
838*789Sahrens 	 * on-disk is over quota.
839*789Sahrens 	 */
840*789Sahrens 	if (asize > 0 && est_used > quota) {
841*789Sahrens 		if (dd->dd_used_bytes < quota)
842*789Sahrens 			edquot = ERESTART;
843*789Sahrens 		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
844*789Sahrens 		    "quota=%lluK tr=%lluK err=%d\n",
845*789Sahrens 		    dd->dd_used_bytes>>10, est_used>>10,
846*789Sahrens 		    quota>>10, asize>>10, edquot);
847*789Sahrens 		mutex_exit(&dd->dd_lock);
848*789Sahrens 		return (edquot);
849*789Sahrens 	}
850*789Sahrens 
851*789Sahrens 	/* We need to up our estimated delta before dropping dd_lock */
852*789Sahrens 	dd->dd_tempreserved[txgidx] += asize;
853*789Sahrens 
854*789Sahrens 	parent_rsrv = parent_delta(dd, est_used, asize);
855*789Sahrens 	mutex_exit(&dd->dd_lock);
856*789Sahrens 
857*789Sahrens 	tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
858*789Sahrens 	tr->tr_ds = dd;
859*789Sahrens 	tr->tr_size = asize;
860*789Sahrens 	list_insert_tail(tr_list, tr);
861*789Sahrens 
862*789Sahrens 	/* see if it's OK with our parent */
863*789Sahrens 	if (dd->dd_parent && parent_rsrv) {
864*789Sahrens 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
865*789Sahrens 		    parent_rsrv, netfree, tr_list, tx));
866*789Sahrens 	} else {
867*789Sahrens 		return (0);
868*789Sahrens 	}
869*789Sahrens }
870*789Sahrens 
871*789Sahrens /*
872*789Sahrens  * Reserve space in this dsl_dir, to be used in this tx's txg.
873*789Sahrens  * After the space has been dirtied (and thus
874*789Sahrens  * dsl_dir_willuse_space() has been called), the reservation should
875*789Sahrens  * be canceled, using dsl_dir_tempreserve_clear().
876*789Sahrens  */
877*789Sahrens int
878*789Sahrens dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
879*789Sahrens     uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
880*789Sahrens {
881*789Sahrens 	int err = 0;
882*789Sahrens 	list_t *tr_list;
883*789Sahrens 
884*789Sahrens 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
885*789Sahrens 	list_create(tr_list, sizeof (struct tempreserve),
886*789Sahrens 	    offsetof(struct tempreserve, tr_node));
887*789Sahrens 
888*789Sahrens 	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
889*789Sahrens 	    tr_list, tx);
890*789Sahrens 
891*789Sahrens 	if (err == 0) {
892*789Sahrens 		struct tempreserve *tr;
893*789Sahrens 
894*789Sahrens 		err = arc_tempreserve_space(lsize);
895*789Sahrens 		if (err == 0) {
896*789Sahrens 			tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
897*789Sahrens 			tr->tr_ds = NULL;
898*789Sahrens 			tr->tr_size = lsize;
899*789Sahrens 			list_insert_tail(tr_list, tr);
900*789Sahrens 		}
901*789Sahrens 	}
902*789Sahrens 
903*789Sahrens 	if (err)
904*789Sahrens 		dsl_dir_tempreserve_clear(tr_list, tx);
905*789Sahrens 	else
906*789Sahrens 		*tr_cookiep = tr_list;
907*789Sahrens 	return (err);
908*789Sahrens }
909*789Sahrens 
910*789Sahrens /*
911*789Sahrens  * Clear a temporary reservation that we previously made with
912*789Sahrens  * dsl_dir_tempreserve_space().
913*789Sahrens  */
914*789Sahrens void
915*789Sahrens dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
916*789Sahrens {
917*789Sahrens 	int txgidx = tx->tx_txg & TXG_MASK;
918*789Sahrens 	list_t *tr_list = tr_cookie;
919*789Sahrens 	struct tempreserve *tr;
920*789Sahrens 
921*789Sahrens 	ASSERT3U(tx->tx_txg, !=, 0);
922*789Sahrens 
923*789Sahrens 	while (tr = list_head(tr_list)) {
924*789Sahrens 		if (tr->tr_ds == NULL) {
925*789Sahrens 			arc_tempreserve_clear(tr->tr_size);
926*789Sahrens 		} else {
927*789Sahrens 			mutex_enter(&tr->tr_ds->dd_lock);
928*789Sahrens 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
929*789Sahrens 			    tr->tr_size);
930*789Sahrens 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
931*789Sahrens 			mutex_exit(&tr->tr_ds->dd_lock);
932*789Sahrens 		}
933*789Sahrens 		list_remove(tr_list, tr);
934*789Sahrens 		kmem_free(tr, sizeof (struct tempreserve));
935*789Sahrens 	}
936*789Sahrens 
937*789Sahrens 	kmem_free(tr_list, sizeof (list_t));
938*789Sahrens }
939*789Sahrens 
940*789Sahrens /*
941*789Sahrens  * Call in open context when we think we're going to write/free space,
942*789Sahrens  * eg. when dirtying data.  Be conservative (ie. OK to write less than
943*789Sahrens  * this or free more than this, but don't write more or free less).
944*789Sahrens  */
945*789Sahrens void
946*789Sahrens dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
947*789Sahrens {
948*789Sahrens 	int64_t parent_space;
949*789Sahrens 	uint64_t est_used;
950*789Sahrens 
951*789Sahrens 	mutex_enter(&dd->dd_lock);
952*789Sahrens 	if (space > 0)
953*789Sahrens 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
954*789Sahrens 
955*789Sahrens 	est_used = dsl_dir_estimated_space(dd);
956*789Sahrens 	parent_space = parent_delta(dd, est_used, space);
957*789Sahrens 	mutex_exit(&dd->dd_lock);
958*789Sahrens 
959*789Sahrens 	/* Make sure that we clean up dd_space_to* */
960*789Sahrens 	dsl_dir_dirty(dd, tx);
961*789Sahrens 
962*789Sahrens 	/* XXX this is potentially expensive and unnecessary... */
963*789Sahrens 	if (parent_space && dd->dd_parent)
964*789Sahrens 		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
965*789Sahrens }
966*789Sahrens 
967*789Sahrens /* call from syncing context when we actually write/free space for this dd */
968*789Sahrens void
969*789Sahrens dsl_dir_diduse_space(dsl_dir_t *dd,
970*789Sahrens     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
971*789Sahrens {
972*789Sahrens 	int64_t accounted_delta;
973*789Sahrens 
974*789Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
975*789Sahrens 
976*789Sahrens 	dsl_dir_dirty(dd, tx);
977*789Sahrens 
978*789Sahrens 	mutex_enter(&dd->dd_lock);
979*789Sahrens 	accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
980*789Sahrens 	ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
981*789Sahrens 	ASSERT(compressed >= 0 ||
982*789Sahrens 	    dd->dd_phys->dd_compressed_bytes >= -compressed);
983*789Sahrens 	ASSERT(uncompressed >= 0 ||
984*789Sahrens 	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
985*789Sahrens 	dd->dd_used_bytes += used;
986*789Sahrens 	if (used > 0)
987*789Sahrens 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
988*789Sahrens 	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
989*789Sahrens 	dd->dd_phys->dd_compressed_bytes += compressed;
990*789Sahrens 	mutex_exit(&dd->dd_lock);
991*789Sahrens 
992*789Sahrens 	if (dd->dd_parent != NULL) {
993*789Sahrens 		dsl_dir_diduse_space(dd->dd_parent,
994*789Sahrens 		    accounted_delta, compressed, uncompressed, tx);
995*789Sahrens 	}
996*789Sahrens }
997*789Sahrens 
998*789Sahrens static int
999*789Sahrens dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1000*789Sahrens {
1001*789Sahrens 	uint64_t *quotap = arg;
1002*789Sahrens 	uint64_t new_quota = *quotap;
1003*789Sahrens 	int err = 0;
1004*789Sahrens 
1005*789Sahrens 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1006*789Sahrens 
1007*789Sahrens 	mutex_enter(&dd->dd_lock);
1008*789Sahrens 	if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved ||
1009*789Sahrens 	    new_quota < dsl_dir_estimated_space(dd))) {
1010*789Sahrens 		err = ENOSPC;
1011*789Sahrens 	} else {
1012*789Sahrens 		dd->dd_phys->dd_quota = new_quota;
1013*789Sahrens 	}
1014*789Sahrens 	mutex_exit(&dd->dd_lock);
1015*789Sahrens 	return (err);
1016*789Sahrens }
1017*789Sahrens 
1018*789Sahrens int
1019*789Sahrens dsl_dir_set_quota(const char *ddname, uint64_t quota)
1020*789Sahrens {
1021*789Sahrens 	dsl_dir_t *dd;
1022*789Sahrens 	int err;
1023*789Sahrens 
1024*789Sahrens 	dd = dsl_dir_open(ddname, FTAG, NULL);
1025*789Sahrens 	if (dd == NULL)
1026*789Sahrens 		return (ENOENT);
1027*789Sahrens 	/*
1028*789Sahrens 	 * If someone removes a file, then tries to set the quota, we
1029*789Sahrens 	 * want to make sure the file freeing takes effect.
1030*789Sahrens 	 */
1031*789Sahrens 	txg_wait_open(dd->dd_pool, 0);
1032*789Sahrens 
1033*789Sahrens 	err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, &quota, 0);
1034*789Sahrens 	dsl_dir_close(dd, FTAG);
1035*789Sahrens 	return (err);
1036*789Sahrens }
1037*789Sahrens 
1038*789Sahrens static int
1039*789Sahrens dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1040*789Sahrens {
1041*789Sahrens 	uint64_t *reservationp = arg;
1042*789Sahrens 	uint64_t new_reservation = *reservationp;
1043*789Sahrens 	uint64_t used, avail;
1044*789Sahrens 	int64_t delta;
1045*789Sahrens 
1046*789Sahrens 	if (new_reservation > INT64_MAX)
1047*789Sahrens 		return (EOVERFLOW);
1048*789Sahrens 
1049*789Sahrens 	mutex_enter(&dd->dd_lock);
1050*789Sahrens 	used = dd->dd_used_bytes;
1051*789Sahrens 	delta = MAX(used, new_reservation) -
1052*789Sahrens 	    MAX(used, dd->dd_phys->dd_reserved);
1053*789Sahrens 	mutex_exit(&dd->dd_lock);
1054*789Sahrens 
1055*789Sahrens 	if (dd->dd_parent) {
1056*789Sahrens 		avail = dsl_dir_space_available(dd->dd_parent,
1057*789Sahrens 		    NULL, 0, FALSE);
1058*789Sahrens 	} else {
1059*789Sahrens 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1060*789Sahrens 	}
1061*789Sahrens 
1062*789Sahrens 	if (delta > 0 && delta > avail)
1063*789Sahrens 		return (ENOSPC);
1064*789Sahrens 	if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
1065*789Sahrens 	    new_reservation > dd->dd_phys->dd_quota)
1066*789Sahrens 		return (ENOSPC);
1067*789Sahrens 
1068*789Sahrens 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1069*789Sahrens 	dd->dd_phys->dd_reserved = new_reservation;
1070*789Sahrens 
1071*789Sahrens 	if (dd->dd_parent != NULL) {
1072*789Sahrens 		/* Roll up this additional usage into our ancestors */
1073*789Sahrens 		dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
1074*789Sahrens 	}
1075*789Sahrens 	return (0);
1076*789Sahrens }
1077*789Sahrens 
1078*789Sahrens int
1079*789Sahrens dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
1080*789Sahrens {
1081*789Sahrens 	dsl_dir_t *dd;
1082*789Sahrens 	int err;
1083*789Sahrens 
1084*789Sahrens 	dd = dsl_dir_open(ddname, FTAG, NULL);
1085*789Sahrens 	if (dd == NULL)
1086*789Sahrens 		return (ENOENT);
1087*789Sahrens 	err = dsl_dir_sync_task(dd,
1088*789Sahrens 	    dsl_dir_set_reservation_sync, &reservation, 0);
1089*789Sahrens 	dsl_dir_close(dd, FTAG);
1090*789Sahrens 	return (err);
1091*789Sahrens }
1092*789Sahrens 
1093*789Sahrens static dsl_dir_t *
1094*789Sahrens closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1095*789Sahrens {
1096*789Sahrens 	for (; ds1; ds1 = ds1->dd_parent) {
1097*789Sahrens 		dsl_dir_t *dd;
1098*789Sahrens 		for (dd = ds2; dd; dd = dd->dd_parent) {
1099*789Sahrens 			if (ds1 == dd)
1100*789Sahrens 				return (dd);
1101*789Sahrens 		}
1102*789Sahrens 	}
1103*789Sahrens 	return (NULL);
1104*789Sahrens }
1105*789Sahrens 
1106*789Sahrens /*
1107*789Sahrens  * If delta is applied to dd, how much of that delta would be applied to
1108*789Sahrens  * ancestor?  Syncing context only.
1109*789Sahrens  */
1110*789Sahrens static int64_t
1111*789Sahrens would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1112*789Sahrens {
1113*789Sahrens 	if (dd == ancestor)
1114*789Sahrens 		return (delta);
1115*789Sahrens 
1116*789Sahrens 	mutex_enter(&dd->dd_lock);
1117*789Sahrens 	delta = parent_delta(dd, dd->dd_used_bytes, delta);
1118*789Sahrens 	mutex_exit(&dd->dd_lock);
1119*789Sahrens 	return (would_change(dd->dd_parent, delta, ancestor));
1120*789Sahrens }
1121*789Sahrens 
1122*789Sahrens int
1123*789Sahrens dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1124*789Sahrens {
1125*789Sahrens 	const char *newname = arg;
1126*789Sahrens 	dsl_pool_t *dp = dd->dd_pool;
1127*789Sahrens 	objset_t *mos = dp->dp_meta_objset;
1128*789Sahrens 	dsl_dir_t *newpds;
1129*789Sahrens 	const char *tail;
1130*789Sahrens 	int err, len;
1131*789Sahrens 
1132*789Sahrens 	/* can't rename to different pool */
1133*789Sahrens 	len = strlen(dp->dp_root_dir->dd_myname);
1134*789Sahrens 	if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) ||
1135*789Sahrens 	    newname[len] != '/') {
1136*789Sahrens 		return (ENXIO);
1137*789Sahrens 	}
1138*789Sahrens 
1139*789Sahrens 	newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
1140*789Sahrens 
1141*789Sahrens 	/* new parent should exist */
1142*789Sahrens 	if (newpds == NULL)
1143*789Sahrens 		return (ENOENT);
1144*789Sahrens 
1145*789Sahrens 	/* new name should not already exist */
1146*789Sahrens 	if (tail == NULL) {
1147*789Sahrens 		dsl_dir_close(newpds, FTAG);
1148*789Sahrens 		return (EEXIST);
1149*789Sahrens 	}
1150*789Sahrens 
1151*789Sahrens 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
1152*789Sahrens 
1153*789Sahrens 	/* There should be 2 references: the open and the dirty */
1154*789Sahrens 	if (dmu_buf_refcount(dd->dd_dbuf) > 2) {
1155*789Sahrens 		rw_exit(&dp->dp_config_rwlock);
1156*789Sahrens 		dsl_dir_close(newpds, FTAG);
1157*789Sahrens 		return (EBUSY);
1158*789Sahrens 	}
1159*789Sahrens 
1160*789Sahrens 	if (newpds != dd->dd_parent) {
1161*789Sahrens 		dsl_dir_t *ancestor;
1162*789Sahrens 		int64_t adelta;
1163*789Sahrens 		uint64_t myspace, avail;
1164*789Sahrens 
1165*789Sahrens 		ancestor = closest_common_ancestor(dd, newpds);
1166*789Sahrens 
1167*789Sahrens 		/* no rename into our descendent */
1168*789Sahrens 		if (ancestor == dd) {
1169*789Sahrens 			dsl_dir_close(newpds, FTAG);
1170*789Sahrens 			rw_exit(&dp->dp_config_rwlock);
1171*789Sahrens 			return (EINVAL);
1172*789Sahrens 		}
1173*789Sahrens 
1174*789Sahrens 		myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
1175*789Sahrens 		adelta = would_change(dd->dd_parent, -myspace, ancestor);
1176*789Sahrens 		avail = dsl_dir_space_available(newpds,
1177*789Sahrens 		    ancestor, adelta, FALSE);
1178*789Sahrens 		if (avail < myspace) {
1179*789Sahrens 			dsl_dir_close(newpds, FTAG);
1180*789Sahrens 			rw_exit(&dp->dp_config_rwlock);
1181*789Sahrens 			return (ENOSPC);
1182*789Sahrens 		}
1183*789Sahrens 
1184*789Sahrens 		/* The point of no (unsuccessful) return */
1185*789Sahrens 
1186*789Sahrens 		dsl_dir_diduse_space(dd->dd_parent, -myspace,
1187*789Sahrens 		    -dd->dd_phys->dd_compressed_bytes,
1188*789Sahrens 		    -dd->dd_phys->dd_uncompressed_bytes, tx);
1189*789Sahrens 		dsl_dir_diduse_space(newpds, myspace,
1190*789Sahrens 		    dd->dd_phys->dd_compressed_bytes,
1191*789Sahrens 		    dd->dd_phys->dd_uncompressed_bytes, tx);
1192*789Sahrens 	}
1193*789Sahrens 
1194*789Sahrens 	/* The point of no (unsuccessful) return */
1195*789Sahrens 
1196*789Sahrens 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1197*789Sahrens 
1198*789Sahrens 	/* remove from old parent zapobj */
1199*789Sahrens 	err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1200*789Sahrens 	    dd->dd_myname, tx);
1201*789Sahrens 	ASSERT3U(err, ==, 0);
1202*789Sahrens 
1203*789Sahrens 	(void) strcpy(dd->dd_myname, tail);
1204*789Sahrens 	dsl_dir_close(dd->dd_parent, dd);
1205*789Sahrens 	dd->dd_phys->dd_parent_obj = newpds->dd_object;
1206*789Sahrens 	dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
1207*789Sahrens 	    newpds->dd_object, NULL, dd);
1208*789Sahrens 
1209*789Sahrens 	/* add to new parent zapobj */
1210*789Sahrens 	err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
1211*789Sahrens 	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
1212*789Sahrens 	ASSERT3U(err, ==, 0);
1213*789Sahrens 
1214*789Sahrens 	dsl_dir_close(newpds, FTAG);
1215*789Sahrens 	rw_exit(&dp->dp_config_rwlock);
1216*789Sahrens 	return (0);
1217*789Sahrens }
1218