xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c (revision ba2539a9805a0544ff82c0003cc02fe1eee5603d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/time.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/resource.h>
32 #include <sys/vfs.h>
33 #include <sys/vnode.h>
34 #include <sys/file.h>
35 #include <sys/kmem.h>
36 #include <sys/uio.h>
37 #include <sys/cmn_err.h>
38 #include <sys/errno.h>
39 #include <sys/stat.h>
40 #include <sys/unistd.h>
41 #include <sys/sunddi.h>
42 #include <sys/random.h>
43 #include <sys/policy.h>
44 #ifdef __FreeBSD__
45 #include <sys/kcondvar.h>
46 #include <sys/callb.h>
47 #include <sys/smp.h>
48 #endif
49 #include <sys/zfs_dir.h>
50 #include <sys/zfs_acl.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/zap.h>
53 #include <sys/dmu.h>
54 #include <sys/atomic.h>
55 #include <sys/zfs_ctldir.h>
56 #include <sys/zfs_fuid.h>
57 #include <sys/sa.h>
58 #include <sys/zfs_sa.h>
59 #include <sys/dnlc.h>
60 #include <sys/extdirent.h>
61 
62 /*
63  * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
64  * of names after deciding which is the appropriate lookup interface.
65  */
66 static int
zfs_match_find(zfsvfs_t * zfsvfs,znode_t * dzp,const char * name,boolean_t exact,uint64_t * zoid)67 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
68     boolean_t exact, uint64_t *zoid)
69 {
70 	int error;
71 
72 	if (zfsvfs->z_norm) {
73 		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
74 
75 		/*
76 		 * In the non-mixed case we only expect there would ever
77 		 * be one match, but we need to use the normalizing lookup.
78 		 */
79 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
80 		    zoid, mt, NULL, 0, NULL);
81 	} else {
82 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
83 	}
84 	*zoid = ZFS_DIRENT_OBJ(*zoid);
85 
86 	return (error);
87 }
88 
89 /*
90  * Look up a directory entry under a locked vnode.
91  * dvp being locked gives us a guarantee that there are no concurrent
92  * modification of the directory and, thus, if a node can be found in
93  * the directory, then it must not be unlinked.
94  *
95  * Input arguments:
96  *	dzp	- znode for directory
97  *	name	- name of entry to lock
98  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
99  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
100  *		  ZXATTR: we want dzp's xattr directory
101  *
102  * Output arguments:
103  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
104  *
105  * Return value: 0 on success or errno on failure.
106  *
107  * NOTE: Always checks for, and rejects, '.' and '..'.
108  */
109 int
zfs_dirent_lookup(znode_t * dzp,const char * name,znode_t ** zpp,int flag)110 zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
111 {
112 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
113 	boolean_t	exact;
114 	uint64_t	zoid;
115 	vnode_t		*vp = NULL;
116 	int		error = 0;
117 
118 	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
119 
120 	*zpp = NULL;
121 
122 	/*
123 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
124 	 */
125 	if (name[0] == '.' &&
126 	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
127 	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
128 		return (SET_ERROR(EEXIST));
129 
130 	/*
131 	 * Case sensitivity and normalization preferences are set when
132 	 * the file system is created.  These are stored in the
133 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
134 	 * affect how we perform zap lookups.
135 	 *
136 	 * Decide if exact matches should be requested when performing
137 	 * a zap lookup on file systems supporting case-insensitive
138 	 * access.
139 	 *
140 	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
141 	 * because in that case MT_EXACT and MT_FIRST should produce exactly
142 	 * the same result.
143 	 */
144 	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
145 
146 	if (dzp->z_unlinked && !(flag & ZXATTR))
147 		return (ENOENT);
148 	if (flag & ZXATTR) {
149 		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
150 		    sizeof (zoid));
151 		if (error == 0)
152 			error = (zoid == 0 ? ENOENT : 0);
153 	} else {
154 		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
155 	}
156 	if (error) {
157 		if (error != ENOENT || (flag & ZEXISTS)) {
158 			return (error);
159 		}
160 	} else {
161 		if (flag & ZNEW) {
162 			return (SET_ERROR(EEXIST));
163 		}
164 		error = zfs_zget(zfsvfs, zoid, zpp);
165 		if (error)
166 			return (error);
167 		ASSERT(!(*zpp)->z_unlinked);
168 	}
169 
170 	return (0);
171 }
172 
173 static int
zfs_dd_lookup(znode_t * dzp,znode_t ** zpp)174 zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
175 {
176 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
177 	znode_t *zp;
178 	uint64_t parent;
179 	int error;
180 
181 	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
182 	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
183 
184 	if (dzp->z_unlinked)
185 		return (ENOENT);
186 
187 	if ((error = sa_lookup(dzp->z_sa_hdl,
188 	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
189 		return (error);
190 
191 	error = zfs_zget(zfsvfs, parent, &zp);
192 	if (error == 0)
193 		*zpp = zp;
194 	return (error);
195 }
196 
197 int
zfs_dirlook(znode_t * dzp,const char * name,znode_t ** zpp)198 zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
199 {
200 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
201 	znode_t *zp;
202 	int error = 0;
203 
204 	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
205 	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
206 
207 	if (dzp->z_unlinked)
208 		return (SET_ERROR(ENOENT));
209 
210 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
211 		*zpp = dzp;
212 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
213 		error = zfs_dd_lookup(dzp, zpp);
214 	} else {
215 		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
216 		if (error == 0) {
217 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
218 			*zpp = zp;
219 		}
220 	}
221 	return (error);
222 }
223 
224 /*
225  * unlinked Set (formerly known as the "delete queue") Error Handling
226  *
227  * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
228  * don't specify the name of the entry that we will be manipulating.  We
229  * also fib and say that we won't be adding any new entries to the
230  * unlinked set, even though we might (this is to lower the minimum file
231  * size that can be deleted in a full filesystem).  So on the small
232  * chance that the nlink list is using a fat zap (ie. has more than
233  * 2000 entries), we *may* not pre-read a block that's needed.
234  * Therefore it is remotely possible for some of the assertions
235  * regarding the unlinked set below to fail due to i/o error.  On a
236  * nondebug system, this will result in the space being leaked.
237  */
238 void
zfs_unlinked_add(znode_t * zp,dmu_tx_t * tx)239 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
240 {
241 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
242 
243 	ASSERT(zp->z_unlinked);
244 	ASSERT(zp->z_links == 0);
245 
246 	VERIFY3U(0, ==,
247 	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
248 }
249 
250 /*
251  * Clean up any znodes that had no links when we either crashed or
252  * (force) umounted the file system.
253  */
254 void
zfs_unlinked_drain(zfsvfs_t * zfsvfs)255 zfs_unlinked_drain(zfsvfs_t *zfsvfs)
256 {
257 	zap_cursor_t	zc;
258 	zap_attribute_t zap;
259 	dmu_object_info_t doi;
260 	znode_t		*zp;
261 	int		error;
262 
263 	/*
264 	 * Interate over the contents of the unlinked set.
265 	 */
266 	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
267 	    zap_cursor_retrieve(&zc, &zap) == 0;
268 	    zap_cursor_advance(&zc)) {
269 
270 		/*
271 		 * See what kind of object we have in list
272 		 */
273 
274 		error = dmu_object_info(zfsvfs->z_os,
275 		    zap.za_first_integer, &doi);
276 		if (error != 0)
277 			continue;
278 
279 		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
280 		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
281 		/*
282 		 * We need to re-mark these list entries for deletion,
283 		 * so we pull them back into core and set zp->z_unlinked.
284 		 */
285 		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
286 
287 		/*
288 		 * We may pick up znodes that are already marked for deletion.
289 		 * This could happen during the purge of an extended attribute
290 		 * directory.  All we need to do is skip over them, since they
291 		 * are already in the system marked z_unlinked.
292 		 */
293 		if (error != 0)
294 			continue;
295 
296 		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
297 		zp->z_unlinked = B_TRUE;
298 		vput(ZTOV(zp));
299 	}
300 	zap_cursor_fini(&zc);
301 }
302 
303 /*
304  * Delete the entire contents of a directory.  Return a count
305  * of the number of entries that could not be deleted. If we encounter
306  * an error, return a count of at least one so that the directory stays
307  * in the unlinked set.
308  *
309  * NOTE: this function assumes that the directory is inactive,
310  *	so there is no need to lock its entries before deletion.
311  *	Also, it assumes the directory contents is *only* regular
312  *	files.
313  */
314 static int
zfs_purgedir(znode_t * dzp)315 zfs_purgedir(znode_t *dzp)
316 {
317 	zap_cursor_t	zc;
318 	zap_attribute_t	zap;
319 	znode_t		*xzp;
320 	dmu_tx_t	*tx;
321 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
322 	int skipped = 0;
323 	int error;
324 
325 	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
326 	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
327 	    zap_cursor_advance(&zc)) {
328 		error = zfs_zget(zfsvfs,
329 		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
330 		if (error) {
331 			skipped += 1;
332 			continue;
333 		}
334 
335 		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
336 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
337 		    (ZTOV(xzp)->v_type == VLNK));
338 
339 		tx = dmu_tx_create(zfsvfs->z_os);
340 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
341 		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
342 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
343 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
344 		/* Is this really needed ? */
345 		zfs_sa_upgrade_txholds(tx, xzp);
346 		dmu_tx_mark_netfree(tx);
347 		error = dmu_tx_assign(tx, TXG_WAIT);
348 		if (error) {
349 			dmu_tx_abort(tx);
350 			vput(ZTOV(xzp));
351 			skipped += 1;
352 			continue;
353 		}
354 
355 		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
356 		if (error)
357 			skipped += 1;
358 		dmu_tx_commit(tx);
359 
360 		vput(ZTOV(xzp));
361 	}
362 	zap_cursor_fini(&zc);
363 	if (error != ENOENT)
364 		skipped += 1;
365 	return (skipped);
366 }
367 
368 void
zfs_rmnode(znode_t * zp)369 zfs_rmnode(znode_t *zp)
370 {
371 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
372 	objset_t	*os = zfsvfs->z_os;
373 	znode_t		*xzp = NULL;
374 	dmu_tx_t	*tx;
375 	uint64_t	acl_obj;
376 	uint64_t	xattr_obj;
377 	int		error;
378 
379 	ASSERT(zp->z_links == 0);
380 #ifndef __NetBSD__
381 	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
382 #endif
383 
384 	/*
385 	 * If this is an attribute directory, purge its contents.
386 	 */
387 	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
388 	    (zp->z_pflags & ZFS_XATTR)) {
389 		if (zfs_purgedir(zp) != 0) {
390 			/*
391 			 * Not enough space to delete some xattrs.
392 			 * Leave it in the unlinked set.
393 			 */
394 			zfs_znode_dmu_fini(zp);
395 			zfs_znode_free(zp);
396 			return;
397 		}
398 	} else {
399 		/*
400 		 * Free up all the data in the file.  We don't do this for
401 		 * XATTR directories because we need truncate and remove to be
402 		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
403 		 * we crash here we'll end up with an inconsistent truncated
404 		 * zap object in the delete queue.  Note a truncated file is
405 		 * harmless since it only contains user data.
406 		 */
407 		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
408 		if (error) {
409 			/*
410 			 * Not enough space.  Leave the file in the unlinked
411 			 * set.
412 			 */
413 			zfs_znode_dmu_fini(zp);
414 			zfs_znode_free(zp);
415 			return;
416 		}
417 	}
418 
419 	/*
420 	 * If the file has extended attributes, we're going to unlink
421 	 * the xattr dir.
422 	 */
423 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
424 	    &xattr_obj, sizeof (xattr_obj));
425 	if (error == 0 && xattr_obj) {
426 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
427 		ASSERT3S(error, ==, 0);
428 		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
429 	}
430 
431 	acl_obj = zfs_external_acl(zp);
432 
433 	/*
434 	 * Set up the final transaction.
435 	 */
436 	tx = dmu_tx_create(os);
437 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
438 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
439 	if (xzp) {
440 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
441 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
442 	}
443 	if (acl_obj)
444 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
445 
446 	zfs_sa_upgrade_txholds(tx, zp);
447 	error = dmu_tx_assign(tx, TXG_WAIT);
448 	if (error) {
449 		/*
450 		 * Not enough space to delete the file.  Leave it in the
451 		 * unlinked set, leaking it until the fs is remounted (at
452 		 * which point we'll call zfs_unlinked_drain() to process it).
453 		 */
454 		dmu_tx_abort(tx);
455 		zfs_znode_dmu_fini(zp);
456 		zfs_znode_free(zp);
457 		goto out;
458 	}
459 
460 	if (xzp) {
461 		ASSERT(error == 0);
462 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
463 		xzp->z_links = 0;	/* no more links to it */
464 		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
465 		    &xzp->z_links, sizeof (xzp->z_links), tx));
466 		zfs_unlinked_add(xzp, tx);
467 	}
468 
469 	/* Remove this znode from the unlinked set */
470 	VERIFY3U(0, ==,
471 	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
472 
473 	zfs_znode_delete(zp, tx);
474 
475 	dmu_tx_commit(tx);
476 out:
477 	if (xzp)
478 		vput(ZTOV(xzp));
479 }
480 
481 static uint64_t
zfs_dirent(znode_t * zp,uint64_t mode)482 zfs_dirent(znode_t *zp, uint64_t mode)
483 {
484 	uint64_t de = zp->z_id;
485 
486 	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
487 		de |= IFTODT(mode) << 60;
488 	return (de);
489 }
490 
491 /*
492  * Link zp into dzp.  Can only fail if zp has been unlinked.
493  */
494 int
zfs_link_create(znode_t * dzp,const char * name,znode_t * zp,dmu_tx_t * tx,int flag)495 zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
496     int flag)
497 {
498 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
499 	vnode_t *vp = ZTOV(zp);
500 	uint64_t value;
501 	int zp_is_dir = (vp->v_type == VDIR);
502 	sa_bulk_attr_t bulk[5];
503 	uint64_t mtime[2], ctime[2];
504 	int count = 0;
505 	int error;
506 
507 	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
508 	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
509 #if 0
510 	if (zp_is_dir) {
511 		error = 0;
512 		if (dzp->z_links >= LINK_MAX)
513 			error = SET_ERROR(EMLINK);
514 		return (error);
515 	}
516 #endif
517 	if (!(flag & ZRENAMING)) {
518 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
519 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
520 			return (SET_ERROR(ENOENT));
521 		}
522 #if 0
523 		if (zp->z_links >= LINK_MAX) {
524 			return (SET_ERROR(EMLINK));
525 		}
526 #endif
527 		zp->z_links++;
528 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
529 		    &zp->z_links, sizeof (zp->z_links));
530 
531 	} else {
532 		ASSERT(zp->z_unlinked == 0);
533 	}
534 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
535 	    &dzp->z_id, sizeof (dzp->z_id));
536 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
537 	    &zp->z_pflags, sizeof (zp->z_pflags));
538 
539 	if (!(flag & ZNEW)) {
540 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
541 		    ctime, sizeof (ctime));
542 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
543 		    ctime, B_TRUE);
544 	}
545 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
546 	ASSERT0(error);
547 
548 	dzp->z_size++;
549 	dzp->z_links += zp_is_dir;
550 	count = 0;
551 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
552 	    &dzp->z_size, sizeof (dzp->z_size));
553 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
554 	    &dzp->z_links, sizeof (dzp->z_links));
555 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
556 	    mtime, sizeof (mtime));
557 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
558 	    ctime, sizeof (ctime));
559 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
560 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
561 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
562 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
563 	ASSERT0(error);
564 
565 	value = zfs_dirent(zp, zp->z_mode);
566 	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
567 	    8, 1, &value, tx);
568 	VERIFY0(error);
569 
570 	return (0);
571 }
572 
573 static int
zfs_dropname(znode_t * dzp,const char * name,znode_t * zp,dmu_tx_t * tx,int flag)574 zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
575     int flag)
576 {
577 	int error;
578 
579 	if (zp->z_zfsvfs->z_norm) {
580 		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
581 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
582 			    dzp->z_id, name, MT_EXACT, tx);
583 		else
584 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
585 			    dzp->z_id, name, MT_FIRST, tx);
586 	} else {
587 		error = zap_remove(zp->z_zfsvfs->z_os,
588 		    dzp->z_id, name, tx);
589 	}
590 
591 	return (error);
592 }
593 
594 /*
595  * Unlink zp from dzp, and mark zp for deletion if this was the last link.
596  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
597  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
598  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
599  * and it's the caller's job to do it.
600  */
601 int
zfs_link_destroy(znode_t * dzp,const char * name,znode_t * zp,dmu_tx_t * tx,int flag,boolean_t * unlinkedp)602 zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
603     int flag, boolean_t *unlinkedp)
604 {
605 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
606 	vnode_t *vp = ZTOV(zp);
607 	int zp_is_dir = (vp->v_type == VDIR);
608 	boolean_t unlinked = B_FALSE;
609 	sa_bulk_attr_t bulk[5];
610 	uint64_t mtime[2], ctime[2];
611 	int count = 0;
612 	int error;
613 
614 	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
615 	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
616 
617 	if (!(flag & ZRENAMING)) {
618 
619 		if (zp_is_dir && !zfs_dirempty(zp)) {
620 #ifdef illumos
621 			return (SET_ERROR(EEXIST));
622 #else
623 			return (SET_ERROR(ENOTEMPTY));
624 #endif
625 		}
626 
627 		/*
628 		 * If we get here, we are going to try to remove the object.
629 		 * First try removing the name from the directory; if that
630 		 * fails, return the error.
631 		 */
632 		error = zfs_dropname(dzp, name, zp, tx, flag);
633 		if (error != 0) {
634 			return (error);
635 		}
636 
637 		if (zp->z_links <= zp_is_dir) {
638 			zfs_panic_recover("zfs: link count on vnode %p is %u, "
639 			    "should be at least %u", zp->z_vnode,
640 			    (int)zp->z_links,
641 			    zp_is_dir + 1);
642 			zp->z_links = zp_is_dir + 1;
643 		}
644 		if (--zp->z_links == zp_is_dir) {
645 			zp->z_unlinked = B_TRUE;
646 			zp->z_links = 0;
647 			unlinked = B_TRUE;
648 		} else {
649 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
650 			    NULL, &ctime, sizeof (ctime));
651 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
652 			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
653 			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
654 			    B_TRUE);
655 		}
656 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
657 		    NULL, &zp->z_links, sizeof (zp->z_links));
658 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
659 		count = 0;
660 		ASSERT0(error);
661 	} else {
662 		ASSERT(zp->z_unlinked == 0);
663 		error = zfs_dropname(dzp, name, zp, tx, flag);
664 		if (error != 0)
665 			return (error);
666 	}
667 
668 	dzp->z_size--;		/* one dirent removed */
669 	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
670 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
671 	    NULL, &dzp->z_links, sizeof (dzp->z_links));
672 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
673 	    NULL, &dzp->z_size, sizeof (dzp->z_size));
674 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
675 	    NULL, ctime, sizeof (ctime));
676 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
677 	    NULL, mtime, sizeof (mtime));
678 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
679 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
680 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
681 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
682 	ASSERT0(error);
683 
684 	if (unlinkedp != NULL)
685 		*unlinkedp = unlinked;
686 	else if (unlinked)
687 		zfs_unlinked_add(zp, tx);
688 
689 	return (0);
690 }
691 
692 /*
693  * Indicate whether the directory is empty.
694  */
695 boolean_t
zfs_dirempty(znode_t * dzp)696 zfs_dirempty(znode_t *dzp)
697 {
698 	return (dzp->z_size == 2);
699 }
700 
701 int
zfs_make_xattrdir(znode_t * zp,vattr_t * vap,vnode_t ** xvpp,cred_t * cr)702 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
703 {
704 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
705 	znode_t *xzp;
706 	dmu_tx_t *tx;
707 	int error;
708 	zfs_acl_ids_t acl_ids;
709 	boolean_t fuid_dirtied;
710 	uint64_t parent;
711 
712 	*xvpp = NULL;
713 
714 	/*
715 	 * In FreeBSD, access checking for creating an EA is being done
716 	 * in zfs_setextattr(),
717 	 */
718 #ifndef __FreeBSD_kernel__
719 	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
720 		return (error);
721 #endif
722 
723 	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
724 	    &acl_ids)) != 0)
725 		return (error);
726 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
727 		zfs_acl_ids_free(&acl_ids);
728 		return (SET_ERROR(EDQUOT));
729 	}
730 
731 	getnewvnode_reserve(1);
732 
733 	tx = dmu_tx_create(zfsvfs->z_os);
734 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
735 	    ZFS_SA_BASE_ATTR_SIZE);
736 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
737 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
738 	fuid_dirtied = zfsvfs->z_fuid_dirty;
739 	if (fuid_dirtied)
740 		zfs_fuid_txhold(zfsvfs, tx);
741 	error = dmu_tx_assign(tx, TXG_WAIT);
742 	if (error) {
743 		zfs_acl_ids_free(&acl_ids);
744 		dmu_tx_abort(tx);
745 		return (error);
746 	}
747 	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
748 
749 	if (fuid_dirtied)
750 		zfs_fuid_sync(zfsvfs, tx);
751 
752 #ifdef DEBUG
753 	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
754 	    &parent, sizeof (parent));
755 	ASSERT(error == 0 && parent == zp->z_id);
756 #endif
757 
758 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
759 	    sizeof (xzp->z_id), tx));
760 
761 	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
762 	    xzp, "", NULL, acl_ids.z_fuidp, vap);
763 
764 	zfs_acl_ids_free(&acl_ids);
765 	dmu_tx_commit(tx);
766 
767 	getnewvnode_drop_reserve();
768 
769 	*xvpp = ZTOV(xzp);
770 
771 	return (0);
772 }
773 
774 /*
775  * Return a znode for the extended attribute directory for zp.
776  * ** If the directory does not already exist, it is created **
777  *
778  *	IN:	zp	- znode to obtain attribute directory from
779  *		cr	- credentials of caller
780  *		flags	- flags from the VOP_LOOKUP call
781  *
782  *	OUT:	xzpp	- pointer to extended attribute znode
783  *
784  *	RETURN:	0 on success
785  *		error number on failure
786  */
787 int
zfs_get_xattrdir(znode_t * zp,vnode_t ** xvpp,cred_t * cr,int flags)788 zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
789 {
790 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
791 	znode_t		*xzp;
792 	vattr_t		va;
793 	int		error;
794 top:
795 	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
796 	if (error)
797 		return (error);
798 
799 	if (xzp != NULL) {
800 		*xvpp = ZTOV(xzp);
801 		return (0);
802 	}
803 
804 
805 	if (!(flags & CREATE_XATTR_DIR)) {
806 #ifdef illumos
807 		return (SET_ERROR(ENOENT));
808 #else
809 		return (SET_ERROR(ENOATTR));
810 #endif
811 	}
812 
813 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
814 		return (SET_ERROR(EROFS));
815 	}
816 
817 	/*
818 	 * The ability to 'create' files in an attribute
819 	 * directory comes from the write_xattr permission on the base file.
820 	 *
821 	 * The ability to 'search' an attribute directory requires
822 	 * read_xattr permission on the base file.
823 	 *
824 	 * Once in a directory the ability to read/write attributes
825 	 * is controlled by the permissions on the attribute file.
826 	 */
827 	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
828 	va.va_type = VDIR;
829 	va.va_mode = S_IFDIR | S_ISVTX | 0777;
830 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
831 
832 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
833 
834 	if (error == ERESTART) {
835 		/* NB: we already did dmu_tx_wait() if necessary */
836 		goto top;
837 	}
838 	if (error == 0)
839 		VOP_UNLOCK(*xvpp, 0);
840 
841 	return (error);
842 }
843 
844 /*
845  * Decide whether it is okay to remove within a sticky directory.
846  *
847  * In sticky directories, write access is not sufficient;
848  * you can remove entries from a directory only if:
849  *
850  *	you own the directory,
851  *	you own the entry,
852  *	the entry is a plain file and you have write access,
853  *	or you are privileged (checked in secpolicy...).
854  *
855  * The function returns 0 if remove access is granted.
856  */
857 int
zfs_sticky_remove_access(znode_t * zdp,znode_t * zp,cred_t * cr)858 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
859 {
860 	uid_t  		uid;
861 	uid_t		downer;
862 	uid_t		fowner;
863 	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
864 
865 	if (zdp->z_zfsvfs->z_replay)
866 		return (0);
867 
868 	if ((zdp->z_mode & S_ISVTX) == 0)
869 		return (0);
870 
871 	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
872 	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
873 
874 	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
875 	    (ZTOV(zp)->v_type == VREG &&
876 	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
877 		return (0);
878 	else
879 		return (secpolicy_vnode_remove(ZTOV(zp), cr));
880 }
881