xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24  * Copyright (c) 2014 Integros [integros.com]
25  */
26 
27 /* Portions Copyright 2007 Jeremy Teo */
28 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
29 
30 #ifdef _KERNEL
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/mntent.h>
38 #include <sys/u8_textprep.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/kmem.h>
44 #include <sys/errno.h>
45 #include <sys/unistd.h>
46 #include <sys/atomic.h>
47 #include <sys/zfs_dir.h>
48 #include <sys/zfs_acl.h>
49 #include <sys/zfs_ioctl.h>
50 #include <sys/zfs_rlock.h>
51 #include <sys/zfs_fuid.h>
52 #include <sys/dnode.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/kidmap.h>
55 
56 #ifdef __NetBSD__
57 #include <miscfs/specfs/specdev.h>
58 
59 extern int (**zfs_vnodeop_p)(void *);
60 extern int (**zfs_fifoop_p)(void *);
61 extern int (**zfs_specop_p)(void *);
62 
63 struct zfs_loadvnode_args {
64 	dmu_buf_t		*db;
65 	int			blksz;
66 	dmu_object_type_t	obj_type;
67 	void			*sa_hdl;
68 };
69 
70 uint_t zfs_loadvnode_key;
71 
72 #endif
73 #endif /* _KERNEL */
74 
75 #include <sys/dmu.h>
76 #include <sys/dmu_objset.h>
77 #include <sys/refcount.h>
78 #include <sys/stat.h>
79 #include <sys/zap.h>
80 #include <sys/zfs_znode.h>
81 #include <sys/sa.h>
82 #include <sys/zfs_sa.h>
83 #include <sys/zfs_stat.h>
84 #include <sys/refcount.h>
85 
86 #include "zfs_prop.h"
87 #include "zfs_comutil.h"
88 
89 /* Used by fstat(1). */
90 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
91     SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
92 
93 /*
94  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
95  * turned on when DEBUG is also defined.
96  */
97 #ifdef	DEBUG
98 #define	ZNODE_STATS
99 #endif	/* DEBUG */
100 
101 #ifdef	ZNODE_STATS
102 #define	ZNODE_STAT_ADD(stat)			((stat)++)
103 #else
104 #define	ZNODE_STAT_ADD(stat)			/* nothing */
105 #endif	/* ZNODE_STATS */
106 
107 /*
108  * Functions needed for userland (ie: libzpool) are not put under
109  * #ifdef_KERNEL; the rest of the functions have dependencies
110  * (such as VFS logic) that will not compile easily in userland.
111  */
112 #ifdef _KERNEL
113 /*
114  * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
115  * be freed before it can be safely accessed.
116  */
117 krwlock_t zfsvfs_lock;
118 
119 static kmem_cache_t *znode_cache = NULL;
120 
121 /*ARGSUSED*/
122 static void
123 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
124 {
125 	/*
126 	 * We should never drop all dbuf refs without first clearing
127 	 * the eviction callback.
128 	 */
129 	panic("evicting znode %p\n", user_ptr);
130 }
131 
132 extern struct vop_vector zfs_vnodeops;
133 extern struct vop_vector zfs_fifoops;
134 extern struct vop_vector zfs_shareops;
135 
136 static int
137 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
138 {
139 	znode_t *zp = buf;
140 
141 #ifdef __NetBSD__
142 	zp = arg;
143 #endif
144 	POINTER_INVALIDATE(&zp->z_zfsvfs);
145 
146 	list_link_init(&zp->z_link_node);
147 
148 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
149 
150 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
151 	avl_create(&zp->z_range_avl, zfs_range_compare,
152 	    sizeof (rl_t), offsetof(rl_t, r_node));
153 
154 	zp->z_acl_cached = NULL;
155 	zp->z_vnode = NULL;
156 	zp->z_moved = 0;
157 	return (0);
158 }
159 
160 /*ARGSUSED*/
161 static void
162 zfs_znode_cache_destructor(void *buf, void *arg)
163 {
164 	znode_t *zp = buf;
165 
166 #ifdef __NetBSD__
167 	zp = arg;
168 #endif
169 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
170 	ASSERT(ZTOV(zp) == NULL);
171 #ifndef __NetBSD__
172 	vn_free(ZTOV(zp));
173 #endif
174 	ASSERT(!list_link_active(&zp->z_link_node));
175 	mutex_destroy(&zp->z_acl_lock);
176 	avl_destroy(&zp->z_range_avl);
177 	mutex_destroy(&zp->z_range_lock);
178 
179 	ASSERT(zp->z_acl_cached == NULL);
180 }
181 
182 #ifdef	ZNODE_STATS
183 static struct {
184 	uint64_t zms_zfsvfs_invalid;
185 	uint64_t zms_zfsvfs_recheck1;
186 	uint64_t zms_zfsvfs_unmounted;
187 	uint64_t zms_zfsvfs_recheck2;
188 	uint64_t zms_obj_held;
189 	uint64_t zms_vnode_locked;
190 	uint64_t zms_not_only_dnlc;
191 } znode_move_stats;
192 #endif	/* ZNODE_STATS */
193 
194 #ifdef illumos
195 static void
196 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
197 {
198 	vnode_t *vp;
199 
200 	/* Copy fields. */
201 	nzp->z_zfsvfs = ozp->z_zfsvfs;
202 
203 	/* Swap vnodes. */
204 	vp = nzp->z_vnode;
205 	nzp->z_vnode = ozp->z_vnode;
206 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
207 	ZTOV(ozp)->v_data = ozp;
208 	ZTOV(nzp)->v_data = nzp;
209 
210 	nzp->z_id = ozp->z_id;
211 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
212 	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
213 	nzp->z_unlinked = ozp->z_unlinked;
214 	nzp->z_atime_dirty = ozp->z_atime_dirty;
215 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
216 	nzp->z_blksz = ozp->z_blksz;
217 	nzp->z_seq = ozp->z_seq;
218 	nzp->z_mapcnt = ozp->z_mapcnt;
219 	nzp->z_gen = ozp->z_gen;
220 	nzp->z_sync_cnt = ozp->z_sync_cnt;
221 	nzp->z_is_sa = ozp->z_is_sa;
222 	nzp->z_sa_hdl = ozp->z_sa_hdl;
223 	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
224 	nzp->z_links = ozp->z_links;
225 	nzp->z_size = ozp->z_size;
226 	nzp->z_pflags = ozp->z_pflags;
227 	nzp->z_uid = ozp->z_uid;
228 	nzp->z_gid = ozp->z_gid;
229 	nzp->z_mode = ozp->z_mode;
230 
231 	/*
232 	 * Since this is just an idle znode and kmem is already dealing with
233 	 * memory pressure, release any cached ACL.
234 	 */
235 	if (ozp->z_acl_cached) {
236 		zfs_acl_free(ozp->z_acl_cached);
237 		ozp->z_acl_cached = NULL;
238 	}
239 
240 	sa_set_userp(nzp->z_sa_hdl, nzp);
241 
242 	/*
243 	 * Invalidate the original znode by clearing fields that provide a
244 	 * pointer back to the znode. Set the low bit of the vfs pointer to
245 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
246 	 * subsequent callback.
247 	 */
248 	ozp->z_sa_hdl = NULL;
249 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
250 
251 	/*
252 	 * Mark the znode.
253 	 */
254 	nzp->z_moved = 1;
255 	ozp->z_moved = (uint8_t)-1;
256 }
257 
258 /*ARGSUSED*/
259 static kmem_cbrc_t
260 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
261 {
262 	znode_t *ozp = buf, *nzp = newbuf;
263 	zfsvfs_t *zfsvfs;
264 	vnode_t *vp;
265 
266 	/*
267 	 * The znode is on the file system's list of known znodes if the vfs
268 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
269 	 * the znode to invalidate it, and the memory patterns written by kmem
270 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
271 	 * created znode sets the vfs pointer last of all to indicate that the
272 	 * znode is known and in a valid state to be moved by this function.
273 	 */
274 	zfsvfs = ozp->z_zfsvfs;
275 	if (!POINTER_IS_VALID(zfsvfs)) {
276 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
277 		return (KMEM_CBRC_DONT_KNOW);
278 	}
279 
280 	/*
281 	 * Close a small window in which it's possible that the filesystem could
282 	 * be unmounted and freed, and zfsvfs, though valid in the previous
283 	 * statement, could point to unrelated memory by the time we try to
284 	 * prevent the filesystem from being unmounted.
285 	 */
286 	rw_enter(&zfsvfs_lock, RW_WRITER);
287 	if (zfsvfs != ozp->z_zfsvfs) {
288 		rw_exit(&zfsvfs_lock);
289 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
290 		return (KMEM_CBRC_DONT_KNOW);
291 	}
292 
293 	/*
294 	 * If the znode is still valid, then so is the file system. We know that
295 	 * no valid file system can be freed while we hold zfsvfs_lock, so we
296 	 * can safely ensure that the filesystem is not and will not be
297 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
298 	 */
299 	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
300 	if (zfsvfs->z_unmounted) {
301 		ZFS_EXIT(zfsvfs);
302 		rw_exit(&zfsvfs_lock);
303 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
304 		return (KMEM_CBRC_DONT_KNOW);
305 	}
306 	rw_exit(&zfsvfs_lock);
307 
308 	mutex_enter(&zfsvfs->z_znodes_lock);
309 	/*
310 	 * Recheck the vfs pointer in case the znode was removed just before
311 	 * acquiring the lock.
312 	 */
313 	if (zfsvfs != ozp->z_zfsvfs) {
314 		mutex_exit(&zfsvfs->z_znodes_lock);
315 		ZFS_EXIT(zfsvfs);
316 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
317 		return (KMEM_CBRC_DONT_KNOW);
318 	}
319 
320 	/*
321 	 * At this point we know that as long as we hold z_znodes_lock, the
322 	 * znode cannot be freed and fields within the znode can be safely
323 	 * accessed. Now, prevent a race with zfs_zget().
324 	 */
325 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
326 		mutex_exit(&zfsvfs->z_znodes_lock);
327 		ZFS_EXIT(zfsvfs);
328 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
329 		return (KMEM_CBRC_LATER);
330 	}
331 
332 	vp = ZTOV(ozp);
333 	if (mutex_tryenter(&vp->v_lock) == 0) {
334 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
335 		mutex_exit(&zfsvfs->z_znodes_lock);
336 		ZFS_EXIT(zfsvfs);
337 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
338 		return (KMEM_CBRC_LATER);
339 	}
340 
341 	/* Only move znodes that are referenced _only_ by the DNLC. */
342 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
343 		mutex_exit(&vp->v_lock);
344 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
345 		mutex_exit(&zfsvfs->z_znodes_lock);
346 		ZFS_EXIT(zfsvfs);
347 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
348 		return (KMEM_CBRC_LATER);
349 	}
350 
351 	/*
352 	 * The znode is known and in a valid state to move. We're holding the
353 	 * locks needed to execute the critical section.
354 	 */
355 	zfs_znode_move_impl(ozp, nzp);
356 	mutex_exit(&vp->v_lock);
357 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
358 
359 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
360 	mutex_exit(&zfsvfs->z_znodes_lock);
361 	ZFS_EXIT(zfsvfs);
362 
363 	return (KMEM_CBRC_YES);
364 }
365 #endif /* illumos */
366 
367 void
368 zfs_znode_init(void)
369 {
370 	/*
371 	 * Initialize zcache
372 	 */
373 	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
374 	ASSERT(znode_cache == NULL);
375 	znode_cache = kmem_cache_create("zfs_znode_cache",
376 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
377 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
378 	kmem_cache_set_move(znode_cache, zfs_znode_move);
379 }
380 
381 void
382 zfs_znode_fini(void)
383 {
384 #ifdef illumos
385 	/*
386 	 * Cleanup vfs & vnode ops
387 	 */
388 	zfs_remove_op_tables();
389 #endif
390 
391 	/*
392 	 * Cleanup zcache
393 	 */
394 	if (znode_cache)
395 		kmem_cache_destroy(znode_cache);
396 	znode_cache = NULL;
397 	rw_destroy(&zfsvfs_lock);
398 }
399 
400 #ifdef illumos
401 struct vnodeops *zfs_dvnodeops;
402 struct vnodeops *zfs_fvnodeops;
403 struct vnodeops *zfs_symvnodeops;
404 struct vnodeops *zfs_xdvnodeops;
405 struct vnodeops *zfs_evnodeops;
406 struct vnodeops *zfs_sharevnodeops;
407 
408 void
409 zfs_remove_op_tables()
410 {
411 	/*
412 	 * Remove vfs ops
413 	 */
414 	ASSERT(zfsfstype);
415 	(void) vfs_freevfsops_by_type(zfsfstype);
416 	zfsfstype = 0;
417 
418 	/*
419 	 * Remove vnode ops
420 	 */
421 	if (zfs_dvnodeops)
422 		vn_freevnodeops(zfs_dvnodeops);
423 	if (zfs_fvnodeops)
424 		vn_freevnodeops(zfs_fvnodeops);
425 	if (zfs_symvnodeops)
426 		vn_freevnodeops(zfs_symvnodeops);
427 	if (zfs_xdvnodeops)
428 		vn_freevnodeops(zfs_xdvnodeops);
429 	if (zfs_evnodeops)
430 		vn_freevnodeops(zfs_evnodeops);
431 	if (zfs_sharevnodeops)
432 		vn_freevnodeops(zfs_sharevnodeops);
433 
434 	zfs_dvnodeops = NULL;
435 	zfs_fvnodeops = NULL;
436 	zfs_symvnodeops = NULL;
437 	zfs_xdvnodeops = NULL;
438 	zfs_evnodeops = NULL;
439 	zfs_sharevnodeops = NULL;
440 }
441 
442 extern const fs_operation_def_t zfs_dvnodeops_template[];
443 extern const fs_operation_def_t zfs_fvnodeops_template[];
444 extern const fs_operation_def_t zfs_xdvnodeops_template[];
445 extern const fs_operation_def_t zfs_symvnodeops_template[];
446 extern const fs_operation_def_t zfs_evnodeops_template[];
447 extern const fs_operation_def_t zfs_sharevnodeops_template[];
448 
449 int
450 zfs_create_op_tables()
451 {
452 	int error;
453 
454 	/*
455 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
456 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
457 	 * In this case we just return as the ops vectors are already set up.
458 	 */
459 	if (zfs_dvnodeops)
460 		return (0);
461 
462 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
463 	    &zfs_dvnodeops);
464 	if (error)
465 		return (error);
466 
467 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
468 	    &zfs_fvnodeops);
469 	if (error)
470 		return (error);
471 
472 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
473 	    &zfs_symvnodeops);
474 	if (error)
475 		return (error);
476 
477 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
478 	    &zfs_xdvnodeops);
479 	if (error)
480 		return (error);
481 
482 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
483 	    &zfs_evnodeops);
484 	if (error)
485 		return (error);
486 
487 	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
488 	    &zfs_sharevnodeops);
489 
490 	return (error);
491 }
492 #endif	/* illumos */
493 
494 int
495 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
496 {
497 	zfs_acl_ids_t acl_ids;
498 	vattr_t vattr;
499 	znode_t *sharezp;
500 	znode_t *zp;
501 	int error;
502 
503 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
504 	vattr.va_type = VDIR;
505 	vattr.va_mode = S_IFDIR|0555;
506 	vattr.va_uid = crgetuid(kcred);
507 	vattr.va_gid = crgetgid(kcred);
508 
509 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
510 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
511 	sharezp->z_moved = 0;
512 	sharezp->z_unlinked = 0;
513 	sharezp->z_atime_dirty = 0;
514 	sharezp->z_zfsvfs = zfsvfs;
515 	sharezp->z_is_sa = zfsvfs->z_use_sa;
516 
517 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
518 	    kcred, NULL, &acl_ids));
519 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
520 	ASSERT3P(zp, ==, sharezp);
521 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
522 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
523 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
524 	zfsvfs->z_shares_dir = sharezp->z_id;
525 
526 	zfs_acl_ids_free(&acl_ids);
527 	sa_handle_destroy(sharezp->z_sa_hdl);
528 	kmem_cache_free(znode_cache, sharezp);
529 
530 	return (error);
531 }
532 
533 /*
534  * define a couple of values we need available
535  * for both 64 and 32 bit environments.
536  */
537 #ifndef NBITSMINOR64
538 #define	NBITSMINOR64	32
539 #endif
540 #ifndef MAXMAJ64
541 #define	MAXMAJ64	0xffffffffUL
542 #endif
543 #ifndef	MAXMIN64
544 #define	MAXMIN64	0xffffffffUL
545 #endif
546 
547 /*
548  * Create special expldev for ZFS private use.
549  * Can't use standard expldev since it doesn't do
550  * what we want.  The standard expldev() takes a
551  * dev32_t in LP64 and expands it to a long dev_t.
552  * We need an interface that takes a dev32_t in ILP32
553  * and expands it to a long dev_t.
554  */
555 static uint64_t
556 zfs_expldev(dev_t dev)
557 {
558 	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
559 }
560 /*
561  * Special cmpldev for ZFS private use.
562  * Can't use standard cmpldev since it takes
563  * a long dev_t and compresses it to dev32_t in
564  * LP64.  We need to do a compaction of a long dev_t
565  * to a dev32_t in ILP32.
566  */
567 dev_t
568 zfs_cmpldev(uint64_t dev)
569 {
570 	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
571 }
572 
573 static void
574 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
575     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
576 {
577 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
578 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
579 
580 	ASSERT(zp->z_sa_hdl == NULL);
581 	ASSERT(zp->z_acl_cached == NULL);
582 	if (sa_hdl == NULL) {
583 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
584 		    SA_HDL_SHARED, &zp->z_sa_hdl));
585 	} else {
586 		zp->z_sa_hdl = sa_hdl;
587 		sa_set_userp(sa_hdl, zp);
588 	}
589 
590 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
591 
592 	/*
593 	 * Slap on VROOT if we are the root znode unless we are the root
594 	 * node of a snapshot mounted under .zfs.
595 	 */
596 	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
597 		ZTOV(zp)->v_flag |= VROOT;
598 
599 	vn_exists(ZTOV(zp));
600 }
601 
602 void
603 zfs_znode_dmu_fini(znode_t *zp)
604 {
605 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
606 	    zp->z_unlinked ||
607 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
608 
609 	sa_handle_destroy(zp->z_sa_hdl);
610 	zp->z_sa_hdl = NULL;
611 }
612 
613 #ifdef __FreeBSD__
614 static void
615 zfs_vnode_forget(vnode_t *vp)
616 {
617 
618 	/* copied from insmntque_stddtr */
619 	vp->v_data = NULL;
620 	vp->v_op = &dead_vnodeops;
621 	vgone(vp);
622 	vput(vp);
623 }
624 
625 /*
626  * Construct a new znode/vnode and intialize.
627  *
628  * This does not do a call to dmu_set_user() that is
629  * up to the caller to do, in case you don't want to
630  * return the znode
631  */
632 static znode_t *
633 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
634     dmu_object_type_t obj_type, sa_handle_t *hdl)
635 {
636 	znode_t	*zp;
637 	vnode_t *vp;
638 	uint64_t mode;
639 	uint64_t parent;
640 	sa_bulk_attr_t bulk[9];
641 	int count = 0;
642 	int error;
643 
644 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
645 
646 	KASSERT(curthread->td_vp_reserv > 0,
647 	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
648 	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
649 	if (error != 0) {
650 		kmem_cache_free(znode_cache, zp);
651 		return (NULL);
652 	}
653 	zp->z_vnode = vp;
654 	vp->v_data = zp;
655 
656 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
657 	zp->z_moved = 0;
658 
659 	/*
660 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
661 	 * the zfs_znode_move() callback.
662 	 */
663 	zp->z_sa_hdl = NULL;
664 	zp->z_unlinked = 0;
665 	zp->z_atime_dirty = 0;
666 	zp->z_mapcnt = 0;
667 	zp->z_id = db->db_object;
668 	zp->z_blksz = blksz;
669 	zp->z_seq = 0x7A4653;
670 	zp->z_sync_cnt = 0;
671 
672 	vp = ZTOV(zp);
673 
674 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
675 
676 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
677 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
678 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
679 	    &zp->z_size, 8);
680 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
681 	    &zp->z_links, 8);
682 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
683 	    &zp->z_pflags, 8);
684 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
685 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
686 	    &zp->z_atime, 16);
687 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
688 	    &zp->z_uid, 8);
689 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
690 	    &zp->z_gid, 8);
691 
692 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
693 		if (hdl == NULL)
694 			sa_handle_destroy(zp->z_sa_hdl);
695 		zfs_vnode_forget(vp);
696 		zp->z_vnode = NULL;
697 		kmem_cache_free(znode_cache, zp);
698 		return (NULL);
699 	}
700 
701 	zp->z_mode = mode;
702 
703 	vp->v_type = IFTOVT((mode_t)mode);
704 
705 	switch (vp->v_type) {
706 	case VDIR:
707 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
708 		break;
709 #ifdef illumos
710 	case VBLK:
711 	case VCHR:
712 		{
713 			uint64_t rdev;
714 			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
715 			    &rdev, sizeof (rdev)) == 0);
716 
717 			vp->v_rdev = zfs_cmpldev(rdev);
718 		}
719 		break;
720 #endif
721 	case VFIFO:
722 #ifdef illumos
723 	case VSOCK:
724 	case VDOOR:
725 #endif
726 		vp->v_op = &zfs_fifoops;
727 		break;
728 	case VREG:
729 		if (parent == zfsvfs->z_shares_dir) {
730 			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
731 			vp->v_op = &zfs_shareops;
732 		}
733 		break;
734 #ifdef illumos
735 	case VLNK:
736 		vn_setops(vp, zfs_symvnodeops);
737 		break;
738 	default:
739 		vn_setops(vp, zfs_evnodeops);
740 		break;
741 #endif
742 	}
743 
744 	mutex_enter(&zfsvfs->z_znodes_lock);
745 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
746 	membar_producer();
747 	/*
748 	 * Everything else must be valid before assigning z_zfsvfs makes the
749 	 * znode eligible for zfs_znode_move().
750 	 */
751 	zp->z_zfsvfs = zfsvfs;
752 	mutex_exit(&zfsvfs->z_znodes_lock);
753 
754 	/*
755 	 * Acquire vnode lock before making it available to the world.
756 	 */
757 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
758 	VN_LOCK_AREC(vp);
759 	if (vp->v_type != VFIFO)
760 		VN_LOCK_ASHARE(vp);
761 
762 #ifdef illumos
763 	VFS_HOLD(zfsvfs->z_vfs);
764 #endif
765 	return (zp);
766 }
767 #endif /* __FreeBSD__ */
768 
769 static uint64_t empty_xattr;
770 static uint64_t pad[4];
771 static zfs_acl_phys_t acl_phys;
772 /*
773  * Create a new DMU object to hold a zfs znode.
774  *
775  *	IN:	dzp	- parent directory for new znode
776  *		vap	- file attributes for new znode
777  *		tx	- dmu transaction id for zap operations
778  *		cr	- credentials of caller
779  *		flag	- flags:
780  *			  IS_ROOT_NODE	- new object will be root
781  *			  IS_XATTR	- new object is an attribute
782  *		bonuslen - length of bonus buffer
783  *		setaclp  - File/Dir initial ACL
784  *		fuidp	 - Tracks fuid allocation.
785  *
786  *	OUT:	zpp	- allocated znode
787  *
788  */
789 void
790 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
791     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
792 {
793 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
794 	uint64_t	mode, size, links, parent, pflags;
795 	uint64_t	dzp_pflags = 0;
796 	uint64_t	rdev = 0;
797 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
798 	dmu_buf_t	*db;
799 	timestruc_t	now;
800 	uint64_t	gen, obj;
801 	int		err;
802 	int		bonuslen;
803 	sa_handle_t	*sa_hdl;
804 	dmu_object_type_t obj_type;
805 	sa_bulk_attr_t	sa_attrs[ZPL_END];
806 	int		cnt = 0;
807 	zfs_acl_locator_cb_t locate = { 0 };
808 
809 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
810 
811 	if (zfsvfs->z_replay) {
812 		obj = vap->va_nodeid;
813 		now = vap->va_ctime;		/* see zfs_replay_create() */
814 		gen = vap->va_nblocks;		/* ditto */
815 	} else {
816 		obj = 0;
817 		vfs_timestamp(&now);
818 		gen = dmu_tx_get_txg(tx);
819 	}
820 
821 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
822 	bonuslen = (obj_type == DMU_OT_SA) ?
823 	    DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
824 
825 	/*
826 	 * Create a new DMU object.
827 	 */
828 	/*
829 	 * There's currently no mechanism for pre-reading the blocks that will
830 	 * be needed to allocate a new object, so we accept the small chance
831 	 * that there will be an i/o error and we will fail one of the
832 	 * assertions below.
833 	 */
834 	if (vap->va_type == VDIR) {
835 		if (zfsvfs->z_replay) {
836 			VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
837 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
838 			    obj_type, bonuslen, tx));
839 		} else {
840 			obj = zap_create_norm(zfsvfs->z_os,
841 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
842 			    obj_type, bonuslen, tx);
843 		}
844 	} else {
845 		if (zfsvfs->z_replay) {
846 			VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
847 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
848 			    obj_type, bonuslen, tx));
849 		} else {
850 			obj = dmu_object_alloc(zfsvfs->z_os,
851 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
852 			    obj_type, bonuslen, tx);
853 		}
854 	}
855 
856 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
857 	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
858 
859 	/*
860 	 * If this is the root, fix up the half-initialized parent pointer
861 	 * to reference the just-allocated physical data area.
862 	 */
863 	if (flag & IS_ROOT_NODE) {
864 		dzp->z_id = obj;
865 	} else {
866 		dzp_pflags = dzp->z_pflags;
867 	}
868 
869 	/*
870 	 * If parent is an xattr, so am I.
871 	 */
872 	if (dzp_pflags & ZFS_XATTR) {
873 		flag |= IS_XATTR;
874 	}
875 
876 	if (zfsvfs->z_use_fuids)
877 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
878 	else
879 		pflags = 0;
880 
881 	if (vap->va_type == VDIR) {
882 		size = 2;		/* contents ("." and "..") */
883 		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
884 	} else {
885 		size = links = 0;
886 	}
887 
888 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
889 		rdev = zfs_expldev(vap->va_rdev);
890 	}
891 
892 	parent = dzp->z_id;
893 	mode = acl_ids->z_mode;
894 	if (flag & IS_XATTR)
895 		pflags |= ZFS_XATTR;
896 
897 	/*
898 	 * No execs denied will be deterimed when zfs_mode_compute() is called.
899 	 */
900 	pflags |= acl_ids->z_aclp->z_hints &
901 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
902 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
903 
904 	ZFS_TIME_ENCODE(&now, crtime);
905 	ZFS_TIME_ENCODE(&now, ctime);
906 
907 	if (vap->va_mask & AT_ATIME) {
908 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
909 	} else {
910 		ZFS_TIME_ENCODE(&now, atime);
911 	}
912 
913 	if (vap->va_mask & AT_MTIME) {
914 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
915 	} else {
916 		ZFS_TIME_ENCODE(&now, mtime);
917 	}
918 
919 	/* Now add in all of the "SA" attributes */
920 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
921 	    &sa_hdl));
922 
923 	/*
924 	 * Setup the array of attributes to be replaced/set on the new file
925 	 *
926 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
927 	 * in the old znode_phys_t format.  Don't change this ordering
928 	 */
929 
930 	if (obj_type == DMU_OT_ZNODE) {
931 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
932 		    NULL, &atime, 16);
933 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
934 		    NULL, &mtime, 16);
935 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
936 		    NULL, &ctime, 16);
937 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
938 		    NULL, &crtime, 16);
939 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
940 		    NULL, &gen, 8);
941 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
942 		    NULL, &mode, 8);
943 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
944 		    NULL, &size, 8);
945 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
946 		    NULL, &parent, 8);
947 	} else {
948 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
949 		    NULL, &mode, 8);
950 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
951 		    NULL, &size, 8);
952 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
953 		    NULL, &gen, 8);
954 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
955 		    &acl_ids->z_fuid, 8);
956 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
957 		    &acl_ids->z_fgid, 8);
958 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
959 		    NULL, &parent, 8);
960 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
961 		    NULL, &pflags, 8);
962 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
963 		    NULL, &atime, 16);
964 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
965 		    NULL, &mtime, 16);
966 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
967 		    NULL, &ctime, 16);
968 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
969 		    NULL, &crtime, 16);
970 	}
971 
972 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
973 
974 	if (obj_type == DMU_OT_ZNODE) {
975 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
976 		    &empty_xattr, 8);
977 	}
978 	if (obj_type == DMU_OT_ZNODE ||
979 	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
980 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
981 		    NULL, &rdev, 8);
982 
983 	}
984 	if (obj_type == DMU_OT_ZNODE) {
985 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
986 		    NULL, &pflags, 8);
987 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
988 		    &acl_ids->z_fuid, 8);
989 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
990 		    &acl_ids->z_fgid, 8);
991 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
992 		    sizeof (uint64_t) * 4);
993 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
994 		    &acl_phys, sizeof (zfs_acl_phys_t));
995 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
996 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
997 		    &acl_ids->z_aclp->z_acl_count, 8);
998 		locate.cb_aclp = acl_ids->z_aclp;
999 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
1000 		    zfs_acl_data_locator, &locate,
1001 		    acl_ids->z_aclp->z_acl_bytes);
1002 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
1003 		    acl_ids->z_fuid, acl_ids->z_fgid);
1004 	}
1005 
1006 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
1007 
1008 	if (!(flag & IS_ROOT_NODE)) {
1009 #ifdef __NetBSD__
1010 		vnode_t *vp;
1011 		struct zfs_loadvnode_args args = { db, 0, obj_type, sa_hdl };
1012 
1013 		tsd_set(zfs_loadvnode_key, &args);
1014 		err = vcache_get(zfsvfs->z_vfs, &obj, sizeof(obj), &vp);
1015 		tsd_set(zfs_loadvnode_key, NULL);
1016 
1017 		ASSERT3U(err, ==, 0);
1018 		*zpp = VTOZ(vp);
1019 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1020 #else
1021 		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
1022 #endif
1023 		ASSERT(*zpp != NULL);
1024 	} else {
1025 		/*
1026 		 * If we are creating the root node, the "parent" we
1027 		 * passed in is the znode for the root.
1028 		 */
1029 		*zpp = dzp;
1030 
1031 		(*zpp)->z_sa_hdl = sa_hdl;
1032 	}
1033 
1034 	(*zpp)->z_pflags = pflags;
1035 	(*zpp)->z_mode = mode;
1036 
1037 	if (vap->va_mask & AT_XVATTR)
1038 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
1039 
1040 	if (obj_type == DMU_OT_ZNODE ||
1041 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
1042 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
1043 	}
1044 #ifndef __NetBSD__
1045 	if (!(flag & IS_ROOT_NODE)) {
1046 		vnode_t *vp;
1047 
1048 		vp = ZTOV(*zpp);
1049 		vp->v_vflag |= VV_FORCEINSMQ;
1050 		err = insmntque(vp, zfsvfs->z_vfs);
1051 		vp->v_vflag &= ~VV_FORCEINSMQ;
1052 		KASSERT(err == 0, ("insmntque() failed: error %d", err));
1053 	}
1054 #endif
1055 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1056 }
1057 
1058 /*
1059  * Update in-core attributes.  It is assumed the caller will be doing an
1060  * sa_bulk_update to push the changes out.
1061  */
1062 void
1063 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
1064 {
1065 	xoptattr_t *xoap;
1066 
1067 	xoap = xva_getxoptattr(xvap);
1068 	ASSERT(xoap);
1069 
1070 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
1071 		uint64_t times[2];
1072 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
1073 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
1074 		    &times, sizeof (times), tx);
1075 		XVA_SET_RTN(xvap, XAT_CREATETIME);
1076 	}
1077 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1078 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1079 		    zp->z_pflags, tx);
1080 		XVA_SET_RTN(xvap, XAT_READONLY);
1081 	}
1082 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1083 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1084 		    zp->z_pflags, tx);
1085 		XVA_SET_RTN(xvap, XAT_HIDDEN);
1086 	}
1087 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1088 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1089 		    zp->z_pflags, tx);
1090 		XVA_SET_RTN(xvap, XAT_SYSTEM);
1091 	}
1092 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1093 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1094 		    zp->z_pflags, tx);
1095 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
1096 	}
1097 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1098 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1099 		    zp->z_pflags, tx);
1100 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1101 	}
1102 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1103 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1104 		    zp->z_pflags, tx);
1105 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
1106 	}
1107 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1108 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1109 		    zp->z_pflags, tx);
1110 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
1111 	}
1112 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1113 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1114 		    zp->z_pflags, tx);
1115 		XVA_SET_RTN(xvap, XAT_NODUMP);
1116 	}
1117 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1118 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1119 		    zp->z_pflags, tx);
1120 		XVA_SET_RTN(xvap, XAT_OPAQUE);
1121 	}
1122 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1123 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1124 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1125 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1126 	}
1127 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1128 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1129 		    zp->z_pflags, tx);
1130 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1131 	}
1132 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1133 		zfs_sa_set_scanstamp(zp, xvap, tx);
1134 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1135 	}
1136 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1137 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1138 		    zp->z_pflags, tx);
1139 		XVA_SET_RTN(xvap, XAT_REPARSE);
1140 	}
1141 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1142 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1143 		    zp->z_pflags, tx);
1144 		XVA_SET_RTN(xvap, XAT_OFFLINE);
1145 	}
1146 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1147 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1148 		    zp->z_pflags, tx);
1149 		XVA_SET_RTN(xvap, XAT_SPARSE);
1150 	}
1151 }
1152 
1153 #ifdef __NetBSD__
1154 
1155 static inline int
1156 zfs_do_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp,
1157     int (*get)(struct mount *, const void *, size_t, struct vnode **))
1158 {
1159 	struct vnode *vp;
1160 	int err;
1161 
1162 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1163 
1164 	err = (*get)(zfsvfs->z_vfs, &obj_num, sizeof(obj_num), &vp);
1165 	if (err)
1166 		*zpp = NULL;
1167 	else
1168 		*zpp = VTOZ(vp);
1169 
1170 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1171 
1172 	return (err);
1173 }
1174 
1175 int
1176 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1177 {
1178 
1179 	return zfs_do_zget(zfsvfs, obj_num, zpp, vcache_get);
1180 }
1181 
1182 int
1183 zfs_zget_cleaner(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1184 {
1185 	dmu_buf_t *db;
1186 	sa_handle_t *hdl;
1187 	dmu_object_info_t doi;
1188 	znode_t *zp;
1189 	int err;
1190 
1191 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1192 	if (err) {
1193 		return (SET_ERROR(err));
1194 	}
1195 
1196 	dmu_object_info_from_db(db, &doi);
1197 	if (doi.doi_bonus_type != DMU_OT_SA &&
1198 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1199 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1200 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1201 		sa_buf_rele(db, NULL);
1202 		return (SET_ERROR(EINVAL));
1203 	}
1204 	hdl = dmu_buf_get_user(db);
1205 	KASSERT(hdl != NULL);
1206 	zp = sa_get_userdata(hdl);
1207 	*zpp = zp;
1208 	return (0);
1209 }
1210 
1211 /*
1212  * Callback from vcache to set up the znode.
1213  * This is largely copied from zfs_znode_alloc().
1214  */
1215 
1216 int
1217 zfs_loadvnode(struct mount *mp, struct vnode *vp,
1218     const void *key, size_t key_len, const void **new_key)
1219 {
1220 	znode_t *zp;
1221 	uint64_t mode;
1222 	uint64_t parent;
1223 	sa_bulk_attr_t bulk[9];
1224 	int count = 0;
1225 	int err;
1226 
1227 	uint64_t obj_num, rdev;
1228 	zfsvfs_t *zfsvfs;
1229 	dmu_object_info_t doi;
1230 
1231 	/* args to zfs_zvnode_alloc() */
1232 	struct zfs_loadvnode_args *args;
1233 	dmu_buf_t *db;
1234 	int blksz;
1235 	dmu_object_type_t obj_type;
1236 	sa_handle_t *hdl;
1237 
1238 	KASSERT(key_len == sizeof(obj_num));
1239 	memcpy(&obj_num, key, key_len);
1240 
1241 	zfsvfs = mp->mnt_data;
1242 
1243 	args = tsd_get(zfs_loadvnode_key);
1244 	if (args) {
1245 		db = args->db;
1246 		blksz = args->blksz;
1247 		obj_type = args->obj_type;
1248 		hdl = args->sa_hdl;
1249 		goto skip_lookup;
1250 	}
1251 
1252 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1253 	if (err) {
1254 		return (SET_ERROR(err));
1255 	}
1256 
1257 	dmu_object_info_from_db(db, &doi);
1258 	if (doi.doi_bonus_type != DMU_OT_SA &&
1259 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1260 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1261 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1262 		sa_buf_rele(db, NULL);
1263 		return (SET_ERROR(EINVAL));
1264 	}
1265 	blksz = doi.doi_data_block_size;
1266 	obj_type = doi.doi_bonus_type;
1267 	hdl = dmu_buf_get_user(db);
1268 
1269 	if (hdl != NULL) {
1270 		zp = sa_get_userdata(hdl);
1271 
1272 		/*
1273 		 * Since "SA" does immediate eviction we
1274 		 * should never find a sa handle that doesn't
1275 		 * know about the znode.
1276 		 */
1277 		ASSERT3P(zp, !=, NULL);
1278 		ASSERT3U(zp->z_id, ==, obj_num);
1279 
1280 		sa_buf_rele(db, NULL);
1281 		VFS_HOLD(zfsvfs->z_vfs);
1282 		*new_key = &zp->z_id;
1283 		return (0);
1284 	}
1285 
1286 skip_lookup:
1287 	vp->v_op = zfs_vnodeop_p;
1288 	vp->v_tag = VT_ZFS;
1289 
1290 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1291 	zp->z_vnode = vp;
1292 	vp->v_data = zp;
1293 
1294 	extern const struct genfs_ops zfs_genfsops;
1295 	genfs_node_init(vp, &zfs_genfsops);
1296 
1297 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
1298 	zp->z_moved = 0;
1299 
1300 	/*
1301 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
1302 	 * the zfs_znode_move() callback.
1303 	 */
1304 	zp->z_sa_hdl = NULL;
1305 	zp->z_unlinked = 0;
1306 	zp->z_atime_dirty = 0;
1307 	zp->z_mapcnt = 0;
1308 	zp->z_id = db->db_object;
1309 	zp->z_blksz = blksz;
1310 	zp->z_seq = 0x7A4653;
1311 	zp->z_sync_cnt = 0;
1312 
1313 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
1314 
1315 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
1316 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
1317 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1318 	    &zp->z_size, 8);
1319 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1320 	    &zp->z_links, 8);
1321 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1322 	    &zp->z_pflags, 8);
1323 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
1324 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1325 	    &zp->z_atime, 16);
1326 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1327 	    &zp->z_uid, 8);
1328 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1329 	    &zp->z_gid, 8);
1330 
1331 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
1332 		if (hdl == NULL)
1333 			sa_handle_destroy(zp->z_sa_hdl);
1334 		zp->z_vnode = NULL;
1335 		kmem_cache_free(znode_cache, zp);
1336 		sa_buf_rele(db, NULL);
1337 		return (SET_ERROR(ENOENT));
1338 	}
1339 
1340 	zp->z_mode = mode;
1341 
1342 	vp->v_type = IFTOVT((mode_t)zp->z_mode);
1343 
1344 	switch (vp->v_type) {
1345 	case VDIR:
1346 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
1347 		break;
1348 	case VBLK:
1349 	case VCHR:
1350 	/* XXX NetBSD	vp->v_op = zfs_specop_p; */
1351 		(void) sa_lookup(hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
1352 		    sizeof (rdev));
1353 		spec_node_init(vp, zfs_cmpldev(rdev));
1354 		break;
1355 	case VFIFO:
1356 		/* XXX NetBSD vp->v_op = zfs_fifoop_p; */
1357 		break;
1358 	}
1359 
1360 	uvm_vnp_setsize(vp, zp->z_size);
1361 	dprintf("zfs_loadvnode znode %p -- vnode %p\n", zp, vp);
1362 	dprintf("zfs_loadvnode z_id %ld\n", zp->z_id);
1363 
1364 	mutex_enter(&zfsvfs->z_znodes_lock);
1365 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
1366 	membar_producer();
1367 
1368 	/*
1369 	 * Everything else must be valid before assigning z_zfsvfs makes the
1370 	 * znode eligible for zfs_znode_move().
1371 	 */
1372 	zp->z_zfsvfs = zfsvfs;
1373 	mutex_exit(&zfsvfs->z_znodes_lock);
1374 
1375 	VFS_HOLD(zfsvfs->z_vfs);
1376 	*new_key = &zp->z_id;
1377 	return (0);
1378 }
1379 
1380 #else /* __NetBSD__ */
1381 
1382 int
1383 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1384 {
1385 	dmu_object_info_t doi;
1386 	dmu_buf_t	*db;
1387 	znode_t		*zp;
1388 	vnode_t		*vp;
1389 	sa_handle_t	*hdl;
1390 	struct thread	*td;
1391 	int locked;
1392 	int err;
1393 
1394 	td = curthread;
1395 	getnewvnode_reserve(1);
1396 again:
1397 	*zpp = NULL;
1398 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1399 
1400 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1401 	if (err) {
1402 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1403 		getnewvnode_drop_reserve();
1404 		return (err);
1405 	}
1406 
1407 	dmu_object_info_from_db(db, &doi);
1408 	if (doi.doi_bonus_type != DMU_OT_SA &&
1409 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1410 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1411 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1412 		sa_buf_rele(db, NULL);
1413 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1414 #ifdef __FreeBSD__
1415 		getnewvnode_drop_reserve();
1416 #endif
1417 		return (SET_ERROR(EINVAL));
1418 	}
1419 
1420 	hdl = dmu_buf_get_user(db);
1421 	if (hdl != NULL) {
1422 		zp  = sa_get_userdata(hdl);
1423 
1424 		/*
1425 		 * Since "SA" does immediate eviction we
1426 		 * should never find a sa handle that doesn't
1427 		 * know about the znode.
1428 		 */
1429 		ASSERT3P(zp, !=, NULL);
1430 		ASSERT3U(zp->z_id, ==, obj_num);
1431 		*zpp = zp;
1432 		vp = ZTOV(zp);
1433 
1434 		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
1435 		VN_HOLD(vp);
1436 
1437 		sa_buf_rele(db, NULL);
1438 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1439 
1440 		locked = VOP_ISLOCKED(vp);
1441 		VI_LOCK(vp);
1442 		if ((vp->v_iflag & VI_DOOMED) != 0 &&
1443 		    locked != LK_EXCLUSIVE) {
1444 			/*
1445 			 * The vnode is doomed and this thread doesn't
1446 			 * hold the exclusive lock on it, so the vnode
1447 			 * must be being reclaimed by another thread.
1448 			 * Otherwise the doomed vnode is being reclaimed
1449 			 * by this thread and zfs_zget is called from
1450 			 * ZIL internals.
1451 			 */
1452 			VI_UNLOCK(vp);
1453 
1454 			/*
1455 			 * XXX vrele() locks the vnode when the last reference
1456 			 * is dropped.  Although in this case the vnode is
1457 			 * doomed / dead and so no inactivation is required,
1458 			 * the vnode lock is still acquired.  That could result
1459 			 * in a LOR with z_teardown_lock if another thread holds
1460 			 * the vnode's lock and tries to take z_teardown_lock.
1461 			 * But that is only possible if the other thread peforms
1462 			 * a ZFS vnode operation on the vnode.  That either
1463 			 * should not happen if the vnode is dead or the thread
1464 			 * should also have a refrence to the vnode and thus
1465 			 * our reference is not last.
1466 			 */
1467 			VN_RELE(vp);
1468 			goto again;
1469 		}
1470 		VI_UNLOCK(vp);
1471 		getnewvnode_drop_reserve();
1472 		return (0);
1473 	}
1474 
1475 	/*
1476 	 * Not found create new znode/vnode
1477 	 * but only if file exists.
1478 	 *
1479 	 * There is a small window where zfs_vget() could
1480 	 * find this object while a file create is still in
1481 	 * progress.  This is checked for in zfs_znode_alloc()
1482 	 *
1483 	 * if zfs_znode_alloc() fails it will drop the hold on the
1484 	 * bonus buffer.
1485 	 */
1486 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1487 	    doi.doi_bonus_type, NULL);
1488 	if (zp == NULL) {
1489 		err = SET_ERROR(ENOENT);
1490 	} else {
1491 		*zpp = zp;
1492 	}
1493 	if (err == 0) {
1494 		vnode_t *vp = ZTOV(zp);
1495 
1496 		err = insmntque(vp, zfsvfs->z_vfs);
1497 		if (err == 0) {
1498 			vp->v_hash = obj_num;
1499 			VOP_UNLOCK(vp, 0);
1500 		} else {
1501 			zp->z_vnode = NULL;
1502 			zfs_znode_dmu_fini(zp);
1503 			zfs_znode_free(zp);
1504 			*zpp = NULL;
1505 		}
1506 	}
1507 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1508 	getnewvnode_drop_reserve();
1509 	return (err);
1510 }
1511 
1512 #endif /* __NetBSD__ */
1513 
1514 int
1515 zfs_rezget(znode_t *zp)
1516 {
1517 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1518 	dmu_object_info_t doi;
1519 	dmu_buf_t *db;
1520 	vnode_t *vp;
1521 	uint64_t obj_num = zp->z_id;
1522 	uint64_t mode, size;
1523 	sa_bulk_attr_t bulk[8];
1524 	int err;
1525 	int count = 0;
1526 	uint64_t gen;
1527 
1528 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1529 
1530 	mutex_enter(&zp->z_acl_lock);
1531 	if (zp->z_acl_cached) {
1532 		zfs_acl_free(zp->z_acl_cached);
1533 		zp->z_acl_cached = NULL;
1534 	}
1535 
1536 	mutex_exit(&zp->z_acl_lock);
1537 	ASSERT(zp->z_sa_hdl == NULL);
1538 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1539 	if (err) {
1540 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1541 		return (err);
1542 	}
1543 
1544 	dmu_object_info_from_db(db, &doi);
1545 	if (doi.doi_bonus_type != DMU_OT_SA &&
1546 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1547 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1548 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1549 		sa_buf_rele(db, NULL);
1550 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1551 		return (SET_ERROR(EINVAL));
1552 	}
1553 
1554 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1555 	size = zp->z_size;
1556 
1557 	/* reload cached values */
1558 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1559 	    &gen, sizeof (gen));
1560 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1561 	    &zp->z_size, sizeof (zp->z_size));
1562 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1563 	    &zp->z_links, sizeof (zp->z_links));
1564 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1565 	    &zp->z_pflags, sizeof (zp->z_pflags));
1566 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1567 	    &zp->z_atime, sizeof (zp->z_atime));
1568 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1569 	    &zp->z_uid, sizeof (zp->z_uid));
1570 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1571 	    &zp->z_gid, sizeof (zp->z_gid));
1572 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1573 	    &mode, sizeof (mode));
1574 
1575 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1576 		zfs_znode_dmu_fini(zp);
1577 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1578 		return (SET_ERROR(EIO));
1579 	}
1580 
1581 	zp->z_mode = mode;
1582 
1583 	if (gen != zp->z_gen) {
1584 		zfs_znode_dmu_fini(zp);
1585 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1586 		return (SET_ERROR(EIO));
1587 	}
1588 
1589 	/*
1590 	 * It is highly improbable but still quite possible that two
1591 	 * objects in different datasets are created with the same
1592 	 * object numbers and in transaction groups with the same
1593 	 * numbers.  znodes corresponding to those objects would
1594 	 * have the same z_id and z_gen, but their other attributes
1595 	 * may be different.
1596 	 * zfs recv -F may replace one of such objects with the other.
1597 	 * As a result file properties recorded in the replaced
1598 	 * object's vnode may no longer match the received object's
1599 	 * properties.  At present the only cached property is the
1600 	 * files type recorded in v_type.
1601 	 * So, handle this case by leaving the old vnode and znode
1602 	 * disassociated from the actual object.  A new vnode and a
1603 	 * znode will be created if the object is accessed
1604 	 * (e.g. via a look-up).  The old vnode and znode will be
1605 	 * recycled when the last vnode reference is dropped.
1606 	 */
1607 	vp = ZTOV(zp);
1608 	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
1609 		zfs_znode_dmu_fini(zp);
1610 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1611 		return (EIO);
1612 	}
1613 
1614 	zp->z_unlinked = (zp->z_links == 0);
1615 	zp->z_blksz = doi.doi_data_block_size;
1616 	vn_pages_remove(vp, 0, 0);
1617 	if (zp->z_size != size)
1618 		vnode_pager_setsize(vp, zp->z_size);
1619 
1620 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1621 
1622 	return (0);
1623 }
1624 
1625 void
1626 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1627 {
1628 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1629 	objset_t *os = zfsvfs->z_os;
1630 	uint64_t obj = zp->z_id;
1631 	uint64_t acl_obj = zfs_external_acl(zp);
1632 
1633 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1634 	if (acl_obj) {
1635 		VERIFY(!zp->z_is_sa);
1636 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1637 	}
1638 	VERIFY(0 == dmu_object_free(os, obj, tx));
1639 	zfs_znode_dmu_fini(zp);
1640 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1641 	zfs_znode_free(zp);
1642 }
1643 
1644 void
1645 zfs_zinactive(znode_t *zp)
1646 {
1647 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1648 	uint64_t z_id = zp->z_id;
1649 
1650 	ASSERT(zp->z_sa_hdl);
1651 
1652 	/*
1653 	 * Don't allow a zfs_zget() while were trying to release this znode
1654 	 */
1655 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1656 
1657 	/*
1658 	 * If this was the last reference to a file with no links,
1659 	 * remove the file from the file system.
1660 	 */
1661 	if (zp->z_unlinked) {
1662 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1663 		zfs_rmnode(zp);
1664 		return;
1665 	}
1666 
1667 	zfs_znode_dmu_fini(zp);
1668 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1669 	zfs_znode_free(zp);
1670 }
1671 
1672 void
1673 zfs_znode_free(znode_t *zp)
1674 {
1675 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1676 
1677 #ifdef __NetBSD__
1678 	struct vnode *vp = ZTOV(zp);
1679 
1680 	genfs_node_destroy(vp);
1681 
1682 	/*
1683 	 * Interlock with zfs_sync().
1684 	 */
1685 	mutex_enter(vp->v_interlock);
1686 	vp->v_data = NULL;
1687 	mutex_exit(vp->v_interlock);
1688 #endif
1689 
1690 	ASSERT(zp->z_sa_hdl == NULL);
1691 	zp->z_vnode = NULL;
1692 	mutex_enter(&zfsvfs->z_znodes_lock);
1693 	POINTER_INVALIDATE(&zp->z_zfsvfs);
1694 	list_remove(&zfsvfs->z_all_znodes, zp);
1695 	mutex_exit(&zfsvfs->z_znodes_lock);
1696 
1697 	if (zp->z_acl_cached) {
1698 		zfs_acl_free(zp->z_acl_cached);
1699 		zp->z_acl_cached = NULL;
1700 	}
1701 
1702 	kmem_cache_free(znode_cache, zp);
1703 
1704 #ifdef illumos
1705 	VFS_RELE(zfsvfs->z_vfs);
1706 #endif
1707 }
1708 
1709 void
1710 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1711     uint64_t ctime[2], boolean_t have_tx)
1712 {
1713 	timestruc_t	now;
1714 
1715 	vfs_timestamp(&now);
1716 
1717 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
1718 		zp->z_atime_dirty = 0;
1719 		zp->z_seq++;
1720 	} else {
1721 		zp->z_atime_dirty = 1;
1722 	}
1723 
1724 	if (flag & AT_ATIME) {
1725 		ZFS_TIME_ENCODE(&now, zp->z_atime);
1726 	}
1727 
1728 	if (flag & AT_MTIME) {
1729 		ZFS_TIME_ENCODE(&now, mtime);
1730 		if (zp->z_zfsvfs->z_use_fuids) {
1731 			zp->z_pflags |= (ZFS_ARCHIVE |
1732 			    ZFS_AV_MODIFIED);
1733 		}
1734 	}
1735 
1736 	if (flag & AT_CTIME) {
1737 		ZFS_TIME_ENCODE(&now, ctime);
1738 		if (zp->z_zfsvfs->z_use_fuids)
1739 			zp->z_pflags |= ZFS_ARCHIVE;
1740 	}
1741 }
1742 
1743 /*
1744  * Grow the block size for a file.
1745  *
1746  *	IN:	zp	- znode of file to free data in.
1747  *		size	- requested block size
1748  *		tx	- open transaction.
1749  *
1750  * NOTE: this function assumes that the znode is write locked.
1751  */
1752 void
1753 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1754 {
1755 	int		error;
1756 	u_longlong_t	dummy;
1757 
1758 	if (size <= zp->z_blksz)
1759 		return;
1760 	/*
1761 	 * If the file size is already greater than the current blocksize,
1762 	 * we will not grow.  If there is more than one block in a file,
1763 	 * the blocksize cannot change.
1764 	 */
1765 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1766 		return;
1767 
1768 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1769 	    size, 0, tx);
1770 
1771 	if (error == ENOTSUP)
1772 		return;
1773 	ASSERT0(error);
1774 
1775 	/* What blocksize did we actually get? */
1776 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1777 }
1778 
1779 #ifdef illumos
1780 /*
1781  * This is a dummy interface used when pvn_vplist_dirty() should *not*
1782  * be calling back into the fs for a putpage().  E.g.: when truncating
1783  * a file, the pages being "thrown away* don't need to be written out.
1784  */
1785 /* ARGSUSED */
1786 static int
1787 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1788     int flags, cred_t *cr)
1789 {
1790 	ASSERT(0);
1791 	return (0);
1792 }
1793 #endif
1794 
1795 /*
1796  * Increase the file length
1797  *
1798  *	IN:	zp	- znode of file to free data in.
1799  *		end	- new end-of-file
1800  *
1801  *	RETURN:	0 on success, error code on failure
1802  */
1803 static int
1804 zfs_extend(znode_t *zp, uint64_t end)
1805 {
1806 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1807 	dmu_tx_t *tx;
1808 	rl_t *rl;
1809 	uint64_t newblksz;
1810 	int error;
1811 
1812 	/*
1813 	 * We will change zp_size, lock the whole file.
1814 	 */
1815 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1816 
1817 	/*
1818 	 * Nothing to do if file already at desired length.
1819 	 */
1820 	if (end <= zp->z_size) {
1821 		zfs_range_unlock(rl);
1822 		return (0);
1823 	}
1824 	tx = dmu_tx_create(zfsvfs->z_os);
1825 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1826 	zfs_sa_upgrade_txholds(tx, zp);
1827 	if (end > zp->z_blksz &&
1828 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1829 		/*
1830 		 * We are growing the file past the current block size.
1831 		 */
1832 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1833 			/*
1834 			 * File's blocksize is already larger than the
1835 			 * "recordsize" property.  Only let it grow to
1836 			 * the next power of 2.
1837 			 */
1838 			ASSERT(!ISP2(zp->z_blksz));
1839 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1840 		} else {
1841 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1842 		}
1843 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1844 	} else {
1845 		newblksz = 0;
1846 	}
1847 
1848 	error = dmu_tx_assign(tx, TXG_WAIT);
1849 	if (error) {
1850 		dmu_tx_abort(tx);
1851 		zfs_range_unlock(rl);
1852 		return (error);
1853 	}
1854 
1855 	if (newblksz)
1856 		zfs_grow_blocksize(zp, newblksz, tx);
1857 
1858 	zp->z_size = end;
1859 
1860 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
1861 	    &zp->z_size, sizeof (zp->z_size), tx));
1862 
1863 	vnode_pager_setsize(ZTOV(zp), end);
1864 
1865 	zfs_range_unlock(rl);
1866 
1867 	dmu_tx_commit(tx);
1868 
1869 	return (0);
1870 }
1871 
1872 /*
1873  * Free space in a file.
1874  *
1875  *	IN:	zp	- znode of file to free data in.
1876  *		off	- start of section to free.
1877  *		len	- length of section to free.
1878  *
1879  *	RETURN:	0 on success, error code on failure
1880  */
1881 static int
1882 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1883 {
1884 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1885 	rl_t *rl;
1886 	int error;
1887 
1888 	/*
1889 	 * Lock the range being freed.
1890 	 */
1891 	rl = zfs_range_lock(zp, off, len, RL_WRITER);
1892 
1893 	/*
1894 	 * Nothing to do if file already at desired length.
1895 	 */
1896 	if (off >= zp->z_size) {
1897 		zfs_range_unlock(rl);
1898 		return (0);
1899 	}
1900 
1901 	if (off + len > zp->z_size)
1902 		len = zp->z_size - off;
1903 
1904 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1905 
1906 	if (error == 0) {
1907 		/*
1908 		 * In FreeBSD we cannot free block in the middle of a file,
1909 		 * but only at the end of a file, so this code path should
1910 		 * never happen.
1911 		 */
1912 		vnode_pager_setsize(ZTOV(zp), off);
1913 	}
1914 
1915 	zfs_range_unlock(rl);
1916 
1917 	return (error);
1918 }
1919 
1920 /*
1921  * Truncate a file
1922  *
1923  *	IN:	zp	- znode of file to free data in.
1924  *		end	- new end-of-file.
1925  *
1926  *	RETURN:	0 on success, error code on failure
1927  */
1928 static int
1929 zfs_trunc(znode_t *zp, uint64_t end)
1930 {
1931 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1932 	vnode_t *vp = ZTOV(zp);
1933 	dmu_tx_t *tx;
1934 	rl_t *rl;
1935 	int error;
1936 	sa_bulk_attr_t bulk[2];
1937 	int count = 0;
1938 
1939 	/*
1940 	 * We will change zp_size, lock the whole file.
1941 	 */
1942 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1943 
1944 	/*
1945 	 * Nothing to do if file already at desired length.
1946 	 */
1947 	if (end >= zp->z_size) {
1948 		zfs_range_unlock(rl);
1949 		return (0);
1950 	}
1951 
1952 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
1953 	if (error) {
1954 		zfs_range_unlock(rl);
1955 		return (error);
1956 	}
1957 	tx = dmu_tx_create(zfsvfs->z_os);
1958 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1959 	zfs_sa_upgrade_txholds(tx, zp);
1960 	dmu_tx_mark_netfree(tx);
1961 	error = dmu_tx_assign(tx, TXG_WAIT);
1962 	if (error) {
1963 		dmu_tx_abort(tx);
1964 		zfs_range_unlock(rl);
1965 		return (error);
1966 	}
1967 
1968 	zp->z_size = end;
1969 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1970 	    NULL, &zp->z_size, sizeof (zp->z_size));
1971 
1972 	if (end == 0) {
1973 		zp->z_pflags &= ~ZFS_SPARSE;
1974 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1975 		    NULL, &zp->z_pflags, 8);
1976 	}
1977 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1978 
1979 	dmu_tx_commit(tx);
1980 
1981 	/*
1982 	 * Clear any mapped pages in the truncated region.  This has to
1983 	 * happen outside of the transaction to avoid the possibility of
1984 	 * a deadlock with someone trying to push a page that we are
1985 	 * about to invalidate.
1986 	 */
1987 	vnode_pager_setsize(vp, end);
1988 
1989 	zfs_range_unlock(rl);
1990 
1991 	return (0);
1992 }
1993 
1994 /*
1995  * Free space in a file
1996  *
1997  *	IN:	zp	- znode of file to free data in.
1998  *		off	- start of range
1999  *		len	- end of range (0 => EOF)
2000  *		flag	- current file open mode flags.
2001  *		log	- TRUE if this action should be logged
2002  *
2003  *	RETURN:	0 on success, error code on failure
2004  */
2005 int
2006 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
2007 {
2008 	vnode_t *vp = ZTOV(zp);
2009 	dmu_tx_t *tx;
2010 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2011 	zilog_t *zilog = zfsvfs->z_log;
2012 	uint64_t mode;
2013 	uint64_t mtime[2], ctime[2];
2014 	sa_bulk_attr_t bulk[3];
2015 	int count = 0;
2016 	int error;
2017 
2018 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
2019 	    sizeof (mode))) != 0)
2020 		return (error);
2021 
2022 	if (off > zp->z_size) {
2023 		error =  zfs_extend(zp, off+len);
2024 		if (error == 0 && log)
2025 			goto log;
2026 		else
2027 			return (error);
2028 	}
2029 
2030 	/*
2031 	 * Check for any locks in the region to be freed.
2032 	 */
2033 
2034 	if (MANDLOCK(vp, (mode_t)mode)) {
2035 		uint64_t length = (len ? len : zp->z_size - off);
2036 		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
2037 			return (error);
2038 	}
2039 
2040 	if (len == 0) {
2041 		error = zfs_trunc(zp, off);
2042 	} else {
2043 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
2044 		    off + len > zp->z_size)
2045 			error = zfs_extend(zp, off+len);
2046 	}
2047 	if (error || !log)
2048 		return (error);
2049 log:
2050 	tx = dmu_tx_create(zfsvfs->z_os);
2051 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2052 	zfs_sa_upgrade_txholds(tx, zp);
2053 	error = dmu_tx_assign(tx, TXG_WAIT);
2054 	if (error) {
2055 		dmu_tx_abort(tx);
2056 		return (error);
2057 	}
2058 
2059 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
2060 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
2061 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
2062 	    NULL, &zp->z_pflags, 8);
2063 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
2064 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2065 	ASSERT(error == 0);
2066 
2067 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
2068 
2069 	dmu_tx_commit(tx);
2070 	return (0);
2071 }
2072 
2073 void
2074 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
2075 {
2076 	uint64_t	moid, obj, sa_obj, version;
2077 	uint64_t	sense = ZFS_CASE_SENSITIVE;
2078 	uint64_t	norm = 0;
2079 	nvpair_t	*elem;
2080 	int		error;
2081 	int		i;
2082 	znode_t		*rootzp = NULL;
2083 	zfsvfs_t	*zfsvfs;
2084 	vattr_t		vattr;
2085 	znode_t		*zp;
2086 	zfs_acl_ids_t	acl_ids;
2087 
2088 	/*
2089 	 * First attempt to create master node.
2090 	 */
2091 	/*
2092 	 * In an empty objset, there are no blocks to read and thus
2093 	 * there can be no i/o errors (which we assert below).
2094 	 */
2095 	moid = MASTER_NODE_OBJ;
2096 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
2097 	    DMU_OT_NONE, 0, tx);
2098 	ASSERT(error == 0);
2099 
2100 	/*
2101 	 * Set starting attributes.
2102 	 */
2103 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
2104 	elem = NULL;
2105 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
2106 		/* For the moment we expect all zpl props to be uint64_ts */
2107 		uint64_t val;
2108 		char *name;
2109 
2110 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
2111 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
2112 		name = nvpair_name(elem);
2113 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
2114 			if (val < version)
2115 				version = val;
2116 		} else {
2117 			error = zap_update(os, moid, name, 8, 1, &val, tx);
2118 		}
2119 		ASSERT(error == 0);
2120 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
2121 			norm = val;
2122 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
2123 			sense = val;
2124 	}
2125 	ASSERT(version != 0);
2126 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
2127 
2128 	/*
2129 	 * Create zap object used for SA attribute registration
2130 	 */
2131 
2132 	if (version >= ZPL_VERSION_SA) {
2133 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2134 		    DMU_OT_NONE, 0, tx);
2135 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2136 		ASSERT(error == 0);
2137 	} else {
2138 		sa_obj = 0;
2139 	}
2140 	/*
2141 	 * Create a delete queue.
2142 	 */
2143 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
2144 
2145 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
2146 	ASSERT(error == 0);
2147 
2148 	/*
2149 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
2150 	 * to allow zfs_mknode to work.
2151 	 */
2152 	VATTR_NULL(&vattr);
2153 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
2154 	vattr.va_type = VDIR;
2155 	vattr.va_mode = S_IFDIR|0755;
2156 	vattr.va_uid = crgetuid(cr);
2157 	vattr.va_gid = crgetgid(cr);
2158 
2159 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
2160 
2161 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
2162 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
2163 	rootzp->z_moved = 0;
2164 	rootzp->z_unlinked = 0;
2165 	rootzp->z_atime_dirty = 0;
2166 	rootzp->z_is_sa = USE_SA(version, os);
2167 
2168 	zfsvfs->z_os = os;
2169 	zfsvfs->z_parent = zfsvfs;
2170 	zfsvfs->z_version = version;
2171 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
2172 	zfsvfs->z_use_sa = USE_SA(version, os);
2173 	zfsvfs->z_norm = norm;
2174 
2175 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
2176 	    &zfsvfs->z_attr_table);
2177 
2178 	ASSERT(error == 0);
2179 
2180 	/*
2181 	 * Fold case on file systems that are always or sometimes case
2182 	 * insensitive.
2183 	 */
2184 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
2185 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
2186 
2187 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
2188 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
2189 	    offsetof(znode_t, z_link_node));
2190 
2191 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
2192 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
2193 
2194 	rootzp->z_zfsvfs = zfsvfs;
2195 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
2196 	    cr, NULL, &acl_ids));
2197 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
2198 	ASSERT3P(zp, ==, rootzp);
2199 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
2200 	ASSERT(error == 0);
2201 	zfs_acl_ids_free(&acl_ids);
2202 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
2203 
2204 	sa_handle_destroy(rootzp->z_sa_hdl);
2205 	kmem_cache_free(znode_cache, rootzp);
2206 
2207 	/*
2208 	 * Create shares directory
2209 	 */
2210 
2211 	error = zfs_create_share_dir(zfsvfs, tx);
2212 
2213 	ASSERT(error == 0);
2214 
2215 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
2216 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
2217 	mutex_destroy(&zfsvfs->z_znodes_lock);
2218 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
2219 }
2220 #endif /* _KERNEL */
2221 
2222 static int
2223 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2224 {
2225 	uint64_t sa_obj = 0;
2226 	int error;
2227 
2228 	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2229 	if (error != 0 && error != ENOENT)
2230 		return (error);
2231 
2232 	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2233 	return (error);
2234 }
2235 
2236 static int
2237 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2238     dmu_buf_t **db, void *tag)
2239 {
2240 	dmu_object_info_t doi;
2241 	int error;
2242 
2243 	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2244 		return (error);
2245 
2246 	dmu_object_info_from_db(*db, &doi);
2247 	if ((doi.doi_bonus_type != DMU_OT_SA &&
2248 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
2249 	    doi.doi_bonus_type == DMU_OT_ZNODE &&
2250 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
2251 		sa_buf_rele(*db, tag);
2252 		return (SET_ERROR(ENOTSUP));
2253 	}
2254 
2255 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2256 	if (error != 0) {
2257 		sa_buf_rele(*db, tag);
2258 		return (error);
2259 	}
2260 
2261 	return (0);
2262 }
2263 
2264 void
2265 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2266 {
2267 	sa_handle_destroy(hdl);
2268 	sa_buf_rele(db, tag);
2269 }
2270 
2271 /*
2272  * Given an object number, return its parent object number and whether
2273  * or not the object is an extended attribute directory.
2274  */
2275 static int
2276 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2277     uint64_t *pobjp, int *is_xattrdir)
2278 {
2279 	uint64_t parent;
2280 	uint64_t pflags;
2281 	uint64_t mode;
2282 	uint64_t parent_mode;
2283 	sa_bulk_attr_t bulk[3];
2284 	sa_handle_t *sa_hdl;
2285 	dmu_buf_t *sa_db;
2286 	int count = 0;
2287 	int error;
2288 
2289 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2290 	    &parent, sizeof (parent));
2291 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2292 	    &pflags, sizeof (pflags));
2293 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2294 	    &mode, sizeof (mode));
2295 
2296 	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2297 		return (error);
2298 
2299 	/*
2300 	 * When a link is removed its parent pointer is not changed and will
2301 	 * be invalid.  There are two cases where a link is removed but the
2302 	 * file stays around, when it goes to the delete queue and when there
2303 	 * are additional links.
2304 	 */
2305 	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2306 	if (error != 0)
2307 		return (error);
2308 
2309 	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2310 	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2311 	if (error != 0)
2312 		return (error);
2313 
2314 	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2315 
2316 	/*
2317 	 * Extended attributes can be applied to files, directories, etc.
2318 	 * Otherwise the parent must be a directory.
2319 	 */
2320 	if (!*is_xattrdir && !S_ISDIR(parent_mode))
2321 		return (SET_ERROR(EINVAL));
2322 
2323 	*pobjp = parent;
2324 
2325 	return (0);
2326 }
2327 
2328 /*
2329  * Given an object number, return some zpl level statistics
2330  */
2331 static int
2332 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2333     zfs_stat_t *sb)
2334 {
2335 	sa_bulk_attr_t bulk[4];
2336 	int count = 0;
2337 
2338 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2339 	    &sb->zs_mode, sizeof (sb->zs_mode));
2340 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2341 	    &sb->zs_gen, sizeof (sb->zs_gen));
2342 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2343 	    &sb->zs_links, sizeof (sb->zs_links));
2344 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2345 	    &sb->zs_ctime, sizeof (sb->zs_ctime));
2346 
2347 	return (sa_bulk_lookup(hdl, bulk, count));
2348 }
2349 
2350 static int
2351 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2352     sa_attr_type_t *sa_table, char *buf, int len)
2353 {
2354 	sa_handle_t *sa_hdl;
2355 	sa_handle_t *prevhdl = NULL;
2356 	dmu_buf_t *prevdb = NULL;
2357 	dmu_buf_t *sa_db = NULL;
2358 	char *path = buf + len - 1;
2359 	int error;
2360 
2361 	*path = '\0';
2362 	sa_hdl = hdl;
2363 
2364 	for (;;) {
2365 		uint64_t pobj;
2366 		char component[MAXNAMELEN + 2];
2367 		size_t complen;
2368 		int is_xattrdir;
2369 
2370 		if (prevdb)
2371 			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2372 
2373 		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2374 		    &is_xattrdir)) != 0)
2375 			break;
2376 
2377 		if (pobj == obj) {
2378 			if (path[0] != '/')
2379 				*--path = '/';
2380 			break;
2381 		}
2382 
2383 		component[0] = '/';
2384 		if (is_xattrdir) {
2385 			(void) sprintf(component + 1, "<xattrdir>");
2386 		} else {
2387 			error = zap_value_search(osp, pobj, obj,
2388 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
2389 			if (error != 0)
2390 				break;
2391 		}
2392 
2393 		complen = strlen(component);
2394 		path -= complen;
2395 		ASSERT(path >= buf);
2396 		bcopy(component, path, complen);
2397 		obj = pobj;
2398 
2399 		if (sa_hdl != hdl) {
2400 			prevhdl = sa_hdl;
2401 			prevdb = sa_db;
2402 		}
2403 		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2404 		if (error != 0) {
2405 			sa_hdl = prevhdl;
2406 			sa_db = prevdb;
2407 			break;
2408 		}
2409 	}
2410 
2411 	if (sa_hdl != NULL && sa_hdl != hdl) {
2412 		ASSERT(sa_db != NULL);
2413 		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2414 	}
2415 
2416 	if (error == 0)
2417 		(void) memmove(buf, path, buf + len - path);
2418 
2419 	return (error);
2420 }
2421 
2422 int
2423 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2424 {
2425 	sa_attr_type_t *sa_table;
2426 	sa_handle_t *hdl;
2427 	dmu_buf_t *db;
2428 	int error;
2429 
2430 	error = zfs_sa_setup(osp, &sa_table);
2431 	if (error != 0)
2432 		return (error);
2433 
2434 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2435 	if (error != 0)
2436 		return (error);
2437 
2438 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2439 
2440 	zfs_release_sa_handle(hdl, db, FTAG);
2441 	return (error);
2442 }
2443 
2444 int
2445 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2446     char *buf, int len)
2447 {
2448 	char *path = buf + len - 1;
2449 	sa_attr_type_t *sa_table;
2450 	sa_handle_t *hdl;
2451 	dmu_buf_t *db;
2452 	int error;
2453 
2454 	*path = '\0';
2455 
2456 	error = zfs_sa_setup(osp, &sa_table);
2457 	if (error != 0)
2458 		return (error);
2459 
2460 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2461 	if (error != 0)
2462 		return (error);
2463 
2464 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2465 	if (error != 0) {
2466 		zfs_release_sa_handle(hdl, db, FTAG);
2467 		return (error);
2468 	}
2469 
2470 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2471 
2472 	zfs_release_sa_handle(hdl, db, FTAG);
2473 	return (error);
2474 }
2475 
2476 #ifdef _KERNEL
2477 int
2478 zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
2479 {
2480 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2481 	uint64_t parent;
2482 	int is_xattrdir;
2483 	int err;
2484 
2485 	/* Extended attributes should not be visible as regular files. */
2486 	if ((zp->z_pflags & ZFS_XATTR) != 0)
2487 		return (SET_ERROR(EINVAL));
2488 
2489 	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
2490 	    &parent, &is_xattrdir);
2491 	if (err != 0)
2492 		return (err);
2493 	ASSERT0(is_xattrdir);
2494 
2495 	/* No name as this is a root object. */
2496 	if (parent == zp->z_id)
2497 		return (SET_ERROR(EINVAL));
2498 
2499 	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
2500 	    ZFS_DIRENT_OBJ(-1ULL), buf);
2501 	if (err != 0)
2502 		return (err);
2503 	err = zfs_zget(zfsvfs, parent, dzpp);
2504 	return (err);
2505 }
2506 #endif /* _KERNEL */
2507