xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c (revision 7f21db1c0118155e0dd40b75182e30c589d9f63e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Portions Copyright 2007 Jeremy Teo */
27 
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/u8_textprep.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/kmem.h>
42 #include <sys/errno.h>
43 #include <sys/unistd.h>
44 #include <sys/atomic.h>
45 #include <sys/zfs_dir.h>
46 #include <sys/zfs_acl.h>
47 #include <sys/zfs_ioctl.h>
48 #include <sys/zfs_rlock.h>
49 #include <sys/zfs_fuid.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/kidmap.h>
52 #endif /* _KERNEL */
53 
54 #include <sys/dmu.h>
55 #include <sys/refcount.h>
56 #include <sys/stat.h>
57 #include <sys/zap.h>
58 #include <sys/zfs_znode.h>
59 
60 #include "zfs_prop.h"
61 
62 #if defined(_KERNEL) && defined(__NetBSD__)
63 #include <miscfs/specfs/specdev.h>
64 static const struct genfs_ops zfs_genfsops = {
65 	.gop_write = genfs_compat_gop_write,
66 };
67 
68 #endif
69 
70 extern int (**zfs_vnodeop_p)(void *);
71 extern int (**zfs_fifoop_p)(void *);
72 extern int (**zfs_specop_p)(void *);
73 
74 /*
75  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
76  * turned on when DEBUG is also defined.
77  */
78 #ifdef	DEBUG
79 #define	ZNODE_STATS
80 #endif	/* DEBUG */
81 
82 #ifdef	ZNODE_STATS
83 #define	ZNODE_STAT_ADD(stat)			((stat)++)
84 #else
85 #define	ZNODE_STAT_ADD(stat)			/* nothing */
86 #endif	/* ZNODE_STATS */
87 
88 #define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
89 #define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
90 
91 /*
92  * Functions needed for userland (ie: libzpool) are not put under
93  * #ifdef_KERNEL; the rest of the functions have dependencies
94  * (such as VFS logic) that will not compile easily in userland.
95  */
96 #ifdef _KERNEL
97 static kmem_cache_t *znode_cache = NULL;
98 
99 /*ARGSUSED*/
100 static void
101 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
102 {
103 	/*
104 	 * We should never drop all dbuf refs without first clearing
105 	 * the eviction callback.
106 	 */
107 	panic("evicting znode %p\n", user_ptr);
108 }
109 
110 /*ARGSUSED*/
111 static int
112 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
113 {
114 	znode_t *zp = arg;
115 
116 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
117 
118 	list_link_init(&zp->z_link_node);
119 
120 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
121 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
122 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
123 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
124 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
125 
126 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
127 	avl_create(&zp->z_range_avl, zfs_range_compare,
128 	    sizeof (rl_t), offsetof(rl_t, r_node));
129 
130 	zp->z_dbuf = NULL;
131 	zp->z_dirlocks = NULL;
132 	return (0);
133 }
134 
135 /*ARGSUSED*/
136 static void
137 zfs_znode_cache_destructor(void *buf, void *arg)
138 {
139 	znode_t *zp = arg;
140 
141 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
142 	ASSERT(ZTOV(zp) == NULL);
143 
144 	ASSERT(!list_link_active(&zp->z_link_node));
145 	mutex_destroy(&zp->z_lock);
146 	rw_destroy(&zp->z_map_lock);
147 	rw_destroy(&zp->z_parent_lock);
148 	rw_destroy(&zp->z_name_lock);
149 	mutex_destroy(&zp->z_acl_lock);
150 	avl_destroy(&zp->z_range_avl);
151 	mutex_destroy(&zp->z_range_lock);
152 
153 	ASSERT(zp->z_dbuf == NULL);
154 	ASSERT(zp->z_dirlocks == NULL);
155 }
156 
157 #ifdef	ZNODE_STATS
158 static struct {
159 	uint64_t zms_zfsvfs_invalid;
160 	uint64_t zms_zfsvfs_unmounted;
161 	uint64_t zms_zfsvfs_recheck_invalid;
162 	uint64_t zms_obj_held;
163 	uint64_t zms_vnode_locked;
164 	uint64_t zms_not_only_dnlc;
165 } znode_move_stats;
166 #endif	/* ZNODE_STATS */
167 
168 static void
169 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
170 {
171 	vnode_t *vp;
172 
173 	/* Copy fields. */
174 	nzp->z_zfsvfs = ozp->z_zfsvfs;
175 
176 	/* Swap vnodes. */
177 	vp = nzp->z_vnode;
178 	nzp->z_vnode = ozp->z_vnode;
179 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
180 	ZTOV(ozp)->v_data = ozp;
181 	ZTOV(nzp)->v_data = nzp;
182 
183 	nzp->z_id = ozp->z_id;
184 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
185 	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
186 	nzp->z_unlinked = ozp->z_unlinked;
187 	nzp->z_atime_dirty = ozp->z_atime_dirty;
188 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
189 	nzp->z_blksz = ozp->z_blksz;
190 	nzp->z_seq = ozp->z_seq;
191 	nzp->z_mapcnt = ozp->z_mapcnt;
192 	nzp->z_last_itx = ozp->z_last_itx;
193 	nzp->z_gen = ozp->z_gen;
194 	nzp->z_sync_cnt = ozp->z_sync_cnt;
195 	nzp->z_phys = ozp->z_phys;
196 	nzp->z_dbuf = ozp->z_dbuf;
197 
198 	/* Update back pointers. */
199 	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
200 	    znode_evict_error);
201 
202 	/*
203 	 * Invalidate the original znode by clearing fields that provide a
204 	 * pointer back to the znode. Set the low bit of the vfs pointer to
205 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
206 	 * subsequent callback.
207 	 */
208 	ozp->z_dbuf = NULL;
209 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
210 }
211 
212 /*
213  * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
214  * returns a non-zero error code.
215  */
216 static int
217 zfs_enter(zfsvfs_t *zfsvfs)
218 {
219 	ZFS_ENTER(zfsvfs);
220 	return (0);
221 }
222 
223 #ifndef __NetBSD__
224 /*ARGSUSED*/
225 static kmem_cbrc_t
226 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
227 {
228 	znode_t *ozp = buf, *nzp = newbuf;
229 	zfsvfs_t *zfsvfs;
230 	vnode_t *vp;
231 
232 	/*
233 	 * The znode is on the file system's list of known znodes if the vfs
234 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
235 	 * the znode to invalidate it, and the memory patterns written by kmem
236 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
237 	 * created znode sets the vfs pointer last of all to indicate that the
238 	 * znode is known and in a valid state to be moved by this function.
239 	 */
240 	zfsvfs = ozp->z_zfsvfs;
241 	if (!POINTER_IS_VALID(zfsvfs)) {
242 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
243 		return (KMEM_CBRC_DONT_KNOW);
244 	}
245 
246 	/*
247 	 * Ensure that the filesystem is not unmounted during the move.
248 	 */
249 	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
250 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
251 		return (KMEM_CBRC_DONT_KNOW);
252 	}
253 
254 	mutex_enter(&zfsvfs->z_znodes_lock);
255 	/*
256 	 * Recheck the vfs pointer in case the znode was removed just before
257 	 * acquiring the lock.
258 	 */
259 	if (zfsvfs != ozp->z_zfsvfs) {
260 		mutex_exit(&zfsvfs->z_znodes_lock);
261 		ZFS_EXIT(zfsvfs);
262 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
263 		return (KMEM_CBRC_DONT_KNOW);
264 	}
265 
266 	/*
267 	 * At this point we know that as long as we hold z_znodes_lock, the
268 	 * znode cannot be freed and fields within the znode can be safely
269 	 * accessed. Now, prevent a race with zfs_zget().
270 	 */
271 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
272 		mutex_exit(&zfsvfs->z_znodes_lock);
273 		ZFS_EXIT(zfsvfs);
274 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
275 		return (KMEM_CBRC_LATER);
276 	}
277 
278 	vp = ZTOV(ozp);
279 	if (mutex_tryenter(&vp->v_lock) == 0) {
280 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
281 		mutex_exit(&zfsvfs->z_znodes_lock);
282 		ZFS_EXIT(zfsvfs);
283 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
284 		return (KMEM_CBRC_LATER);
285 	}
286 
287 	/* Only move znodes that are referenced _only_ by the DNLC. */
288 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
289 		mutex_exit(&vp->v_lock);
290 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
291 		mutex_exit(&zfsvfs->z_znodes_lock);
292 		ZFS_EXIT(zfsvfs);
293 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
294 		return (KMEM_CBRC_LATER);
295 	}
296 
297 	/*
298 	 * The znode is known and in a valid state to move. We're holding the
299 	 * locks needed to execute the critical section.
300 	 */
301 	zfs_znode_move_impl(ozp, nzp);
302 	mutex_exit(&vp->v_lock);
303 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
304 
305 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
306 	mutex_exit(&zfsvfs->z_znodes_lock);
307 	ZFS_EXIT(zfsvfs);
308 
309 	return (KMEM_CBRC_YES);
310 }
311 #endif	/* !__NetBSD__ */
312 
313 void
314 zfs_znode_init(void)
315 {
316 	/*
317 	 * Initialize zcache
318 	 */
319 	ASSERT(znode_cache == NULL);
320 	znode_cache = kmem_cache_create("zfs_znode_cache",
321 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
322 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
323 }
324 
325 void
326 zfs_znode_fini(void)
327 {
328 
329 	/*
330 	 * Cleanup zcache
331 	 */
332 	if (znode_cache)
333 		kmem_cache_destroy(znode_cache);
334 	znode_cache = NULL;
335 }
336 
337 #ifndef __NetBSD__
338 struct vnodeops *zfs_dvnodeops;
339 struct vnodeops *zfs_fvnodeops;
340 struct vnodeops *zfs_symvnodeops;
341 struct vnodeops *zfs_xdvnodeops;
342 struct vnodeops *zfs_evnodeops;
343 #endif
344 void
345 zfs_remove_op_tables()
346 {
347 #ifndef __NetBSD__
348 	/*
349 	 * Remove vfs ops
350 	 */
351 	ASSERT(zfsfstype);
352 	(void) vfs_freevfsops_by_type(zfsfstype);
353 	zfsfstype = 0;
354 
355 	/*
356 	 * Remove vnode ops
357 	 */
358 	if (zfs_dvnodeops)
359 		vn_freevnodeops(zfs_dvnodeops);
360 	if (zfs_fvnodeops)
361 		vn_freevnodeops(zfs_fvnodeops);
362 	if (zfs_symvnodeops)
363 		vn_freevnodeops(zfs_symvnodeops);
364 	if (zfs_xdvnodeops)
365 		vn_freevnodeops(zfs_xdvnodeops);
366 	if (zfs_evnodeops)
367 		vn_freevnodeops(zfs_evnodeops);
368 
369 	zfs_dvnodeops = NULL;
370 	zfs_fvnodeops = NULL;
371 	zfs_symvnodeops = NULL;
372 	zfs_xdvnodeops = NULL;
373 	zfs_evnodeops = NULL;
374 #endif
375 }
376 #ifndef __NetBSD__
377 extern const fs_operation_def_t zfs_dvnodeops_template[];
378 extern const fs_operation_def_t zfs_fvnodeops_template[];
379 extern const fs_operation_def_t zfs_xdvnodeops_template[];
380 extern const fs_operation_def_t zfs_symvnodeops_template[];
381 extern const fs_operation_def_t zfs_evnodeops_template[];
382 #endif
383 int
384 zfs_create_op_tables()
385 {
386 #ifndef __NetBSD__
387 	int error;
388 
389 	/*
390 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
391 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
392 	 * In this case we just return as the ops vectors are already set up.
393 	 */
394 	if (zfs_dvnodeops)
395 		return (0);
396 
397 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
398 	    &zfs_dvnodeops);
399 	if (error)
400 		return (error);
401 
402 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
403 	    &zfs_fvnodeops);
404 	if (error)
405 		return (error);
406 
407 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
408 	    &zfs_symvnodeops);
409 	if (error)
410 		return (error);
411 
412 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
413 	    &zfs_xdvnodeops);
414 	if (error)
415 		return (error);
416 
417 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
418 	    &zfs_evnodeops);
419 
420 	return (error);
421 #endif
422 	return 0;
423 }
424 
425 /*
426  * zfs_init_fs - Initialize the zfsvfs struct and the file system
427  *	incore "master" object.  Verify version compatibility.
428  */
429 int
430 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
431 {
432 	extern int zfsfstype;
433 
434 	objset_t	*os = zfsvfs->z_os;
435 	int		i, error;
436 	uint64_t fsid_guid;
437 	uint64_t zval;
438 
439 	*zpp = NULL;
440 
441 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
442 	if (error) {
443 		return (error);
444 	} else if (zfsvfs->z_version > ZPL_VERSION) {
445 		(void) printf("Mismatched versions:  File system "
446 		    "is version %llu on-disk format, which is "
447 		    "incompatible with this software version %lld!",
448 		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
449 		return (ENOTSUP);
450 	}
451 
452 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
453 		return (error);
454 	zfsvfs->z_norm = (int)zval;
455 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
456 		return (error);
457 	zfsvfs->z_utf8 = (zval != 0);
458 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
459 		return (error);
460 	zfsvfs->z_case = (uint_t)zval;
461 	/*
462 	 * Fold case on file systems that are always or sometimes case
463 	 * insensitive.
464 	 */
465 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
466 	    zfsvfs->z_case == ZFS_CASE_MIXED)
467 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
468 
469 	/*
470 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
471 	 * separates our fsid from any other filesystem types, and a
472 	 * 56-bit objset unique ID.  The objset unique ID is unique to
473 	 * all objsets open on this system, provided by unique_create().
474 	 * The 8-bit fs type must be put in the low bits of fsid[1]
475 	 * because that's where other Solaris filesystems put it.
476 	 */
477 	fsid_guid = dmu_objset_fsid_guid(os);
478 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
479 	zfsvfs->z_vfs->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
480 	zfsvfs->z_vfs->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) |
481 	    zfsfstype & 0xFF;
482 	zfsvfs->z_vfs->mnt_stat.f_fsid = fsid_guid;
483 
484 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
485 	    &zfsvfs->z_root);
486 	if (error)
487 		return (error);
488 	ASSERT(zfsvfs->z_root != 0);
489 
490 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
491 	    &zfsvfs->z_unlinkedobj);
492 	if (error)
493 		return (error);
494 
495 	/*
496 	 * Initialize zget mutex's
497 	 */
498 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
499 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
500 
501 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
502 	if (error) {
503 		/*
504 		 * On error, we destroy the mutexes here since it's not
505 		 * possible for the caller to determine if the mutexes were
506 		 * initialized properly.
507 		 */
508 		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
509 			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
510 		return (error);
511 	}
512 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
513 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
514 	    &zfsvfs->z_fuid_obj);
515 	if (error == ENOENT)
516 		error = 0;
517 
518 	return (0);
519 }
520 
521 /*
522  * define a couple of values we need available
523  * for both 64 and 32 bit environments.
524  */
525 #ifndef NBITSMINOR64
526 #define	NBITSMINOR64	32
527 #endif
528 #ifndef MAXMAJ64
529 #define	MAXMAJ64	0xffffffffUL
530 #endif
531 #ifndef	MAXMIN64
532 #define	MAXMIN64	0xffffffffUL
533 #endif
534 
535 /*
536  * Create special expldev for ZFS private use.
537  * Can't use standard expldev since it doesn't do
538  * what we want.  The standard expldev() takes a
539  * dev32_t in LP64 and expands it to a long dev_t.
540  * We need an interface that takes a dev32_t in ILP32
541  * and expands it to a long dev_t.
542  */
543 static uint64_t
544 zfs_expldev(dev_t dev)
545 {
546 	return ((uint64_t)major(dev) << NBITSMINOR64) |
547 	    (minor_t)minor(dev);
548 }
549 
550 /*
551  * Special cmpldev for ZFS private use.
552  * Can't use standard cmpldev since it takes
553  * a long dev_t and compresses it to dev32_t in
554  * LP64.  We need to do a compaction of a long dev_t
555  * to a dev32_t in ILP32.
556  */
557 dev_t
558 zfs_cmpldev(uint64_t dev)
559 {
560 	minor_t minor = (minor_t)dev & MAXMIN64;
561 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
562 
563 	return makedev(minor, major);
564 }
565 
566 static void
567 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
568 {
569 	znode_t		*nzp;
570 
571 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
572 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
573 
574 	mutex_enter(&zp->z_lock);
575 
576 	ASSERT(zp->z_dbuf == NULL);
577 	zp->z_dbuf = db;
578 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
579 
580 	/*
581 	 * there should be no
582 	 * concurrent zgets on this object.
583 	 */
584 	if (nzp != NULL)
585 		panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
586 
587 	/*
588 	 * Slap on VROOT if we are the root znode
589 	 */
590 	if (zp->z_id == zfsvfs->z_root)
591 		ZTOV(zp)->v_flag |= VROOT;
592 
593 	mutex_exit(&zp->z_lock);
594 }
595 
596 void
597 zfs_znode_dmu_fini(znode_t *zp)
598 {
599 	dmu_buf_t *db = zp->z_dbuf;
600 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
601 	    zp->z_unlinked ||
602 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
603 	ASSERT(zp->z_dbuf != NULL);
604 	zp->z_dbuf = NULL;
605 	VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
606 	dmu_buf_rele(db, NULL);
607 }
608 
609 /*
610  * Construct a new znode/vnode and intialize.
611  *
612  * This does not do a call to dmu_set_user() that is
613  * up to the caller to do, in case you don't want to
614  * return the znode
615  */
616 
617 static znode_t *
618 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
619 {
620 	znode_t	*zp;
621 	vnode_t *vp;
622 	int error;
623 
624 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
625 	for (;;) {
626 
627 		error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs,
628 		    zfs_vnodeop_p, &zp->z_vnode);
629 		if (__predict_true(error == 0))
630 			break;
631 		printf("WARNING: zfs_znode_alloc: unable to get vnode, "
632 		    "error=%d\n", error);
633 		(void)kpause("zfsnewvn", false, hz, NULL);
634 	}
635 
636 	ASSERT(zp->z_dirlocks == NULL);
637 	ASSERT(zp->z_dbuf == NULL);
638 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
639 
640 	/*
641 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
642 	 * the zfs_znode_move() callback.
643 	 */
644 	zp->z_phys = NULL;
645 	zp->z_unlinked = 0;
646 	zp->z_atime_dirty = 0;
647 	zp->z_mapcnt = 0;
648 	zp->z_last_itx = 0;
649 	zp->z_id = db->db_object;
650 	zp->z_blksz = blksz;
651 	zp->z_seq = 0x7A4653;
652 	zp->z_sync_cnt = 0;
653 
654 	vp = ZTOV(zp);
655 
656 	zfs_znode_dmu_init(zfsvfs, zp, db);
657 
658 	zp->z_gen = zp->z_phys->zp_gen;
659 
660 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
661 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
662 	vp->v_data = zp;
663 	switch (vp->v_type) {
664 	case VDIR:
665 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
666 		break;
667 	case VBLK:
668 	case VCHR:
669 	/* XXX NetBSD	vp->v_op = zfs_specop_p; */
670 		spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev));
671 		break;
672 	case VFIFO:
673 		/* XXX NetBSD vp->v_op = zfs_fifoop_p; */
674 		break;
675 	}
676 
677 	dprintf("zfs_znode_alloc znode %p -- vnode %p\n", zp, vp);
678 	dprintf("zfs_znode_alloc z_id %ld\n", zp->z_id);
679 	//cpu_Debugger();
680 
681 	uvm_vnp_setsize(vp, zp->z_phys->zp_size);
682 
683 	mutex_enter(&zfsvfs->z_znodes_lock);
684 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
685 	membar_producer();
686 	/*
687 	 * Everything else must be valid before assigning z_zfsvfs makes the
688 	 * znode eligible for zfs_znode_move().
689 	 */
690 	zp->z_zfsvfs = zfsvfs;
691 	mutex_exit(&zfsvfs->z_znodes_lock);
692 
693 	return (zp);
694 }
695 
696 /*
697  * Create a new DMU object to hold a zfs znode.
698  *
699  *	IN:	dzp	- parent directory for new znode
700  *		vap	- file attributes for new znode
701  *		tx	- dmu transaction id for zap operations
702  *		cr	- credentials of caller
703  *		flag	- flags:
704  *			  IS_ROOT_NODE	- new object will be root
705  *			  IS_XATTR	- new object is an attribute
706  *			  IS_REPLAY	- intent log replay
707  *		bonuslen - length of bonus buffer
708  *		setaclp  - File/Dir initial ACL
709  *		fuidp	 - Tracks fuid allocation.
710  *
711  *	OUT:	zpp	- allocated znode
712  *
713  */
714 void
715 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
716     uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
717     zfs_fuid_info_t **fuidp)
718 {
719 	dmu_buf_t	*db;
720 	znode_phys_t	*pzp;
721 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
722 	timestruc_t	now;
723 	uint64_t	gen, obj;
724 	int		err;
725 
726 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
727 
728 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
729 		obj = vap->va_nodeid;
730 		flag |= IS_REPLAY;
731 		now = vap->va_ctime;		/* see zfs_replay_create() */
732 		gen = vap->va_nblocks;		/* ditto */
733 	} else {
734 		obj = 0;
735 		gethrestime(&now);
736 		gen = dmu_tx_get_txg(tx);
737 	}
738 
739 	/*
740 	 * Create a new DMU object.
741 	 */
742 	/*
743 	 * There's currently no mechanism for pre-reading the blocks that will
744 	 * be to needed allocate a new object, so we accept the small chance
745 	 * that there will be an i/o error and we will fail one of the
746 	 * assertions below.
747 	 */
748 	if (vap->va_type == VDIR) {
749 		if (flag & IS_REPLAY) {
750 			err = zap_create_claim_norm(zfsvfs->z_os, obj,
751 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
752 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
753 			ASSERT3U(err, ==, 0);
754 		} else {
755 			obj = zap_create_norm(zfsvfs->z_os,
756 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
757 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
758 		}
759 	} else {
760 		if (flag & IS_REPLAY) {
761 			err = dmu_object_claim(zfsvfs->z_os, obj,
762 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
763 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
764 			ASSERT3U(err, ==, 0);
765 		} else {
766 			obj = dmu_object_alloc(zfsvfs->z_os,
767 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
768 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
769 		}
770 	}
771 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
772 	dmu_buf_will_dirty(db, tx);
773 
774 	/*
775 	 * Initialize the znode physical data to zero.
776 	 */
777 	ASSERT(db->db_size >= sizeof (znode_phys_t));
778 	bzero(db->db_data, db->db_size);
779 	pzp = db->db_data;
780 
781 	/*
782 	 * If this is the root, fix up the half-initialized parent pointer
783 	 * to reference the just-allocated physical data area.
784 	 */
785 	if (flag & IS_ROOT_NODE) {
786 		dzp->z_dbuf = db;
787 		dzp->z_phys = pzp;
788 		dzp->z_id = obj;
789 	}
790 
791 	/*
792 	 * If parent is an xattr, so am I.
793 	 */
794 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
795 		flag |= IS_XATTR;
796 
797 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
798 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
799 	}
800 
801 	if (zfsvfs->z_use_fuids)
802 		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
803 
804 	if (vap->va_type == VDIR) {
805 		pzp->zp_size = 2;		/* contents ("." and "..") */
806 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
807 	}
808 
809 	pzp->zp_parent = dzp->z_id;
810 	if (flag & IS_XATTR)
811 		pzp->zp_flags |= ZFS_XATTR;
812 
813 	pzp->zp_gen = gen;
814 
815 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
816 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
817 
818 	if (vap->va_mask & AT_ATIME) {
819 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
820 	} else {
821 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
822 	}
823 
824 	if (vap->va_mask & AT_MTIME) {
825 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
826 	} else {
827 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
828 	}
829 
830 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
831 	if (!(flag & IS_ROOT_NODE)) {
832 		dprintf("zfs_mknode parent vp %p - zp %p\n", ZTOV(dzp), dzp);
833 		dprintf("Going to lock %p with %ld\n", ZFS_OBJ_MUTEX(zfsvfs, obj), obj);
834 
835 		ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
836 		*zpp = zfs_znode_alloc(zfsvfs, db, 0);
837 
838 		genfs_node_init(ZTOV(*zpp), &zfs_genfsops);
839 
840 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
841 	} else {
842 		/*
843 		 * If we are creating the root node, the "parent" we
844 		 * passed in is the znode for the root.
845 		 */
846 		*zpp = dzp;
847 	}
848 	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
849 }
850 
851 void
852 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
853 {
854 	xoptattr_t *xoap;
855 
856 	xoap = xva_getxoptattr(xvap);
857 	ASSERT(xoap);
858 
859 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
860 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
861 		XVA_SET_RTN(xvap, XAT_CREATETIME);
862 	}
863 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
864 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
865 		XVA_SET_RTN(xvap, XAT_READONLY);
866 	}
867 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
868 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
869 		XVA_SET_RTN(xvap, XAT_HIDDEN);
870 	}
871 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
872 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
873 		XVA_SET_RTN(xvap, XAT_SYSTEM);
874 	}
875 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
876 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
877 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
878 	}
879 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
880 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
881 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
882 	}
883 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
884 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
885 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
886 	}
887 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
888 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
889 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
890 	}
891 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
892 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
893 		XVA_SET_RTN(xvap, XAT_NODUMP);
894 	}
895 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
896 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
897 		XVA_SET_RTN(xvap, XAT_OPAQUE);
898 	}
899 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
900 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
901 		    xoap->xoa_av_quarantined);
902 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
903 	}
904 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
905 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
906 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
907 	}
908 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
909 		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
910 		    sizeof (xoap->xoa_av_scanstamp));
911 		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
912 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
913 	}
914 }
915 
916 int
917 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
918 {
919 	dmu_object_info_t doi;
920 	dmu_buf_t   *db;
921 	znode_t     *zp;
922 	vnode_t     *vp;
923 	int err, first = 1;
924 
925 	*zpp = NULL;
926 again:
927 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
928 
929 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
930 	if (err) {
931 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
932 		return (err);
933 	}
934 
935 	dmu_object_info_from_db(db, &doi);
936 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
937 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
938 		dmu_buf_rele(db, NULL);
939 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
940 		return (EINVAL);
941 	}
942 
943 	zp = dmu_buf_get_user(db);
944 	if (zp != NULL) {
945 		mutex_enter(&zp->z_lock);
946 
947 		/*
948 		 * Since we do immediate eviction of the z_dbuf, we
949 		 * should never find a dbuf with a znode that doesn't
950 		 * know about the dbuf.
951 		 */
952 		ASSERT3P(zp->z_dbuf, ==, db);
953 		ASSERT3U(zp->z_id, ==, obj_num);
954 		if (zp->z_unlinked) {
955 			err = ENOENT;
956 		} else {
957 			if ((vp = ZTOV(zp)) != NULL) {
958 				mutex_enter(&vp->v_interlock);
959 				mutex_exit(&zp->z_lock);
960 				if (vget(vp, LK_INTERLOCK) != 0) {
961 					dmu_buf_rele(db, NULL);
962 					mutex_exit(&vp->v_interlock);
963 					goto again;
964 				}
965 				mutex_enter(&zp->z_lock);
966 			} else {
967 				if (first) {
968 					ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
969 					first = 0;
970 				}
971 				/*
972 				 * znode is dying so we can't reuse it, we must
973 				 * wait until destruction is completed.
974 				 */
975 				dmu_buf_rele(db, NULL);
976 				mutex_exit(&zp->z_lock);
977 				ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
978 				kpause("zcollide", 0, 1, NULL);
979 				goto again;
980 			}
981 			*zpp = zp;
982 			err = 0;
983 		}
984 
985 		dmu_buf_rele(db, NULL);
986 		mutex_exit(&zp->z_lock);
987 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
988 		return (err);
989 	}
990 
991 	/*
992 	 * Not found create new znode/vnode
993 	 */
994 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
995 	vp = ZTOV(zp);
996 
997 	genfs_node_init(vp, &zfs_genfsops);
998 
999 	VOP_UNLOCK(vp, 0);
1000 
1001 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1002 	*zpp = zp;
1003 	return (0);
1004 }
1005 
1006 int
1007 zfs_rezget(znode_t *zp)
1008 {
1009 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1010 	dmu_object_info_t doi;
1011 	dmu_buf_t *db;
1012 	uint64_t obj_num = zp->z_id;
1013 	int err;
1014 
1015 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1016 
1017 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
1018 	if (err) {
1019 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1020 		return (err);
1021 	}
1022 
1023 	dmu_object_info_from_db(db, &doi);
1024 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1025 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1026 		dmu_buf_rele(db, NULL);
1027 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1028 		return (EINVAL);
1029 	}
1030 
1031 	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
1032 		dmu_buf_rele(db, NULL);
1033 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1034 		return (EIO);
1035 	}
1036 
1037 	zfs_znode_dmu_init(zfsvfs, zp, db);
1038 	zp->z_unlinked = (zp->z_phys->zp_links == 0);
1039 	zp->z_blksz = doi.doi_data_block_size;
1040 
1041 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1042 
1043 	return (0);
1044 }
1045 
1046 void
1047 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1048 {
1049 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1050 	objset_t *os = zfsvfs->z_os;
1051 	uint64_t obj = zp->z_id;
1052 	uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
1053 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1054 	if (acl_obj)
1055 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1056 	VERIFY(0 == dmu_object_free(os, obj, tx));
1057 	zfs_znode_dmu_fini(zp);
1058 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1059 	zfs_znode_free(zp);
1060 }
1061 
1062 /*
1063  * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock
1064  * will be released in zfs_zinactive.
1065  */
1066 void
1067 zfs_zinactive(znode_t *zp)
1068 {
1069 	vnode_t	*vp = ZTOV(zp);
1070 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1071 
1072 	ASSERT(zp->z_dbuf && zp->z_phys);
1073 
1074 	//printf("zfs_zinactive vp %p - zp %p\n", vp, zp);
1075 	//printf("Going to lock %p with %ld\n", ZFS_OBJ_MUTEX(zfsvfs, z_id), z_id);
1076 
1077 	mutex_enter(&zp->z_lock);
1078 	/*
1079 	 * If this was the last reference to a file with no links,
1080 	 * remove the file from the file system.
1081 	 */
1082 	if (zp->z_unlinked) {
1083 		mutex_exit(&zp->z_lock);
1084 		ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
1085 		zfs_rmnode(zp);
1086 		return;
1087 	}
1088 
1089 	mutex_exit(&zp->z_lock);
1090 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
1091 	zfs_znode_free(zp);
1092 }
1093 
1094 void
1095 zfs_znode_free(znode_t *zp)
1096 {
1097 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1098 	ASSERT(ZTOV(zp) == NULL);
1099 
1100 	dprintf("destroying znode %p\n", zp);
1101 	//cpu_Debugger();
1102 	mutex_enter(&zfsvfs->z_znodes_lock);
1103 	POINTER_INVALIDATE(&zp->z_zfsvfs);
1104 	list_remove(&zfsvfs->z_all_znodes, zp);
1105 	mutex_exit(&zfsvfs->z_znodes_lock);
1106 
1107 	kmem_cache_free(znode_cache, zp);
1108 
1109 	VFS_RELE(zfsvfs->z_vfs);
1110 }
1111 
1112 void
1113 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1114 {
1115 	timestruc_t	now;
1116 
1117 	ASSERT(MUTEX_HELD(&zp->z_lock));
1118 
1119 	gethrestime(&now);
1120 
1121 	if (tx) {
1122 		dmu_buf_will_dirty(zp->z_dbuf, tx);
1123 		zp->z_atime_dirty = 0;
1124 		zp->z_seq++;
1125 	} else {
1126 		zp->z_atime_dirty = 1;
1127 	}
1128 
1129 	if (flag & AT_ATIME)
1130 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1131 
1132 	if (flag & AT_MTIME) {
1133 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1134 		if (zp->z_zfsvfs->z_use_fuids)
1135 			zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1136 	}
1137 
1138 	if (flag & AT_CTIME) {
1139 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1140 		if (zp->z_zfsvfs->z_use_fuids)
1141 			zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1142 	}
1143 }
1144 
1145 /*
1146  * Update the requested znode timestamps with the current time.
1147  * If we are in a transaction, then go ahead and mark the znode
1148  * dirty in the transaction so the timestamps will go to disk.
1149  * Otherwise, we will get pushed next time the znode is updated
1150  * in a transaction, or when this znode eventually goes inactive.
1151  *
1152  * Why is this OK?
1153  *  1 - Only the ACCESS time is ever updated outside of a transaction.
1154  *  2 - Multiple consecutive updates will be collapsed into a single
1155  *	znode update by the transaction grouping semantics of the DMU.
1156  */
1157 void
1158 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1159 {
1160 	mutex_enter(&zp->z_lock);
1161 	zfs_time_stamper_locked(zp, flag, tx);
1162 	mutex_exit(&zp->z_lock);
1163 }
1164 
1165 /*
1166  * Grow the block size for a file.
1167  *
1168  *	IN:	zp	- znode of file to free data in.
1169  *		size	- requested block size
1170  *		tx	- open transaction.
1171  *
1172  * NOTE: this function assumes that the znode is write locked.
1173  */
1174 void
1175 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1176 {
1177 	int		error;
1178 	u_longlong_t	dummy;
1179 
1180 	if (size <= zp->z_blksz)
1181 		return;
1182 	/*
1183 	 * If the file size is already greater than the current blocksize,
1184 	 * we will not grow.  If there is more than one block in a file,
1185 	 * the blocksize cannot change.
1186 	 */
1187 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1188 		return;
1189 
1190 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1191 	    size, 0, tx);
1192 	if (error == ENOTSUP)
1193 		return;
1194 	ASSERT3U(error, ==, 0);
1195 
1196 	/* What blocksize did we actually get? */
1197 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1198 }
1199 
1200 /*
1201  * Increase the file length
1202  *
1203  *	IN:	zp	- znode of file to free data in.
1204  *		end	- new end-of-file
1205  *
1206  * 	RETURN:	0 if success
1207  *		error code if failure
1208  */
1209 static int
1210 zfs_extend(znode_t *zp, uint64_t end)
1211 {
1212 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1213 	dmu_tx_t *tx;
1214 	rl_t *rl;
1215 	uint64_t newblksz;
1216 	int error;
1217 
1218 	/*
1219 	 * We will change zp_size, lock the whole file.
1220 	 */
1221 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1222 
1223 	/*
1224 	 * Nothing to do if file already at desired length.
1225 	 */
1226 	if (end <= zp->z_phys->zp_size) {
1227 		zfs_range_unlock(rl);
1228 		return (0);
1229 	}
1230 top:
1231 	tx = dmu_tx_create(zfsvfs->z_os);
1232 	dmu_tx_hold_bonus(tx, zp->z_id);
1233 	if (end > zp->z_blksz &&
1234 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1235 		/*
1236 		 * We are growing the file past the current block size.
1237 		 */
1238 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1239 			ASSERT(!ISP2(zp->z_blksz));
1240 			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1241 		} else {
1242 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1243 		}
1244 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1245 	} else {
1246 		newblksz = 0;
1247 	}
1248 
1249 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1250 	if (error) {
1251 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1252 			dmu_tx_wait(tx);
1253 			dmu_tx_abort(tx);
1254 			goto top;
1255 		}
1256 		dmu_tx_abort(tx);
1257 		zfs_range_unlock(rl);
1258 		return (error);
1259 	}
1260 	dmu_buf_will_dirty(zp->z_dbuf, tx);
1261 
1262 	if (newblksz)
1263 		zfs_grow_blocksize(zp, newblksz, tx);
1264 
1265 	zp->z_phys->zp_size = end;
1266 
1267 	zfs_range_unlock(rl);
1268 
1269 	dmu_tx_commit(tx);
1270 
1271 	rw_enter(&zp->z_map_lock, RW_WRITER);
1272 	uvm_vnp_setsize(ZTOV(zp), end);
1273 	rw_exit(&zp->z_map_lock);
1274 
1275 	return (0);
1276 }
1277 
1278 /*
1279  * Free space in a file.
1280  *
1281  *	IN:	zp	- znode of file to free data in.
1282  *		off	- start of section to free.
1283  *		len	- length of section to free.
1284  *
1285  * 	RETURN:	0 if success
1286  *		error code if failure
1287  */
1288 static int
1289 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1290 {
1291 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1292 	rl_t *rl;
1293 	int error;
1294 
1295 	/*
1296 	 * Lock the range being freed.
1297 	 */
1298 	rl = zfs_range_lock(zp, off, len, RL_WRITER);
1299 
1300 	/*
1301 	 * Nothing to do if file already at desired length.
1302 	 */
1303 	if (off >= zp->z_phys->zp_size) {
1304 		zfs_range_unlock(rl);
1305 		return (0);
1306 	}
1307 
1308 	if (off + len > zp->z_phys->zp_size)
1309 		len = zp->z_phys->zp_size - off;
1310 
1311 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1312 
1313 	if (error == 0) {
1314 		/*
1315 		 * In NetBSD we cannot free block in the middle of a file,
1316 		 * but only at the end of a file.
1317 		 */
1318 		rw_enter(&zp->z_map_lock, RW_WRITER);
1319 		uvm_vnp_setsize(ZTOV(zp), off);
1320 		rw_exit(&zp->z_map_lock);
1321 	}
1322 
1323 	zfs_range_unlock(rl);
1324 
1325 	return (error);
1326 }
1327 
1328 /*
1329  * Truncate a file
1330  *
1331  *	IN:	zp	- znode of file to free data in.
1332  *		end	- new end-of-file.
1333  *
1334  * 	RETURN:	0 if success
1335  *		error code if failure
1336  */
1337 static int
1338 zfs_trunc(znode_t *zp, uint64_t end)
1339 {
1340 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1341 	vnode_t *vp = ZTOV(zp);
1342 	dmu_tx_t *tx;
1343 	rl_t *rl;
1344 	int error;
1345 
1346 	/*
1347 	 * We will change zp_size, lock the whole file.
1348 	 */
1349 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1350 
1351 	/*
1352 	 * Nothing to do if file already at desired length.
1353 	 */
1354 	if (end >= zp->z_phys->zp_size) {
1355 		zfs_range_unlock(rl);
1356 		return (0);
1357 	}
1358 
1359 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
1360 	if (error) {
1361 		zfs_range_unlock(rl);
1362 		return (error);
1363 	}
1364 top:
1365 	tx = dmu_tx_create(zfsvfs->z_os);
1366 	dmu_tx_hold_bonus(tx, zp->z_id);
1367 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1368 	if (error) {
1369 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1370 			dmu_tx_wait(tx);
1371 			dmu_tx_abort(tx);
1372 			goto top;
1373 		}
1374 		dmu_tx_abort(tx);
1375 		zfs_range_unlock(rl);
1376 		return (error);
1377 	}
1378 	dmu_buf_will_dirty(zp->z_dbuf, tx);
1379 
1380 	zp->z_phys->zp_size = end;
1381 
1382 	dmu_tx_commit(tx);
1383 
1384 	zfs_range_unlock(rl);
1385 
1386 	/*
1387 	 * Clear any mapped pages in the truncated region.  This has to
1388 	 * happen outside of the transaction to avoid the possibility of
1389 	 * a deadlock with someone trying to push a page that we are
1390 	 * about to invalidate.
1391 	 */
1392 	rw_enter(&zp->z_map_lock, RW_WRITER);
1393 	uvm_vnp_setsize(vp, end);
1394 	rw_exit(&zp->z_map_lock);
1395 
1396 	return (0);
1397 }
1398 
1399 /*
1400  * Free space in a file
1401  *
1402  *	IN:	zp	- znode of file to free data in.
1403  *		off	- start of range
1404  *		len	- end of range (0 => EOF)
1405  *		flag	- current file open mode flags.
1406  *		log	- TRUE if this action should be logged
1407  *
1408  * 	RETURN:	0 if success
1409  *		error code if failure
1410  */
1411 int
1412 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1413 {
1414 	vnode_t *vp = ZTOV(zp);
1415 	dmu_tx_t *tx;
1416 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1417 	zilog_t *zilog = zfsvfs->z_log;
1418 	int error;
1419 
1420 	if (off > zp->z_phys->zp_size) {
1421 		error =  zfs_extend(zp, off+len);
1422 		if (error == 0 && log)
1423 			goto log;
1424 		else
1425 			return (error);
1426 	}
1427 
1428 	if (len == 0) {
1429 		error = zfs_trunc(zp, off);
1430 	} else {
1431 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1432 		    off + len > zp->z_phys->zp_size)
1433 			error = zfs_extend(zp, off+len);
1434 	}
1435 	if (error || !log)
1436 		return (error);
1437 log:
1438 	tx = dmu_tx_create(zfsvfs->z_os);
1439 	dmu_tx_hold_bonus(tx, zp->z_id);
1440 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1441 	if (error) {
1442 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1443 			dmu_tx_wait(tx);
1444 			dmu_tx_abort(tx);
1445 			goto log;
1446 		}
1447 		dmu_tx_abort(tx);
1448 		return (error);
1449 	}
1450 
1451 	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1452 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1453 
1454 	dmu_tx_commit(tx);
1455 	return (0);
1456 }
1457 
1458 void
1459 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1460 {
1461 	zfsvfs_t	zfsvfs;
1462 	uint64_t	moid, doid, version;
1463 	uint64_t	sense = ZFS_CASE_SENSITIVE;
1464 	uint64_t	norm = 0;
1465 	nvpair_t	*elem;
1466 	int		error;
1467 	znode_t		*rootzp = NULL;
1468 	vnode_t		*vp;
1469 	vattr_t		vattr;
1470 	znode_t		*zp;
1471 
1472 	/*
1473 	 * First attempt to create master node.
1474 	 */
1475 	/*
1476 	 * In an empty objset, there are no blocks to read and thus
1477 	 * there can be no i/o errors (which we assert below).
1478 	 */
1479 	moid = MASTER_NODE_OBJ;
1480 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1481 	    DMU_OT_NONE, 0, tx);
1482 	ASSERT(error == 0);
1483 
1484 	/*
1485 	 * Set starting attributes.
1486 	 */
1487 	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1488 		version = ZPL_VERSION;
1489 	else
1490 		version = ZPL_VERSION_FUID - 1;
1491 	error = zap_update(os, moid, ZPL_VERSION_STR,
1492 	    8, 1, &version, tx);
1493 	elem = NULL;
1494 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1495 		/* For the moment we expect all zpl props to be uint64_ts */
1496 		uint64_t val;
1497 		char *name;
1498 
1499 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1500 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1501 		name = nvpair_name(elem);
1502 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1503 			version = val;
1504 			error = zap_update(os, moid, ZPL_VERSION_STR,
1505 			    8, 1, &version, tx);
1506 		} else {
1507 			error = zap_update(os, moid, name, 8, 1, &val, tx);
1508 		}
1509 		ASSERT(error == 0);
1510 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1511 			norm = val;
1512 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1513 			sense = val;
1514 	}
1515 	ASSERT(version != 0);
1516 
1517 	/*
1518 	 * Create a delete queue.
1519 	 */
1520 	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1521 
1522 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
1523 	ASSERT(error == 0);
1524 
1525 	/*
1526 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1527 	 * to allow zfs_mknode to work.
1528 	 */
1529 	vattr_null(&vattr);
1530 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1531 	vattr.va_type = VDIR;
1532 	vattr.va_mode = S_IFDIR|0755;
1533 	vattr.va_uid = crgetuid(cr);
1534 	vattr.va_gid = crgetgid(cr);
1535 
1536 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1537 	rootzp->z_unlinked = 0;
1538 	rootzp->z_atime_dirty = 0;
1539 
1540 	for (;;) {
1541 		error = getnewvnode(VT_ZFS, NULL, zfs_vnodeop_p,
1542 		    &rootzp->z_vnode);
1543 		if (error == 0)
1544 			break;
1545 		printf("WARNING: zfs_create_fs: unable to get vnode, "
1546 		    "error=%d\n", error);
1547 		kpause("zfsvn", false, hz, NULL);
1548 	}
1549 
1550 	vp = ZTOV(rootzp);
1551 	vp->v_type = VDIR;
1552 
1553 	bzero(&zfsvfs, sizeof (zfsvfs_t));
1554 
1555 	zfsvfs.z_os = os;
1556 	zfsvfs.z_assign = TXG_NOWAIT;
1557 	zfsvfs.z_parent = &zfsvfs;
1558 	zfsvfs.z_version = version;
1559 	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1560 	zfsvfs.z_norm = norm;
1561 	/*
1562 	 * Fold case on file systems that are always or sometimes case
1563 	 * insensitive.
1564 	 */
1565 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1566 		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1567 
1568 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1569 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1570 	    offsetof(znode_t, z_link_node));
1571 
1572 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1573 	rootzp->z_zfsvfs = &zfsvfs;
1574 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
1575 	ASSERT3P(zp, ==, rootzp);
1576 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1577 	ASSERT(error == 0);
1578 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1579 
1580 	dmu_buf_rele(rootzp->z_dbuf, NULL);
1581 	rootzp->z_dbuf = NULL;
1582 	ungetnewvnode(vp);
1583 	kmem_cache_free(znode_cache, rootzp);
1584 }
1585 
1586 #endif /* _KERNEL */
1587 /*
1588  * Given an object number, return its parent object number and whether
1589  * or not the object is an extended attribute directory.
1590  */
1591 static int
1592 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1593 {
1594 	dmu_buf_t *db;
1595 	dmu_object_info_t doi;
1596 	znode_phys_t *zp;
1597 	int error;
1598 
1599 	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1600 		return (error);
1601 
1602 	dmu_object_info_from_db(db, &doi);
1603 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1604 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1605 		dmu_buf_rele(db, FTAG);
1606 		return (EINVAL);
1607 	}
1608 
1609 	zp = db->db_data;
1610 	*pobjp = zp->zp_parent;
1611 	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1612 	    S_ISDIR(zp->zp_mode);
1613 	dmu_buf_rele(db, FTAG);
1614 
1615 	return (0);
1616 }
1617 
1618 int
1619 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1620 {
1621 	char *path = buf + len - 1;
1622 	int error;
1623 
1624 	*path = '\0';
1625 
1626 	for (;;) {
1627 		uint64_t pobj;
1628 		char component[MAXNAMELEN + 2];
1629 		size_t complen;
1630 		int is_xattrdir;
1631 
1632 		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1633 		    &is_xattrdir)) != 0)
1634 			break;
1635 
1636 		if (pobj == obj) {
1637 			if (path[0] != '/')
1638 				*--path = '/';
1639 			break;
1640 		}
1641 
1642 		component[0] = '/';
1643 		if (is_xattrdir) {
1644 			(void) sprintf(component + 1, "<xattrdir>");
1645 		} else {
1646 			error = zap_value_search(osp, pobj, obj,
1647 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
1648 			if (error != 0)
1649 				break;
1650 		}
1651 
1652 		complen = strlen(component);
1653 		path -= complen;
1654 		ASSERT(path >= buf);
1655 		bcopy(component, path, complen);
1656 		obj = pobj;
1657 	}
1658 
1659 	if (error == 0)
1660 		(void) memmove(buf, path, buf + len - path);
1661 	return (error);
1662 }
1663