xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  */
27 
28 /* Portions Copyright 2007 Jeremy Teo */
29 /* Portions Copyright 2010 Robert Milkowski */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/vfs.h>
38 #include <sys/vm.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/atomic.h>
46 #include <sys/namei.h>
47 #include <sys/mman.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/unistd.h>
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/dirent.h>
62 #include <sys/policy.h>
63 #include <sys/sunddi.h>
64 #include <sys/filio.h>
65 #include <sys/sid.h>
66 #include <sys/zfs_ctldir.h>
67 #include <sys/zfs_fuid.h>
68 #include <sys/zfs_sa.h>
69 #include <sys/dnlc.h>
70 #include <sys/zfs_rlock.h>
71 #include <sys/buf.h>
72 #include <sys/sched.h>
73 #include <sys/acl.h>
74 #include <sys/extdirent.h>
75 
76 #ifdef __FreeBSD__
77 #include <sys/kidmap.h>
78 #include <sys/bio.h>
79 #include <vm/vm_param.h>
80 #endif
81 
82 #ifdef __NetBSD__
83 #include <dev/mm.h>
84 #include <miscfs/fifofs/fifo.h>
85 #include <miscfs/genfs/genfs.h>
86 #include <miscfs/genfs/genfs_node.h>
87 #include <uvm/uvm_extern.h>
88 #include <sys/fstrans.h>
89 #include <sys/malloc.h>
90 
91 uint_t zfs_putpage_key;
92 #endif
93 
94 /*
95  * Programming rules.
96  *
97  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
98  * properly lock its in-core state, create a DMU transaction, do the work,
99  * record this work in the intent log (ZIL), commit the DMU transaction,
100  * and wait for the intent log to commit if it is a synchronous operation.
101  * Moreover, the vnode ops must work in both normal and log replay context.
102  * The ordering of events is important to avoid deadlocks and references
103  * to freed memory.  The example below illustrates the following Big Rules:
104  *
105  *  (1)	A check must be made in each zfs thread for a mounted file system.
106  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
107  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
108  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
109  *	can return EIO from the calling function.
110  *
111  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
112  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
113  *	First, if it's the last reference, the vnode/znode
114  *	can be freed, so the zp may point to freed memory.  Second, the last
115  *	reference will call zfs_zinactive(), which may induce a lot of work --
116  *	pushing cached pages (which acquires range locks) and syncing out
117  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
118  *	which could deadlock the system if you were already holding one.
119  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
120  *
121  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
122  *	as they can span dmu_tx_assign() calls.
123  *
124  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
125  *      dmu_tx_assign().  This is critical because we don't want to block
126  *      while holding locks.
127  *
128  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
129  *	reduces lock contention and CPU usage when we must wait (note that if
130  *	throughput is constrained by the storage, nearly every transaction
131  *	must wait).
132  *
133  *      Note, in particular, that if a lock is sometimes acquired before
134  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
135  *      to use a non-blocking assign can deadlock the system.  The scenario:
136  *
137  *	Thread A has grabbed a lock before calling dmu_tx_assign().
138  *	Thread B is in an already-assigned tx, and blocks for this lock.
139  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
140  *	forever, because the previous txg can't quiesce until B's tx commits.
141  *
142  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
143  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
144  *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
145  *	to indicate that this operation has already called dmu_tx_wait().
146  *	This will ensure that we don't retry forever, waiting a short bit
147  *	each time.
148  *
149  *  (5)	If the operation succeeded, generate the intent log entry for it
150  *	before dropping locks.  This ensures that the ordering of events
151  *	in the intent log matches the order in which they actually occurred.
152  *	During ZIL replay the zfs_log_* functions will update the sequence
153  *	number to indicate the zil transaction has replayed.
154  *
155  *  (6)	At the end of each vnode op, the DMU tx must always commit,
156  *	regardless of whether there were any errors.
157  *
158  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
159  *	to ensure that synchronous semantics are provided when necessary.
160  *
161  * In general, this is how things should be ordered in each vnode op:
162  *
163  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
164  * top:
165  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
166  *	rw_enter(...);			// grab any other locks you need
167  *	tx = dmu_tx_create(...);	// get DMU tx
168  *	dmu_tx_hold_*();		// hold each object you might modify
169  *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
170  *	if (error) {
171  *		rw_exit(...);		// drop locks
172  *		zfs_dirent_unlock(dl);	// unlock directory entry
173  *		VN_RELE(...);		// release held vnodes
174  *		if (error == ERESTART) {
175  *			waited = B_TRUE;
176  *			dmu_tx_wait(tx);
177  *			dmu_tx_abort(tx);
178  *			goto top;
179  *		}
180  *		dmu_tx_abort(tx);	// abort DMU tx
181  *		ZFS_EXIT(zfsvfs);	// finished in zfs
182  *		return (error);		// really out of space
183  *	}
184  *	error = do_real_work();		// do whatever this VOP does
185  *	if (error == 0)
186  *		zfs_log_*(...);		// on success, make ZIL entry
187  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
188  *	rw_exit(...);			// drop locks
189  *	zfs_dirent_unlock(dl);		// unlock directory entry
190  *	VN_RELE(...);			// release held vnodes
191  *	zil_commit(zilog, foid);	// synchronous when necessary
192  *	ZFS_EXIT(zfsvfs);		// finished in zfs
193  *	return (error);			// done, report error
194  */
195 
196 /* ARGSUSED */
197 static int
198 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
199 {
200 	znode_t	*zp = VTOZ(*vpp);
201 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
202 
203 	ZFS_ENTER(zfsvfs);
204 	ZFS_VERIFY_ZP(zp);
205 
206 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
207 	    ((flag & FAPPEND) == 0)) {
208 		ZFS_EXIT(zfsvfs);
209 		return (SET_ERROR(EPERM));
210 	}
211 
212 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
213 	    ZTOV(zp)->v_type == VREG &&
214 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
215 		if (fs_vscan(*vpp, cr, 0) != 0) {
216 			ZFS_EXIT(zfsvfs);
217 			return (SET_ERROR(EACCES));
218 		}
219 	}
220 
221 	/* Keep a count of the synchronous opens in the znode */
222 	if (flag & (FSYNC | FDSYNC))
223 		atomic_inc_32(&zp->z_sync_cnt);
224 
225 	ZFS_EXIT(zfsvfs);
226 	return (0);
227 }
228 
229 /* ARGSUSED */
230 static int
231 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
232     caller_context_t *ct)
233 {
234 	znode_t	*zp = VTOZ(vp);
235 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
236 
237 	/*
238 	 * Clean up any locks held by this process on the vp.
239 	 */
240 	cleanlocks(vp, ddi_get_pid(), 0);
241 	cleanshares(vp, ddi_get_pid());
242 
243 	ZFS_ENTER(zfsvfs);
244 	ZFS_VERIFY_ZP(zp);
245 
246 	/* Decrement the synchronous opens in the znode */
247 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
248 		atomic_dec_32(&zp->z_sync_cnt);
249 
250 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
251 	    ZTOV(zp)->v_type == VREG &&
252 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
253 		VERIFY(fs_vscan(vp, cr, 1) == 0);
254 
255 	ZFS_EXIT(zfsvfs);
256 	return (0);
257 }
258 
259 /*
260  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
261  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
262  */
263 static int
264 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
265 {
266 	znode_t	*zp = VTOZ(vp);
267 	uint64_t noff = (uint64_t)*off; /* new offset */
268 	uint64_t file_sz;
269 	int error;
270 	boolean_t hole;
271 
272 	file_sz = zp->z_size;
273 	if (noff >= file_sz)  {
274 		return (SET_ERROR(ENXIO));
275 	}
276 
277 	if (cmd == _FIO_SEEK_HOLE)
278 		hole = B_TRUE;
279 	else
280 		hole = B_FALSE;
281 
282 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
283 
284 	if (error == ESRCH)
285 		return (SET_ERROR(ENXIO));
286 
287 	/*
288 	 * We could find a hole that begins after the logical end-of-file,
289 	 * because dmu_offset_next() only works on whole blocks.  If the
290 	 * EOF falls mid-block, then indicate that the "virtual hole"
291 	 * at the end of the file begins at the logical EOF, rather than
292 	 * at the end of the last block.
293 	 */
294 	if (noff > file_sz) {
295 		ASSERT(hole);
296 		noff = file_sz;
297 	}
298 
299 	if (noff < *off)
300 		return (error);
301 	*off = noff;
302 	return (error);
303 }
304 
305 /* ARGSUSED */
306 static int
307 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
308     int *rvalp, caller_context_t *ct)
309 {
310 	offset_t off;
311 	offset_t ndata;
312 	dmu_object_info_t doi;
313 	int error;
314 	zfsvfs_t *zfsvfs;
315 	znode_t *zp;
316 
317 	switch (com) {
318 	case _FIOFFS:
319 	{
320 		return (0);
321 
322 		/*
323 		 * The following two ioctls are used by bfu.  Faking out,
324 		 * necessary to avoid bfu errors.
325 		 */
326 	}
327 	case _FIOGDIO:
328 	case _FIOSDIO:
329 	{
330 		return (0);
331 	}
332 
333 	case _FIO_SEEK_DATA:
334 	case _FIO_SEEK_HOLE:
335 	{
336 #ifdef illumos
337 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
338 			return (SET_ERROR(EFAULT));
339 #else
340 		off = *(offset_t *)data;
341 #endif
342 		zp = VTOZ(vp);
343 		zfsvfs = zp->z_zfsvfs;
344 		ZFS_ENTER(zfsvfs);
345 		ZFS_VERIFY_ZP(zp);
346 
347 		/* offset parameter is in/out */
348 		error = zfs_holey(vp, com, &off);
349 		ZFS_EXIT(zfsvfs);
350 		if (error)
351 			return (error);
352 #ifdef illumos
353 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
354 			return (SET_ERROR(EFAULT));
355 #else
356 		*(offset_t *)data = off;
357 #endif
358 		return (0);
359 	}
360 #ifdef illumos
361 	case _FIO_COUNT_FILLED:
362 	{
363 		/*
364 		 * _FIO_COUNT_FILLED adds a new ioctl command which
365 		 * exposes the number of filled blocks in a
366 		 * ZFS object.
367 		 */
368 		zp = VTOZ(vp);
369 		zfsvfs = zp->z_zfsvfs;
370 		ZFS_ENTER(zfsvfs);
371 		ZFS_VERIFY_ZP(zp);
372 
373 		/*
374 		 * Wait for all dirty blocks for this object
375 		 * to get synced out to disk, and the DMU info
376 		 * updated.
377 		 */
378 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
379 		if (error) {
380 			ZFS_EXIT(zfsvfs);
381 			return (error);
382 		}
383 
384 		/*
385 		 * Retrieve fill count from DMU object.
386 		 */
387 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
388 		if (error) {
389 			ZFS_EXIT(zfsvfs);
390 			return (error);
391 		}
392 
393 		ndata = doi.doi_fill_count;
394 
395 		ZFS_EXIT(zfsvfs);
396 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
397 			return (SET_ERROR(EFAULT));
398 		return (0);
399 	}
400 #endif
401 	}
402 	return (SET_ERROR(ENOTTY));
403 }
404 
405 #ifdef __FreeBSD__
406 static vm_page_t
407 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
408 {
409 	vm_object_t obj;
410 	vm_page_t pp;
411 	int64_t end;
412 
413 	/*
414 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
415 	 * aligned boundaries, if the range is not aligned.  As a result a
416 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
417 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
418 	 * the whole page would be considred clean despite have some dirty data.
419 	 * For this reason we should shrink the range to DEV_BSIZE aligned
420 	 * boundaries before calling vm_page_clear_dirty.
421 	 */
422 	end = rounddown2(off + nbytes, DEV_BSIZE);
423 	off = roundup2(off, DEV_BSIZE);
424 	nbytes = end - off;
425 
426 	obj = vp->v_object;
427 	zfs_vmobject_assert_wlocked(obj);
428 
429 	for (;;) {
430 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
431 		    pp->valid) {
432 			if (vm_page_xbusied(pp)) {
433 				/*
434 				 * Reference the page before unlocking and
435 				 * sleeping so that the page daemon is less
436 				 * likely to reclaim it.
437 				 */
438 				vm_page_reference(pp);
439 				vm_page_lock(pp);
440 				zfs_vmobject_wunlock(obj);
441 				vm_page_busy_sleep(pp, "zfsmwb", true);
442 				zfs_vmobject_wlock(obj);
443 				continue;
444 			}
445 			vm_page_sbusy(pp);
446 		} else if (pp != NULL) {
447 			ASSERT(!pp->valid);
448 			pp = NULL;
449 		}
450 
451 		if (pp != NULL) {
452 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
453 			vm_object_pip_add(obj, 1);
454 			pmap_remove_write(pp);
455 			if (nbytes != 0)
456 				vm_page_clear_dirty(pp, off, nbytes);
457 		}
458 		break;
459 	}
460 	return (pp);
461 }
462 
463 static void
464 page_unbusy(vm_page_t pp)
465 {
466 
467 	vm_page_sunbusy(pp);
468 	vm_object_pip_subtract(pp->object, 1);
469 }
470 
471 static vm_page_t
472 page_hold(vnode_t *vp, int64_t start)
473 {
474 	vm_object_t obj;
475 	vm_page_t pp;
476 
477 	obj = vp->v_object;
478 	zfs_vmobject_assert_wlocked(obj);
479 
480 	for (;;) {
481 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
482 		    pp->valid) {
483 			if (vm_page_xbusied(pp)) {
484 				/*
485 				 * Reference the page before unlocking and
486 				 * sleeping so that the page daemon is less
487 				 * likely to reclaim it.
488 				 */
489 				vm_page_reference(pp);
490 				vm_page_lock(pp);
491 				zfs_vmobject_wunlock(obj);
492 				vm_page_busy_sleep(pp, "zfsmwb", true);
493 				zfs_vmobject_wlock(obj);
494 				continue;
495 			}
496 
497 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
498 			vm_page_lock(pp);
499 			vm_page_hold(pp);
500 			vm_page_unlock(pp);
501 
502 		} else
503 			pp = NULL;
504 		break;
505 	}
506 	return (pp);
507 }
508 
509 static void
510 page_unhold(vm_page_t pp)
511 {
512 
513 	vm_page_lock(pp);
514 	vm_page_unhold(pp);
515 	vm_page_unlock(pp);
516 }
517 
518 /*
519  * When a file is memory mapped, we must keep the IO data synchronized
520  * between the DMU cache and the memory mapped pages.  What this means:
521  *
522  * On Write:	If we find a memory mapped page, we write to *both*
523  *		the page and the dmu buffer.
524  */
525 static void
526 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
527     int segflg, dmu_tx_t *tx)
528 {
529 	vm_object_t obj;
530 	struct sf_buf *sf;
531 	caddr_t va;
532 	int off;
533 
534 	ASSERT(segflg != UIO_NOCOPY);
535 	ASSERT(vp->v_mount != NULL);
536 	obj = vp->v_object;
537 	ASSERT(obj != NULL);
538 
539 	off = start & PAGEOFFSET;
540 	zfs_vmobject_wlock(obj);
541 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
542 		vm_page_t pp;
543 		int nbytes = imin(PAGESIZE - off, len);
544 
545 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
546 			zfs_vmobject_wunlock(obj);
547 
548 			va = zfs_map_page(pp, &sf);
549 			(void) dmu_read(os, oid, start+off, nbytes,
550 			    va+off, DMU_READ_PREFETCH);;
551 			zfs_unmap_page(sf);
552 
553 			zfs_vmobject_wlock(obj);
554 			page_unbusy(pp);
555 		}
556 		len -= nbytes;
557 		off = 0;
558 	}
559 	vm_object_pip_wakeupn(obj, 0);
560 	zfs_vmobject_wunlock(obj);
561 }
562 
563 /*
564  * Read with UIO_NOCOPY flag means that sendfile(2) requests
565  * ZFS to populate a range of page cache pages with data.
566  *
567  * NOTE: this function could be optimized to pre-allocate
568  * all pages in advance, drain exclusive busy on all of them,
569  * map them into contiguous KVA region and populate them
570  * in one single dmu_read() call.
571  */
572 static int
573 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
574 {
575 	znode_t *zp = VTOZ(vp);
576 	objset_t *os = zp->z_zfsvfs->z_os;
577 	struct sf_buf *sf;
578 	vm_object_t obj;
579 	vm_page_t pp;
580 	int64_t start;
581 	caddr_t va;
582 	int len = nbytes;
583 	int off;
584 	int error = 0;
585 
586 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
587 	ASSERT(vp->v_mount != NULL);
588 	obj = vp->v_object;
589 	ASSERT(obj != NULL);
590 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
591 
592 	zfs_vmobject_wlock(obj);
593 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
594 		int bytes = MIN(PAGESIZE, len);
595 
596 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
597 		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
598 		if (pp->valid == 0) {
599 			zfs_vmobject_wunlock(obj);
600 			va = zfs_map_page(pp, &sf);
601 			error = dmu_read(os, zp->z_id, start, bytes, va,
602 			    DMU_READ_PREFETCH);
603 			if (bytes != PAGESIZE && error == 0)
604 				bzero(va + bytes, PAGESIZE - bytes);
605 			zfs_unmap_page(sf);
606 			zfs_vmobject_wlock(obj);
607 			vm_page_sunbusy(pp);
608 			vm_page_lock(pp);
609 			if (error) {
610 				if (pp->wire_count == 0 && pp->valid == 0 &&
611 				    !vm_page_busied(pp))
612 					vm_page_free(pp);
613 			} else {
614 				pp->valid = VM_PAGE_BITS_ALL;
615 				vm_page_activate(pp);
616 			}
617 			vm_page_unlock(pp);
618 		} else {
619 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
620 			vm_page_sunbusy(pp);
621 		}
622 		if (error)
623 			break;
624 		uio->uio_resid -= bytes;
625 		uio->uio_offset += bytes;
626 		len -= bytes;
627 	}
628 	zfs_vmobject_wunlock(obj);
629 	return (error);
630 }
631 
632 /*
633  * When a file is memory mapped, we must keep the IO data synchronized
634  * between the DMU cache and the memory mapped pages.  What this means:
635  *
636  * On Read:	We "read" preferentially from memory mapped pages,
637  *		else we default from the dmu buffer.
638  *
639  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
640  *	 the file is memory mapped.
641  */
642 static int
643 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
644 {
645 	znode_t *zp = VTOZ(vp);
646 	vm_object_t obj;
647 	int64_t start;
648 	caddr_t va;
649 	int len = nbytes;
650 	int off;
651 	int error = 0;
652 
653 	ASSERT(vp->v_mount != NULL);
654 	obj = vp->v_object;
655 	ASSERT(obj != NULL);
656 
657 	start = uio->uio_loffset;
658 	off = start & PAGEOFFSET;
659 	zfs_vmobject_wlock(obj);
660 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
661 		vm_page_t pp;
662 		uint64_t bytes = MIN(PAGESIZE - off, len);
663 
664 		if (pp = page_hold(vp, start)) {
665 			struct sf_buf *sf;
666 			caddr_t va;
667 
668 			zfs_vmobject_wunlock(obj);
669 			va = zfs_map_page(pp, &sf);
670 #ifdef illumos
671 			error = uiomove(va + off, bytes, UIO_READ, uio);
672 #else
673 			error = vn_io_fault_uiomove(va + off, bytes, uio);
674 #endif
675 			zfs_unmap_page(sf);
676 			zfs_vmobject_wlock(obj);
677 			page_unhold(pp);
678 		} else {
679 			zfs_vmobject_wunlock(obj);
680 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
681 			    uio, bytes);
682 			zfs_vmobject_wlock(obj);
683 		}
684 		len -= bytes;
685 		off = 0;
686 		if (error)
687 			break;
688 	}
689 	zfs_vmobject_wunlock(obj);
690 	return (error);
691 }
692 #endif /* __FreeBSD__ */
693 
694 #ifdef __NetBSD__
695 
696 caddr_t
697 zfs_map_page(page_t *pp, enum seg_rw rw)
698 {
699 	vaddr_t va;
700 	int flags;
701 
702 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
703 	if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
704 		return (caddr_t)va;
705 #endif
706 
707 	flags = UVMPAGER_MAPIN_WAITOK |
708 		(rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ);
709 	va = uvm_pagermapin(&pp, 1, flags);
710 	return (caddr_t)va;
711 }
712 
713 void
714 zfs_unmap_page(page_t *pp, caddr_t addr)
715 {
716 
717 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
718 	vaddr_t va;
719 
720 	if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
721 		return;
722 #endif
723 	uvm_pagermapout((vaddr_t)addr, 1);
724 }
725 
726 static int
727 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
728 {
729 	znode_t *zp = VTOZ(vp);
730 	struct uvm_object *uobj = &vp->v_uobj;
731 	kmutex_t *mtx = uobj->vmobjlock;
732 	int64_t start;
733 	caddr_t va;
734 	size_t len = nbytes;
735 	int off;
736 	int error = 0;
737 	int npages, found;
738 
739 	start = uio->uio_loffset;
740 	off = start & PAGEOFFSET;
741 
742 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
743 		page_t *pp;
744 		uint64_t bytes = MIN(PAGESIZE - off, len);
745 
746 		pp = NULL;
747 		npages = 1;
748 		mutex_enter(mtx);
749 		found = uvn_findpages(uobj, start, &npages, &pp, UFP_NOALLOC);
750 		mutex_exit(mtx);
751 
752 		/* XXXNETBSD shouldn't access userspace with the page busy */
753 		if (found) {
754 			va = zfs_map_page(pp, S_READ);
755 			error = uiomove(va + off, bytes, UIO_READ, uio);
756 			zfs_unmap_page(pp, va);
757 		} else {
758 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
759 			    uio, bytes);
760 		}
761 
762 		mutex_enter(mtx);
763 		uvm_page_unbusy(&pp, 1);
764 		mutex_exit(mtx);
765 
766 		len -= bytes;
767 		off = 0;
768 		if (error)
769 			break;
770 	}
771 	return (error);
772 }
773 
774 static void
775 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
776     int segflg, dmu_tx_t *tx)
777 {
778 	struct uvm_object *uobj = &vp->v_uobj;
779 	kmutex_t *mtx = uobj->vmobjlock;
780 	caddr_t va;
781 	int off;
782 
783 	ASSERT(vp->v_mount != NULL);
784 
785 	mutex_enter(mtx);
786 
787 	off = start & PAGEOFFSET;
788 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
789 		page_t *pp;
790 		int nbytes = MIN(PAGESIZE - off, len);
791 		int npages, found;
792 
793 		pp = NULL;
794 		npages = 1;
795 		found = uvn_findpages(uobj, start, &npages, &pp, UFP_NOALLOC);
796 		if (found) {
797 			mutex_exit(mtx);
798 
799 			va = zfs_map_page(pp, S_WRITE);
800 			(void) dmu_read(os, oid, start + off, nbytes,
801 			    va + off, DMU_READ_PREFETCH);
802 			zfs_unmap_page(pp, va);
803 
804 			mutex_enter(mtx);
805 			uvm_page_unbusy(&pp, 1);
806 		}
807 		len -= nbytes;
808 		off = 0;
809 	}
810 	mutex_exit(mtx);
811 }
812 #endif /* __NetBSD__ */
813 
814 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
815 
816 /*
817  * Read bytes from specified file into supplied buffer.
818  *
819  *	IN:	vp	- vnode of file to be read from.
820  *		uio	- structure supplying read location, range info,
821  *			  and return buffer.
822  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
823  *		cr	- credentials of caller.
824  *		ct	- caller context
825  *
826  *	OUT:	uio	- updated offset and range, buffer filled.
827  *
828  *	RETURN:	0 on success, error code on failure.
829  *
830  * Side Effects:
831  *	vp - atime updated if byte count > 0
832  */
833 /* ARGSUSED */
834 static int
835 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
836 {
837 	znode_t		*zp = VTOZ(vp);
838 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
839 	ssize_t		n, nbytes;
840 	int		error = 0;
841 	rl_t		*rl;
842 	xuio_t		*xuio = NULL;
843 
844 	ZFS_ENTER(zfsvfs);
845 	ZFS_VERIFY_ZP(zp);
846 
847 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
848 		ZFS_EXIT(zfsvfs);
849 		return (SET_ERROR(EACCES));
850 	}
851 
852 	/*
853 	 * Validate file offset
854 	 */
855 	if (uio->uio_loffset < (offset_t)0) {
856 		ZFS_EXIT(zfsvfs);
857 		return (SET_ERROR(EINVAL));
858 	}
859 
860 	/*
861 	 * Fasttrack empty reads
862 	 */
863 	if (uio->uio_resid == 0) {
864 		ZFS_EXIT(zfsvfs);
865 		return (0);
866 	}
867 
868 	/*
869 	 * Check for mandatory locks
870 	 */
871 	if (MANDMODE(zp->z_mode)) {
872 		if (error = chklock(vp, FREAD,
873 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
874 			ZFS_EXIT(zfsvfs);
875 			return (error);
876 		}
877 	}
878 
879 	/*
880 	 * If we're in FRSYNC mode, sync out this znode before reading it.
881 	 */
882 	if (zfsvfs->z_log &&
883 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
884 		zil_commit(zfsvfs->z_log, zp->z_id);
885 
886 	/*
887 	 * Lock the range against changes.
888 	 */
889 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
890 
891 	/*
892 	 * If we are reading past end-of-file we can skip
893 	 * to the end; but we might still need to set atime.
894 	 */
895 	if (uio->uio_loffset >= zp->z_size) {
896 		error = 0;
897 		goto out;
898 	}
899 
900 	ASSERT(uio->uio_loffset < zp->z_size);
901 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
902 
903 #ifdef illumos
904 	if ((uio->uio_extflg == UIO_XUIO) &&
905 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
906 		int nblk;
907 		int blksz = zp->z_blksz;
908 		uint64_t offset = uio->uio_loffset;
909 
910 		xuio = (xuio_t *)uio;
911 		if ((ISP2(blksz))) {
912 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
913 			    blksz)) / blksz;
914 		} else {
915 			ASSERT(offset + n <= blksz);
916 			nblk = 1;
917 		}
918 		(void) dmu_xuio_init(xuio, nblk);
919 
920 		if (vn_has_cached_data(vp)) {
921 			/*
922 			 * For simplicity, we always allocate a full buffer
923 			 * even if we only expect to read a portion of a block.
924 			 */
925 			while (--nblk >= 0) {
926 				(void) dmu_xuio_add(xuio,
927 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
928 				    blksz), 0, blksz);
929 			}
930 		}
931 	}
932 #endif	/* illumos */
933 
934 	while (n > 0) {
935 		nbytes = MIN(n, zfs_read_chunk_size -
936 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
937 
938 #ifdef __FreeBSD__
939 		if (uio->uio_segflg == UIO_NOCOPY)
940 			error = mappedread_sf(vp, nbytes, uio);
941 		else
942 #endif /* __FreeBSD__ */
943 		if (vn_has_cached_data(vp)) {
944 			error = mappedread(vp, nbytes, uio);
945 		} else {
946 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
947 			    uio, nbytes);
948 		}
949 		if (error) {
950 			/* convert checksum errors into IO errors */
951 			if (error == ECKSUM)
952 				error = SET_ERROR(EIO);
953 			break;
954 		}
955 
956 		n -= nbytes;
957 	}
958 out:
959 	zfs_range_unlock(rl);
960 
961 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
962 	ZFS_EXIT(zfsvfs);
963 	return (error);
964 }
965 
966 /*
967  * Write the bytes to a file.
968  *
969  *	IN:	vp	- vnode of file to be written to.
970  *		uio	- structure supplying write location, range info,
971  *			  and data buffer.
972  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
973  *			  set if in append mode.
974  *		cr	- credentials of caller.
975  *		ct	- caller context (NFS/CIFS fem monitor only)
976  *
977  *	OUT:	uio	- updated offset and range.
978  *
979  *	RETURN:	0 on success, error code on failure.
980  *
981  * Timestamps:
982  *	vp - ctime|mtime updated if byte count > 0
983  */
984 
985 /* ARGSUSED */
986 static int
987 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
988 {
989 	znode_t		*zp = VTOZ(vp);
990 	rlim64_t	limit = MAXOFFSET_T;
991 	ssize_t		start_resid = uio->uio_resid;
992 	ssize_t		tx_bytes;
993 	uint64_t	end_size;
994 	dmu_tx_t	*tx;
995 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
996 	zilog_t		*zilog;
997 	offset_t	woff;
998 	ssize_t		n, nbytes;
999 	rl_t		*rl;
1000 	int		max_blksz = zfsvfs->z_max_blksz;
1001 	int		error = 0;
1002 	arc_buf_t	*abuf;
1003 	iovec_t		*aiov = NULL;
1004 	xuio_t		*xuio = NULL;
1005 	int		i_iov = 0;
1006 	int		iovcnt = uio->uio_iovcnt;
1007 	iovec_t		*iovp = uio->uio_iov;
1008 	int		write_eof;
1009 	int		count = 0;
1010 	sa_bulk_attr_t	bulk[4];
1011 	uint64_t	mtime[2], ctime[2];
1012 	int		segflg;
1013 
1014 #ifdef __NetBSD__
1015 	segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ?
1016 		UIO_SYSSPACE : UIO_USERSPACE;
1017 #else
1018 	segflg = uio->uio_segflg;
1019 #endif
1020 
1021 	/*
1022 	 * Fasttrack empty write
1023 	 */
1024 	n = start_resid;
1025 	if (n == 0)
1026 		return (0);
1027 
1028 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
1029 		limit = MAXOFFSET_T;
1030 
1031 	ZFS_ENTER(zfsvfs);
1032 	ZFS_VERIFY_ZP(zp);
1033 
1034 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1035 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1036 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1037 	    &zp->z_size, 8);
1038 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1039 	    &zp->z_pflags, 8);
1040 
1041 	/*
1042 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
1043 	 * callers might not be able to detect properly that we are read-only,
1044 	 * so check it explicitly here.
1045 	 */
1046 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
1047 		ZFS_EXIT(zfsvfs);
1048 		return (SET_ERROR(EROFS));
1049 	}
1050 
1051 	/*
1052 	 * If immutable or not appending then return EPERM
1053 	 */
1054 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
1055 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
1056 	    (uio->uio_loffset < zp->z_size))) {
1057 		ZFS_EXIT(zfsvfs);
1058 		return (SET_ERROR(EPERM));
1059 	}
1060 
1061 	zilog = zfsvfs->z_log;
1062 
1063 	/*
1064 	 * Validate file offset
1065 	 */
1066 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
1067 	if (woff < 0) {
1068 		ZFS_EXIT(zfsvfs);
1069 		return (SET_ERROR(EINVAL));
1070 	}
1071 
1072 	/*
1073 	 * Check for mandatory locks before calling zfs_range_lock()
1074 	 * in order to prevent a deadlock with locks set via fcntl().
1075 	 */
1076 	if (MANDMODE((mode_t)zp->z_mode) &&
1077 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
1078 		ZFS_EXIT(zfsvfs);
1079 		return (error);
1080 	}
1081 
1082 #ifdef illumos
1083 	/*
1084 	 * Pre-fault the pages to ensure slow (eg NFS) pages
1085 	 * don't hold up txg.
1086 	 * Skip this if uio contains loaned arc_buf.
1087 	 */
1088 	if ((uio->uio_extflg == UIO_XUIO) &&
1089 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
1090 		xuio = (xuio_t *)uio;
1091 	else
1092 		uio_prefaultpages(MIN(n, max_blksz), uio);
1093 #endif
1094 
1095 	/*
1096 	 * If in append mode, set the io offset pointer to eof.
1097 	 */
1098 	if (ioflag & FAPPEND) {
1099 		/*
1100 		 * Obtain an appending range lock to guarantee file append
1101 		 * semantics.  We reset the write offset once we have the lock.
1102 		 */
1103 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
1104 		woff = rl->r_off;
1105 		if (rl->r_len == UINT64_MAX) {
1106 			/*
1107 			 * We overlocked the file because this write will cause
1108 			 * the file block size to increase.
1109 			 * Note that zp_size cannot change with this lock held.
1110 			 */
1111 			woff = zp->z_size;
1112 		}
1113 		uio->uio_loffset = woff;
1114 	} else {
1115 		/*
1116 		 * Note that if the file block size will change as a result of
1117 		 * this write, then this range lock will lock the entire file
1118 		 * so that we can re-write the block safely.
1119 		 */
1120 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
1121 	}
1122 
1123 #ifdef illumos
1124 	if (woff >= limit) {
1125 		zfs_range_unlock(rl);
1126 		ZFS_EXIT(zfsvfs);
1127 		return (SET_ERROR(EFBIG));
1128 	}
1129 
1130 #endif
1131 #ifdef __FreeBSD__
1132 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
1133 		zfs_range_unlock(rl);
1134 		ZFS_EXIT(zfsvfs);
1135 		return (SET_ERROR(EFBIG));
1136 	}
1137 #endif
1138 #ifdef __NetBSD__
1139 	/* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */
1140 #endif
1141 
1142 	if ((woff + n) > limit || woff > (limit - n))
1143 		n = limit - woff;
1144 
1145 	/* Will this write extend the file length? */
1146 	write_eof = (woff + n > zp->z_size);
1147 
1148 	end_size = MAX(zp->z_size, woff + n);
1149 
1150 	/*
1151 	 * Write the file in reasonable size chunks.  Each chunk is written
1152 	 * in a separate transaction; this keeps the intent log records small
1153 	 * and allows us to do more fine-grained space accounting.
1154 	 */
1155 	while (n > 0) {
1156 		abuf = NULL;
1157 		woff = uio->uio_loffset;
1158 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1159 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1160 			if (abuf != NULL)
1161 				dmu_return_arcbuf(abuf);
1162 			error = SET_ERROR(EDQUOT);
1163 			break;
1164 		}
1165 
1166 		if (xuio && abuf == NULL) {
1167 			ASSERT(i_iov < iovcnt);
1168 			aiov = &iovp[i_iov];
1169 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1170 			dmu_xuio_clear(xuio, i_iov);
1171 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1172 			    iovec_t *, aiov, arc_buf_t *, abuf);
1173 			ASSERT((aiov->iov_base == abuf->b_data) ||
1174 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1175 			    aiov->iov_len == arc_buf_size(abuf)));
1176 			i_iov++;
1177 		} else if (abuf == NULL && n >= max_blksz &&
1178 		    woff >= zp->z_size &&
1179 		    P2PHASE(woff, max_blksz) == 0 &&
1180 		    zp->z_blksz == max_blksz) {
1181 			/*
1182 			 * This write covers a full block.  "Borrow" a buffer
1183 			 * from the dmu so that we can fill it before we enter
1184 			 * a transaction.  This avoids the possibility of
1185 			 * holding up the transaction if the data copy hangs
1186 			 * up on a pagefault (e.g., from an NFS server mapping).
1187 			 */
1188 #if defined(illumos) || defined(__NetBSD__)
1189 			size_t cbytes;
1190 #endif
1191 
1192 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1193 			    max_blksz);
1194 			ASSERT(abuf != NULL);
1195 			ASSERT(arc_buf_size(abuf) == max_blksz);
1196 #if defined(illumos) || defined(__NetBSD__)
1197 			if (error = uiocopy(abuf->b_data, max_blksz,
1198 			    UIO_WRITE, uio, &cbytes)) {
1199 				dmu_return_arcbuf(abuf);
1200 				break;
1201 			}
1202 			ASSERT(cbytes == max_blksz);
1203 #endif
1204 #ifdef __FreeBSD__
1205 			ssize_t resid = uio->uio_resid;
1206 
1207 			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1208 			if (error != 0) {
1209 				uio->uio_offset -= resid - uio->uio_resid;
1210 				uio->uio_resid = resid;
1211 				dmu_return_arcbuf(abuf);
1212 				break;
1213 			}
1214 #endif
1215 		}
1216 
1217 		/*
1218 		 * Start a transaction.
1219 		 */
1220 		tx = dmu_tx_create(zfsvfs->z_os);
1221 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1222 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1223 		zfs_sa_upgrade_txholds(tx, zp);
1224 		error = dmu_tx_assign(tx, TXG_WAIT);
1225 		if (error) {
1226 			dmu_tx_abort(tx);
1227 			if (abuf != NULL)
1228 				dmu_return_arcbuf(abuf);
1229 			break;
1230 		}
1231 
1232 		/*
1233 		 * If zfs_range_lock() over-locked we grow the blocksize
1234 		 * and then reduce the lock range.  This will only happen
1235 		 * on the first iteration since zfs_range_reduce() will
1236 		 * shrink down r_len to the appropriate size.
1237 		 */
1238 		if (rl->r_len == UINT64_MAX) {
1239 			uint64_t new_blksz;
1240 
1241 			if (zp->z_blksz > max_blksz) {
1242 				/*
1243 				 * File's blocksize is already larger than the
1244 				 * "recordsize" property.  Only let it grow to
1245 				 * the next power of 2.
1246 				 */
1247 				ASSERT(!ISP2(zp->z_blksz));
1248 				new_blksz = MIN(end_size,
1249 				    1 << highbit64(zp->z_blksz));
1250 			} else {
1251 				new_blksz = MIN(end_size, max_blksz);
1252 			}
1253 			zfs_grow_blocksize(zp, new_blksz, tx);
1254 			zfs_range_reduce(rl, woff, n);
1255 		}
1256 
1257 		/*
1258 		 * XXX - should we really limit each write to z_max_blksz?
1259 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1260 		 */
1261 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1262 
1263 		if (woff + nbytes > zp->z_size)
1264 			vnode_pager_setsize(vp, woff + nbytes);
1265 
1266 		if (abuf == NULL) {
1267 			tx_bytes = uio->uio_resid;
1268 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1269 			    uio, nbytes, tx);
1270 			tx_bytes -= uio->uio_resid;
1271 		} else {
1272 			tx_bytes = nbytes;
1273 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1274 			/*
1275 			 * If this is not a full block write, but we are
1276 			 * extending the file past EOF and this data starts
1277 			 * block-aligned, use assign_arcbuf().  Otherwise,
1278 			 * write via dmu_write().
1279 			 */
1280 			if (tx_bytes < max_blksz && (!write_eof ||
1281 			    aiov->iov_base != abuf->b_data)) {
1282 				ASSERT(xuio);
1283 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1284 				    aiov->iov_len, aiov->iov_base, tx);
1285 				dmu_return_arcbuf(abuf);
1286 				xuio_stat_wbuf_copied();
1287 			} else {
1288 				ASSERT(xuio || tx_bytes == max_blksz);
1289 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1290 				    woff, abuf, tx);
1291 			}
1292 #if defined(illumos) || defined(__NetBSD__)
1293 			ASSERT(tx_bytes <= uio->uio_resid);
1294 			uioskip(uio, tx_bytes);
1295 #endif
1296 		}
1297 		if (tx_bytes && vn_has_cached_data(vp)) {
1298 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1299 			    zp->z_id, segflg, tx);
1300 		}
1301 
1302 		/*
1303 		 * If we made no progress, we're done.  If we made even
1304 		 * partial progress, update the znode and ZIL accordingly.
1305 		 */
1306 		if (tx_bytes == 0) {
1307 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1308 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1309 			dmu_tx_commit(tx);
1310 			ASSERT(error != 0);
1311 			break;
1312 		}
1313 
1314 		/*
1315 		 * Clear Set-UID/Set-GID bits on successful write if not
1316 		 * privileged and at least one of the excute bits is set.
1317 		 *
1318 		 * It would be nice to to this after all writes have
1319 		 * been done, but that would still expose the ISUID/ISGID
1320 		 * to another app after the partial write is committed.
1321 		 *
1322 		 * Note: we don't call zfs_fuid_map_id() here because
1323 		 * user 0 is not an ephemeral uid.
1324 		 */
1325 		mutex_enter(&zp->z_acl_lock);
1326 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1327 		    (S_IXUSR >> 6))) != 0 &&
1328 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1329 		    secpolicy_vnode_setid_retain(vp, cr,
1330 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1331 			uint64_t newmode;
1332 			zp->z_mode &= ~(S_ISUID | S_ISGID);
1333 			newmode = zp->z_mode;
1334 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1335 			    (void *)&newmode, sizeof (uint64_t), tx);
1336 		}
1337 		mutex_exit(&zp->z_acl_lock);
1338 
1339 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1340 		    B_TRUE);
1341 
1342 		/*
1343 		 * Update the file size (zp_size) if it has changed;
1344 		 * account for possible concurrent updates.
1345 		 */
1346 		while ((end_size = zp->z_size) < uio->uio_loffset) {
1347 			(void) atomic_cas_64(&zp->z_size, end_size,
1348 			    uio->uio_loffset);
1349 #ifdef illumos
1350 			ASSERT(error == 0);
1351 #else
1352 			ASSERT(error == 0 || error == EFAULT);
1353 #endif
1354 		}
1355 		/*
1356 		 * If we are replaying and eof is non zero then force
1357 		 * the file size to the specified eof. Note, there's no
1358 		 * concurrency during replay.
1359 		 */
1360 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1361 			zp->z_size = zfsvfs->z_replay_eof;
1362 
1363 		if (error == 0)
1364 			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1365 		else
1366 			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1367 
1368 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1369 		dmu_tx_commit(tx);
1370 
1371 		if (error != 0)
1372 			break;
1373 		ASSERT(tx_bytes == nbytes);
1374 		n -= nbytes;
1375 
1376 #ifdef illumos
1377 		if (!xuio && n > 0)
1378 			uio_prefaultpages(MIN(n, max_blksz), uio);
1379 #endif
1380 	}
1381 
1382 	zfs_range_unlock(rl);
1383 
1384 	/*
1385 	 * If we're in replay mode, or we made no progress, return error.
1386 	 * Otherwise, it's at least a partial write, so it's successful.
1387 	 */
1388 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1389 		ZFS_EXIT(zfsvfs);
1390 		return (error);
1391 	}
1392 
1393 #ifdef __FreeBSD__
1394 	/*
1395 	 * EFAULT means that at least one page of the source buffer was not
1396 	 * available.  VFS will re-try remaining I/O upon this error.
1397 	 */
1398 	if (error == EFAULT) {
1399 		ZFS_EXIT(zfsvfs);
1400 		return (error);
1401 	}
1402 #endif
1403 
1404 	if (ioflag & (FSYNC | FDSYNC) ||
1405 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1406 		zil_commit(zilog, zp->z_id);
1407 
1408 	ZFS_EXIT(zfsvfs);
1409 	return (0);
1410 }
1411 
1412 void
1413 zfs_get_done(zgd_t *zgd, int error)
1414 {
1415 	znode_t *zp = zgd->zgd_private;
1416 	objset_t *os = zp->z_zfsvfs->z_os;
1417 
1418 	if (zgd->zgd_db)
1419 		dmu_buf_rele(zgd->zgd_db, zgd);
1420 
1421 	zfs_range_unlock(zgd->zgd_rl);
1422 
1423 	/*
1424 	 * Release the vnode asynchronously as we currently have the
1425 	 * txg stopped from syncing.
1426 	 */
1427 	VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1428 
1429 	if (error == 0 && zgd->zgd_bp)
1430 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1431 
1432 	kmem_free(zgd, sizeof (zgd_t));
1433 }
1434 
1435 #ifdef DEBUG
1436 static int zil_fault_io = 0;
1437 #endif
1438 
1439 /*
1440  * Get data to generate a TX_WRITE intent log record.
1441  */
1442 int
1443 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1444 {
1445 	zfsvfs_t *zfsvfs = arg;
1446 	objset_t *os = zfsvfs->z_os;
1447 	znode_t *zp;
1448 	uint64_t object = lr->lr_foid;
1449 	uint64_t offset = lr->lr_offset;
1450 	uint64_t size = lr->lr_length;
1451 	blkptr_t *bp = &lr->lr_blkptr;
1452 	dmu_buf_t *db;
1453 	zgd_t *zgd;
1454 	int error = 0;
1455 
1456 	ASSERT(zio != NULL);
1457 	ASSERT(size != 0);
1458 
1459 	/*
1460 	 * Nothing to do if the file has been removed
1461 	 */
1462 	if (zfs_zget_cleaner(zfsvfs, object, &zp) != 0)
1463 		return (SET_ERROR(ENOENT));
1464 	if (zp->z_unlinked) {
1465 		/*
1466 		 * Release the vnode asynchronously as we currently have the
1467 		 * txg stopped from syncing.
1468 		 */
1469 		VN_RELE_CLEANER(ZTOV(zp),
1470 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1471 		return (SET_ERROR(ENOENT));
1472 	}
1473 
1474 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1475 	zgd->zgd_zilog = zfsvfs->z_log;
1476 	zgd->zgd_private = zp;
1477 
1478 	/*
1479 	 * Write records come in two flavors: immediate and indirect.
1480 	 * For small writes it's cheaper to store the data with the
1481 	 * log record (immediate); for large writes it's cheaper to
1482 	 * sync the data and get a pointer to it (indirect) so that
1483 	 * we don't have to write the data twice.
1484 	 */
1485 	if (buf != NULL) { /* immediate write */
1486 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1487 		/* test for truncation needs to be done while range locked */
1488 		if (offset >= zp->z_size) {
1489 			error = SET_ERROR(ENOENT);
1490 		} else {
1491 			error = dmu_read(os, object, offset, size, buf,
1492 			    DMU_READ_NO_PREFETCH);
1493 		}
1494 		ASSERT(error == 0 || error == ENOENT);
1495 	} else { /* indirect write */
1496 		/*
1497 		 * Have to lock the whole block to ensure when it's
1498 		 * written out and it's checksum is being calculated
1499 		 * that no one can change the data. We need to re-check
1500 		 * blocksize after we get the lock in case it's changed!
1501 		 */
1502 		for (;;) {
1503 			uint64_t blkoff;
1504 			size = zp->z_blksz;
1505 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1506 			offset -= blkoff;
1507 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1508 			    RL_READER);
1509 			if (zp->z_blksz == size)
1510 				break;
1511 			offset += blkoff;
1512 			zfs_range_unlock(zgd->zgd_rl);
1513 		}
1514 		/* test for truncation needs to be done while range locked */
1515 		if (lr->lr_offset >= zp->z_size)
1516 			error = SET_ERROR(ENOENT);
1517 #ifdef DEBUG
1518 		if (zil_fault_io) {
1519 			error = SET_ERROR(EIO);
1520 			zil_fault_io = 0;
1521 		}
1522 #endif
1523 		if (error == 0)
1524 			error = dmu_buf_hold(os, object, offset, zgd, &db,
1525 			    DMU_READ_NO_PREFETCH);
1526 
1527 		if (error == 0) {
1528 			blkptr_t *obp = dmu_buf_get_blkptr(db);
1529 			if (obp) {
1530 				ASSERT(BP_IS_HOLE(bp));
1531 				*bp = *obp;
1532 			}
1533 
1534 			zgd->zgd_db = db;
1535 			zgd->zgd_bp = bp;
1536 
1537 			ASSERT(db->db_offset == offset);
1538 			ASSERT(db->db_size == size);
1539 
1540 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1541 			    zfs_get_done, zgd);
1542 			ASSERT(error || lr->lr_length <= zp->z_blksz);
1543 
1544 			/*
1545 			 * On success, we need to wait for the write I/O
1546 			 * initiated by dmu_sync() to complete before we can
1547 			 * release this dbuf.  We will finish everything up
1548 			 * in the zfs_get_done() callback.
1549 			 */
1550 			if (error == 0)
1551 				return (0);
1552 
1553 			if (error == EALREADY) {
1554 				lr->lr_common.lrc_txtype = TX_WRITE2;
1555 				error = 0;
1556 			}
1557 		}
1558 	}
1559 
1560 	zfs_get_done(zgd, error);
1561 
1562 	return (error);
1563 }
1564 
1565 /*ARGSUSED*/
1566 static int
1567 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1568     caller_context_t *ct)
1569 {
1570 	znode_t *zp = VTOZ(vp);
1571 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1572 	int error;
1573 
1574 	ZFS_ENTER(zfsvfs);
1575 	ZFS_VERIFY_ZP(zp);
1576 
1577 	if (flag & V_ACE_MASK)
1578 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1579 	else
1580 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1581 
1582 	ZFS_EXIT(zfsvfs);
1583 	return (error);
1584 }
1585 
1586 #ifdef __FreeBSD__
1587 static int
1588 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1589 {
1590 	int error;
1591 
1592 	*vpp = arg;
1593 	error = vn_lock(*vpp, lkflags);
1594 	if (error != 0)
1595 		vrele(*vpp);
1596 	return (error);
1597 }
1598 
1599 static int
1600 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1601 {
1602 	znode_t *zdp = VTOZ(dvp);
1603 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1604 	int error;
1605 	int ltype;
1606 
1607 	ASSERT_VOP_LOCKED(dvp, __func__);
1608 #ifdef DIAGNOSTIC
1609 	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1610 		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1611 #endif
1612 
1613 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1614 		ASSERT3P(dvp, ==, vp);
1615 		vref(dvp);
1616 		ltype = lkflags & LK_TYPE_MASK;
1617 		if (ltype != VOP_ISLOCKED(dvp)) {
1618 			if (ltype == LK_EXCLUSIVE)
1619 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1620 			else /* if (ltype == LK_SHARED) */
1621 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1622 
1623 			/*
1624 			 * Relock for the "." case could leave us with
1625 			 * reclaimed vnode.
1626 			 */
1627 			if (dvp->v_iflag & VI_DOOMED) {
1628 				vrele(dvp);
1629 				return (SET_ERROR(ENOENT));
1630 			}
1631 		}
1632 		return (0);
1633 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1634 		/*
1635 		 * Note that in this case, dvp is the child vnode, and we
1636 		 * are looking up the parent vnode - exactly reverse from
1637 		 * normal operation.  Unlocking dvp requires some rather
1638 		 * tricky unlock/relock dance to prevent mp from being freed;
1639 		 * use vn_vget_ino_gen() which takes care of all that.
1640 		 *
1641 		 * XXX Note that there is a time window when both vnodes are
1642 		 * unlocked.  It is possible, although highly unlikely, that
1643 		 * during that window the parent-child relationship between
1644 		 * the vnodes may change, for example, get reversed.
1645 		 * In that case we would have a wrong lock order for the vnodes.
1646 		 * All other filesystems seem to ignore this problem, so we
1647 		 * do the same here.
1648 		 * A potential solution could be implemented as follows:
1649 		 * - using LK_NOWAIT when locking the second vnode and retrying
1650 		 *   if necessary
1651 		 * - checking that the parent-child relationship still holds
1652 		 *   after locking both vnodes and retrying if it doesn't
1653 		 */
1654 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1655 		return (error);
1656 	} else {
1657 		error = vn_lock(vp, lkflags);
1658 		if (error != 0)
1659 			vrele(vp);
1660 		return (error);
1661 	}
1662 }
1663 
1664 /*
1665  * Lookup an entry in a directory, or an extended attribute directory.
1666  * If it exists, return a held vnode reference for it.
1667  *
1668  *	IN:	dvp	- vnode of directory to search.
1669  *		nm	- name of entry to lookup.
1670  *		pnp	- full pathname to lookup [UNUSED].
1671  *		flags	- LOOKUP_XATTR set if looking for an attribute.
1672  *		rdir	- root directory vnode [UNUSED].
1673  *		cr	- credentials of caller.
1674  *		ct	- caller context
1675  *
1676  *	OUT:	vpp	- vnode of located entry, NULL if not found.
1677  *
1678  *	RETURN:	0 on success, error code on failure.
1679  *
1680  * Timestamps:
1681  *	NA
1682  */
1683 /* ARGSUSED */
1684 static int
1685 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1686     int nameiop, cred_t *cr, kthread_t *td, int flags)
1687 {
1688 	znode_t *zdp = VTOZ(dvp);
1689 	znode_t *zp;
1690 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1691 	int	error = 0;
1692 
1693 	/* fast path (should be redundant with vfs namecache) */
1694 	if (!(flags & LOOKUP_XATTR)) {
1695 		if (dvp->v_type != VDIR) {
1696 			return (SET_ERROR(ENOTDIR));
1697 		} else if (zdp->z_sa_hdl == NULL) {
1698 			return (SET_ERROR(EIO));
1699 		}
1700 	}
1701 
1702 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1703 
1704 	ZFS_ENTER(zfsvfs);
1705 	ZFS_VERIFY_ZP(zdp);
1706 
1707 	*vpp = NULL;
1708 
1709 	if (flags & LOOKUP_XATTR) {
1710 #ifdef TODO
1711 		/*
1712 		 * If the xattr property is off, refuse the lookup request.
1713 		 */
1714 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1715 			ZFS_EXIT(zfsvfs);
1716 			return (SET_ERROR(EINVAL));
1717 		}
1718 #endif
1719 
1720 		/*
1721 		 * We don't allow recursive attributes..
1722 		 * Maybe someday we will.
1723 		 */
1724 		if (zdp->z_pflags & ZFS_XATTR) {
1725 			ZFS_EXIT(zfsvfs);
1726 			return (SET_ERROR(EINVAL));
1727 		}
1728 
1729 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1730 			ZFS_EXIT(zfsvfs);
1731 			return (error);
1732 		}
1733 
1734 		/*
1735 		 * Do we have permission to get into attribute directory?
1736 		 */
1737 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1738 		    B_FALSE, cr)) {
1739 			vrele(*vpp);
1740 			*vpp = NULL;
1741 		}
1742 
1743 		ZFS_EXIT(zfsvfs);
1744 		return (error);
1745 	}
1746 
1747 	/*
1748 	 * Check accessibility of directory.
1749 	 */
1750 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1751 		ZFS_EXIT(zfsvfs);
1752 		return (error);
1753 	}
1754 
1755 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1756 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1757 		ZFS_EXIT(zfsvfs);
1758 		return (SET_ERROR(EILSEQ));
1759 	}
1760 
1761 
1762 	/*
1763 	 * First handle the special cases.
1764 	 */
1765 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1766 		/*
1767 		 * If we are a snapshot mounted under .zfs, return
1768 		 * the vp for the snapshot directory.
1769 		 */
1770 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1771 			struct componentname cn;
1772 			vnode_t *zfsctl_vp;
1773 			int ltype;
1774 
1775 			ZFS_EXIT(zfsvfs);
1776 			ltype = VOP_ISLOCKED(dvp);
1777 			VOP_UNLOCK(dvp, 0);
1778 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1779 			    &zfsctl_vp);
1780 			if (error == 0) {
1781 				cn.cn_nameptr = "snapshot";
1782 				cn.cn_namelen = strlen(cn.cn_nameptr);
1783 				cn.cn_nameiop = cnp->cn_nameiop;
1784 				cn.cn_flags = cnp->cn_flags;
1785 				cn.cn_lkflags = cnp->cn_lkflags;
1786 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1787 				vput(zfsctl_vp);
1788 			}
1789 			vn_lock(dvp, ltype | LK_RETRY);
1790 			return (error);
1791 		}
1792 	}
1793 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1794 		ZFS_EXIT(zfsvfs);
1795 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1796 			return (SET_ERROR(ENOTSUP));
1797 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1798 		return (error);
1799 	}
1800 
1801 	/*
1802 	 * The loop is retry the lookup if the parent-child relationship
1803 	 * changes during the dot-dot locking complexities.
1804 	 */
1805 	for (;;) {
1806 		uint64_t parent;
1807 
1808 		error = zfs_dirlook(zdp, nm, &zp);
1809 		if (error == 0)
1810 			*vpp = ZTOV(zp);
1811 
1812 		ZFS_EXIT(zfsvfs);
1813 		if (error != 0)
1814 			break;
1815 
1816 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1817 		if (error != 0) {
1818 			/*
1819 			 * If we've got a locking error, then the vnode
1820 			 * got reclaimed because of a force unmount.
1821 			 * We never enter doomed vnodes into the name cache.
1822 			 */
1823 			*vpp = NULL;
1824 			return (error);
1825 		}
1826 
1827 		if ((cnp->cn_flags & ISDOTDOT) == 0)
1828 			break;
1829 
1830 		ZFS_ENTER(zfsvfs);
1831 		if (zdp->z_sa_hdl == NULL) {
1832 			error = SET_ERROR(EIO);
1833 		} else {
1834 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1835 			    &parent, sizeof (parent));
1836 		}
1837 		if (error != 0) {
1838 			ZFS_EXIT(zfsvfs);
1839 			vput(ZTOV(zp));
1840 			break;
1841 		}
1842 		if (zp->z_id == parent) {
1843 			ZFS_EXIT(zfsvfs);
1844 			break;
1845 		}
1846 		vput(ZTOV(zp));
1847 	}
1848 
1849 out:
1850 	if (error != 0)
1851 		*vpp = NULL;
1852 
1853 	/* Translate errors and add SAVENAME when needed. */
1854 	if (cnp->cn_flags & ISLASTCN) {
1855 		switch (nameiop) {
1856 		case CREATE:
1857 		case RENAME:
1858 			if (error == ENOENT) {
1859 				error = EJUSTRETURN;
1860 				cnp->cn_flags |= SAVENAME;
1861 				break;
1862 			}
1863 			/* FALLTHROUGH */
1864 		case DELETE:
1865 			if (error == 0)
1866 				cnp->cn_flags |= SAVENAME;
1867 			break;
1868 		}
1869 	}
1870 
1871 	/* Insert name into cache (as non-existent) if appropriate. */
1872 	if (zfsvfs->z_use_namecache &&
1873 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1874 		cache_enter(dvp, NULL, cnp);
1875 
1876 	/* Insert name into cache if appropriate. */
1877 	if (zfsvfs->z_use_namecache &&
1878 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1879 		if (!(cnp->cn_flags & ISLASTCN) ||
1880 		    (nameiop != DELETE && nameiop != RENAME)) {
1881 			cache_enter(dvp, *vpp, cnp);
1882 		}
1883 	}
1884 
1885 	return (error);
1886 }
1887 #endif /* __FreeBSD__ */
1888 
1889 #ifdef __NetBSD__
1890 /*
1891  * If vnode is for a device return a specfs vnode instead.
1892  */
1893 static int
1894 specvp_check(vnode_t **vpp, cred_t *cr)
1895 {
1896 	int error = 0;
1897 
1898 	if (IS_DEVVP(*vpp)) {
1899 		struct vnode *svp;
1900 
1901 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1902 		VN_RELE(*vpp);
1903 		if (svp == NULL)
1904 			error = ENOSYS;
1905 		*vpp = svp;
1906 	}
1907 	return (error);
1908 }
1909 
1910 /*
1911  * Lookup an entry in a directory, or an extended attribute directory.
1912  * If it exists, return a held vnode reference for it.
1913  *
1914  *	IN:	dvp	- vnode of directory to search.
1915  *		nm	- name of entry to lookup.
1916  *		pnp	- full pathname to lookup [UNUSED].
1917  *		flags	- LOOKUP_XATTR set if looking for an attribute.
1918  *		rdir	- root directory vnode [UNUSED].
1919  *		cr	- credentials of caller.
1920  *		ct	- caller context
1921  *		direntflags - directory lookup flags
1922  *		realpnp - returned pathname.
1923  *
1924  *	OUT:	vpp	- vnode of located entry, NULL if not found.
1925  *
1926  *	RETURN:	0 if success
1927  *		error code if failure
1928  *
1929  * Timestamps:
1930  *	NA
1931  */
1932 /* ARGSUSED */
1933 static int
1934 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, int flags,
1935     struct componentname *cnp, int nameiop, cred_t *cr)
1936 {
1937 	znode_t *zdp = VTOZ(dvp);
1938 	znode_t *zp;
1939 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1940 	int	error = 0;
1941 
1942 	/* fast path */
1943 	if (!(flags & LOOKUP_XATTR)) {
1944 		if (dvp->v_type != VDIR) {
1945 			return (ENOTDIR);
1946 		} else if (zdp->z_sa_hdl == NULL) {
1947 			return (SET_ERROR(EIO));
1948 		}
1949 
1950 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1951 			error = zfs_fastaccesschk_execute(zdp, cr);
1952 			if (!error) {
1953 				*vpp = dvp;
1954 				VN_HOLD(*vpp);
1955 				return (0);
1956 			}
1957 			return (error);
1958 		} else {
1959 			vnode_t *tvp = dnlc_lookup(dvp, nm);
1960 
1961 			if (tvp) {
1962 				error = zfs_fastaccesschk_execute(zdp, cr);
1963 				if (error) {
1964 					VN_RELE(tvp);
1965 					return (error);
1966 				}
1967 				if (tvp == DNLC_NO_VNODE) {
1968 					VN_RELE(tvp);
1969 					return (ENOENT);
1970 				} else {
1971 					*vpp = tvp;
1972 					return (specvp_check(vpp, cr));
1973 				}
1974 			}
1975 		}
1976 	}
1977 
1978 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1979 
1980 	ZFS_ENTER(zfsvfs);
1981 	ZFS_VERIFY_ZP(zdp);
1982 
1983 	*vpp = NULL;
1984 
1985 	if (flags & LOOKUP_XATTR) {
1986 #ifdef TODO
1987 		/*
1988 		 * If the xattr property is off, refuse the lookup request.
1989 		 */
1990 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1991 			ZFS_EXIT(zfsvfs);
1992 			return (EINVAL);
1993 		}
1994 #endif
1995 
1996 		/*
1997 		 * We don't allow recursive attributes..
1998 		 * Maybe someday we will.
1999 		 */
2000 		if (zdp->z_pflags & ZFS_XATTR) {
2001 			ZFS_EXIT(zfsvfs);
2002 			return (EINVAL);
2003 		}
2004 
2005 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
2006 			ZFS_EXIT(zfsvfs);
2007 			return (error);
2008 		}
2009 
2010 		/*
2011 		 * Do we have permission to get into attribute directory?
2012 		 */
2013 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
2014 		    B_FALSE, cr)) {
2015 			VN_RELE(*vpp);
2016 			*vpp = NULL;
2017 		}
2018 
2019 		ZFS_EXIT(zfsvfs);
2020 		return (error);
2021 	}
2022 
2023 	if (dvp->v_type != VDIR) {
2024 		ZFS_EXIT(zfsvfs);
2025 		return (ENOTDIR);
2026 	}
2027 
2028 	/*
2029 	 * Check accessibility of directory.
2030 	 */
2031 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
2032 		ZFS_EXIT(zfsvfs);
2033 		return (error);
2034 	}
2035 
2036 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
2037 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2038 		ZFS_EXIT(zfsvfs);
2039 		return (EILSEQ);
2040 	}
2041 
2042 	/*
2043 	 * First handle the special cases.
2044 	 */
2045 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
2046 		/*
2047 		 * If we are a snapshot mounted under .zfs, return
2048 		 * the vp for the snapshot directory.
2049 		 */
2050 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
2051 			ZFS_EXIT(zfsvfs);
2052 			error = zfsctl_snapshot(zfsvfs->z_parent, vpp);
2053 
2054 			return (error);
2055 		}
2056 	}
2057 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
2058 		ZFS_EXIT(zfsvfs);
2059 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
2060 			return (SET_ERROR(ENOTSUP));
2061 		error = zfsctl_root(zfsvfs, vpp);
2062 		return (error);
2063 	}
2064 
2065 	error = zfs_dirlook(zdp, nm, &zp);
2066 	if (error == 0) {
2067 		*vpp = ZTOV(zp);
2068 		error = specvp_check(vpp, cr);
2069 	}
2070 
2071 	ZFS_EXIT(zfsvfs);
2072 	return (error);
2073 }
2074 #endif
2075 
2076 /*
2077  * Attempt to create a new entry in a directory.  If the entry
2078  * already exists, truncate the file if permissible, else return
2079  * an error.  Return the vp of the created or trunc'd file.
2080  *
2081  *	IN:	dvp	- vnode of directory to put new file entry in.
2082  *		name	- name of new file entry.
2083  *		vap	- attributes of new file.
2084  *		excl	- flag indicating exclusive or non-exclusive mode.
2085  *		mode	- mode to open file with.
2086  *		cr	- credentials of caller.
2087  *		flag	- large file flag [UNUSED].
2088  *		ct	- caller context
2089  *		vsecp	- ACL to be set
2090  *
2091  *	OUT:	vpp	- vnode of created or trunc'd entry.
2092  *
2093  *	RETURN:	0 on success, error code on failure.
2094  *
2095  * Timestamps:
2096  *	dvp - ctime|mtime updated if new entry created
2097  *	 vp - ctime|mtime always, atime if new
2098  */
2099 
2100 /* ARGSUSED */
2101 static int
2102 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
2103     vnode_t **vpp, cred_t *cr, kthread_t *td)
2104 {
2105 	znode_t		*zp, *dzp = VTOZ(dvp);
2106 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2107 	zilog_t		*zilog;
2108 	objset_t	*os;
2109 	dmu_tx_t	*tx;
2110 	int		error;
2111 	ksid_t		*ksid;
2112 	uid_t		uid;
2113 	gid_t		gid = crgetgid(cr);
2114 	zfs_acl_ids_t   acl_ids;
2115 	boolean_t	fuid_dirtied;
2116 	void		*vsecp = NULL;
2117 	int		flag = 0;
2118 	uint64_t	txtype;
2119 
2120 	/*
2121 	 * If we have an ephemeral id, ACL, or XVATTR then
2122 	 * make sure file system is at proper version
2123 	 */
2124 
2125 	ksid = crgetsid(cr, KSID_OWNER);
2126 	if (ksid)
2127 		uid = ksid_getid(ksid);
2128 	else
2129 		uid = crgetuid(cr);
2130 
2131 	if (zfsvfs->z_use_fuids == B_FALSE &&
2132 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2133 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2134 		return (SET_ERROR(EINVAL));
2135 
2136 	ZFS_ENTER(zfsvfs);
2137 	ZFS_VERIFY_ZP(dzp);
2138 	os = zfsvfs->z_os;
2139 	zilog = zfsvfs->z_log;
2140 
2141 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
2142 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2143 		ZFS_EXIT(zfsvfs);
2144 		return (SET_ERROR(EILSEQ));
2145 	}
2146 
2147 	if (vap->va_mask & AT_XVATTR) {
2148 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2149 		    crgetuid(cr), cr, vap->va_type)) != 0) {
2150 			ZFS_EXIT(zfsvfs);
2151 			return (error);
2152 		}
2153 	}
2154 
2155 	*vpp = NULL;
2156 
2157 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
2158 		vap->va_mode &= ~S_ISVTX;
2159 
2160 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
2161 	if (error) {
2162 		ZFS_EXIT(zfsvfs);
2163 		return (error);
2164 	}
2165 	ASSERT3P(zp, ==, NULL);
2166 
2167 	/*
2168 	 * Create a new file object and update the directory
2169 	 * to reference it.
2170 	 */
2171 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
2172 		goto out;
2173 	}
2174 
2175 	/*
2176 	 * We only support the creation of regular files in
2177 	 * extended attribute directories.
2178 	 */
2179 
2180 	if ((dzp->z_pflags & ZFS_XATTR) &&
2181 	    (vap->va_type != VREG)) {
2182 		error = SET_ERROR(EINVAL);
2183 		goto out;
2184 	}
2185 
2186 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
2187 	    cr, vsecp, &acl_ids)) != 0)
2188 		goto out;
2189 
2190 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2191 		zfs_acl_ids_free(&acl_ids);
2192 		error = SET_ERROR(EDQUOT);
2193 		goto out;
2194 	}
2195 
2196 	getnewvnode_reserve(1);
2197 
2198 	tx = dmu_tx_create(os);
2199 
2200 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2201 	    ZFS_SA_BASE_ATTR_SIZE);
2202 
2203 	fuid_dirtied = zfsvfs->z_fuid_dirty;
2204 	if (fuid_dirtied)
2205 		zfs_fuid_txhold(zfsvfs, tx);
2206 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2207 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
2208 	if (!zfsvfs->z_use_sa &&
2209 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2210 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2211 		    0, acl_ids.z_aclp->z_acl_bytes);
2212 	}
2213 	error = dmu_tx_assign(tx, TXG_WAIT);
2214 	if (error) {
2215 		zfs_acl_ids_free(&acl_ids);
2216 		dmu_tx_abort(tx);
2217 		getnewvnode_drop_reserve();
2218 		ZFS_EXIT(zfsvfs);
2219 		return (error);
2220 	}
2221 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2222 
2223 	if (fuid_dirtied)
2224 		zfs_fuid_sync(zfsvfs, tx);
2225 
2226 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
2227 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
2228 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
2229 	    vsecp, acl_ids.z_fuidp, vap);
2230 	zfs_acl_ids_free(&acl_ids);
2231 	dmu_tx_commit(tx);
2232 
2233 	getnewvnode_drop_reserve();
2234 
2235 out:
2236 	if (error == 0) {
2237 		*vpp = ZTOV(zp);
2238 	}
2239 
2240 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2241 		zil_commit(zilog, 0);
2242 
2243 	ZFS_EXIT(zfsvfs);
2244 	return (error);
2245 }
2246 
2247 /*
2248  * Remove an entry from a directory.
2249  *
2250  *	IN:	dvp	- vnode of directory to remove entry from.
2251  *		name	- name of entry to remove.
2252  *		cr	- credentials of caller.
2253  *		ct	- caller context
2254  *		flags	- case flags
2255  *
2256  *	RETURN:	0 on success, error code on failure.
2257  *
2258  * Timestamps:
2259  *	dvp - ctime|mtime
2260  *	 vp - ctime (if nlink > 0)
2261  */
2262 
2263 /*ARGSUSED*/
2264 static int
2265 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2266 {
2267 	znode_t		*dzp = VTOZ(dvp);
2268 	znode_t		*zp = VTOZ(vp);
2269 	znode_t		*xzp;
2270 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2271 	zilog_t		*zilog;
2272 	uint64_t	acl_obj, xattr_obj;
2273 	uint64_t	obj = 0;
2274 	dmu_tx_t	*tx;
2275 	boolean_t	unlinked, toobig = FALSE;
2276 	uint64_t	txtype;
2277 	int		error;
2278 
2279 	ZFS_ENTER(zfsvfs);
2280 	ZFS_VERIFY_ZP(dzp);
2281 	ZFS_VERIFY_ZP(zp);
2282 	zilog = zfsvfs->z_log;
2283 	zp = VTOZ(vp);
2284 
2285 	xattr_obj = 0;
2286 	xzp = NULL;
2287 
2288 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2289 		goto out;
2290 	}
2291 
2292 	/*
2293 	 * Need to use rmdir for removing directories.
2294 	 */
2295 	if (vp->v_type == VDIR) {
2296 		error = SET_ERROR(EPERM);
2297 		goto out;
2298 	}
2299 
2300 	vnevent_remove(vp, dvp, name, ct);
2301 
2302 	obj = zp->z_id;
2303 
2304 	/* are there any extended attributes? */
2305 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2306 	    &xattr_obj, sizeof (xattr_obj));
2307 	if (error == 0 && xattr_obj) {
2308 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
2309 		ASSERT0(error);
2310 	}
2311 
2312 	/*
2313 	 * We may delete the znode now, or we may put it in the unlinked set;
2314 	 * it depends on whether we're the last link, and on whether there are
2315 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
2316 	 * allow for either case.
2317 	 */
2318 	tx = dmu_tx_create(zfsvfs->z_os);
2319 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2320 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2321 	zfs_sa_upgrade_txholds(tx, zp);
2322 	zfs_sa_upgrade_txholds(tx, dzp);
2323 
2324 	if (xzp) {
2325 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2326 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
2327 	}
2328 
2329 	/* charge as an update -- would be nice not to charge at all */
2330 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2331 
2332 	/*
2333 	 * Mark this transaction as typically resulting in a net free of space
2334 	 */
2335 	dmu_tx_mark_netfree(tx);
2336 
2337 	error = dmu_tx_assign(tx, TXG_WAIT);
2338 	if (error) {
2339 		dmu_tx_abort(tx);
2340 		ZFS_EXIT(zfsvfs);
2341 		return (error);
2342 	}
2343 
2344 	/*
2345 	 * Remove the directory entry.
2346 	 */
2347 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2348 
2349 	if (error) {
2350 		dmu_tx_commit(tx);
2351 		goto out;
2352 	}
2353 
2354 	if (unlinked) {
2355 		zfs_unlinked_add(zp, tx);
2356 		vp->v_vflag |= VV_NOSYNC;
2357 	}
2358 
2359 	txtype = TX_REMOVE;
2360 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2361 
2362 	dmu_tx_commit(tx);
2363 out:
2364 
2365 	if (xzp)
2366 		vrele(ZTOV(xzp));
2367 
2368 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2369 		zil_commit(zilog, 0);
2370 
2371 	ZFS_EXIT(zfsvfs);
2372 	return (error);
2373 }
2374 
2375 /*
2376  * Create a new directory and insert it into dvp using the name
2377  * provided.  Return a pointer to the inserted directory.
2378  *
2379  *	IN:	dvp	- vnode of directory to add subdir to.
2380  *		dirname	- name of new directory.
2381  *		vap	- attributes of new directory.
2382  *		cr	- credentials of caller.
2383  *		ct	- caller context
2384  *		flags	- case flags
2385  *		vsecp	- ACL to be set
2386  *
2387  *	OUT:	vpp	- vnode of created directory.
2388  *
2389  *	RETURN:	0 on success, error code on failure.
2390  *
2391  * Timestamps:
2392  *	dvp - ctime|mtime updated
2393  *	 vp - ctime|mtime|atime updated
2394  */
2395 /*ARGSUSED*/
2396 static int
2397 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2398 {
2399 	znode_t		*zp, *dzp = VTOZ(dvp);
2400 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2401 	zilog_t		*zilog;
2402 	uint64_t	txtype;
2403 	dmu_tx_t	*tx;
2404 	int		error;
2405 	ksid_t		*ksid;
2406 	uid_t		uid;
2407 	gid_t		gid = crgetgid(cr);
2408 	zfs_acl_ids_t   acl_ids;
2409 	boolean_t	fuid_dirtied;
2410 
2411 	ASSERT(vap->va_type == VDIR);
2412 
2413 	/*
2414 	 * If we have an ephemeral id, ACL, or XVATTR then
2415 	 * make sure file system is at proper version
2416 	 */
2417 
2418 	ksid = crgetsid(cr, KSID_OWNER);
2419 	if (ksid)
2420 		uid = ksid_getid(ksid);
2421 	else
2422 		uid = crgetuid(cr);
2423 	if (zfsvfs->z_use_fuids == B_FALSE &&
2424 	    ((vap->va_mask & AT_XVATTR) ||
2425 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2426 		return (SET_ERROR(EINVAL));
2427 
2428 	ZFS_ENTER(zfsvfs);
2429 	ZFS_VERIFY_ZP(dzp);
2430 	zilog = zfsvfs->z_log;
2431 
2432 	if (dzp->z_pflags & ZFS_XATTR) {
2433 		ZFS_EXIT(zfsvfs);
2434 		return (SET_ERROR(EINVAL));
2435 	}
2436 
2437 	if (zfsvfs->z_utf8 && u8_validate(dirname,
2438 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2439 		ZFS_EXIT(zfsvfs);
2440 		return (SET_ERROR(EILSEQ));
2441 	}
2442 
2443 	if (vap->va_mask & AT_XVATTR) {
2444 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2445 		    crgetuid(cr), cr, vap->va_type)) != 0) {
2446 			ZFS_EXIT(zfsvfs);
2447 			return (error);
2448 		}
2449 	}
2450 
2451 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2452 	    NULL, &acl_ids)) != 0) {
2453 		ZFS_EXIT(zfsvfs);
2454 		return (error);
2455 	}
2456 
2457 	/*
2458 	 * First make sure the new directory doesn't exist.
2459 	 *
2460 	 * Existence is checked first to make sure we don't return
2461 	 * EACCES instead of EEXIST which can cause some applications
2462 	 * to fail.
2463 	 */
2464 	*vpp = NULL;
2465 
2466 	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2467 		zfs_acl_ids_free(&acl_ids);
2468 		ZFS_EXIT(zfsvfs);
2469 		return (error);
2470 	}
2471 	ASSERT3P(zp, ==, NULL);
2472 
2473 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2474 		zfs_acl_ids_free(&acl_ids);
2475 		ZFS_EXIT(zfsvfs);
2476 		return (error);
2477 	}
2478 
2479 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2480 		zfs_acl_ids_free(&acl_ids);
2481 		ZFS_EXIT(zfsvfs);
2482 		return (SET_ERROR(EDQUOT));
2483 	}
2484 
2485 	/*
2486 	 * Add a new entry to the directory.
2487 	 */
2488 	getnewvnode_reserve(1);
2489 	tx = dmu_tx_create(zfsvfs->z_os);
2490 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2491 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2492 	fuid_dirtied = zfsvfs->z_fuid_dirty;
2493 	if (fuid_dirtied)
2494 		zfs_fuid_txhold(zfsvfs, tx);
2495 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2496 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2497 		    acl_ids.z_aclp->z_acl_bytes);
2498 	}
2499 
2500 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2501 	    ZFS_SA_BASE_ATTR_SIZE);
2502 
2503 	error = dmu_tx_assign(tx, TXG_WAIT);
2504 	if (error) {
2505 		zfs_acl_ids_free(&acl_ids);
2506 		dmu_tx_abort(tx);
2507 		getnewvnode_drop_reserve();
2508 		ZFS_EXIT(zfsvfs);
2509 		return (error);
2510 	}
2511 
2512 	/*
2513 	 * Create new node.
2514 	 */
2515 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2516 
2517 	if (fuid_dirtied)
2518 		zfs_fuid_sync(zfsvfs, tx);
2519 
2520 	/*
2521 	 * Now put new name in parent dir.
2522 	 */
2523 	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2524 
2525 	*vpp = ZTOV(zp);
2526 
2527 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2528 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2529 	    acl_ids.z_fuidp, vap);
2530 
2531 	zfs_acl_ids_free(&acl_ids);
2532 
2533 	dmu_tx_commit(tx);
2534 
2535 	getnewvnode_drop_reserve();
2536 
2537 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2538 		zil_commit(zilog, 0);
2539 
2540 	ZFS_EXIT(zfsvfs);
2541 	return (0);
2542 }
2543 
2544 /*
2545  * Remove a directory subdir entry.  If the current working
2546  * directory is the same as the subdir to be removed, the
2547  * remove will fail.
2548  *
2549  *	IN:	dvp	- vnode of directory to remove from.
2550  *		name	- name of directory to be removed.
2551  *		cwd	- vnode of current working directory.
2552  *		cr	- credentials of caller.
2553  *		ct	- caller context
2554  *		flags	- case flags
2555  *
2556  *	RETURN:	0 on success, error code on failure.
2557  *
2558  * Timestamps:
2559  *	dvp - ctime|mtime updated
2560  */
2561 /*ARGSUSED*/
2562 static int
2563 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2564 {
2565 	znode_t		*dzp = VTOZ(dvp);
2566 	znode_t		*zp = VTOZ(vp);
2567 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2568 	zilog_t		*zilog;
2569 	dmu_tx_t	*tx;
2570 	int		error;
2571 
2572 	ZFS_ENTER(zfsvfs);
2573 	ZFS_VERIFY_ZP(dzp);
2574 	ZFS_VERIFY_ZP(zp);
2575 	zilog = zfsvfs->z_log;
2576 
2577 
2578 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2579 		goto out;
2580 	}
2581 
2582 	if (vp->v_type != VDIR) {
2583 		error = SET_ERROR(ENOTDIR);
2584 		goto out;
2585 	}
2586 
2587 	vnevent_rmdir(vp, dvp, name, ct);
2588 
2589 	tx = dmu_tx_create(zfsvfs->z_os);
2590 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2591 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2592 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2593 	zfs_sa_upgrade_txholds(tx, zp);
2594 	zfs_sa_upgrade_txholds(tx, dzp);
2595 	dmu_tx_mark_netfree(tx);
2596 	error = dmu_tx_assign(tx, TXG_WAIT);
2597 	if (error) {
2598 		dmu_tx_abort(tx);
2599 		ZFS_EXIT(zfsvfs);
2600 		return (error);
2601 	}
2602 
2603 	cache_purge(dvp);
2604 
2605 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2606 
2607 	if (error == 0) {
2608 		uint64_t txtype = TX_RMDIR;
2609 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2610 	}
2611 
2612 	dmu_tx_commit(tx);
2613 
2614 	cache_purge(vp);
2615 out:
2616 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2617 		zil_commit(zilog, 0);
2618 
2619 	ZFS_EXIT(zfsvfs);
2620 	return (error);
2621 }
2622 
2623 /*
2624  * Read as many directory entries as will fit into the provided
2625  * buffer from the given directory cursor position (specified in
2626  * the uio structure).
2627  *
2628  *	IN:	vp	- vnode of directory to read.
2629  *		uio	- structure supplying read location, range info,
2630  *			  and return buffer.
2631  *		cr	- credentials of caller.
2632  *		ct	- caller context
2633  *		flags	- case flags
2634  *
2635  *	OUT:	uio	- updated offset and range, buffer filled.
2636  *		eofp	- set to true if end-of-file detected.
2637  *
2638  *	RETURN:	0 on success, error code on failure.
2639  *
2640  * Timestamps:
2641  *	vp - atime updated
2642  *
2643  * Note that the low 4 bits of the cookie returned by zap is always zero.
2644  * This allows us to use the low range for "special" directory entries:
2645  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2646  * we use the offset 2 for the '.zfs' directory.
2647  */
2648 /* ARGSUSED */
2649 static int
2650 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies)
2651 {
2652 	znode_t		*zp = VTOZ(vp);
2653 	iovec_t		*iovp;
2654 	edirent_t	*eodp;
2655 	dirent64_t	*odp;
2656 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2657 	objset_t	*os;
2658 	caddr_t		outbuf;
2659 	size_t		bufsize;
2660 	zap_cursor_t	zc;
2661 	zap_attribute_t	zap;
2662 	uint_t		bytes_wanted;
2663 	uint64_t	offset; /* must be unsigned; checks for < 1 */
2664 	uint64_t	parent;
2665 	int		local_eof;
2666 	int		outcount;
2667 	int		error;
2668 	uint8_t		prefetch;
2669 	boolean_t	check_sysattrs;
2670 	uint8_t		type;
2671 	int		ncooks = 0;
2672 	off_t		*cooks = NULL;
2673 	int		flags = 0;
2674 #ifdef __FreeBSD__
2675 	boolean_t user = uio->uio_segflg != UIO_SYSSPACE;
2676 #endif
2677 #ifdef __NetBSD__
2678 	boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace);
2679 #endif
2680 
2681 	ZFS_ENTER(zfsvfs);
2682 	ZFS_VERIFY_ZP(zp);
2683 
2684 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2685 	    &parent, sizeof (parent))) != 0) {
2686 		ZFS_EXIT(zfsvfs);
2687 		return (error);
2688 	}
2689 
2690 	/*
2691 	 * If we are not given an eof variable,
2692 	 * use a local one.
2693 	 */
2694 	if (eofp == NULL)
2695 		eofp = &local_eof;
2696 
2697 	/*
2698 	 * Check for valid iov_len.
2699 	 */
2700 	if (uio->uio_iov->iov_len <= 0) {
2701 		ZFS_EXIT(zfsvfs);
2702 		return (SET_ERROR(EINVAL));
2703 	}
2704 
2705 	/*
2706 	 * Quit if directory has been removed (posix)
2707 	 */
2708 	if ((*eofp = zp->z_unlinked) != 0) {
2709 		ZFS_EXIT(zfsvfs);
2710 		return (0);
2711 	}
2712 
2713 	error = 0;
2714 	os = zfsvfs->z_os;
2715 	offset = uio->uio_loffset;
2716 	prefetch = zp->z_zn_prefetch;
2717 
2718 	/*
2719 	 * Initialize the iterator cursor.
2720 	 */
2721 	if (offset <= 3) {
2722 		/*
2723 		 * Start iteration from the beginning of the directory.
2724 		 */
2725 		zap_cursor_init(&zc, os, zp->z_id);
2726 	} else {
2727 		/*
2728 		 * The offset is a serialized cursor.
2729 		 */
2730 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2731 	}
2732 
2733 	/*
2734 	 * Get space to change directory entries into fs independent format.
2735 	 */
2736 	iovp = uio->uio_iov;
2737 	bytes_wanted = iovp->iov_len;
2738 	if (user || uio->uio_iovcnt != 1) {
2739 		bufsize = bytes_wanted;
2740 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2741 		odp = (struct dirent64 *)outbuf;
2742 	} else {
2743 		bufsize = bytes_wanted;
2744 		outbuf = NULL;
2745 		odp = (struct dirent64 *)iovp->iov_base;
2746 	}
2747 	eodp = (struct edirent *)odp;
2748 
2749 	if (ncookies != NULL) {
2750 		/*
2751 		 * Minimum entry size is dirent size and 1 byte for a file name.
2752 		 */
2753 #ifdef __FreeBSD__
2754 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2755 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2756 #endif
2757 #ifdef __NetBSD__
2758 		ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
2759 		cooks = malloc(ncooks * sizeof(off_t), M_TEMP, M_WAITOK);
2760 #endif
2761 		*cookies = cooks;
2762 		*ncookies = ncooks;
2763 	}
2764 
2765 	/*
2766 	 * If this VFS supports the system attribute view interface; and
2767 	 * we're looking at an extended attribute directory; and we care
2768 	 * about normalization conflicts on this vfs; then we must check
2769 	 * for normalization conflicts with the sysattr name space.
2770 	 */
2771 #ifdef TODO
2772 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2773 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2774 	    (flags & V_RDDIR_ENTFLAGS);
2775 #else
2776 	check_sysattrs = 0;
2777 #endif
2778 
2779 	/*
2780 	 * Transform to file-system independent format
2781 	 */
2782 	outcount = 0;
2783 	while (outcount < bytes_wanted) {
2784 		ino64_t objnum;
2785 		ushort_t reclen;
2786 		off64_t *next = NULL;
2787 
2788 		/*
2789 		 * Special case `.', `..', and `.zfs'.
2790 		 */
2791 		if (offset == 0) {
2792 			(void) strcpy(zap.za_name, ".");
2793 			zap.za_normalization_conflict = 0;
2794 			objnum = zp->z_id;
2795 			type = DT_DIR;
2796 		} else if (offset == 1) {
2797 			(void) strcpy(zap.za_name, "..");
2798 			zap.za_normalization_conflict = 0;
2799 			objnum = parent;
2800 			type = DT_DIR;
2801 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2802 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2803 			zap.za_normalization_conflict = 0;
2804 			objnum = ZFSCTL_INO_ROOT;
2805 			type = DT_DIR;
2806 		} else {
2807 			/*
2808 			 * Grab next entry.
2809 			 */
2810 			if (error = zap_cursor_retrieve(&zc, &zap)) {
2811 				if ((*eofp = (error == ENOENT)) != 0)
2812 					break;
2813 				else
2814 					goto update;
2815 			}
2816 
2817 			if (zap.za_integer_length != 8 ||
2818 			    zap.za_num_integers != 1) {
2819 				cmn_err(CE_WARN, "zap_readdir: bad directory "
2820 				    "entry, obj = %lld, offset = %lld\n",
2821 				    (u_longlong_t)zp->z_id,
2822 				    (u_longlong_t)offset);
2823 				error = SET_ERROR(ENXIO);
2824 				goto update;
2825 			}
2826 
2827 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2828 			/*
2829 			 * MacOS X can extract the object type here such as:
2830 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2831 			 */
2832 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2833 
2834 			if (check_sysattrs && !zap.za_normalization_conflict) {
2835 #ifdef TODO
2836 				zap.za_normalization_conflict =
2837 				    xattr_sysattr_casechk(zap.za_name);
2838 #else
2839 				panic("%s:%u: TODO", __func__, __LINE__);
2840 #endif
2841 			}
2842 		}
2843 
2844 		if (flags & V_RDDIR_ACCFILTER) {
2845 			/*
2846 			 * If we have no access at all, don't include
2847 			 * this entry in the returned information
2848 			 */
2849 			znode_t	*ezp;
2850 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2851 				goto skip_entry;
2852 			if (!zfs_has_access(ezp, cr)) {
2853 				vrele(ZTOV(ezp));
2854 				goto skip_entry;
2855 			}
2856 			vrele(ZTOV(ezp));
2857 		}
2858 
2859 		if (flags & V_RDDIR_ENTFLAGS)
2860 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2861 		else
2862 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2863 
2864 		/*
2865 		 * Will this entry fit in the buffer?
2866 		 */
2867 		if (outcount + reclen > bufsize) {
2868 			/*
2869 			 * Did we manage to fit anything in the buffer?
2870 			 */
2871 			if (!outcount) {
2872 				error = SET_ERROR(EINVAL);
2873 				goto update;
2874 			}
2875 			break;
2876 		}
2877 		if (flags & V_RDDIR_ENTFLAGS) {
2878 			/*
2879 			 * Add extended flag entry:
2880 			 */
2881 			eodp->ed_ino = objnum;
2882 			eodp->ed_reclen = reclen;
2883 			/* NOTE: ed_off is the offset for the *next* entry */
2884 			next = &(eodp->ed_off);
2885 			eodp->ed_eflags = zap.za_normalization_conflict ?
2886 			    ED_CASE_CONFLICT : 0;
2887 			(void) strncpy(eodp->ed_name, zap.za_name,
2888 			    EDIRENT_NAMELEN(reclen));
2889 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2890 		} else {
2891 			/*
2892 			 * Add normal entry:
2893 			 */
2894 			odp->d_ino = objnum;
2895 			odp->d_reclen = reclen;
2896 			odp->d_namlen = strlen(zap.za_name);
2897 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2898 			odp->d_type = type;
2899 			odp = (dirent64_t *)((intptr_t)odp + reclen);
2900 		}
2901 		outcount += reclen;
2902 
2903 		ASSERT(outcount <= bufsize);
2904 
2905 		/* Prefetch znode */
2906 		if (prefetch)
2907 			dmu_prefetch(os, objnum, 0, 0, 0,
2908 			    ZIO_PRIORITY_SYNC_READ);
2909 
2910 	skip_entry:
2911 		/*
2912 		 * Move to the next entry, fill in the previous offset.
2913 		 */
2914 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2915 			zap_cursor_advance(&zc);
2916 			offset = zap_cursor_serialize(&zc);
2917 		} else {
2918 			offset += 1;
2919 		}
2920 
2921 		if (cooks != NULL) {
2922 			*cooks++ = offset;
2923 			ncooks--;
2924 #ifdef __FreeBSD__
2925 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2926 #endif
2927 #ifdef __NetBSD__
2928 			KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks);
2929 #endif
2930 		}
2931 	}
2932 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2933 
2934 	/* Subtract unused cookies */
2935 	if (ncookies != NULL)
2936 		*ncookies -= ncooks;
2937 
2938 	if (!user && uio->uio_iovcnt == 1) {
2939 		iovp->iov_base += outcount;
2940 		iovp->iov_len -= outcount;
2941 		uio->uio_resid -= outcount;
2942 	} else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) {
2943 		/*
2944 		 * Reset the pointer.
2945 		 */
2946 		offset = uio->uio_loffset;
2947 	}
2948 
2949 update:
2950 	zap_cursor_fini(&zc);
2951 	if (user || uio->uio_iovcnt != 1)
2952 		kmem_free(outbuf, bufsize);
2953 
2954 	if (error == ENOENT)
2955 		error = 0;
2956 
2957 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2958 
2959 	uio->uio_loffset = offset;
2960 	ZFS_EXIT(zfsvfs);
2961 	if (error != 0 && cookies != NULL) {
2962 #ifdef __FreeBSD__
2963 		free(*cookies, M_TEMP);
2964 #endif
2965 #ifdef __NetBSD__
2966 		kmem_free(*cookies, ncooks * sizeof(off_t));
2967 #endif
2968 		*cookies = NULL;
2969 		*ncookies = 0;
2970 	}
2971 	return (error);
2972 }
2973 
2974 ulong_t zfs_fsync_sync_cnt = 4;
2975 
2976 static int
2977 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2978 {
2979 	znode_t	*zp = VTOZ(vp);
2980 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2981 
2982 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2983 
2984 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2985 		ZFS_ENTER(zfsvfs);
2986 		ZFS_VERIFY_ZP(zp);
2987 
2988 #ifdef __NetBSD__
2989 		if (!zp->z_unlinked)
2990 #endif
2991 		zil_commit(zfsvfs->z_log, zp->z_id);
2992 		ZFS_EXIT(zfsvfs);
2993 	}
2994 	return (0);
2995 }
2996 
2997 
2998 /*
2999  * Get the requested file attributes and place them in the provided
3000  * vattr structure.
3001  *
3002  *	IN:	vp	- vnode of file.
3003  *		vap	- va_mask identifies requested attributes.
3004  *			  If AT_XVATTR set, then optional attrs are requested
3005  *		flags	- ATTR_NOACLCHECK (CIFS server context)
3006  *		cr	- credentials of caller.
3007  *		ct	- caller context
3008  *
3009  *	OUT:	vap	- attribute values.
3010  *
3011  *	RETURN:	0 (always succeeds).
3012  */
3013 /* ARGSUSED */
3014 static int
3015 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3016     caller_context_t *ct)
3017 {
3018 	znode_t *zp = VTOZ(vp);
3019 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3020 	int	error = 0;
3021 	uint32_t blksize;
3022 	u_longlong_t nblocks;
3023 	uint64_t links;
3024 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
3025 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3026 	xoptattr_t *xoap = NULL;
3027 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3028 	sa_bulk_attr_t bulk[4];
3029 	int count = 0;
3030 
3031 	ZFS_ENTER(zfsvfs);
3032 	ZFS_VERIFY_ZP(zp);
3033 
3034 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
3035 
3036 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3037 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3038 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
3039 	if (vp->v_type == VBLK || vp->v_type == VCHR)
3040 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
3041 		    &rdev, 8);
3042 
3043 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
3044 		ZFS_EXIT(zfsvfs);
3045 		return (error);
3046 	}
3047 
3048 	/*
3049 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
3050 	 * Also, if we are the owner don't bother, since owner should
3051 	 * always be allowed to read basic attributes of file.
3052 	 */
3053 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
3054 	    (vap->va_uid != crgetuid(cr))) {
3055 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
3056 		    skipaclchk, cr)) {
3057 			ZFS_EXIT(zfsvfs);
3058 			return (error);
3059 		}
3060 	}
3061 
3062 	/*
3063 	 * Return all attributes.  It's cheaper to provide the answer
3064 	 * than to determine whether we were asked the question.
3065 	 */
3066 
3067 	vap->va_type = IFTOVT(zp->z_mode);
3068 	vap->va_mode = zp->z_mode & ~S_IFMT;
3069 #ifdef illumos
3070 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
3071 #endif
3072 #ifdef __FreeBSD__
3073 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
3074 	vap->va_nodeid = zp->z_id;
3075 #endif
3076 #ifdef __NetBSD__
3077 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid;
3078 	vap->va_nodeid = zp->z_id;
3079 	/*
3080 	 * If we are a snapshot mounted under .zfs, return
3081 	 * the object id of the snapshot to make getcwd happy.
3082 	 */
3083 	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
3084 		vnode_t *cvp = vp->v_mount->mnt_vnodecovered;
3085 
3086 		if (cvp && zfsctl_is_node(cvp))
3087 			vap->va_nodeid = dmu_objset_id(zfsvfs->z_os);
3088 	}
3089 #endif
3090 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
3091 		links = zp->z_links + 1;
3092 	else
3093 		links = zp->z_links;
3094 	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
3095 	vap->va_size = zp->z_size;
3096 #ifdef illumos
3097 	vap->va_rdev = vp->v_rdev;
3098 #else
3099 	if (vp->v_type == VBLK || vp->v_type == VCHR)
3100 		vap->va_rdev = zfs_cmpldev(rdev);
3101 #endif
3102 	vap->va_seq = zp->z_seq;
3103 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
3104      	vap->va_filerev = zp->z_seq;
3105 
3106 	/*
3107 	 * Add in any requested optional attributes and the create time.
3108 	 * Also set the corresponding bits in the returned attribute bitmap.
3109 	 */
3110 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
3111 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
3112 			xoap->xoa_archive =
3113 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
3114 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
3115 		}
3116 
3117 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
3118 			xoap->xoa_readonly =
3119 			    ((zp->z_pflags & ZFS_READONLY) != 0);
3120 			XVA_SET_RTN(xvap, XAT_READONLY);
3121 		}
3122 
3123 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
3124 			xoap->xoa_system =
3125 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
3126 			XVA_SET_RTN(xvap, XAT_SYSTEM);
3127 		}
3128 
3129 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
3130 			xoap->xoa_hidden =
3131 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
3132 			XVA_SET_RTN(xvap, XAT_HIDDEN);
3133 		}
3134 
3135 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3136 			xoap->xoa_nounlink =
3137 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
3138 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
3139 		}
3140 
3141 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3142 			xoap->xoa_immutable =
3143 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
3144 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
3145 		}
3146 
3147 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3148 			xoap->xoa_appendonly =
3149 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
3150 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
3151 		}
3152 
3153 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3154 			xoap->xoa_nodump =
3155 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
3156 			XVA_SET_RTN(xvap, XAT_NODUMP);
3157 		}
3158 
3159 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
3160 			xoap->xoa_opaque =
3161 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
3162 			XVA_SET_RTN(xvap, XAT_OPAQUE);
3163 		}
3164 
3165 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3166 			xoap->xoa_av_quarantined =
3167 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
3168 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
3169 		}
3170 
3171 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3172 			xoap->xoa_av_modified =
3173 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
3174 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
3175 		}
3176 
3177 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
3178 		    vp->v_type == VREG) {
3179 			zfs_sa_get_scanstamp(zp, xvap);
3180 		}
3181 
3182 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3183 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
3184 			XVA_SET_RTN(xvap, XAT_REPARSE);
3185 		}
3186 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
3187 			xoap->xoa_generation = zp->z_gen;
3188 			XVA_SET_RTN(xvap, XAT_GEN);
3189 		}
3190 
3191 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
3192 			xoap->xoa_offline =
3193 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
3194 			XVA_SET_RTN(xvap, XAT_OFFLINE);
3195 		}
3196 
3197 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
3198 			xoap->xoa_sparse =
3199 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
3200 			XVA_SET_RTN(xvap, XAT_SPARSE);
3201 		}
3202 	}
3203 
3204 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
3205 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
3206 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
3207 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
3208 
3209 
3210 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
3211 	vap->va_blksize = blksize;
3212 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
3213 
3214 	if (zp->z_blksz == 0) {
3215 		/*
3216 		 * Block size hasn't been set; suggest maximal I/O transfers.
3217 		 */
3218 		vap->va_blksize = zfsvfs->z_max_blksz;
3219 	}
3220 
3221 	ZFS_EXIT(zfsvfs);
3222 	return (0);
3223 }
3224 
3225 /*
3226  * Set the file attributes to the values contained in the
3227  * vattr structure.
3228  *
3229  *	IN:	vp	- vnode of file to be modified.
3230  *		vap	- new attribute values.
3231  *			  If AT_XVATTR set, then optional attrs are being set
3232  *		flags	- ATTR_UTIME set if non-default time values provided.
3233  *			- ATTR_NOACLCHECK (CIFS context only).
3234  *		cr	- credentials of caller.
3235  *		ct	- caller context
3236  *
3237  *	RETURN:	0 on success, error code on failure.
3238  *
3239  * Timestamps:
3240  *	vp - ctime updated, mtime updated if size changed.
3241  */
3242 /* ARGSUSED */
3243 static int
3244 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3245     caller_context_t *ct)
3246 {
3247 	znode_t		*zp = VTOZ(vp);
3248 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3249 	zilog_t		*zilog;
3250 	dmu_tx_t	*tx;
3251 	vattr_t		oldva;
3252 	xvattr_t	tmpxvattr;
3253 	uint_t		mask = vap->va_mask;
3254 	uint_t		saved_mask = 0;
3255 	uint64_t	saved_mode;
3256 	int		trim_mask = 0;
3257 	uint64_t	new_mode;
3258 	uint64_t	new_uid, new_gid;
3259 	uint64_t	xattr_obj;
3260 	uint64_t	mtime[2], ctime[2];
3261 	znode_t		*attrzp;
3262 	int		need_policy = FALSE;
3263 	int		err, err2;
3264 	zfs_fuid_info_t *fuidp = NULL;
3265 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3266 	xoptattr_t	*xoap;
3267 	zfs_acl_t	*aclp;
3268 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3269 	boolean_t	fuid_dirtied = B_FALSE;
3270 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
3271 	int		count = 0, xattr_count = 0;
3272 
3273 	if (mask == 0)
3274 		return (0);
3275 
3276 	if (mask & AT_NOSET)
3277 		return (SET_ERROR(EINVAL));
3278 
3279 	ZFS_ENTER(zfsvfs);
3280 	ZFS_VERIFY_ZP(zp);
3281 
3282 	zilog = zfsvfs->z_log;
3283 
3284 	/*
3285 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3286 	 * that file system is at proper version level
3287 	 */
3288 
3289 	if (zfsvfs->z_use_fuids == B_FALSE &&
3290 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3291 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3292 	    (mask & AT_XVATTR))) {
3293 		ZFS_EXIT(zfsvfs);
3294 		return (SET_ERROR(EINVAL));
3295 	}
3296 
3297 	if (mask & AT_SIZE && vp->v_type == VDIR) {
3298 		ZFS_EXIT(zfsvfs);
3299 		return (SET_ERROR(EISDIR));
3300 	}
3301 
3302 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3303 		ZFS_EXIT(zfsvfs);
3304 		return (SET_ERROR(EINVAL));
3305 	}
3306 
3307 	/*
3308 	 * If this is an xvattr_t, then get a pointer to the structure of
3309 	 * optional attributes.  If this is NULL, then we have a vattr_t.
3310 	 */
3311 	xoap = xva_getxoptattr(xvap);
3312 
3313 	xva_init(&tmpxvattr);
3314 
3315 	/*
3316 	 * Immutable files can only alter immutable bit and atime
3317 	 */
3318 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3319 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3320 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3321 		ZFS_EXIT(zfsvfs);
3322 		return (SET_ERROR(EPERM));
3323 	}
3324 
3325 	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3326 		ZFS_EXIT(zfsvfs);
3327 		return (SET_ERROR(EPERM));
3328 	}
3329 
3330 	/*
3331 	 * Verify timestamps doesn't overflow 32 bits.
3332 	 * ZFS can handle large timestamps, but 32bit syscalls can't
3333 	 * handle times greater than 2039.  This check should be removed
3334 	 * once large timestamps are fully supported.
3335 	 */
3336 	if (mask & (AT_ATIME | AT_MTIME)) {
3337 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3338 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3339 			ZFS_EXIT(zfsvfs);
3340 			return (SET_ERROR(EOVERFLOW));
3341 		}
3342 	}
3343 	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
3344 	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
3345 		ZFS_EXIT(zfsvfs);
3346 		return (SET_ERROR(EOVERFLOW));
3347 	}
3348 
3349 	attrzp = NULL;
3350 	aclp = NULL;
3351 
3352 	/* Can this be moved to before the top label? */
3353 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3354 		ZFS_EXIT(zfsvfs);
3355 		return (SET_ERROR(EROFS));
3356 	}
3357 
3358 	/*
3359 	 * First validate permissions
3360 	 */
3361 
3362 	if (mask & AT_SIZE) {
3363 		/*
3364 		 * XXX - Note, we are not providing any open
3365 		 * mode flags here (like FNDELAY), so we may
3366 		 * block if there are locks present... this
3367 		 * should be addressed in openat().
3368 		 */
3369 		/* XXX - would it be OK to generate a log record here? */
3370 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3371 		if (err) {
3372 			ZFS_EXIT(zfsvfs);
3373 			return (err);
3374 		}
3375 	}
3376 
3377 	if (mask & (AT_ATIME|AT_MTIME) ||
3378 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3379 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3380 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3381 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3382 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3383 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3384 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3385 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3386 		    skipaclchk, cr);
3387 	}
3388 
3389 	if (mask & (AT_UID|AT_GID)) {
3390 		int	idmask = (mask & (AT_UID|AT_GID));
3391 		int	take_owner;
3392 		int	take_group;
3393 
3394 		/*
3395 		 * NOTE: even if a new mode is being set,
3396 		 * we may clear S_ISUID/S_ISGID bits.
3397 		 */
3398 
3399 		if (!(mask & AT_MODE))
3400 			vap->va_mode = zp->z_mode;
3401 
3402 		/*
3403 		 * Take ownership or chgrp to group we are a member of
3404 		 */
3405 
3406 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3407 		take_group = (mask & AT_GID) &&
3408 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3409 
3410 		/*
3411 		 * If both AT_UID and AT_GID are set then take_owner and
3412 		 * take_group must both be set in order to allow taking
3413 		 * ownership.
3414 		 *
3415 		 * Otherwise, send the check through secpolicy_vnode_setattr()
3416 		 *
3417 		 */
3418 
3419 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3420 		    ((idmask == AT_UID) && take_owner) ||
3421 		    ((idmask == AT_GID) && take_group)) {
3422 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3423 			    skipaclchk, cr) == 0) {
3424 				/*
3425 				 * Remove setuid/setgid for non-privileged users
3426 				 */
3427 				secpolicy_setid_clear(vap, vp, cr);
3428 				trim_mask = (mask & (AT_UID|AT_GID));
3429 			} else {
3430 				need_policy =  TRUE;
3431 			}
3432 		} else {
3433 			need_policy =  TRUE;
3434 		}
3435 	}
3436 
3437 	oldva.va_mode = zp->z_mode;
3438 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3439 	if (mask & AT_XVATTR) {
3440 		/*
3441 		 * Update xvattr mask to include only those attributes
3442 		 * that are actually changing.
3443 		 *
3444 		 * the bits will be restored prior to actually setting
3445 		 * the attributes so the caller thinks they were set.
3446 		 */
3447 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3448 			if (xoap->xoa_appendonly !=
3449 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3450 				need_policy = TRUE;
3451 			} else {
3452 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3453 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3454 			}
3455 		}
3456 
3457 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3458 			if (xoap->xoa_nounlink !=
3459 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3460 				need_policy = TRUE;
3461 			} else {
3462 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3463 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3464 			}
3465 		}
3466 
3467 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3468 			if (xoap->xoa_immutable !=
3469 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3470 				need_policy = TRUE;
3471 			} else {
3472 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3473 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3474 			}
3475 		}
3476 
3477 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3478 			if (xoap->xoa_nodump !=
3479 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3480 				need_policy = TRUE;
3481 			} else {
3482 				XVA_CLR_REQ(xvap, XAT_NODUMP);
3483 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3484 			}
3485 		}
3486 
3487 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3488 			if (xoap->xoa_av_modified !=
3489 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3490 				need_policy = TRUE;
3491 			} else {
3492 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3493 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3494 			}
3495 		}
3496 
3497 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3498 			if ((vp->v_type != VREG &&
3499 			    xoap->xoa_av_quarantined) ||
3500 			    xoap->xoa_av_quarantined !=
3501 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3502 				need_policy = TRUE;
3503 			} else {
3504 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3505 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3506 			}
3507 		}
3508 
3509 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3510 			ZFS_EXIT(zfsvfs);
3511 			return (SET_ERROR(EPERM));
3512 		}
3513 
3514 		if (need_policy == FALSE &&
3515 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3516 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3517 			need_policy = TRUE;
3518 		}
3519 	}
3520 
3521 	if (mask & AT_MODE) {
3522 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3523 			err = secpolicy_setid_setsticky_clear(vp, vap,
3524 			    &oldva, cr);
3525 			if (err) {
3526 				ZFS_EXIT(zfsvfs);
3527 				return (err);
3528 			}
3529 			trim_mask |= AT_MODE;
3530 		} else {
3531 			need_policy = TRUE;
3532 		}
3533 	}
3534 
3535 	if (need_policy) {
3536 		/*
3537 		 * If trim_mask is set then take ownership
3538 		 * has been granted or write_acl is present and user
3539 		 * has the ability to modify mode.  In that case remove
3540 		 * UID|GID and or MODE from mask so that
3541 		 * secpolicy_vnode_setattr() doesn't revoke it.
3542 		 */
3543 
3544 		if (trim_mask) {
3545 			saved_mask = vap->va_mask;
3546 			vap->va_mask &= ~trim_mask;
3547 			if (trim_mask & AT_MODE) {
3548 				/*
3549 				 * Save the mode, as secpolicy_vnode_setattr()
3550 				 * will overwrite it with ova.va_mode.
3551 				 */
3552 				saved_mode = vap->va_mode;
3553 			}
3554 		}
3555 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3556 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3557 		if (err) {
3558 			ZFS_EXIT(zfsvfs);
3559 			return (err);
3560 		}
3561 
3562 		if (trim_mask) {
3563 			vap->va_mask |= saved_mask;
3564 			if (trim_mask & AT_MODE) {
3565 				/*
3566 				 * Recover the mode after
3567 				 * secpolicy_vnode_setattr().
3568 				 */
3569 				vap->va_mode = saved_mode;
3570 			}
3571 		}
3572 	}
3573 
3574 	/*
3575 	 * secpolicy_vnode_setattr, or take ownership may have
3576 	 * changed va_mask
3577 	 */
3578 	mask = vap->va_mask;
3579 
3580 	if ((mask & (AT_UID | AT_GID))) {
3581 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3582 		    &xattr_obj, sizeof (xattr_obj));
3583 
3584 		if (err == 0 && xattr_obj) {
3585 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3586 			if (err == 0) {
3587 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3588 				if (err != 0)
3589 					vrele(ZTOV(attrzp));
3590 			}
3591 			if (err)
3592 				goto out2;
3593 		}
3594 		if (mask & AT_UID) {
3595 			new_uid = zfs_fuid_create(zfsvfs,
3596 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3597 			if (new_uid != zp->z_uid &&
3598 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3599 				if (attrzp)
3600 					vput(ZTOV(attrzp));
3601 				err = SET_ERROR(EDQUOT);
3602 				goto out2;
3603 			}
3604 		}
3605 
3606 		if (mask & AT_GID) {
3607 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3608 			    cr, ZFS_GROUP, &fuidp);
3609 			if (new_gid != zp->z_gid &&
3610 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3611 				if (attrzp)
3612 					vput(ZTOV(attrzp));
3613 				err = SET_ERROR(EDQUOT);
3614 				goto out2;
3615 			}
3616 		}
3617 	}
3618 	tx = dmu_tx_create(zfsvfs->z_os);
3619 
3620 	if (mask & AT_MODE) {
3621 		uint64_t pmode = zp->z_mode;
3622 		uint64_t acl_obj;
3623 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3624 
3625 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3626 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3627 			err = SET_ERROR(EPERM);
3628 			goto out;
3629 		}
3630 
3631 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3632 			goto out;
3633 
3634 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3635 			/*
3636 			 * Are we upgrading ACL from old V0 format
3637 			 * to V1 format?
3638 			 */
3639 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3640 			    zfs_znode_acl_version(zp) ==
3641 			    ZFS_ACL_VERSION_INITIAL) {
3642 				dmu_tx_hold_free(tx, acl_obj, 0,
3643 				    DMU_OBJECT_END);
3644 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3645 				    0, aclp->z_acl_bytes);
3646 			} else {
3647 				dmu_tx_hold_write(tx, acl_obj, 0,
3648 				    aclp->z_acl_bytes);
3649 			}
3650 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3651 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3652 			    0, aclp->z_acl_bytes);
3653 		}
3654 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3655 	} else {
3656 		if ((mask & AT_XVATTR) &&
3657 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3658 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3659 		else
3660 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3661 	}
3662 
3663 	if (attrzp) {
3664 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3665 	}
3666 
3667 	fuid_dirtied = zfsvfs->z_fuid_dirty;
3668 	if (fuid_dirtied)
3669 		zfs_fuid_txhold(zfsvfs, tx);
3670 
3671 	zfs_sa_upgrade_txholds(tx, zp);
3672 
3673 	err = dmu_tx_assign(tx, TXG_WAIT);
3674 	if (err)
3675 		goto out;
3676 
3677 	count = 0;
3678 	/*
3679 	 * Set each attribute requested.
3680 	 * We group settings according to the locks they need to acquire.
3681 	 *
3682 	 * Note: you cannot set ctime directly, although it will be
3683 	 * updated as a side-effect of calling this function.
3684 	 */
3685 
3686 	if (mask & (AT_UID|AT_GID|AT_MODE))
3687 		mutex_enter(&zp->z_acl_lock);
3688 
3689 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3690 	    &zp->z_pflags, sizeof (zp->z_pflags));
3691 
3692 	if (attrzp) {
3693 		if (mask & (AT_UID|AT_GID|AT_MODE))
3694 			mutex_enter(&attrzp->z_acl_lock);
3695 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3696 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3697 		    sizeof (attrzp->z_pflags));
3698 	}
3699 
3700 	if (mask & (AT_UID|AT_GID)) {
3701 
3702 		if (mask & AT_UID) {
3703 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3704 			    &new_uid, sizeof (new_uid));
3705 			zp->z_uid = new_uid;
3706 			if (attrzp) {
3707 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3708 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3709 				    sizeof (new_uid));
3710 				attrzp->z_uid = new_uid;
3711 			}
3712 		}
3713 
3714 		if (mask & AT_GID) {
3715 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3716 			    NULL, &new_gid, sizeof (new_gid));
3717 			zp->z_gid = new_gid;
3718 			if (attrzp) {
3719 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3720 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3721 				    sizeof (new_gid));
3722 				attrzp->z_gid = new_gid;
3723 			}
3724 		}
3725 		if (!(mask & AT_MODE)) {
3726 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3727 			    NULL, &new_mode, sizeof (new_mode));
3728 			new_mode = zp->z_mode;
3729 		}
3730 		err = zfs_acl_chown_setattr(zp);
3731 		ASSERT(err == 0);
3732 		if (attrzp) {
3733 			err = zfs_acl_chown_setattr(attrzp);
3734 			ASSERT(err == 0);
3735 		}
3736 	}
3737 
3738 	if (mask & AT_MODE) {
3739 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3740 		    &new_mode, sizeof (new_mode));
3741 		zp->z_mode = new_mode;
3742 		ASSERT3U((uintptr_t)aclp, !=, 0);
3743 		err = zfs_aclset_common(zp, aclp, cr, tx);
3744 		ASSERT0(err);
3745 		if (zp->z_acl_cached)
3746 			zfs_acl_free(zp->z_acl_cached);
3747 		zp->z_acl_cached = aclp;
3748 		aclp = NULL;
3749 	}
3750 
3751 
3752 	if (mask & AT_ATIME) {
3753 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3754 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3755 		    &zp->z_atime, sizeof (zp->z_atime));
3756 	}
3757 
3758 	if (mask & AT_MTIME) {
3759 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3760 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3761 		    mtime, sizeof (mtime));
3762 	}
3763 
3764 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3765 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3766 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3767 		    NULL, mtime, sizeof (mtime));
3768 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3769 		    &ctime, sizeof (ctime));
3770 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3771 		    B_TRUE);
3772 	} else if (mask != 0) {
3773 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3774 		    &ctime, sizeof (ctime));
3775 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3776 		    B_TRUE);
3777 		if (attrzp) {
3778 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3779 			    SA_ZPL_CTIME(zfsvfs), NULL,
3780 			    &ctime, sizeof (ctime));
3781 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3782 			    mtime, ctime, B_TRUE);
3783 		}
3784 	}
3785 	/*
3786 	 * Do this after setting timestamps to prevent timestamp
3787 	 * update from toggling bit
3788 	 */
3789 
3790 	if (xoap && (mask & AT_XVATTR)) {
3791 
3792 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3793 			xoap->xoa_createtime = vap->va_birthtime;
3794 		/*
3795 		 * restore trimmed off masks
3796 		 * so that return masks can be set for caller.
3797 		 */
3798 
3799 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3800 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3801 		}
3802 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3803 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3804 		}
3805 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3806 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3807 		}
3808 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3809 			XVA_SET_REQ(xvap, XAT_NODUMP);
3810 		}
3811 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3812 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3813 		}
3814 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3815 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3816 		}
3817 
3818 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3819 			ASSERT(vp->v_type == VREG);
3820 
3821 		zfs_xvattr_set(zp, xvap, tx);
3822 	}
3823 
3824 	if (fuid_dirtied)
3825 		zfs_fuid_sync(zfsvfs, tx);
3826 
3827 	if (mask != 0)
3828 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3829 
3830 	if (mask & (AT_UID|AT_GID|AT_MODE))
3831 		mutex_exit(&zp->z_acl_lock);
3832 
3833 	if (attrzp) {
3834 		if (mask & (AT_UID|AT_GID|AT_MODE))
3835 			mutex_exit(&attrzp->z_acl_lock);
3836 	}
3837 out:
3838 	if (err == 0 && attrzp) {
3839 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3840 		    xattr_count, tx);
3841 		ASSERT(err2 == 0);
3842 	}
3843 
3844 	if (attrzp)
3845 		vput(ZTOV(attrzp));
3846 
3847 	if (aclp)
3848 		zfs_acl_free(aclp);
3849 
3850 	if (fuidp) {
3851 		zfs_fuid_info_free(fuidp);
3852 		fuidp = NULL;
3853 	}
3854 
3855 	if (err) {
3856 		dmu_tx_abort(tx);
3857 	} else {
3858 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3859 		dmu_tx_commit(tx);
3860 	}
3861 
3862 out2:
3863 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3864 		zil_commit(zilog, 0);
3865 
3866 	ZFS_EXIT(zfsvfs);
3867 	return (err);
3868 }
3869 
3870 /*
3871  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3872  * fail to acquire any lock in the path we will drop all held locks,
3873  * acquire the new lock in a blocking fashion, and then release it and
3874  * restart the rename.  This acquire/release step ensures that we do not
3875  * spin on a lock waiting for release.  On error release all vnode locks
3876  * and decrement references the way tmpfs_rename() would do.
3877  */
3878 static int
3879 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3880     struct vnode *tdvp, struct vnode **tvpp,
3881     const struct componentname *scnp, const struct componentname *tcnp)
3882 {
3883 	zfsvfs_t	*zfsvfs;
3884 	struct vnode	*nvp, *svp, *tvp;
3885 	znode_t		*sdzp, *tdzp, *szp, *tzp;
3886 #ifdef __FreeBSD__
3887 	const char	*snm = scnp->cn_nameptr;
3888 	const char	*tnm = tcnp->cn_nameptr;
3889 #endif
3890 #ifdef __NetBSD__
3891 	char *snm, *tnm;
3892 #endif
3893 	int error;
3894 
3895 #ifdef __FreeBSD__
3896 	VOP_UNLOCK(tdvp, 0);
3897 	if (*tvpp != NULL && *tvpp != tdvp)
3898 		VOP_UNLOCK(*tvpp, 0);
3899 #endif
3900 
3901 relock:
3902 	error = vn_lock(sdvp, LK_EXCLUSIVE);
3903 	if (error)
3904 		goto out;
3905 	sdzp = VTOZ(sdvp);
3906 
3907 #ifdef __NetBSD__
3908 	if (tdvp == sdvp) {
3909 	} else {
3910 #endif
3911 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3912 	if (error != 0) {
3913 		VOP_UNLOCK(sdvp, 0);
3914 		if (error != EBUSY)
3915 			goto out;
3916 		error = vn_lock(tdvp, LK_EXCLUSIVE);
3917 		if (error)
3918 			goto out;
3919 		VOP_UNLOCK(tdvp, 0);
3920 		goto relock;
3921 	}
3922 #ifdef __NetBSD__
3923 	} /* end if (tdvp == sdvp) */
3924 #endif
3925 
3926 	tdzp = VTOZ(tdvp);
3927 
3928 	/*
3929 	 * Before using sdzp and tdzp we must ensure that they are live.
3930 	 * As a porting legacy from illumos we have two things to worry
3931 	 * about.  One is typical for FreeBSD and it is that the vnode is
3932 	 * not reclaimed (doomed).  The other is that the znode is live.
3933 	 * The current code can invalidate the znode without acquiring the
3934 	 * corresponding vnode lock if the object represented by the znode
3935 	 * and vnode is no longer valid after a rollback or receive operation.
3936 	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3937 	 * that protects the znodes from the invalidation.
3938 	 */
3939 	zfsvfs = sdzp->z_zfsvfs;
3940 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3941 	ZFS_ENTER(zfsvfs);
3942 
3943 	/*
3944 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3945 	 * bypassing the cleanup code in the case of an error.
3946 	 */
3947 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3948 		ZFS_EXIT(zfsvfs);
3949 		VOP_UNLOCK(sdvp, 0);
3950 #ifdef __NetBSD__
3951 		if (tdvp != sdvp)
3952 #endif
3953 		VOP_UNLOCK(tdvp, 0);
3954 		error = SET_ERROR(EIO);
3955 		goto out;
3956 	}
3957 
3958 	/*
3959 	 * Re-resolve svp to be certain it still exists and fetch the
3960 	 * correct vnode.
3961 	 */
3962 #ifdef __NetBSD__
3963 	/* ZFS wants a null-terminated name. */
3964 	snm = PNBUF_GET();
3965 	strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
3966 #endif
3967 	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3968 #ifdef __NetBSD__
3969 	PNBUF_PUT(snm);
3970 #endif
3971 	if (error != 0) {
3972 		/* Source entry invalid or not there. */
3973 		ZFS_EXIT(zfsvfs);
3974 		VOP_UNLOCK(sdvp, 0);
3975 #ifdef __NetBSD__
3976 		if (tdvp != sdvp)
3977 #endif
3978 		VOP_UNLOCK(tdvp, 0);
3979 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3980 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3981 			error = SET_ERROR(EINVAL);
3982 		goto out;
3983 	}
3984 	svp = ZTOV(szp);
3985 
3986 	/*
3987 	 * Re-resolve tvp, if it disappeared we just carry on.
3988 	 */
3989 #ifdef __NetBSD__
3990 	/* ZFS wants a null-terminated name. */
3991 	tnm = PNBUF_GET();
3992 	strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
3993 #endif
3994 	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3995 #ifdef __NetBSD__
3996 	PNBUF_PUT(tnm);
3997 #endif
3998 	if (error != 0) {
3999 		ZFS_EXIT(zfsvfs);
4000 		VOP_UNLOCK(sdvp, 0);
4001 #ifdef __NetBSD__
4002 		if (tdvp != sdvp)
4003 #endif
4004 		VOP_UNLOCK(tdvp, 0);
4005 		vrele(svp);
4006 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
4007 			error = SET_ERROR(EINVAL);
4008 		goto out;
4009 	}
4010 	if (tzp != NULL)
4011 		tvp = ZTOV(tzp);
4012 	else
4013 		tvp = NULL;
4014 
4015 	/*
4016 	 * At present the vnode locks must be acquired before z_teardown_lock,
4017 	 * although it would be more logical to use the opposite order.
4018 	 */
4019 	ZFS_EXIT(zfsvfs);
4020 
4021 	/*
4022 	 * Now try acquire locks on svp and tvp.
4023 	 */
4024 	nvp = svp;
4025 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4026 	if (error != 0) {
4027 		VOP_UNLOCK(sdvp, 0);
4028 #ifdef __NetBSD__
4029 		if (tdvp != sdvp)
4030 #endif
4031 		VOP_UNLOCK(tdvp, 0);
4032 		if (tvp != NULL)
4033 			vrele(tvp);
4034 		if (error != EBUSY) {
4035 			vrele(nvp);
4036 			goto out;
4037 		}
4038 		error = vn_lock(nvp, LK_EXCLUSIVE);
4039 		if (error != 0) {
4040 			vrele(nvp);
4041 			goto out;
4042 		}
4043 		VOP_UNLOCK(nvp, 0);
4044 		/*
4045 		 * Concurrent rename race.
4046 		 * XXX ?
4047 		 */
4048 		if (nvp == tdvp) {
4049 			vrele(nvp);
4050 			error = SET_ERROR(EINVAL);
4051 			goto out;
4052 		}
4053 #ifdef __NetBSD__
4054 		if (*svpp != NULL)
4055 #endif
4056 		vrele(*svpp);
4057 		*svpp = nvp;
4058 		goto relock;
4059 	}
4060 #ifdef __NetBSD__
4061 	if (*svpp != NULL)
4062 #endif
4063 	vrele(*svpp);
4064 	*svpp = nvp;
4065 
4066 	if (*tvpp != NULL)
4067 		vrele(*tvpp);
4068 	*tvpp = NULL;
4069 	if (tvp != NULL) {
4070 		nvp = tvp;
4071 
4072 #ifdef __NetBSD__
4073 		if (tvp == svp || tvp == sdvp) {
4074 		} else {
4075 #endif
4076 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4077 		if (error != 0) {
4078 			VOP_UNLOCK(sdvp, 0);
4079 #ifdef __NetBSD__
4080 			if (tdvp != sdvp)
4081 #endif
4082 			VOP_UNLOCK(tdvp, 0);
4083 #ifdef __NetBSD__
4084 			if (*svpp != tdvp)
4085 #endif
4086 			VOP_UNLOCK(*svpp, 0);
4087 			if (error != EBUSY) {
4088 				vrele(nvp);
4089 				goto out;
4090 			}
4091 			error = vn_lock(nvp, LK_EXCLUSIVE);
4092 			if (error != 0) {
4093 				vrele(nvp);
4094 				goto out;
4095 			}
4096 			vput(nvp);
4097 			goto relock;
4098 		}
4099 #ifdef __NetBSD__
4100 		} /* end if (tvp == svp || tvp == sdvp) */
4101 #endif
4102 
4103 		*tvpp = nvp;
4104 	}
4105 
4106 	KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE);
4107 	KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE);
4108 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4109 	KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE);
4110 
4111 	return (0);
4112 
4113 out:
4114 	return (error);
4115 }
4116 
4117 /*
4118  * Note that we must use VRELE_ASYNC in this function as it walks
4119  * up the directory tree and vrele may need to acquire an exclusive
4120  * lock if a last reference to a vnode is dropped.
4121  */
4122 static int
4123 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
4124 {
4125 	zfsvfs_t	*zfsvfs;
4126 	znode_t		*zp, *zp1;
4127 	uint64_t	parent;
4128 	int		error;
4129 
4130 	zfsvfs = tdzp->z_zfsvfs;
4131 	if (tdzp == szp)
4132 		return (SET_ERROR(EINVAL));
4133 	if (tdzp == sdzp)
4134 		return (0);
4135 	if (tdzp->z_id == zfsvfs->z_root)
4136 		return (0);
4137 	zp = tdzp;
4138 	for (;;) {
4139 		ASSERT(!zp->z_unlinked);
4140 		if ((error = sa_lookup(zp->z_sa_hdl,
4141 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
4142 			break;
4143 
4144 		if (parent == szp->z_id) {
4145 			error = SET_ERROR(EINVAL);
4146 			break;
4147 		}
4148 		if (parent == zfsvfs->z_root)
4149 			break;
4150 		if (parent == sdzp->z_id)
4151 			break;
4152 
4153 		error = zfs_zget(zfsvfs, parent, &zp1);
4154 		if (error != 0)
4155 			break;
4156 
4157 		if (zp != tdzp)
4158 			VN_RELE_ASYNC(ZTOV(zp),
4159 			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4160 		zp = zp1;
4161 	}
4162 
4163 	if (error == ENOTDIR)
4164 		panic("checkpath: .. not a directory\n");
4165 	if (zp != tdzp)
4166 		VN_RELE_ASYNC(ZTOV(zp),
4167 		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4168 	return (error);
4169 }
4170 
4171 /*
4172  * Move an entry from the provided source directory to the target
4173  * directory.  Change the entry name as indicated.
4174  *
4175  *	IN:	sdvp	- Source directory containing the "old entry".
4176  *		snm	- Old entry name.
4177  *		tdvp	- Target directory to contain the "new entry".
4178  *		tnm	- New entry name.
4179  *		cr	- credentials of caller.
4180  *		ct	- caller context
4181  *		flags	- case flags
4182  *
4183  *	RETURN:	0 on success, error code on failure.
4184  *
4185  * Timestamps:
4186  *	sdvp,tdvp - ctime|mtime updated
4187  */
4188 /*ARGSUSED*/
4189 static int
4190 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
4191     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
4192     cred_t *cr)
4193 {
4194 	zfsvfs_t	*zfsvfs;
4195 	znode_t		*sdzp, *tdzp, *szp, *tzp;
4196 	zilog_t		*zilog = NULL;
4197 	dmu_tx_t	*tx;
4198 #ifdef __FreeBSD__
4199 	char		*snm = __UNCONST(scnp->cn_nameptr);
4200 	char		*tnm = __UNCONST(tcnp->cn_nameptr);
4201 #endif
4202 #ifdef __NetBSD__
4203 	char *snm, *tnm;
4204 #endif
4205 	int		error = 0;
4206 
4207 	/* Reject renames across filesystems. */
4208 	if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) ||
4209 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
4210 		error = SET_ERROR(EXDEV);
4211 		goto out;
4212 	}
4213 
4214 	if (zfsctl_is_node(tdvp)) {
4215 		error = SET_ERROR(EXDEV);
4216 		goto out;
4217 	}
4218 
4219 	/*
4220 	 * Lock all four vnodes to ensure safety and semantics of renaming.
4221 	 */
4222 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
4223 	if (error != 0) {
4224 		/* no vnodes are locked in the case of error here */
4225 		return (error);
4226 	}
4227 
4228 	tdzp = VTOZ(tdvp);
4229 	sdzp = VTOZ(sdvp);
4230 	zfsvfs = tdzp->z_zfsvfs;
4231 	zilog = zfsvfs->z_log;
4232 #ifdef __NetBSD__
4233 	/* ZFS wants a null-terminated name. */
4234 	snm = PNBUF_GET();
4235 	strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4236 	tnm = PNBUF_GET();
4237 	strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4238 #endif
4239 
4240 	/*
4241 	 * After we re-enter ZFS_ENTER() we will have to revalidate all
4242 	 * znodes involved.
4243 	 */
4244 	ZFS_ENTER(zfsvfs);
4245 
4246 	if (zfsvfs->z_utf8 && u8_validate(tnm,
4247 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4248 		error = SET_ERROR(EILSEQ);
4249 		goto unlockout;
4250 	}
4251 
4252 #ifndef __NetBSD__
4253 	/* If source and target are the same file, there is nothing to do. */
4254 	if ((*svpp) == (*tvpp)) {
4255 		error = 0;
4256 		goto unlockout;
4257 	}
4258 #endif
4259 
4260 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
4261 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
4262 	    (*tvpp)->v_mountedhere != NULL)) {
4263 		error = SET_ERROR(EXDEV);
4264 		goto unlockout;
4265 	}
4266 
4267 	/*
4268 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
4269 	 * bypassing the cleanup code in the case of an error.
4270 	 */
4271 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
4272 		error = SET_ERROR(EIO);
4273 		goto unlockout;
4274 	}
4275 
4276 	szp = VTOZ(*svpp);
4277 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
4278 	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
4279 		error = SET_ERROR(EIO);
4280 		goto unlockout;
4281 	}
4282 
4283 	/*
4284 	 * This is to prevent the creation of links into attribute space
4285 	 * by renaming a linked file into/outof an attribute directory.
4286 	 * See the comment in zfs_link() for why this is considered bad.
4287 	 */
4288 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
4289 		error = SET_ERROR(EINVAL);
4290 		goto unlockout;
4291 	}
4292 
4293 	/*
4294 	 * Must have write access at the source to remove the old entry
4295 	 * and write access at the target to create the new entry.
4296 	 * Note that if target and source are the same, this can be
4297 	 * done in a single check.
4298 	 */
4299 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
4300 		goto unlockout;
4301 
4302 	if ((*svpp)->v_type == VDIR) {
4303 		/*
4304 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
4305 		 */
4306 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
4307 		    sdzp == szp ||
4308 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
4309 			error = SET_ERROR(EINVAL);
4310 			goto unlockout;
4311 		}
4312 
4313 		/*
4314 		 * Check to make sure rename is valid.
4315 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
4316 		 */
4317 		if (error = zfs_rename_check(szp, sdzp, tdzp))
4318 			goto unlockout;
4319 	}
4320 
4321 	/*
4322 	 * Does target exist?
4323 	 */
4324 	if (tzp) {
4325 		/*
4326 		 * Source and target must be the same type.
4327 		 */
4328 		if ((*svpp)->v_type == VDIR) {
4329 			if ((*tvpp)->v_type != VDIR) {
4330 				error = SET_ERROR(ENOTDIR);
4331 				goto unlockout;
4332 			} else {
4333 				cache_purge(tdvp);
4334 				if (sdvp != tdvp)
4335 					cache_purge(sdvp);
4336 			}
4337 		} else {
4338 			if ((*tvpp)->v_type == VDIR) {
4339 				error = SET_ERROR(EISDIR);
4340 				goto unlockout;
4341 			}
4342 		}
4343 
4344 		/*
4345 		 * POSIX dictates that when the source and target
4346 		 * entries refer to the same file object, rename
4347 		 * must do nothing and exit without error.
4348 		 */
4349 #ifndef __NetBSD__
4350 		/*
4351 		 * But on NetBSD we have a different system call to do
4352 		 * this, posix_rename, which sorta kinda handles this
4353 		 * case (modulo races), and our tests expect BSD
4354 		 * semantics for rename, so we'll do that until we can
4355 		 * push the choice between BSD and POSIX semantics into
4356 		 * the VOP_RENAME protocol as a flag.
4357 		 */
4358 		if (szp->z_id == tzp->z_id) {
4359 			error = 0;
4360 			goto unlockout;
4361 		}
4362 #endif
4363 	}
4364 
4365 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
4366 	if (tzp)
4367 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
4368 
4369 	/*
4370 	 * notify the target directory if it is not the same
4371 	 * as source directory.
4372 	 */
4373 	if (tdvp != sdvp) {
4374 		vnevent_rename_dest_dir(tdvp, ct);
4375 	}
4376 
4377 	tx = dmu_tx_create(zfsvfs->z_os);
4378 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4379 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4380 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4381 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4382 	if (sdzp != tdzp) {
4383 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4384 		zfs_sa_upgrade_txholds(tx, tdzp);
4385 	}
4386 	if (tzp) {
4387 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4388 		zfs_sa_upgrade_txholds(tx, tzp);
4389 	}
4390 
4391 	zfs_sa_upgrade_txholds(tx, szp);
4392 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4393 	error = dmu_tx_assign(tx, TXG_WAIT);
4394 	if (error) {
4395 		dmu_tx_abort(tx);
4396 		goto unlockout;
4397 	}
4398 
4399 
4400 	if (tzp && (tzp->z_id != szp->z_id))
4401 		/* Attempt to remove the existing target */
4402 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
4403 
4404 	if (error == 0) {
4405 		if (!tzp || (tzp->z_id != szp->z_id))
4406 			error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
4407 		if (error == 0) {
4408 			szp->z_pflags |= ZFS_AV_MODIFIED;
4409 
4410 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4411 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4412 			ASSERT0(error);
4413 
4414 			error = zfs_link_destroy(sdzp, snm, szp, tx,
4415 			    /* Kludge for BSD rename semantics.  */
4416 			    tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL);
4417 			if (error == 0) {
4418 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
4419 				    snm, tdzp, tnm, szp);
4420 
4421 				/*
4422 				 * Update path information for the target vnode
4423 				 */
4424 				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
4425 			} else {
4426 				/*
4427 				 * At this point, we have successfully created
4428 				 * the target name, but have failed to remove
4429 				 * the source name.  Since the create was done
4430 				 * with the ZRENAMING flag, there are
4431 				 * complications; for one, the link count is
4432 				 * wrong.  The easiest way to deal with this
4433 				 * is to remove the newly created target, and
4434 				 * return the original error.  This must
4435 				 * succeed; fortunately, it is very unlikely to
4436 				 * fail, since we just created it.
4437 				 */
4438 				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
4439 				    ZRENAMING, NULL), ==, 0);
4440 			}
4441 		}
4442 		if (error == 0) {
4443 			cache_purge(*svpp);
4444 			if (*tvpp != NULL)
4445 				cache_purge(*tvpp);
4446 			cache_purge_negative(tdvp);
4447 		}
4448 	}
4449 
4450 	dmu_tx_commit(tx);
4451 
4452 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4453 		zil_commit(zilog, 0);
4454 
4455 unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
4456 	ZFS_EXIT(zfsvfs);
4457 
4458 	VOP_UNLOCK(*svpp, 0);
4459 	VOP_UNLOCK(sdvp, 0);
4460 #ifdef __NetBSD__
4461 	PNBUF_PUT(snm);
4462 	PNBUF_PUT(tnm);
4463 #endif
4464 
4465 	if (*tvpp != sdvp && *tvpp != *svpp)
4466 	if (*tvpp != NULL)
4467 		VOP_UNLOCK(*tvpp, 0);
4468 	if (tdvp != sdvp && tdvp != *svpp)
4469 	if (tdvp != *tvpp)
4470 		VOP_UNLOCK(tdvp, 0);
4471 
4472 out:
4473 	return (error);
4474 }
4475 
4476 /*
4477  * Insert the indicated symbolic reference entry into the directory.
4478  *
4479  *	IN:	dvp	- Directory to contain new symbolic link.
4480  *		link	- Name for new symlink entry.
4481  *		vap	- Attributes of new entry.
4482  *		cr	- credentials of caller.
4483  *		ct	- caller context
4484  *		flags	- case flags
4485  *
4486  *	RETURN:	0 on success, error code on failure.
4487  *
4488  * Timestamps:
4489  *	dvp - ctime|mtime updated
4490  */
4491 /*ARGSUSED*/
4492 static int
4493 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4494     cred_t *cr, kthread_t *td)
4495 {
4496 	znode_t		*zp, *dzp = VTOZ(dvp);
4497 	dmu_tx_t	*tx;
4498 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4499 	zilog_t		*zilog;
4500 	uint64_t	len = strlen(link);
4501 	int		error;
4502 	zfs_acl_ids_t	acl_ids;
4503 	boolean_t	fuid_dirtied;
4504 	uint64_t	txtype = TX_SYMLINK;
4505 	int		flags = 0;
4506 
4507 	ASSERT(vap->va_type == VLNK);
4508 
4509 	ZFS_ENTER(zfsvfs);
4510 	ZFS_VERIFY_ZP(dzp);
4511 	zilog = zfsvfs->z_log;
4512 
4513 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4514 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4515 		ZFS_EXIT(zfsvfs);
4516 		return (SET_ERROR(EILSEQ));
4517 	}
4518 
4519 	if (len > MAXPATHLEN) {
4520 		ZFS_EXIT(zfsvfs);
4521 		return (SET_ERROR(ENAMETOOLONG));
4522 	}
4523 
4524 	if ((error = zfs_acl_ids_create(dzp, 0,
4525 	    vap, cr, NULL, &acl_ids)) != 0) {
4526 		ZFS_EXIT(zfsvfs);
4527 		return (error);
4528 	}
4529 
4530 	/*
4531 	 * Attempt to lock directory; fail if entry already exists.
4532 	 */
4533 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4534 	if (error) {
4535 		zfs_acl_ids_free(&acl_ids);
4536 		ZFS_EXIT(zfsvfs);
4537 		return (error);
4538 	}
4539 
4540 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4541 		zfs_acl_ids_free(&acl_ids);
4542 		ZFS_EXIT(zfsvfs);
4543 		return (error);
4544 	}
4545 
4546 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4547 		zfs_acl_ids_free(&acl_ids);
4548 		ZFS_EXIT(zfsvfs);
4549 		return (SET_ERROR(EDQUOT));
4550 	}
4551 
4552 	getnewvnode_reserve(1);
4553 	tx = dmu_tx_create(zfsvfs->z_os);
4554 	fuid_dirtied = zfsvfs->z_fuid_dirty;
4555 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4556 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4557 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4558 	    ZFS_SA_BASE_ATTR_SIZE + len);
4559 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4560 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4561 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4562 		    acl_ids.z_aclp->z_acl_bytes);
4563 	}
4564 	if (fuid_dirtied)
4565 		zfs_fuid_txhold(zfsvfs, tx);
4566 	error = dmu_tx_assign(tx, TXG_WAIT);
4567 	if (error) {
4568 		zfs_acl_ids_free(&acl_ids);
4569 		dmu_tx_abort(tx);
4570 		getnewvnode_drop_reserve();
4571 		ZFS_EXIT(zfsvfs);
4572 		return (error);
4573 	}
4574 
4575 	/*
4576 	 * Create a new object for the symlink.
4577 	 * for version 4 ZPL datsets the symlink will be an SA attribute
4578 	 */
4579 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4580 
4581 	if (fuid_dirtied)
4582 		zfs_fuid_sync(zfsvfs, tx);
4583 
4584 	if (zp->z_is_sa)
4585 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4586 		    link, len, tx);
4587 	else
4588 		zfs_sa_symlink(zp, link, len, tx);
4589 
4590 	zp->z_size = len;
4591 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4592 	    &zp->z_size, sizeof (zp->z_size), tx);
4593 	/*
4594 	 * Insert the new object into the directory.
4595 	 */
4596 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4597 
4598 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4599 	*vpp = ZTOV(zp);
4600 
4601 	zfs_acl_ids_free(&acl_ids);
4602 
4603 	dmu_tx_commit(tx);
4604 
4605 	getnewvnode_drop_reserve();
4606 
4607 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4608 		zil_commit(zilog, 0);
4609 
4610 	ZFS_EXIT(zfsvfs);
4611 	return (error);
4612 }
4613 
4614 /*
4615  * Return, in the buffer contained in the provided uio structure,
4616  * the symbolic path referred to by vp.
4617  *
4618  *	IN:	vp	- vnode of symbolic link.
4619  *		uio	- structure to contain the link path.
4620  *		cr	- credentials of caller.
4621  *		ct	- caller context
4622  *
4623  *	OUT:	uio	- structure containing the link path.
4624  *
4625  *	RETURN:	0 on success, error code on failure.
4626  *
4627  * Timestamps:
4628  *	vp - atime updated
4629  */
4630 /* ARGSUSED */
4631 static int
4632 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4633 {
4634 	znode_t		*zp = VTOZ(vp);
4635 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4636 	int		error;
4637 
4638 	ZFS_ENTER(zfsvfs);
4639 	ZFS_VERIFY_ZP(zp);
4640 
4641 	if (zp->z_is_sa)
4642 		error = sa_lookup_uio(zp->z_sa_hdl,
4643 		    SA_ZPL_SYMLINK(zfsvfs), uio);
4644 	else
4645 		error = zfs_sa_readlink(zp, uio);
4646 
4647 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4648 
4649 	ZFS_EXIT(zfsvfs);
4650 	return (error);
4651 }
4652 
4653 /*
4654  * Insert a new entry into directory tdvp referencing svp.
4655  *
4656  *	IN:	tdvp	- Directory to contain new entry.
4657  *		svp	- vnode of new entry.
4658  *		name	- name of new entry.
4659  *		cr	- credentials of caller.
4660  *		ct	- caller context
4661  *
4662  *	RETURN:	0 on success, error code on failure.
4663  *
4664  * Timestamps:
4665  *	tdvp - ctime|mtime updated
4666  *	 svp - ctime updated
4667  */
4668 /* ARGSUSED */
4669 static int
4670 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4671     caller_context_t *ct, int flags)
4672 {
4673 	znode_t		*dzp = VTOZ(tdvp);
4674 	znode_t		*tzp, *szp;
4675 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4676 	zilog_t		*zilog;
4677 	dmu_tx_t	*tx;
4678 	int		error;
4679 	uint64_t	parent;
4680 	uid_t		owner;
4681 
4682 	ASSERT(tdvp->v_type == VDIR);
4683 
4684 	ZFS_ENTER(zfsvfs);
4685 	ZFS_VERIFY_ZP(dzp);
4686 	zilog = zfsvfs->z_log;
4687 
4688 	/*
4689 	 * POSIX dictates that we return EPERM here.
4690 	 * Better choices include ENOTSUP or EISDIR.
4691 	 */
4692 	if (svp->v_type == VDIR) {
4693 		ZFS_EXIT(zfsvfs);
4694 		return (SET_ERROR(EPERM));
4695 	}
4696 
4697 	szp = VTOZ(svp);
4698 	ZFS_VERIFY_ZP(szp);
4699 
4700 	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4701 		ZFS_EXIT(zfsvfs);
4702 		return (SET_ERROR(EPERM));
4703 	}
4704 
4705 	/* Prevent links to .zfs/shares files */
4706 
4707 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4708 	    &parent, sizeof (uint64_t))) != 0) {
4709 		ZFS_EXIT(zfsvfs);
4710 		return (error);
4711 	}
4712 	if (parent == zfsvfs->z_shares_dir) {
4713 		ZFS_EXIT(zfsvfs);
4714 		return (SET_ERROR(EPERM));
4715 	}
4716 
4717 	if (zfsvfs->z_utf8 && u8_validate(name,
4718 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4719 		ZFS_EXIT(zfsvfs);
4720 		return (SET_ERROR(EILSEQ));
4721 	}
4722 
4723 	/*
4724 	 * We do not support links between attributes and non-attributes
4725 	 * because of the potential security risk of creating links
4726 	 * into "normal" file space in order to circumvent restrictions
4727 	 * imposed in attribute space.
4728 	 */
4729 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4730 		ZFS_EXIT(zfsvfs);
4731 		return (SET_ERROR(EINVAL));
4732 	}
4733 
4734 
4735 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4736 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4737 		ZFS_EXIT(zfsvfs);
4738 		return (SET_ERROR(EPERM));
4739 	}
4740 
4741 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4742 		ZFS_EXIT(zfsvfs);
4743 		return (error);
4744 	}
4745 
4746 	/*
4747 	 * Attempt to lock directory; fail if entry already exists.
4748 	 */
4749 	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4750 	if (error) {
4751 		ZFS_EXIT(zfsvfs);
4752 		return (error);
4753 	}
4754 
4755 	tx = dmu_tx_create(zfsvfs->z_os);
4756 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4757 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4758 	zfs_sa_upgrade_txholds(tx, szp);
4759 	zfs_sa_upgrade_txholds(tx, dzp);
4760 	error = dmu_tx_assign(tx, TXG_WAIT);
4761 	if (error) {
4762 		dmu_tx_abort(tx);
4763 		ZFS_EXIT(zfsvfs);
4764 		return (error);
4765 	}
4766 
4767 	error = zfs_link_create(dzp, name, szp, tx, 0);
4768 
4769 	if (error == 0) {
4770 		uint64_t txtype = TX_LINK;
4771 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4772 	}
4773 
4774 	dmu_tx_commit(tx);
4775 
4776 	if (error == 0) {
4777 		vnevent_link(svp, ct);
4778 	}
4779 
4780 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4781 		zil_commit(zilog, 0);
4782 
4783 	ZFS_EXIT(zfsvfs);
4784 	return (error);
4785 }
4786 
4787 
4788 /*ARGSUSED*/
4789 void
4790 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4791 {
4792 	znode_t	*zp = VTOZ(vp);
4793 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4794 	int error;
4795 
4796 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4797 	if (zp->z_sa_hdl == NULL) {
4798 		/*
4799 		 * The fs has been unmounted, or we did a
4800 		 * suspend/resume and this file no longer exists.
4801 		 */
4802 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4803 		vrecycle(vp);
4804 		return;
4805 	}
4806 
4807 	if (zp->z_unlinked) {
4808 		/*
4809 		 * Fast path to recycle a vnode of a removed file.
4810 		 */
4811 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4812 		vrecycle(vp);
4813 		return;
4814 	}
4815 
4816 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4817 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4818 
4819 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4820 		zfs_sa_upgrade_txholds(tx, zp);
4821 		error = dmu_tx_assign(tx, TXG_WAIT);
4822 		if (error) {
4823 			dmu_tx_abort(tx);
4824 		} else {
4825 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4826 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4827 			zp->z_atime_dirty = 0;
4828 			dmu_tx_commit(tx);
4829 		}
4830 	}
4831 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4832 }
4833 
4834 
4835 #ifdef __FreeBSD__
4836 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4837 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4838 #endif
4839 
4840 /*ARGSUSED*/
4841 static int
4842 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4843 {
4844 	znode_t		*zp = VTOZ(vp);
4845 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4846 	uint32_t	gen;
4847 	uint64_t	gen64;
4848 	uint64_t	object = zp->z_id;
4849 	zfid_short_t	*zfid;
4850 	int		size, i, error;
4851 
4852 	ZFS_ENTER(zfsvfs);
4853 	ZFS_VERIFY_ZP(zp);
4854 
4855 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4856 	    &gen64, sizeof (uint64_t))) != 0) {
4857 		ZFS_EXIT(zfsvfs);
4858 		return (error);
4859 	}
4860 
4861 	gen = (uint32_t)gen64;
4862 
4863 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4864 
4865 #ifdef illumos
4866 	if (fidp->fid_len < size) {
4867 		fidp->fid_len = size;
4868 		ZFS_EXIT(zfsvfs);
4869 		return (SET_ERROR(ENOSPC));
4870 	}
4871 #else
4872 	fidp->fid_len = size;
4873 #endif
4874 
4875 	zfid = (zfid_short_t *)fidp;
4876 
4877 	zfid->zf_len = size;
4878 
4879 	for (i = 0; i < sizeof (zfid->zf_object); i++)
4880 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4881 
4882 	/* Must have a non-zero generation number to distinguish from .zfs */
4883 	if (gen == 0)
4884 		gen = 1;
4885 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4886 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4887 
4888 	if (size == LONG_FID_LEN) {
4889 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4890 		zfid_long_t	*zlfid;
4891 
4892 		zlfid = (zfid_long_t *)fidp;
4893 
4894 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4895 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4896 
4897 		/* XXX - this should be the generation number for the objset */
4898 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4899 			zlfid->zf_setgen[i] = 0;
4900 	}
4901 
4902 	ZFS_EXIT(zfsvfs);
4903 	return (0);
4904 }
4905 
4906 static int
4907 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4908     caller_context_t *ct)
4909 {
4910 	znode_t		*zp, *xzp;
4911 	zfsvfs_t	*zfsvfs;
4912 	int		error;
4913 
4914 	switch (cmd) {
4915 	case _PC_LINK_MAX:
4916 		*valp = INT_MAX;
4917 		return (0);
4918 
4919 	case _PC_FILESIZEBITS:
4920 		*valp = 64;
4921 		return (0);
4922 #ifdef illumos
4923 	case _PC_XATTR_EXISTS:
4924 		zp = VTOZ(vp);
4925 		zfsvfs = zp->z_zfsvfs;
4926 		ZFS_ENTER(zfsvfs);
4927 		ZFS_VERIFY_ZP(zp);
4928 		*valp = 0;
4929 		error = zfs_dirent_lookup(zp, "", &xzp,
4930 		    ZXATTR | ZEXISTS | ZSHARED);
4931 		if (error == 0) {
4932 			if (!zfs_dirempty(xzp))
4933 				*valp = 1;
4934 			vrele(ZTOV(xzp));
4935 		} else if (error == ENOENT) {
4936 			/*
4937 			 * If there aren't extended attributes, it's the
4938 			 * same as having zero of them.
4939 			 */
4940 			error = 0;
4941 		}
4942 		ZFS_EXIT(zfsvfs);
4943 		return (error);
4944 
4945 	case _PC_SATTR_ENABLED:
4946 	case _PC_SATTR_EXISTS:
4947 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4948 		    (vp->v_type == VREG || vp->v_type == VDIR);
4949 		return (0);
4950 
4951 	case _PC_ACCESS_FILTERING:
4952 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4953 		    vp->v_type == VDIR;
4954 		return (0);
4955 
4956 	case _PC_ACL_ENABLED:
4957 		*valp = _ACL_ACE_ENABLED;
4958 		return (0);
4959 #endif	/* illumos */
4960 	case _PC_MIN_HOLE_SIZE:
4961 		*valp = (int)SPA_MINBLOCKSIZE;
4962 		return (0);
4963 #ifdef illumos
4964 	case _PC_TIMESTAMP_RESOLUTION:
4965 		/* nanosecond timestamp resolution */
4966 		*valp = 1L;
4967 		return (0);
4968 #endif
4969 	case _PC_ACL_EXTENDED:
4970 		*valp = 0;
4971 		return (0);
4972 
4973 #ifndef __NetBSD__
4974 	case _PC_ACL_NFS4:
4975 		*valp = 1;
4976 		return (0);
4977 
4978 	case _PC_ACL_PATH_MAX:
4979 		*valp = ACL_MAX_ENTRIES;
4980 		return (0);
4981 #endif
4982 
4983 	default:
4984 		return (EOPNOTSUPP);
4985 	}
4986 }
4987 
4988 /*ARGSUSED*/
4989 static int
4990 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4991     caller_context_t *ct)
4992 {
4993 	znode_t *zp = VTOZ(vp);
4994 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4995 	int error;
4996 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4997 
4998 	ZFS_ENTER(zfsvfs);
4999 	ZFS_VERIFY_ZP(zp);
5000 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5001 	ZFS_EXIT(zfsvfs);
5002 
5003 	return (error);
5004 }
5005 
5006 /*ARGSUSED*/
5007 int
5008 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5009     caller_context_t *ct)
5010 {
5011 	znode_t *zp = VTOZ(vp);
5012 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5013 	int error;
5014 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5015 	zilog_t	*zilog = zfsvfs->z_log;
5016 
5017 	ZFS_ENTER(zfsvfs);
5018 	ZFS_VERIFY_ZP(zp);
5019 
5020 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5021 
5022 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5023 		zil_commit(zilog, 0);
5024 
5025 	ZFS_EXIT(zfsvfs);
5026 	return (error);
5027 }
5028 
5029 static int
5030 ioflags(int ioflags)
5031 {
5032 	int flags = 0;
5033 
5034 	if (ioflags & IO_APPEND)
5035 		flags |= FAPPEND;
5036 	if (ioflags & IO_NDELAY)
5037 		flags |= FNONBLOCK;
5038 	if (ioflags & IO_SYNC)
5039 		flags |= (FSYNC | FDSYNC | FRSYNC);
5040 
5041 	return (flags);
5042 }
5043 
5044 #ifdef __NetBSD__
5045 
5046 static int
5047 zfs_netbsd_open(void *v)
5048 {
5049 	struct vop_open_args *ap = v;
5050 
5051 	return (zfs_open(&ap->a_vp, ap->a_mode, ap->a_cred, NULL));
5052 }
5053 
5054 static int
5055 zfs_netbsd_close(void *v)
5056 {
5057 	struct vop_close_args *ap = v;
5058 
5059 	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
5060 }
5061 
5062 static int
5063 zfs_netbsd_ioctl(void *v)
5064 {
5065 	struct vop_ioctl_args *ap = v;
5066 
5067 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5068 		ap->a_fflag, ap->a_cred, NULL, NULL));
5069 }
5070 
5071 
5072 static int
5073 zfs_netbsd_read(void *v)
5074 {
5075 	struct vop_read_args *ap = v;
5076 	vnode_t *vp = ap->a_vp;
5077 	znode_t *zp = VTOZ(vp);
5078 
5079 	switch (vp->v_type) {
5080 	case VBLK:
5081 	case VCHR:
5082 		ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5083 		return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap));
5084 	case VFIFO:
5085 		ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5086 		return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap));
5087 	}
5088 
5089 	return (zfs_read(vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
5090 }
5091 
5092 static int
5093 zfs_netbsd_write(void *v)
5094 {
5095 	struct vop_write_args *ap = v;
5096 	vnode_t *vp = ap->a_vp;
5097 
5098 	switch (vp->v_type) {
5099 	case VBLK:
5100 	case VCHR:
5101 		GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5102 		return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap));
5103 	case VFIFO:
5104 		GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5105 		return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap));
5106 	}
5107 
5108 	return (zfs_write(vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
5109 }
5110 
5111 static int
5112 zfs_netbsd_access(void *v)
5113 {
5114 	struct vop_access_args /* {
5115 		struct vnode *a_vp;
5116 		int a_mode;
5117 		kauth_cred_t a_cred;
5118 	} */ *ap = v;
5119 	struct vnode *vp = ap->a_vp;
5120 	int mode = ap->a_mode;
5121 	mode_t zfs_mode = 0;
5122 	kauth_cred_t cred = ap->a_cred;
5123 	int error;
5124 
5125 	/*
5126 	 * XXX This is really random, especially the left shift by six,
5127 	 * and it exists only because of randomness in zfs_unix_to_v4
5128 	 * and zfs_zaccess_rwx in zfs_acl.c.
5129 	 */
5130 	if (mode & VREAD)
5131 		zfs_mode |= S_IROTH;
5132 	if (mode & VWRITE)
5133 		zfs_mode |= S_IWOTH;
5134 	if (mode & VEXEC)
5135 		zfs_mode |= S_IXOTH;
5136 	zfs_mode <<= 6;
5137 
5138 	KASSERT(VOP_ISLOCKED(vp));
5139 	error = zfs_access(vp, zfs_mode, 0, cred, NULL);
5140 
5141 	/* We expect EACCES as common error. */
5142 	if (error == EPERM)
5143 		error = EACCES;
5144 
5145 	return (error);
5146 }
5147 
5148 static int
5149 zfs_netbsd_lookup(void *v)
5150 {
5151 	struct vop_lookup_v2_args /* {
5152 		struct vnode *a_dvp;
5153 		struct vnode **a_vpp;
5154 		struct componentname *a_cnp;
5155 	} */ *ap = v;
5156 	struct vnode *dvp = ap->a_dvp;
5157 	struct vnode **vpp = ap->a_vpp;
5158 	struct componentname *cnp = ap->a_cnp;
5159 	char *nm, short_nm[31];
5160 	int error;
5161 	int iswhiteout;
5162 
5163 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5164 
5165 	*vpp = NULL;
5166 
5167 	/*
5168 	 * Do an access check before the cache lookup.  zfs_lookup does
5169 	 * an access check too, but it's too scary to contemplate
5170 	 * injecting our namecache stuff into zfs internals.
5171 	 *
5172 	 * XXX Is this the correct access check?
5173 	 */
5174 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
5175 		goto out;
5176 
5177 	/*
5178 	 * Check the namecache before entering zfs_lookup.
5179 	 * cache_lookup does the locking dance for us.
5180 	 */
5181 	if (cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
5182 	    cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
5183 		if (iswhiteout) {
5184 			cnp->cn_flags |= ISWHITEOUT;
5185 		}
5186 		return *vpp == NULL ? ENOENT : 0;
5187 	}
5188 
5189 	/*
5190 	 * zfs_lookup wants a null-terminated component name, but namei
5191 	 * gives us a pointer into the full pathname.
5192 	 */
5193 	ASSERT(cnp->cn_namelen < PATH_MAX - 1);
5194 	if (cnp->cn_namelen + 1 > sizeof(short_nm))
5195 		nm = PNBUF_GET();
5196 	else
5197 		nm = short_nm;
5198 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5199 
5200 	error = zfs_lookup(dvp, nm, vpp, 0, cnp, cnp->cn_nameiop, cnp->cn_cred);
5201 
5202 	if (nm != short_nm)
5203 		PNBUF_PUT(nm);
5204 
5205 	/*
5206 	 * Translate errors to match our namei insanity.  Also, if the
5207 	 * caller wants to create an entry here, it's apparently our
5208 	 * responsibility as lookup to make sure that's permissible.
5209 	 * Go figure.
5210 	 */
5211 	if (cnp->cn_flags & ISLASTCN) {
5212 		switch (cnp->cn_nameiop) {
5213 		case CREATE:
5214 		case RENAME:
5215 			if (error == ENOENT) {
5216 				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5217 				if (error)
5218 					break;
5219 				error = EJUSTRETURN;
5220 				break;
5221 			}
5222 			break;
5223 		case DELETE:
5224 			if (error == 0) {
5225 				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5226 				if (error) {
5227 					VN_RELE(*vpp);
5228 					*vpp = NULL;
5229 				}
5230 			}
5231 			break;
5232 		}
5233 	}
5234 
5235 	if (error) {
5236 		KASSERT(*vpp == NULL);
5237 		goto out;
5238 	}
5239 	KASSERT(*vpp != NULL);
5240 
5241 	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
5242 		KASSERT(!(cnp->cn_flags & ISDOTDOT));
5243 		KASSERT(dvp == *vpp);
5244 	} else if ((cnp->cn_namelen == 2) &&
5245 	    (cnp->cn_nameptr[0] == '.') &&
5246 	    (cnp->cn_nameptr[1] == '.')) {
5247 		KASSERT(cnp->cn_flags & ISDOTDOT);
5248 	} else {
5249 		KASSERT(!(cnp->cn_flags & ISDOTDOT));
5250 	}
5251 
5252 out:
5253 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5254 
5255 	/*
5256 	 * Insert name into cache if appropriate.
5257 	 */
5258 
5259 	if (error == 0 || (error == ENOENT && cnp->cn_nameiop != CREATE))
5260 		cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
5261 		    cnp->cn_flags);
5262 
5263 	return (error);
5264 }
5265 
5266 static int
5267 zfs_netbsd_create(void *v)
5268 {
5269 	struct vop_create_v3_args /* {
5270 		struct vnode *a_dvp;
5271 		struct vnode **a_vpp;
5272 		struct componentname *a_cnp;
5273 		struct vattr *a_vap;
5274 	} */ *ap = v;
5275 	struct vnode *dvp = ap->a_dvp;
5276 	struct vnode **vpp = ap->a_vpp;
5277 	struct componentname *cnp = ap->a_cnp;
5278 	struct vattr *vap = ap->a_vap;
5279 	char *nm;
5280 	int mode;
5281 	int error;
5282 
5283 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5284 
5285 	vattr_init_mask(vap);
5286 	mode = vap->va_mode & ALLPERMS;
5287 
5288 	/* ZFS wants a null-terminated name. */
5289 	nm = PNBUF_GET();
5290 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5291 
5292 	/* XXX !EXCL is wrong here...  */
5293 	error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5294 
5295 	PNBUF_PUT(nm);
5296 
5297 	KASSERT((error == 0) == (*vpp != NULL));
5298 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5299 	VOP_UNLOCK(*vpp, 0);
5300 
5301 	return (error);
5302 }
5303 
5304 static int
5305 zfs_netbsd_mknod(void *v)
5306 {
5307 	struct vop_mknod_v3_args /* {
5308 		struct vnode *a_dvp;
5309 		struct vnode **a_vpp;
5310 		struct componentname *a_cnp;
5311 		struct vattr *a_vap;
5312 	} */ *ap = v;
5313 	struct vnode *dvp = ap->a_dvp;
5314 	struct vnode **vpp = ap->a_vpp;
5315 	struct componentname *cnp = ap->a_cnp;
5316 	struct vattr *vap = ap->a_vap;
5317 	char *nm;
5318 	int mode;
5319 	int error;
5320 
5321 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5322 
5323 	vattr_init_mask(vap);
5324 	mode = vap->va_mode & ALLPERMS;
5325 
5326 	/* ZFS wants a null-terminated name. */
5327 	nm = PNBUF_GET();
5328 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5329 
5330 	/* XXX !EXCL is wrong here...  */
5331 	error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5332 
5333 	PNBUF_PUT(nm);
5334 
5335 	KASSERT((error == 0) == (*vpp != NULL));
5336 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5337 	VOP_UNLOCK(*vpp, 0);
5338 
5339 	return (error);
5340 }
5341 
5342 static int
5343 zfs_netbsd_remove(void *v)
5344 {
5345 	struct vop_remove_v2_args /* {
5346 		struct vnode *a_dvp;
5347 		struct vnode *a_vp;
5348 		struct componentname *a_cnp;
5349 	} */ *ap = v;
5350 	struct vnode *dvp = ap->a_dvp;
5351 	struct vnode *vp = ap->a_vp;
5352 	struct componentname *cnp = ap->a_cnp;
5353 	char *nm;
5354 	int error;
5355 
5356 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5357 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5358 
5359 	/* ZFS wants a null-terminated name. */
5360 	nm = PNBUF_GET();
5361 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5362 
5363 	error = zfs_remove(dvp, vp, nm, cnp->cn_cred);
5364 
5365 	PNBUF_PUT(nm);
5366 	vput(vp);
5367 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5368 	return (error);
5369 }
5370 
5371 static int
5372 zfs_netbsd_mkdir(void *v)
5373 {
5374 	struct vop_mkdir_v3_args /* {
5375 		struct vnode *a_dvp;
5376 		struct vnode **a_vpp;
5377 		struct componentname *a_cnp;
5378 		struct vattr *a_vap;
5379 	} */ *ap = v;
5380 	struct vnode *dvp = ap->a_dvp;
5381 	struct vnode **vpp = ap->a_vpp;
5382 	struct componentname *cnp = ap->a_cnp;
5383 	struct vattr *vap = ap->a_vap;
5384 	char *nm;
5385 	int error;
5386 
5387 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5388 
5389 	vattr_init_mask(vap);
5390 
5391 	/* ZFS wants a null-terminated name. */
5392 	nm = PNBUF_GET();
5393 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5394 
5395 	error = zfs_mkdir(dvp, nm, vap, vpp, cnp->cn_cred);
5396 
5397 	PNBUF_PUT(nm);
5398 
5399 	KASSERT((error == 0) == (*vpp != NULL));
5400 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5401 	VOP_UNLOCK(*vpp, 0);
5402 
5403 	return (error);
5404 }
5405 
5406 static int
5407 zfs_netbsd_rmdir(void *v)
5408 {
5409 	struct vop_rmdir_v2_args /* {
5410 		struct vnode *a_dvp;
5411 		struct vnode *a_vp;
5412 		struct componentname *a_cnp;
5413 	} */ *ap = v;
5414 	struct vnode *dvp = ap->a_dvp;
5415 	struct vnode *vp = ap->a_vp;
5416 	struct componentname *cnp = ap->a_cnp;
5417 	char *nm;
5418 	int error;
5419 
5420 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5421 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5422 
5423 	/* ZFS wants a null-terminated name. */
5424 	nm = PNBUF_GET();
5425 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5426 
5427 	error = zfs_rmdir(dvp, vp, nm, cnp->cn_cred);
5428 
5429 	PNBUF_PUT(nm);
5430 	vput(vp);
5431 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5432 	return error;
5433 }
5434 
5435 static int
5436 zfs_netbsd_readdir(void *v)
5437 {
5438 	struct vop_readdir_args *ap = v;
5439 
5440 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5441 		ap->a_ncookies, ap->a_cookies));
5442 }
5443 
5444 static int
5445 zfs_netbsd_fsync(void *v)
5446 {
5447 	struct vop_fsync_args *ap = v;
5448 
5449 	return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5450 }
5451 
5452 static int
5453 zfs_netbsd_getattr(void *v)
5454 {
5455 	struct vop_getattr_args *ap = v;
5456 	vattr_t *vap = ap->a_vap;
5457 	xvattr_t xvap;
5458 	u_long fflags = 0;
5459 	int error;
5460 
5461 	xva_init(&xvap);
5462 	xvap.xva_vattr = *vap;
5463 	xvap.xva_vattr.va_mask |= AT_XVATTR;
5464 
5465 	/* Convert chflags into ZFS-type flags. */
5466 	/* XXX: what about SF_SETTABLE?. */
5467 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5468 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5469 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5470 	XVA_SET_REQ(&xvap, XAT_NODUMP);
5471 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5472 	if (error != 0)
5473 		return (error);
5474 
5475 	/* Convert ZFS xattr into chflags. */
5476 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5477 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5478 		fflags |= (fflag);					\
5479 } while (0)
5480 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5481 	    xvap.xva_xoptattrs.xoa_immutable);
5482 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5483 	    xvap.xva_xoptattrs.xoa_appendonly);
5484 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5485 	    xvap.xva_xoptattrs.xoa_nounlink);
5486 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5487 	    xvap.xva_xoptattrs.xoa_nodump);
5488 #undef	FLAG_CHECK
5489 	*vap = xvap.xva_vattr;
5490 	vap->va_flags = fflags;
5491 	return (0);
5492 }
5493 
5494 static int
5495 zfs_netbsd_setattr(void *v)
5496 {
5497 	struct vop_setattr_args *ap = v;
5498 	vnode_t *vp = ap->a_vp;
5499 	vattr_t *vap = ap->a_vap;
5500 	cred_t *cred = ap->a_cred;
5501 	znode_t *zp = VTOZ(vp);
5502 	xvattr_t xvap;
5503 	kauth_action_t action;
5504 	u_long fflags, sfflags = 0;
5505 	uint64_t zflags;
5506 	int error, flags = 0;
5507 	bool changing_sysflags;
5508 
5509 	vattr_init_mask(vap);
5510 	vap->va_mask &= ~AT_NOSET;
5511 	if (ISSET(vap->va_vaflags, VA_UTIMES_NULL))
5512 		flags |= ATTR_UTIME;
5513 
5514 	xva_init(&xvap);
5515 	xvap.xva_vattr = *vap;
5516 
5517 	zflags = VTOZ(vp)->z_pflags;
5518 
5519 	if (vap->va_flags != VNOVAL) {
5520 		int error;
5521 
5522 		fflags = vap->va_flags;
5523 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5524 			return (EOPNOTSUPP);
5525 
5526 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5527 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5528 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5529 		XVA_SET_REQ(&xvap, (xflag));				\
5530 		(xfield) = ((fflags & (fflag)) != 0);			\
5531 		if (((fflag) & SF_SETTABLE) != 0)			\
5532 			sfflags |= (fflag);				\
5533 	}								\
5534 } while (0)
5535 		/* Convert chflags into ZFS-type flags. */
5536 		/* XXX: what about SF_SETTABLE?. */
5537 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5538 		    xvap.xva_xoptattrs.xoa_immutable);
5539 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5540 		    xvap.xva_xoptattrs.xoa_appendonly);
5541 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5542 		    xvap.xva_xoptattrs.xoa_nounlink);
5543 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5544 		    xvap.xva_xoptattrs.xoa_nodump);
5545 #undef	FLAG_CHANGE
5546 
5547 		action = KAUTH_VNODE_WRITE_FLAGS;
5548 		changing_sysflags = false;
5549 
5550 		if (zflags & (ZFS_IMMUTABLE|ZFS_APPENDONLY|ZFS_NOUNLINK)) {
5551 			action |= KAUTH_VNODE_HAS_SYSFLAGS;
5552 		}
5553 		if (sfflags != 0) {
5554 			action |= KAUTH_VNODE_WRITE_SYSFLAGS;
5555 			changing_sysflags = true;
5556 		}
5557 
5558 		error = kauth_authorize_vnode(cred, action, vp, NULL,
5559 		    genfs_can_chflags(cred, vp->v_type, zp->z_uid,
5560 		    changing_sysflags));
5561 		if (error)
5562 			return error;
5563 	}
5564 
5565 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
5566 	    vap->va_birthtime.tv_sec != VNOVAL) {
5567 		error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp,
5568 		     NULL, genfs_can_chtimes(vp, vap->va_vaflags, zp->z_uid,
5569 		     cred));
5570 		if (error)
5571 			return error;
5572 	}
5573 
5574 	return (zfs_setattr(vp, (vattr_t *)&xvap, flags, cred, NULL));
5575 }
5576 
5577 static int
5578 zfs_netbsd_rename(void *v)
5579 {
5580 	struct vop_rename_args  /* {
5581 		struct vnode *a_fdvp;
5582 		struct vnode *a_fvp;
5583 		struct componentname *a_fcnp;
5584 		struct vnode *a_tdvp;
5585 		struct vnode *a_tvp;
5586 		struct componentname *a_tcnp;
5587 	} */ *ap = v;
5588 	vnode_t *fdvp = ap->a_fdvp;
5589 	vnode_t *fvp = ap->a_fvp;
5590 	struct componentname *fcnp = ap->a_fcnp;
5591 	vnode_t *tdvp = ap->a_tdvp;
5592 	vnode_t *tvp = ap->a_tvp;
5593 	struct componentname *tcnp = ap->a_tcnp;
5594 	kauth_cred_t cred;
5595 	int error;
5596 
5597 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
5598 	KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
5599 	KASSERT(fdvp->v_type == VDIR);
5600 	KASSERT(tdvp->v_type == VDIR);
5601 
5602 	cred = fcnp->cn_cred;
5603 
5604 	/*
5605 	 * XXX Want a better equality test.  `tcnp->cn_cred == cred'
5606 	 * hoses p2k because puffs transmits the creds separately and
5607 	 * allocates distinct but equivalent structures for them.
5608 	 */
5609 	KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
5610 
5611 	/*
5612 	 * Drop the insane locks.
5613 	 */
5614 	VOP_UNLOCK(tdvp, 0);
5615 	if (tvp != NULL && tvp != tdvp)
5616 		VOP_UNLOCK(tvp, 0);
5617 
5618 	/*
5619 	 * Release the source and target nodes; zfs_rename will look
5620 	 * them up again once the locking situation is sane.
5621 	 */
5622 	VN_RELE(fvp);
5623 	if (tvp != NULL)
5624 		VN_RELE(tvp);
5625 	fvp = NULL;
5626 	tvp = NULL;
5627 
5628 	/*
5629 	 * Do the rename ZFSly.
5630 	 */
5631 	error = zfs_rename(fdvp, &fvp, fcnp, tdvp, &tvp, tcnp, cred);
5632 
5633 	/*
5634 	 * Release the directories now too, because the VOP_RENAME
5635 	 * protocol is insane.
5636 	 */
5637 
5638 	VN_RELE(fdvp);
5639 	VN_RELE(tdvp);
5640 	if (fvp != NULL)
5641 		VN_RELE(fvp);
5642 	if (tvp != NULL)
5643 		VN_RELE(tvp);
5644 
5645 	return (error);
5646 }
5647 
5648 static int
5649 zfs_netbsd_symlink(void *v)
5650 {
5651 	struct vop_symlink_v3_args /* {
5652 		struct vnode *a_dvp;
5653 		struct vnode **a_vpp;
5654 		struct componentname *a_cnp;
5655 		struct vattr *a_vap;
5656 		char *a_target;
5657 	} */ *ap = v;
5658 	struct vnode *dvp = ap->a_dvp;
5659 	struct vnode **vpp = ap->a_vpp;
5660 	struct componentname *cnp = ap->a_cnp;
5661 	struct vattr *vap = ap->a_vap;
5662 	char *target = ap->a_target;
5663 	char *nm;
5664 	int error;
5665 
5666 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5667 
5668 	vap->va_type = VLNK;	/* Netbsd: Syscall only sets va_mode. */
5669 	vattr_init_mask(vap);
5670 
5671 	/* ZFS wants a null-terminated name. */
5672 	nm = PNBUF_GET();
5673 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5674 
5675 	error = zfs_symlink(dvp, vpp, nm, vap, target, cnp->cn_cred, 0);
5676 
5677 	PNBUF_PUT(nm);
5678 
5679 	KASSERT((error == 0) == (*vpp != NULL));
5680 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5681 	VOP_UNLOCK(*vpp, 0);
5682 
5683 	return (error);
5684 }
5685 
5686 static int
5687 zfs_netbsd_readlink(void *v)
5688 {
5689 	struct vop_readlink_args *ap = v;
5690 
5691 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5692 }
5693 
5694 static int
5695 zfs_netbsd_link(void *v)
5696 {
5697 	struct vop_link_v2_args /* {
5698 		struct vnode *a_dvp;
5699 		struct vnode *a_vp;
5700 		struct componentname *a_cnp;
5701 	} */ *ap = v;
5702 	struct vnode *dvp = ap->a_dvp;
5703 	struct vnode *vp = ap->a_vp;
5704 	struct componentname *cnp = ap->a_cnp;
5705 	char *nm;
5706 	int error;
5707 
5708 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5709 
5710 	/* ZFS wants a null-terminated name. */
5711 	nm = PNBUF_GET();
5712 	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5713 
5714 	vn_lock(vp, LK_EXCLUSIVE);
5715 	error = zfs_link(dvp, vp, nm, cnp->cn_cred,
5716 	    NULL, 0);
5717 
5718 	PNBUF_PUT(nm);
5719 	VOP_UNLOCK(vp, 0);
5720 	return error;
5721 }
5722 
5723 static int
5724 zfs_netbsd_inactive(void *v)
5725 {
5726 	struct vop_inactive_v2_args *ap = v;
5727 	vnode_t *vp = ap->a_vp;
5728 	znode_t	*zp = VTOZ(vp);
5729 
5730 	/*
5731 	 * NetBSD: nothing to do here, other than indicate if the
5732 	 * vnode should be reclaimed.  No need to lock, if we race
5733 	 * vrele() will call us again.
5734 	 */
5735 	*ap->a_recycle = (zp->z_unlinked != 0);
5736 
5737 	return (0);
5738 }
5739 
5740 static int
5741 zfs_netbsd_reclaim(void *v)
5742 {
5743 	struct vop_reclaim_v2_args /* {
5744 		struct vnode *a_vp;
5745 	} */ *ap = v;
5746 	struct vnode *vp = ap->a_vp;
5747 	znode_t	*zp;
5748 	zfsvfs_t *zfsvfs;
5749 	int error;
5750 
5751 	VOP_UNLOCK(vp, 0);
5752 	zp = VTOZ(vp);
5753 	zfsvfs = zp->z_zfsvfs;
5754 
5755 	KASSERTMSG(!vn_has_cached_data(vp), "vp %p", vp);
5756 
5757 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5758 
5759 	/*
5760 	 * Process a deferred atime update.
5761 	 */
5762 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
5763 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
5764 
5765 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5766 		zfs_sa_upgrade_txholds(tx, zp);
5767 		error = dmu_tx_assign(tx, TXG_WAIT);
5768 		if (error) {
5769 			dmu_tx_abort(tx);
5770 		} else {
5771 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
5772 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
5773 			zp->z_atime_dirty = 0;
5774 			dmu_tx_commit(tx);
5775 		}
5776 	}
5777 
5778 	if (zfsvfs->z_log)
5779 		zil_commit(zfsvfs->z_log, zp->z_id);
5780 
5781 	if (zp->z_sa_hdl == NULL)
5782 		zfs_znode_free(zp);
5783 	else
5784 		zfs_zinactive(zp);
5785 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5786 	return 0;
5787 }
5788 
5789 static int
5790 zfs_netbsd_fid(void *v)
5791 {
5792 	struct vop_fid_args *ap = v;
5793 
5794 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5795 }
5796 
5797 static int
5798 zfs_netbsd_pathconf(void *v)
5799 {
5800 	struct vop_pathconf_args *ap = v;
5801 	ulong_t val;
5802 	int error;
5803 
5804 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL);
5805 	if (error == 0)
5806 		*ap->a_retval = val;
5807 	else if (error == EOPNOTSUPP) {
5808 		switch (ap->a_name) {
5809 		case _PC_NAME_MAX:
5810 			*ap->a_retval = NAME_MAX;
5811 			return (0);
5812 		case _PC_PATH_MAX:
5813 			*ap->a_retval = PATH_MAX;
5814 			return (0);
5815 		case _PC_LINK_MAX:
5816 			*ap->a_retval = LINK_MAX;
5817 			return (0);
5818 		case _PC_MAX_CANON:
5819 			*ap->a_retval = MAX_CANON;
5820 			return (0);
5821 		case _PC_MAX_INPUT:
5822 			*ap->a_retval = MAX_INPUT;
5823 			return (0);
5824 		case _PC_PIPE_BUF:
5825 			*ap->a_retval = PIPE_BUF;
5826 			return (0);
5827 		case _PC_CHOWN_RESTRICTED:
5828 			*ap->a_retval = 1;
5829 			return (0);
5830 		case _PC_NO_TRUNC:
5831 			*ap->a_retval = 1;
5832 			return (0);
5833 		case _PC_VDISABLE:
5834 			*ap->a_retval = _POSIX_VDISABLE;
5835 			return (0);
5836 		default:
5837 			return (EINVAL);
5838 		}
5839 		/* NOTREACHED */
5840 	}
5841 	return (error);
5842 }
5843 
5844 static int
5845 zfs_netbsd_advlock(void *v)
5846 {
5847 	struct vop_advlock_args /* {
5848 		struct vnode *a_vp;
5849 		void *a_id;
5850 		int a_op;
5851 		struct flock *a_fl;
5852 		int a_flags;
5853 	} */ *ap = v;
5854 	struct vnode *vp;
5855 	struct znode *zp;
5856 	struct zfsvfs *zfsvfs;
5857 	int error;
5858 
5859 	vp = ap->a_vp;
5860 	zp = VTOZ(vp);
5861 	zfsvfs = zp->z_zfsvfs;
5862 
5863 	ZFS_ENTER(zfsvfs);
5864 	ZFS_VERIFY_ZP(zp);
5865 	error = lf_advlock(ap, &zp->z_lockf, zp->z_size);
5866 	ZFS_EXIT(zfsvfs);
5867 
5868 	return error;
5869 }
5870 
5871 static int
5872 zfs_netbsd_getpages(void *v)
5873 {
5874 	struct vop_getpages_args /* {
5875 		struct vnode *a_vp;
5876 		voff_t a_offset;
5877 		struct vm_page **a_m;
5878 		int *a_count;
5879 		int a_centeridx;
5880 		vm_prot_t a_access_type;
5881 		int a_advice;
5882 		int a_flags;
5883 	} */ * const ap = v;
5884 
5885 	vnode_t *const vp = ap->a_vp;
5886 	off_t offset = ap->a_offset + (ap->a_centeridx << PAGE_SHIFT);
5887 	const int flags = ap->a_flags;
5888 	const bool async = (flags & PGO_SYNCIO) == 0;
5889 	const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
5890 
5891 	struct uvm_object * const uobj = &vp->v_uobj;
5892 	kmutex_t * const mtx = uobj->vmobjlock;
5893 	znode_t *zp = VTOZ(vp);
5894 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5895 	struct vm_page *pg;
5896 	caddr_t va;
5897 	int npages, found, err = 0;
5898 
5899 	if (flags & PGO_LOCKED) {
5900 		*ap->a_count = 0;
5901 		ap->a_m[ap->a_centeridx] = NULL;
5902 		return EBUSY;
5903 	}
5904 	mutex_exit(mtx);
5905 
5906 	if (async) {
5907 		return 0;
5908 	}
5909 	if (*ap->a_count != 1) {
5910 		return EBUSY;
5911 	}
5912 
5913 	ZFS_ENTER(zfsvfs);
5914 	ZFS_VERIFY_ZP(zp);
5915 
5916 	mutex_enter(mtx);
5917 	npages = 1;
5918 	pg = NULL;
5919 	uvn_findpages(uobj, offset, &npages, &pg, UFP_ALL);
5920 
5921 	if (pg->flags & PG_FAKE) {
5922 		mutex_exit(mtx);
5923 
5924 		va = zfs_map_page(pg, S_WRITE);
5925 		err = dmu_read(zfsvfs->z_os, zp->z_id, offset, PAGE_SIZE,
5926 		    va, DMU_READ_PREFETCH);
5927 		zfs_unmap_page(pg, va);
5928 
5929 		mutex_enter(mtx);
5930 		pg->flags &= ~(PG_FAKE);
5931 		pmap_clear_modify(pg);
5932 	}
5933 
5934 	if (memwrite) {
5935 		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
5936 			vn_syncer_add_to_worklist(vp, filedelay);
5937 		}
5938 		if ((vp->v_iflag & (VI_WRMAP|VI_WRMAPDIRTY)) == VI_WRMAP) {
5939 			vp->v_iflag |= VI_WRMAPDIRTY;
5940 		}
5941 	}
5942 	mutex_exit(mtx);
5943 	ap->a_m[ap->a_centeridx] = pg;
5944 
5945 	ZFS_EXIT(zfsvfs);
5946 
5947 	return (err);
5948 }
5949 
5950 static int
5951 zfs_putapage(vnode_t *vp, page_t **pp, int count, int flags)
5952 {
5953 	znode_t		*zp = VTOZ(vp);
5954 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5955 	dmu_tx_t	*tx;
5956 	voff_t		off, koff;
5957 	voff_t		len, klen;
5958 	int		err;
5959 
5960 	bool async = (flags & PGO_SYNCIO) == 0;
5961 	bool *cleanedp;
5962 	struct uvm_object *uobj = &vp->v_uobj;
5963 	kmutex_t *mtx = uobj->vmobjlock;
5964 
5965 	if (zp->z_sa_hdl == NULL) {
5966 		err = 0;
5967 		goto out_unbusy;
5968 	}
5969 
5970 	off = pp[0]->offset;
5971 	len = count * PAGESIZE;
5972 	KASSERT(off + len <= round_page(zp->z_size));
5973 
5974 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
5975 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
5976 		err = SET_ERROR(EDQUOT);
5977 		goto out;
5978 	}
5979 	tx = dmu_tx_create(zfsvfs->z_os);
5980 	dmu_tx_hold_write(tx, zp->z_id, off, len);
5981 
5982 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5983 	zfs_sa_upgrade_txholds(tx, zp);
5984 	err = dmu_tx_assign(tx, TXG_WAIT);
5985 	if (err != 0) {
5986 		dmu_tx_abort(tx);
5987 		goto out;
5988 	}
5989 
5990 	if (zp->z_blksz <= PAGESIZE) {
5991 		KASSERTMSG(count == 1, "vp %p pp %p count %d", vp, pp, count);
5992 		caddr_t va = zfs_map_page(*pp, S_READ);
5993 		ASSERT3U(len, <=, PAGESIZE);
5994 		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
5995 		zfs_unmap_page(*pp, va);
5996 	} else {
5997 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
5998 	}
5999 	cleanedp = tsd_get(zfs_putpage_key);
6000 	*cleanedp = true;
6001 
6002 	if (err == 0) {
6003 		uint64_t mtime[2], ctime[2];
6004 		sa_bulk_attr_t bulk[3];
6005 		int count = 0;
6006 
6007 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
6008 		    &mtime, 16);
6009 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
6010 		    &ctime, 16);
6011 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
6012 		    &zp->z_pflags, 8);
6013 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
6014 		    B_TRUE);
6015 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
6016 		ASSERT0(err);
6017 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
6018 	}
6019 	dmu_tx_commit(tx);
6020 
6021 out_unbusy:
6022 	mutex_enter(mtx);
6023 	mutex_enter(&uvm_pageqlock);
6024 	uvm_page_unbusy(pp, count);
6025 	mutex_exit(&uvm_pageqlock);
6026 	mutex_exit(mtx);
6027 
6028 out:
6029 	return (err);
6030 }
6031 
6032 static void
6033 zfs_netbsd_gop_markupdate(vnode_t *vp, int flags)
6034 {
6035 	znode_t		*zp = VTOZ(vp);
6036 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
6037 	dmu_tx_t	*tx;
6038 	sa_bulk_attr_t	bulk[2];
6039 	uint64_t	mtime[2], ctime[2];
6040 	int		count = 0, err;
6041 
6042 	KASSERT(flags == GOP_UPDATE_MODIFIED);
6043 
6044 	tx = dmu_tx_create(zfsvfs->z_os);
6045 	err = dmu_tx_assign(tx, TXG_WAIT);
6046 	if (err != 0) {
6047 		dmu_tx_abort(tx);
6048 		return;
6049 	}
6050 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
6051 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
6052 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
6053 	dmu_tx_commit(tx);
6054 }
6055 
6056 static int
6057 zfs_netbsd_putpages(void *v)
6058 {
6059 	struct vop_putpages_args /* {
6060 		struct vnode *a_vp;
6061 		voff_t a_offlo;
6062 		voff_t a_offhi;
6063 		int a_flags;
6064 	} */ * const ap = v;
6065 
6066 	struct vnode *vp = ap->a_vp;
6067 	voff_t offlo = ap->a_offlo;
6068 	voff_t offhi = ap->a_offhi;
6069 	int flags = ap->a_flags;
6070 
6071 	znode_t *zp = VTOZ(vp);
6072 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6073 	rl_t *rl = NULL;
6074 	uint64_t len;
6075 	int error;
6076 	bool cleaned = false;
6077 
6078 	bool async = (flags & PGO_SYNCIO) == 0;
6079 	bool cleaning = (flags & PGO_CLEANIT) != 0;
6080 
6081 	if (cleaning) {
6082 		ASSERT((offlo & PAGE_MASK) == 0 && (offhi & PAGE_MASK) == 0);
6083 		ASSERT(offlo < offhi || offhi == 0);
6084 		if (offhi == 0)
6085 			len = UINT64_MAX;
6086 		else
6087 			len = offhi - offlo;
6088 		mutex_exit(vp->v_interlock);
6089 		if (curlwp == uvm.pagedaemon_lwp) {
6090 			error = fstrans_start_nowait(vp->v_mount);
6091 			if (error)
6092 				return error;
6093 		} else {
6094 			vfs_t *mp = vp->v_mount;
6095 			fstrans_start(mp);
6096 			if (vp->v_mount != mp) {
6097 				fstrans_done(mp);
6098 				ASSERT(!vn_has_cached_data(vp));
6099 				return 0;
6100 			}
6101 		}
6102 		/*
6103 		 * Cannot use ZFS_ENTER() here as it returns with error
6104 		 * if z_unmounted.  The next statement is equivalent.
6105 		 */
6106 		rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
6107 
6108 		rl = zfs_range_lock(zp, offlo, len, RL_WRITER);
6109 		mutex_enter(vp->v_interlock);
6110 		tsd_set(zfs_putpage_key, &cleaned);
6111 	}
6112 	error = genfs_putpages(v);
6113 	if (cleaning) {
6114 		tsd_set(zfs_putpage_key, NULL);
6115 		zfs_range_unlock(rl);
6116 
6117 		/*
6118 		 * Only zil_commit() if we cleaned something.  This avoids
6119 		 * deadlock if we're called from zfs_netbsd_setsize().
6120 		 */
6121 
6122 		if (cleaned)
6123 		if (!async || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
6124 			zil_commit(zfsvfs->z_log, zp->z_id);
6125 		ZFS_EXIT(zfsvfs);
6126 		fstrans_done(vp->v_mount);
6127 	}
6128 	return error;
6129 }
6130 
6131 /*
6132  * Restrict the putpages range to the ZFS block containing the offset.
6133  */
6134 static void
6135 zfs_netbsd_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
6136 {
6137 	znode_t *zp = VTOZ(vp);
6138 
6139 	*lop = trunc_page(rounddown2(off, zp->z_blksz));
6140 	*hip = round_page(*lop + zp->z_blksz);
6141 }
6142 
6143 void
6144 zfs_netbsd_setsize(vnode_t *vp, off_t size)
6145 {
6146 	struct uvm_object *uobj = &vp->v_uobj;
6147 	kmutex_t *mtx = uobj->vmobjlock;
6148 	page_t *pg;
6149 	int count, pgoff;
6150 	caddr_t va;
6151 	off_t tsize;
6152 
6153 	uvm_vnp_setsize(vp, size);
6154 	if (!vn_has_cached_data(vp))
6155 		return;
6156 
6157 	tsize = trunc_page(size);
6158 	if (tsize == size)
6159 		return;
6160 
6161 	/*
6162 	 * If there's a partial page, we need to zero the tail.
6163 	 */
6164 
6165 	mutex_enter(mtx);
6166 	count = 1;
6167 	pg = NULL;
6168 	if (uvn_findpages(uobj, tsize, &count, &pg, UFP_NOALLOC)) {
6169 		va = zfs_map_page(pg, S_WRITE);
6170 		pgoff = size - tsize;
6171 		memset(va + pgoff, 0, PAGESIZE - pgoff);
6172 		zfs_unmap_page(pg, va);
6173 		uvm_page_unbusy(&pg, 1);
6174 	}
6175 
6176 	mutex_exit(mtx);
6177 }
6178 
6179 static int
6180 zfs_netbsd_print(void *v)
6181 {
6182 	struct vop_print_args /* {
6183 		struct vnode	*a_vp;
6184 	} */ *ap = v;
6185 	vnode_t	*vp;
6186 	znode_t	*zp;
6187 
6188 	vp = ap->a_vp;
6189 	zp = VTOZ(vp);
6190 
6191 	printf("\tino %" PRIu64 " size %" PRIu64 "\n",
6192 	       zp->z_id, zp->z_size);
6193 	return 0;
6194 }
6195 
6196 const struct genfs_ops zfs_genfsops = {
6197         .gop_write = zfs_putapage,
6198 	.gop_markupdate = zfs_netbsd_gop_markupdate,
6199 	.gop_putrange = zfs_netbsd_gop_putrange,
6200 };
6201 
6202 #define	zfs_netbsd_lock		genfs_lock
6203 #define	zfs_netbsd_unlock	genfs_unlock
6204 #define	zfs_netbsd_islocked	genfs_islocked
6205 #define zfs_netbsd_seek		genfs_seek
6206 #define zfs_netbsd_mmap		genfs_mmap
6207 #define zfs_netbsd_fcntl	genfs_fcntl
6208 
6209 int (**zfs_vnodeop_p)(void *);
6210 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
6211 	{ &vop_default_desc,		vn_default_error },
6212 	{ &vop_lookup_desc,		zfs_netbsd_lookup },
6213 	{ &vop_create_desc,		zfs_netbsd_create },
6214 	{ &vop_mknod_desc,		zfs_netbsd_mknod },
6215 	{ &vop_open_desc,		zfs_netbsd_open },
6216 	{ &vop_close_desc,		zfs_netbsd_close },
6217 	{ &vop_access_desc,		zfs_netbsd_access },
6218 	{ &vop_getattr_desc,		zfs_netbsd_getattr },
6219 	{ &vop_setattr_desc,		zfs_netbsd_setattr },
6220 	{ &vop_read_desc,		zfs_netbsd_read },
6221 	{ &vop_write_desc,		zfs_netbsd_write },
6222 	{ &vop_ioctl_desc,		zfs_netbsd_ioctl },
6223 	{ &vop_fsync_desc,		zfs_netbsd_fsync },
6224 	{ &vop_remove_desc,		zfs_netbsd_remove },
6225 	{ &vop_link_desc,		zfs_netbsd_link },
6226 	{ &vop_lock_desc,		zfs_netbsd_lock },
6227 	{ &vop_unlock_desc,		zfs_netbsd_unlock },
6228 	{ &vop_rename_desc,		zfs_netbsd_rename },
6229 	{ &vop_mkdir_desc,		zfs_netbsd_mkdir },
6230 	{ &vop_rmdir_desc,		zfs_netbsd_rmdir },
6231 	{ &vop_symlink_desc,		zfs_netbsd_symlink },
6232 	{ &vop_readdir_desc,		zfs_netbsd_readdir },
6233 	{ &vop_readlink_desc,		zfs_netbsd_readlink },
6234 	{ &vop_inactive_desc,		zfs_netbsd_inactive },
6235 	{ &vop_reclaim_desc,		zfs_netbsd_reclaim },
6236 	{ &vop_pathconf_desc,		zfs_netbsd_pathconf },
6237 	{ &vop_seek_desc,		zfs_netbsd_seek },
6238 	{ &vop_getpages_desc,		zfs_netbsd_getpages },
6239 	{ &vop_putpages_desc,		zfs_netbsd_putpages },
6240 	{ &vop_mmap_desc,		zfs_netbsd_mmap },
6241 	{ &vop_islocked_desc,		zfs_netbsd_islocked },
6242 	{ &vop_advlock_desc,		zfs_netbsd_advlock },
6243 	{ &vop_print_desc,		zfs_netbsd_print },
6244 	{ &vop_fcntl_desc,		zfs_netbsd_fcntl },
6245 	{ NULL, NULL }
6246 };
6247 
6248 const struct vnodeopv_desc zfs_vnodeop_opv_desc =
6249 	{ &zfs_vnodeop_p, zfs_vnodeop_entries };
6250 
6251 int (**zfs_specop_p)(void *);
6252 const struct vnodeopv_entry_desc zfs_specop_entries[] = {
6253 	{ &vop_default_desc,		vn_default_error },
6254 	{ &vop_lookup_desc,		spec_lookup },
6255 	{ &vop_create_desc,		spec_create },
6256 	{ &vop_mknod_desc,		spec_mknod },
6257 	{ &vop_open_desc,		spec_open },
6258 	{ &vop_close_desc,		spec_close },
6259 	{ &vop_access_desc,		zfs_netbsd_access },
6260 	{ &vop_getattr_desc,		zfs_netbsd_getattr },
6261 	{ &vop_setattr_desc,		zfs_netbsd_setattr },
6262 	{ &vop_read_desc,		/**/zfs_netbsd_read },
6263 	{ &vop_write_desc,		/**/zfs_netbsd_write },
6264 	{ &vop_ioctl_desc,		spec_ioctl },
6265 	{ &vop_fsync_desc,		zfs_netbsd_fsync },
6266 	{ &vop_remove_desc,		spec_remove },
6267 	{ &vop_link_desc,		spec_link },
6268 	{ &vop_lock_desc,		zfs_netbsd_lock },
6269 	{ &vop_unlock_desc,		zfs_netbsd_unlock },
6270 	{ &vop_rename_desc,		spec_rename },
6271 	{ &vop_mkdir_desc,		spec_mkdir },
6272 	{ &vop_rmdir_desc,		spec_rmdir },
6273 	{ &vop_symlink_desc,		spec_symlink },
6274 	{ &vop_readdir_desc,		spec_readdir },
6275 	{ &vop_readlink_desc,		spec_readlink },
6276 	{ &vop_inactive_desc,		zfs_netbsd_inactive },
6277 	{ &vop_reclaim_desc,		zfs_netbsd_reclaim },
6278 	{ &vop_pathconf_desc,		spec_pathconf },
6279 	{ &vop_seek_desc,		spec_seek },
6280 	{ &vop_getpages_desc,		spec_getpages },
6281 	{ &vop_putpages_desc,		spec_putpages },
6282 	{ &vop_mmap_desc,		spec_mmap },
6283 	{ &vop_islocked_desc,		zfs_netbsd_islocked },
6284 	{ &vop_advlock_desc,		spec_advlock },
6285 	{ &vop_print_desc,		zfs_netbsd_print },
6286 	{ &vop_fcntl_desc,		zfs_netbsd_fcntl },
6287 	{ NULL, NULL }
6288 };
6289 
6290 const struct vnodeopv_desc zfs_specop_opv_desc =
6291 	{ &zfs_specop_p, zfs_specop_entries };
6292 
6293 int (**zfs_fifoop_p)(void *);
6294 const struct vnodeopv_entry_desc zfs_fifoop_entries[] = {
6295 	{ &vop_default_desc,		vn_default_error },
6296 	{ &vop_lookup_desc,		vn_fifo_bypass },
6297 	{ &vop_create_desc,		vn_fifo_bypass },
6298 	{ &vop_mknod_desc,		vn_fifo_bypass },
6299 	{ &vop_open_desc,		vn_fifo_bypass },
6300 	{ &vop_close_desc,		vn_fifo_bypass },
6301 	{ &vop_access_desc,		zfs_netbsd_access },
6302 	{ &vop_getattr_desc,		zfs_netbsd_getattr },
6303 	{ &vop_setattr_desc,		zfs_netbsd_setattr },
6304 	{ &vop_read_desc,		/**/zfs_netbsd_read },
6305 	{ &vop_write_desc,		/**/zfs_netbsd_write },
6306 	{ &vop_ioctl_desc,		vn_fifo_bypass },
6307 	{ &vop_fsync_desc,		zfs_netbsd_fsync },
6308 	{ &vop_remove_desc,		vn_fifo_bypass },
6309 	{ &vop_link_desc,		vn_fifo_bypass },
6310 	{ &vop_lock_desc,		zfs_netbsd_lock },
6311 	{ &vop_unlock_desc,		zfs_netbsd_unlock },
6312 	{ &vop_rename_desc,		vn_fifo_bypass },
6313 	{ &vop_mkdir_desc,		vn_fifo_bypass },
6314 	{ &vop_rmdir_desc,		vn_fifo_bypass },
6315 	{ &vop_symlink_desc,		vn_fifo_bypass },
6316 	{ &vop_readdir_desc,		vn_fifo_bypass },
6317 	{ &vop_readlink_desc,		vn_fifo_bypass },
6318 	{ &vop_inactive_desc,		zfs_netbsd_inactive },
6319 	{ &vop_reclaim_desc,		zfs_netbsd_reclaim },
6320 	{ &vop_pathconf_desc,		vn_fifo_bypass },
6321 	{ &vop_seek_desc,		vn_fifo_bypass },
6322 	{ &vop_putpages_desc,		vn_fifo_bypass },
6323 	{ &vop_mmap_desc,		vn_fifo_bypass },
6324 	{ &vop_islocked_desc,		zfs_netbsd_islocked },
6325 	{ &vop_advlock_desc,		vn_fifo_bypass },
6326 	{ &vop_print_desc,		zfs_netbsd_print },
6327 	{ &vop_fcntl_desc,		zfs_netbsd_fcntl },
6328 	{ NULL, NULL }
6329 };
6330 
6331 const struct vnodeopv_desc zfs_fifoop_opv_desc =
6332 	{ &zfs_fifoop_p, zfs_fifoop_entries };
6333 
6334 #endif /* __NetBSD__ */
6335