xref: /onnv-gate/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 13096:b02331b7b26d)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51705Sjwahlig  * Common Development and Distribution License (the "License").
61705Sjwahlig  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*13096SJordan.Vaughan@Sun.com  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
230Sstevel@tonic-gate  */
240Sstevel@tonic-gate 
250Sstevel@tonic-gate /*
260Sstevel@tonic-gate  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
270Sstevel@tonic-gate  *	All Rights Reserved
280Sstevel@tonic-gate  */
290Sstevel@tonic-gate 
300Sstevel@tonic-gate #include <sys/param.h>
310Sstevel@tonic-gate #include <sys/types.h>
320Sstevel@tonic-gate #include <sys/systm.h>
330Sstevel@tonic-gate #include <sys/thread.h>
340Sstevel@tonic-gate #include <sys/t_lock.h>
350Sstevel@tonic-gate #include <sys/time.h>
360Sstevel@tonic-gate #include <sys/vnode.h>
370Sstevel@tonic-gate #include <sys/vfs.h>
380Sstevel@tonic-gate #include <sys/errno.h>
390Sstevel@tonic-gate #include <sys/buf.h>
400Sstevel@tonic-gate #include <sys/stat.h>
410Sstevel@tonic-gate #include <sys/cred.h>
420Sstevel@tonic-gate #include <sys/kmem.h>
430Sstevel@tonic-gate #include <sys/debug.h>
440Sstevel@tonic-gate #include <sys/dnlc.h>
450Sstevel@tonic-gate #include <sys/vmsystm.h>
460Sstevel@tonic-gate #include <sys/flock.h>
470Sstevel@tonic-gate #include <sys/share.h>
480Sstevel@tonic-gate #include <sys/cmn_err.h>
490Sstevel@tonic-gate #include <sys/tiuser.h>
500Sstevel@tonic-gate #include <sys/sysmacros.h>
510Sstevel@tonic-gate #include <sys/callb.h>
520Sstevel@tonic-gate #include <sys/acl.h>
530Sstevel@tonic-gate #include <sys/kstat.h>
540Sstevel@tonic-gate #include <sys/signal.h>
550Sstevel@tonic-gate #include <sys/disp.h>
560Sstevel@tonic-gate #include <sys/atomic.h>
570Sstevel@tonic-gate #include <sys/list.h>
580Sstevel@tonic-gate #include <sys/sdt.h>
590Sstevel@tonic-gate 
600Sstevel@tonic-gate #include <rpc/types.h>
610Sstevel@tonic-gate #include <rpc/xdr.h>
620Sstevel@tonic-gate #include <rpc/auth.h>
630Sstevel@tonic-gate #include <rpc/clnt.h>
640Sstevel@tonic-gate 
650Sstevel@tonic-gate #include <nfs/nfs.h>
660Sstevel@tonic-gate #include <nfs/nfs_clnt.h>
670Sstevel@tonic-gate #include <nfs/nfs_acl.h>
680Sstevel@tonic-gate 
690Sstevel@tonic-gate #include <nfs/nfs4.h>
700Sstevel@tonic-gate #include <nfs/rnode4.h>
710Sstevel@tonic-gate #include <nfs/nfs4_clnt.h>
720Sstevel@tonic-gate 
730Sstevel@tonic-gate #include <vm/hat.h>
740Sstevel@tonic-gate #include <vm/as.h>
750Sstevel@tonic-gate #include <vm/page.h>
760Sstevel@tonic-gate #include <vm/pvn.h>
770Sstevel@tonic-gate #include <vm/seg.h>
780Sstevel@tonic-gate #include <vm/seg_map.h>
790Sstevel@tonic-gate #include <vm/seg_vn.h>
800Sstevel@tonic-gate 
810Sstevel@tonic-gate #include <sys/ddi.h>
820Sstevel@tonic-gate 
830Sstevel@tonic-gate /*
840Sstevel@tonic-gate  * Arguments to page-flush thread.
850Sstevel@tonic-gate  */
860Sstevel@tonic-gate typedef struct {
870Sstevel@tonic-gate 	vnode_t *vp;
880Sstevel@tonic-gate 	cred_t *cr;
890Sstevel@tonic-gate } pgflush_t;
900Sstevel@tonic-gate 
910Sstevel@tonic-gate #ifdef DEBUG
920Sstevel@tonic-gate int nfs4_client_lease_debug;
930Sstevel@tonic-gate int nfs4_sharedfh_debug;
940Sstevel@tonic-gate int nfs4_fname_debug;
950Sstevel@tonic-gate 
960Sstevel@tonic-gate /* temporary: panic if v_type is inconsistent with r_attr va_type */
970Sstevel@tonic-gate int nfs4_vtype_debug;
980Sstevel@tonic-gate 
990Sstevel@tonic-gate uint_t nfs4_tsd_key;
1000Sstevel@tonic-gate #endif
1010Sstevel@tonic-gate 
1020Sstevel@tonic-gate static time_t	nfs4_client_resumed = 0;
1030Sstevel@tonic-gate static	callb_id_t cid = 0;
1040Sstevel@tonic-gate 
1050Sstevel@tonic-gate static int	nfs4renew(nfs4_server_t *);
1060Sstevel@tonic-gate static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
1070Sstevel@tonic-gate static void	nfs4_pgflush_thread(pgflush_t *);
1080Sstevel@tonic-gate 
1090Sstevel@tonic-gate static boolean_t nfs4_client_cpr_callb(void *, int);
1100Sstevel@tonic-gate 
1110Sstevel@tonic-gate struct mi4_globals {
1120Sstevel@tonic-gate 	kmutex_t	mig_lock;  /* lock protecting mig_list */
1130Sstevel@tonic-gate 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
1140Sstevel@tonic-gate 	boolean_t	mig_destructor_called;
1150Sstevel@tonic-gate };
1160Sstevel@tonic-gate 
1170Sstevel@tonic-gate static zone_key_t mi4_list_key;
1180Sstevel@tonic-gate 
1190Sstevel@tonic-gate /*
1200Sstevel@tonic-gate  * Attributes caching:
1210Sstevel@tonic-gate  *
1220Sstevel@tonic-gate  * Attributes are cached in the rnode in struct vattr form.
1230Sstevel@tonic-gate  * There is a time associated with the cached attributes (r_time_attr_inval)
1240Sstevel@tonic-gate  * which tells whether the attributes are valid. The time is initialized
1250Sstevel@tonic-gate  * to the difference between current time and the modify time of the vnode
1260Sstevel@tonic-gate  * when new attributes are cached. This allows the attributes for
1270Sstevel@tonic-gate  * files that have changed recently to be timed out sooner than for files
1280Sstevel@tonic-gate  * that have not changed for a long time. There are minimum and maximum
1290Sstevel@tonic-gate  * timeout values that can be set per mount point.
1300Sstevel@tonic-gate  */
1310Sstevel@tonic-gate 
1320Sstevel@tonic-gate /*
1330Sstevel@tonic-gate  * If a cache purge is in progress, wait for it to finish.
1340Sstevel@tonic-gate  *
1350Sstevel@tonic-gate  * The current thread must not be in the middle of an
1360Sstevel@tonic-gate  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
1370Sstevel@tonic-gate  * between this thread, a recovery thread, and the page flush thread.
1380Sstevel@tonic-gate  */
1390Sstevel@tonic-gate int
nfs4_waitfor_purge_complete(vnode_t * vp)1400Sstevel@tonic-gate nfs4_waitfor_purge_complete(vnode_t *vp)
1410Sstevel@tonic-gate {
1420Sstevel@tonic-gate 	rnode4_t *rp;
1430Sstevel@tonic-gate 	k_sigset_t smask;
1440Sstevel@tonic-gate 
1450Sstevel@tonic-gate 	rp = VTOR4(vp);
1460Sstevel@tonic-gate 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
1470Sstevel@tonic-gate 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
1480Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
1490Sstevel@tonic-gate 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
1500Sstevel@tonic-gate 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
1510Sstevel@tonic-gate 		    ((rp->r_flags & R4PGFLUSH) &&
1520Sstevel@tonic-gate 		    rp->r_pgflush != curthread)) {
1530Sstevel@tonic-gate 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
1540Sstevel@tonic-gate 				sigunintr(&smask);
1550Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
1560Sstevel@tonic-gate 				return (EINTR);
1570Sstevel@tonic-gate 			}
1580Sstevel@tonic-gate 		}
1590Sstevel@tonic-gate 		sigunintr(&smask);
1600Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
1610Sstevel@tonic-gate 	}
1620Sstevel@tonic-gate 	return (0);
1630Sstevel@tonic-gate }
1640Sstevel@tonic-gate 
1650Sstevel@tonic-gate /*
1660Sstevel@tonic-gate  * Validate caches by checking cached attributes. If they have timed out,
1670Sstevel@tonic-gate  * then get new attributes from the server.  As a side effect, cache
1680Sstevel@tonic-gate  * invalidation is done if the attributes have changed.
1690Sstevel@tonic-gate  *
1700Sstevel@tonic-gate  * If the attributes have not timed out and if there is a cache
1710Sstevel@tonic-gate  * invalidation being done by some other thread, then wait until that
1720Sstevel@tonic-gate  * thread has completed the cache invalidation.
1730Sstevel@tonic-gate  */
1740Sstevel@tonic-gate int
nfs4_validate_caches(vnode_t * vp,cred_t * cr)1750Sstevel@tonic-gate nfs4_validate_caches(vnode_t *vp, cred_t *cr)
1760Sstevel@tonic-gate {
1770Sstevel@tonic-gate 	int error;
1780Sstevel@tonic-gate 	nfs4_ga_res_t gar;
1790Sstevel@tonic-gate 
1800Sstevel@tonic-gate 	if (ATTRCACHE4_VALID(vp)) {
1810Sstevel@tonic-gate 		error = nfs4_waitfor_purge_complete(vp);
1820Sstevel@tonic-gate 		if (error)
1830Sstevel@tonic-gate 			return (error);
1840Sstevel@tonic-gate 		return (0);
1850Sstevel@tonic-gate 	}
1860Sstevel@tonic-gate 
1870Sstevel@tonic-gate 	gar.n4g_va.va_mask = AT_ALL;
1880Sstevel@tonic-gate 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
1890Sstevel@tonic-gate }
1900Sstevel@tonic-gate 
1910Sstevel@tonic-gate /*
1920Sstevel@tonic-gate  * Fill in attribute from the cache.
1930Sstevel@tonic-gate  * If valid, then return 0 to indicate that no error occurred,
1940Sstevel@tonic-gate  * otherwise return 1 to indicate that an error occurred.
1950Sstevel@tonic-gate  */
1960Sstevel@tonic-gate static int
nfs4_getattr_cache(vnode_t * vp,struct vattr * vap)1970Sstevel@tonic-gate nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
1980Sstevel@tonic-gate {
1990Sstevel@tonic-gate 	rnode4_t *rp;
2000Sstevel@tonic-gate 
2010Sstevel@tonic-gate 	rp = VTOR4(vp);
2020Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
2030Sstevel@tonic-gate 	mutex_enter(&rp->r_statev4_lock);
2040Sstevel@tonic-gate 	if (ATTRCACHE4_VALID(vp)) {
2050Sstevel@tonic-gate 		mutex_exit(&rp->r_statev4_lock);
2060Sstevel@tonic-gate 		/*
2070Sstevel@tonic-gate 		 * Cached attributes are valid
2080Sstevel@tonic-gate 		 */
2090Sstevel@tonic-gate 		*vap = rp->r_attr;
2100Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
2110Sstevel@tonic-gate 		return (0);
2120Sstevel@tonic-gate 	}
2130Sstevel@tonic-gate 	mutex_exit(&rp->r_statev4_lock);
2140Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
2150Sstevel@tonic-gate 	return (1);
2160Sstevel@tonic-gate }
2170Sstevel@tonic-gate 
2180Sstevel@tonic-gate 
2190Sstevel@tonic-gate /*
2200Sstevel@tonic-gate  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
2210Sstevel@tonic-gate  * call is synchronous because all the pages were invalidated by the
2220Sstevel@tonic-gate  * nfs4_invalidate_pages() call.
2230Sstevel@tonic-gate  */
2240Sstevel@tonic-gate void
nfs4_purge_stale_fh(int errno,vnode_t * vp,cred_t * cr)2250Sstevel@tonic-gate nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
2260Sstevel@tonic-gate {
2270Sstevel@tonic-gate 	struct rnode4 *rp = VTOR4(vp);
2280Sstevel@tonic-gate 
2290Sstevel@tonic-gate 	/* Ensure that the ..._end_op() call has been done */
2300Sstevel@tonic-gate 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate 	if (errno != ESTALE)
2330Sstevel@tonic-gate 		return;
2340Sstevel@tonic-gate 
2350Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
2360Sstevel@tonic-gate 	rp->r_flags |= R4STALE;
2370Sstevel@tonic-gate 	if (!rp->r_error)
2380Sstevel@tonic-gate 		rp->r_error = errno;
2390Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
2400Sstevel@tonic-gate 	if (nfs4_has_pages(vp))
2410Sstevel@tonic-gate 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
2420Sstevel@tonic-gate 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
2430Sstevel@tonic-gate }
2440Sstevel@tonic-gate 
2450Sstevel@tonic-gate /*
2460Sstevel@tonic-gate  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
2470Sstevel@tonic-gate  * page purge is done asynchronously.
2480Sstevel@tonic-gate  */
2490Sstevel@tonic-gate void
nfs4_purge_caches(vnode_t * vp,int purge_dnlc,cred_t * cr,int asyncpg)2500Sstevel@tonic-gate nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
2510Sstevel@tonic-gate {
2520Sstevel@tonic-gate 	rnode4_t *rp;
2530Sstevel@tonic-gate 	char *contents;
2540Sstevel@tonic-gate 	vnode_t *xattr;
2550Sstevel@tonic-gate 	int size;
2560Sstevel@tonic-gate 	int pgflush;			/* are we the page flush thread? */
2570Sstevel@tonic-gate 
2580Sstevel@tonic-gate 	/*
2590Sstevel@tonic-gate 	 * Purge the DNLC for any entries which refer to this file.
2600Sstevel@tonic-gate 	 */
2610Sstevel@tonic-gate 	if (vp->v_count > 1 &&
2620Sstevel@tonic-gate 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
2630Sstevel@tonic-gate 		dnlc_purge_vp(vp);
2640Sstevel@tonic-gate 
2650Sstevel@tonic-gate 	/*
2660Sstevel@tonic-gate 	 * Clear any readdir state bits and purge the readlink response cache.
2670Sstevel@tonic-gate 	 */
2680Sstevel@tonic-gate 	rp = VTOR4(vp);
2690Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
2700Sstevel@tonic-gate 	rp->r_flags &= ~R4LOOKUP;
2710Sstevel@tonic-gate 	contents = rp->r_symlink.contents;
2720Sstevel@tonic-gate 	size = rp->r_symlink.size;
2730Sstevel@tonic-gate 	rp->r_symlink.contents = NULL;
2740Sstevel@tonic-gate 
2750Sstevel@tonic-gate 	xattr = rp->r_xattr_dir;
2760Sstevel@tonic-gate 	rp->r_xattr_dir = NULL;
2770Sstevel@tonic-gate 
2780Sstevel@tonic-gate 	/*
2790Sstevel@tonic-gate 	 * Purge pathconf cache too.
2800Sstevel@tonic-gate 	 */
2810Sstevel@tonic-gate 	rp->r_pathconf.pc4_xattr_valid = 0;
2820Sstevel@tonic-gate 	rp->r_pathconf.pc4_cache_valid = 0;
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	pgflush = (curthread == rp->r_pgflush);
2850Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
2860Sstevel@tonic-gate 
2870Sstevel@tonic-gate 	if (contents != NULL) {
2880Sstevel@tonic-gate 
2890Sstevel@tonic-gate 		kmem_free((void *)contents, size);
2900Sstevel@tonic-gate 	}
2910Sstevel@tonic-gate 
2920Sstevel@tonic-gate 	if (xattr != NULL)
2930Sstevel@tonic-gate 		VN_RELE(xattr);
2940Sstevel@tonic-gate 
2950Sstevel@tonic-gate 	/*
2960Sstevel@tonic-gate 	 * Flush the page cache.  If the current thread is the page flush
2970Sstevel@tonic-gate 	 * thread, don't initiate a new page flush.  There's no need for
2980Sstevel@tonic-gate 	 * it, and doing it correctly is hard.
2990Sstevel@tonic-gate 	 */
3000Sstevel@tonic-gate 	if (nfs4_has_pages(vp) && !pgflush) {
3010Sstevel@tonic-gate 		if (!asyncpg) {
3020Sstevel@tonic-gate 			(void) nfs4_waitfor_purge_complete(vp);
30310276SPavel.Filipensky@Sun.COM 			nfs4_flush_pages(vp, cr);
3040Sstevel@tonic-gate 		} else {
3050Sstevel@tonic-gate 			pgflush_t *args;
3060Sstevel@tonic-gate 
3070Sstevel@tonic-gate 			/*
3080Sstevel@tonic-gate 			 * We don't hold r_statelock while creating the
3090Sstevel@tonic-gate 			 * thread, in case the call blocks.  So we use a
3100Sstevel@tonic-gate 			 * flag to indicate that a page flush thread is
3110Sstevel@tonic-gate 			 * active.
3120Sstevel@tonic-gate 			 */
3130Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
3140Sstevel@tonic-gate 			if (rp->r_flags & R4PGFLUSH) {
3150Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
3160Sstevel@tonic-gate 			} else {
3170Sstevel@tonic-gate 				rp->r_flags |= R4PGFLUSH;
3180Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
3190Sstevel@tonic-gate 
3200Sstevel@tonic-gate 				args = kmem_alloc(sizeof (pgflush_t),
3215302Sth199096 				    KM_SLEEP);
3220Sstevel@tonic-gate 				args->vp = vp;
3230Sstevel@tonic-gate 				VN_HOLD(args->vp);
3240Sstevel@tonic-gate 				args->cr = cr;
3250Sstevel@tonic-gate 				crhold(args->cr);
3260Sstevel@tonic-gate 				(void) zthread_create(NULL, 0,
3275302Sth199096 				    nfs4_pgflush_thread, args, 0,
3285302Sth199096 				    minclsyspri);
3290Sstevel@tonic-gate 			}
3300Sstevel@tonic-gate 		}
3310Sstevel@tonic-gate 	}
3320Sstevel@tonic-gate 
3330Sstevel@tonic-gate 	/*
3340Sstevel@tonic-gate 	 * Flush the readdir response cache.
3350Sstevel@tonic-gate 	 */
3360Sstevel@tonic-gate 	nfs4_purge_rddir_cache(vp);
3370Sstevel@tonic-gate }
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate /*
3400Sstevel@tonic-gate  * Invalidate all pages for the given file, after writing back the dirty
3410Sstevel@tonic-gate  * ones.
3420Sstevel@tonic-gate  */
3430Sstevel@tonic-gate 
34410276SPavel.Filipensky@Sun.COM void
nfs4_flush_pages(vnode_t * vp,cred_t * cr)34510276SPavel.Filipensky@Sun.COM nfs4_flush_pages(vnode_t *vp, cred_t *cr)
3460Sstevel@tonic-gate {
3470Sstevel@tonic-gate 	int error;
3480Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
3490Sstevel@tonic-gate 
3505331Samw 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
3510Sstevel@tonic-gate 	if (error == ENOSPC || error == EDQUOT) {
3520Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
3530Sstevel@tonic-gate 		if (!rp->r_error)
3540Sstevel@tonic-gate 			rp->r_error = error;
3550Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
3560Sstevel@tonic-gate 	}
3570Sstevel@tonic-gate }
3580Sstevel@tonic-gate 
3590Sstevel@tonic-gate /*
3600Sstevel@tonic-gate  * Page flush thread.
3610Sstevel@tonic-gate  */
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate static void
nfs4_pgflush_thread(pgflush_t * args)3640Sstevel@tonic-gate nfs4_pgflush_thread(pgflush_t *args)
3650Sstevel@tonic-gate {
3660Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(args->vp);
3670Sstevel@tonic-gate 
3680Sstevel@tonic-gate 	/* remember which thread we are, so we don't deadlock ourselves */
3690Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
3700Sstevel@tonic-gate 	ASSERT(rp->r_pgflush == NULL);
3710Sstevel@tonic-gate 	rp->r_pgflush = curthread;
3720Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
3730Sstevel@tonic-gate 
37410276SPavel.Filipensky@Sun.COM 	nfs4_flush_pages(args->vp, args->cr);
3750Sstevel@tonic-gate 
3760Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
3770Sstevel@tonic-gate 	rp->r_pgflush = NULL;
3780Sstevel@tonic-gate 	rp->r_flags &= ~R4PGFLUSH;
3790Sstevel@tonic-gate 	cv_broadcast(&rp->r_cv);
3800Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	VN_RELE(args->vp);
3830Sstevel@tonic-gate 	crfree(args->cr);
3840Sstevel@tonic-gate 	kmem_free(args, sizeof (pgflush_t));
3850Sstevel@tonic-gate 	zthread_exit();
3860Sstevel@tonic-gate }
3870Sstevel@tonic-gate 
3880Sstevel@tonic-gate /*
3890Sstevel@tonic-gate  * Purge the readdir cache of all entries which are not currently
3900Sstevel@tonic-gate  * being filled.
3910Sstevel@tonic-gate  */
3920Sstevel@tonic-gate void
nfs4_purge_rddir_cache(vnode_t * vp)3930Sstevel@tonic-gate nfs4_purge_rddir_cache(vnode_t *vp)
3940Sstevel@tonic-gate {
3950Sstevel@tonic-gate 	rnode4_t *rp;
3960Sstevel@tonic-gate 
3970Sstevel@tonic-gate 	rp = VTOR4(vp);
3980Sstevel@tonic-gate 
3990Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
4000Sstevel@tonic-gate 	rp->r_direof = NULL;
4010Sstevel@tonic-gate 	rp->r_flags &= ~R4LOOKUP;
4020Sstevel@tonic-gate 	rp->r_flags |= R4READDIRWATTR;
4030Sstevel@tonic-gate 	rddir4_cache_purge(rp);
4040Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
4050Sstevel@tonic-gate }
4060Sstevel@tonic-gate 
4070Sstevel@tonic-gate /*
4080Sstevel@tonic-gate  * Set attributes cache for given vnode using virtual attributes.  There is
4090Sstevel@tonic-gate  * no cache validation, but if the attributes are deemed to be stale, they
4100Sstevel@tonic-gate  * are ignored.  This corresponds to nfs3_attrcache().
4110Sstevel@tonic-gate  *
4120Sstevel@tonic-gate  * Set the timeout value on the attribute cache and fill it
4130Sstevel@tonic-gate  * with the passed in attributes.
4140Sstevel@tonic-gate  */
4150Sstevel@tonic-gate void
nfs4_attrcache_noinval(vnode_t * vp,nfs4_ga_res_t * garp,hrtime_t t)4160Sstevel@tonic-gate nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
4170Sstevel@tonic-gate {
4180Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
4190Sstevel@tonic-gate 
4200Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
4210Sstevel@tonic-gate 	if (rp->r_time_attr_saved <= t)
4220Sstevel@tonic-gate 		nfs4_attrcache_va(vp, garp, FALSE);
4230Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
4240Sstevel@tonic-gate }
4250Sstevel@tonic-gate 
4260Sstevel@tonic-gate /*
4270Sstevel@tonic-gate  * Use the passed in virtual attributes to check to see whether the
4280Sstevel@tonic-gate  * data and metadata caches are valid, cache the new attributes, and
4290Sstevel@tonic-gate  * then do the cache invalidation if required.
4300Sstevel@tonic-gate  *
4310Sstevel@tonic-gate  * The cache validation and caching of the new attributes is done
4320Sstevel@tonic-gate  * atomically via the use of the mutex, r_statelock.  If required,
4330Sstevel@tonic-gate  * the cache invalidation is done atomically w.r.t. the cache
4340Sstevel@tonic-gate  * validation and caching of the attributes via the pseudo lock,
4350Sstevel@tonic-gate  * r_serial.
4360Sstevel@tonic-gate  *
4370Sstevel@tonic-gate  * This routine is used to do cache validation and attributes caching
4380Sstevel@tonic-gate  * for operations with a single set of post operation attributes.
4390Sstevel@tonic-gate  */
4400Sstevel@tonic-gate 
4410Sstevel@tonic-gate void
nfs4_attr_cache(vnode_t * vp,nfs4_ga_res_t * garp,hrtime_t t,cred_t * cr,int async,change_info4 * cinfo)4420Sstevel@tonic-gate nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
4435302Sth199096     hrtime_t t, cred_t *cr, int async,
4445302Sth199096     change_info4 *cinfo)
4450Sstevel@tonic-gate {
4460Sstevel@tonic-gate 	rnode4_t *rp;
4475486Svv149972 	int mtime_changed = 0;
4485486Svv149972 	int ctime_changed = 0;
4490Sstevel@tonic-gate 	vsecattr_t *vsp;
4500Sstevel@tonic-gate 	int was_serial, set_time_cache_inval, recov;
4510Sstevel@tonic-gate 	vattr_t *vap = &garp->n4g_va;
4520Sstevel@tonic-gate 	mntinfo4_t *mi = VTOMI4(vp);
4535486Svv149972 	len_t preattr_rsize;
4545486Svv149972 	boolean_t writemodify_set = B_FALSE;
4555486Svv149972 	boolean_t cachepurge_set = B_FALSE;
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
4580Sstevel@tonic-gate 
4590Sstevel@tonic-gate 	/* Is curthread the recovery thread? */
4600Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
4610Sstevel@tonic-gate 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
4620Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
4630Sstevel@tonic-gate 
4640Sstevel@tonic-gate 	rp = VTOR4(vp);
4650Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
4660Sstevel@tonic-gate 	was_serial = (rp->r_serial == curthread);
4670Sstevel@tonic-gate 	if (rp->r_serial && !was_serial) {
4680Sstevel@tonic-gate 		klwp_t *lwp = ttolwp(curthread);
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate 		/*
4710Sstevel@tonic-gate 		 * If we're the recovery thread, then purge current attrs
4720Sstevel@tonic-gate 		 * and bail out to avoid potential deadlock between another
4730Sstevel@tonic-gate 		 * thread caching attrs (r_serial thread), recov thread,
4740Sstevel@tonic-gate 		 * and an async writer thread.
4750Sstevel@tonic-gate 		 */
4760Sstevel@tonic-gate 		if (recov) {
4770Sstevel@tonic-gate 			PURGE_ATTRCACHE4_LOCKED(rp);
4780Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
4790Sstevel@tonic-gate 			return;
4800Sstevel@tonic-gate 		}
4810Sstevel@tonic-gate 
4820Sstevel@tonic-gate 		if (lwp != NULL)
4830Sstevel@tonic-gate 			lwp->lwp_nostop++;
4840Sstevel@tonic-gate 		while (rp->r_serial != NULL) {
4850Sstevel@tonic-gate 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
4860Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
4870Sstevel@tonic-gate 				if (lwp != NULL)
4880Sstevel@tonic-gate 					lwp->lwp_nostop--;
4890Sstevel@tonic-gate 				return;
4900Sstevel@tonic-gate 			}
4910Sstevel@tonic-gate 		}
4920Sstevel@tonic-gate 		if (lwp != NULL)
4930Sstevel@tonic-gate 			lwp->lwp_nostop--;
4940Sstevel@tonic-gate 	}
4950Sstevel@tonic-gate 
4960Sstevel@tonic-gate 	/*
4970Sstevel@tonic-gate 	 * If there is a page flush thread, the current thread needs to
4980Sstevel@tonic-gate 	 * bail out, to prevent a possible deadlock between the current
4990Sstevel@tonic-gate 	 * thread (which might be in a start_op/end_op region), the
5000Sstevel@tonic-gate 	 * recovery thread, and the page flush thread.  Expire the
5010Sstevel@tonic-gate 	 * attribute cache, so that any attributes the current thread was
5020Sstevel@tonic-gate 	 * going to set are not lost.
5030Sstevel@tonic-gate 	 */
5040Sstevel@tonic-gate 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
5050Sstevel@tonic-gate 		PURGE_ATTRCACHE4_LOCKED(rp);
5060Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
5070Sstevel@tonic-gate 		return;
5080Sstevel@tonic-gate 	}
5090Sstevel@tonic-gate 
5100Sstevel@tonic-gate 	if (rp->r_time_attr_saved > t) {
5110Sstevel@tonic-gate 		/*
5120Sstevel@tonic-gate 		 * Attributes have been cached since these attributes were
5133279Smaheshvs 		 * probably made. If there is an inconsistency in what is
5143279Smaheshvs 		 * cached, mark them invalid. If not, don't act on them.
5150Sstevel@tonic-gate 		 */
5163279Smaheshvs 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
5173279Smaheshvs 			PURGE_ATTRCACHE4_LOCKED(rp);
5180Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
5190Sstevel@tonic-gate 		return;
5200Sstevel@tonic-gate 	}
5210Sstevel@tonic-gate 	set_time_cache_inval = 0;
5220Sstevel@tonic-gate 	if (cinfo) {
5230Sstevel@tonic-gate 		/*
5240Sstevel@tonic-gate 		 * Only directory modifying callers pass non-NULL cinfo.
5250Sstevel@tonic-gate 		 */
5260Sstevel@tonic-gate 		ASSERT(vp->v_type == VDIR);
5270Sstevel@tonic-gate 		/*
5280Sstevel@tonic-gate 		 * If the cache timeout either doesn't exist or hasn't expired,
5290Sstevel@tonic-gate 		 * and dir didn't changed on server before dirmod op
5300Sstevel@tonic-gate 		 * and dir didn't change after dirmod op but before getattr
5310Sstevel@tonic-gate 		 * then there's a chance that the client's cached data for
5320Sstevel@tonic-gate 		 * this object is current (not stale).  No immediate cache
5330Sstevel@tonic-gate 		 * flush is required.
5340Sstevel@tonic-gate 		 *
5350Sstevel@tonic-gate 		 */
5360Sstevel@tonic-gate 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
5370Sstevel@tonic-gate 		    cinfo->before == rp->r_change &&
5380Sstevel@tonic-gate 		    (garp->n4g_change_valid &&
5390Sstevel@tonic-gate 		    cinfo->after == garp->n4g_change)) {
5400Sstevel@tonic-gate 
5410Sstevel@tonic-gate 			/*
5420Sstevel@tonic-gate 			 * If atomic isn't set, then the before/after info
5430Sstevel@tonic-gate 			 * cannot be blindly trusted.  For this case, we tell
5440Sstevel@tonic-gate 			 * nfs4_attrcache_va to cache the attrs but also
5450Sstevel@tonic-gate 			 * establish an absolute maximum cache timeout.  When
5460Sstevel@tonic-gate 			 * the timeout is reached, caches will be flushed.
5470Sstevel@tonic-gate 			 */
5480Sstevel@tonic-gate 			if (! cinfo->atomic)
5490Sstevel@tonic-gate 				set_time_cache_inval = 1;
5500Sstevel@tonic-gate 		} else {
5510Sstevel@tonic-gate 
5520Sstevel@tonic-gate 			/*
5530Sstevel@tonic-gate 			 * We're not sure exactly what changed, but we know
5540Sstevel@tonic-gate 			 * what to do.  flush all caches for dir.  remove the
5550Sstevel@tonic-gate 			 * attr timeout.
5560Sstevel@tonic-gate 			 *
5570Sstevel@tonic-gate 			 * a) timeout expired.  flush all caches.
5580Sstevel@tonic-gate 			 * b) r_change != cinfo.before.  flush all caches.
5590Sstevel@tonic-gate 			 * c) r_change == cinfo.before, but cinfo.after !=
5600Sstevel@tonic-gate 			 *    post-op getattr(change).  flush all caches.
5610Sstevel@tonic-gate 			 * d) post-op getattr(change) not provided by server.
5620Sstevel@tonic-gate 			 *    flush all caches.
5630Sstevel@tonic-gate 			 */
5640Sstevel@tonic-gate 			mtime_changed = 1;
5650Sstevel@tonic-gate 			ctime_changed = 1;
5660Sstevel@tonic-gate 			rp->r_time_cache_inval = 0;
5670Sstevel@tonic-gate 		}
5680Sstevel@tonic-gate 	} else {
5695486Svv149972 		/*
5705486Svv149972 		 * Write thread after writing data to file on remote server,
5715486Svv149972 		 * will always set R4WRITEMODIFIED to indicate that file on
5725486Svv149972 		 * remote server was modified with a WRITE operation and would
5735486Svv149972 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
5745486Svv149972 		 * is set, then do not check for mtime and ctime change.
5755486Svv149972 		 */
5760Sstevel@tonic-gate 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
5770Sstevel@tonic-gate 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
5780Sstevel@tonic-gate 				mtime_changed = 1;
5795486Svv149972 
5800Sstevel@tonic-gate 			if (rp->r_attr.va_ctime.tv_sec !=
5810Sstevel@tonic-gate 			    vap->va_ctime.tv_sec ||
5820Sstevel@tonic-gate 			    rp->r_attr.va_ctime.tv_nsec !=
5830Sstevel@tonic-gate 			    vap->va_ctime.tv_nsec)
5840Sstevel@tonic-gate 				ctime_changed = 1;
5850Sstevel@tonic-gate 		} else {
5865486Svv149972 			writemodify_set = B_TRUE;
5870Sstevel@tonic-gate 		}
5880Sstevel@tonic-gate 	}
5890Sstevel@tonic-gate 
5905486Svv149972 	preattr_rsize = rp->r_size;
5915486Svv149972 
5920Sstevel@tonic-gate 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
5930Sstevel@tonic-gate 
5945486Svv149972 	/*
5955486Svv149972 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
5965486Svv149972 	 * drop statelock we will be in transition of purging all
5975486Svv149972 	 * our caches and updating them. It is possible for another
5985486Svv149972 	 * thread to pick this new file size and read in zeroed data.
5995486Svv149972 	 * stall other threads till cache purge is complete.
6005486Svv149972 	 */
6015486Svv149972 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
6025486Svv149972 		/*
6035486Svv149972 		 * If R4WRITEMODIFIED was set and we have updated the file
6045486Svv149972 		 * size, Server's returned file size need not necessarily
6055486Svv149972 		 * be because of this Client's WRITE. We need to purge
6065486Svv149972 		 * all caches.
6075486Svv149972 		 */
6085486Svv149972 		if (writemodify_set)
6095486Svv149972 			mtime_changed = 1;
6105486Svv149972 
6115486Svv149972 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
6125486Svv149972 			rp->r_flags |= R4INCACHEPURGE;
6135486Svv149972 			cachepurge_set = B_TRUE;
6145486Svv149972 		}
6155486Svv149972 	}
6165486Svv149972 
6170Sstevel@tonic-gate 	if (!mtime_changed && !ctime_changed) {
6180Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
6190Sstevel@tonic-gate 		return;
6200Sstevel@tonic-gate 	}
6210Sstevel@tonic-gate 
6220Sstevel@tonic-gate 	rp->r_serial = curthread;
6230Sstevel@tonic-gate 
6240Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
6250Sstevel@tonic-gate 
6260Sstevel@tonic-gate 	/*
6270Sstevel@tonic-gate 	 * If we're the recov thread, then force async nfs4_purge_caches
6280Sstevel@tonic-gate 	 * to avoid potential deadlock.
6290Sstevel@tonic-gate 	 */
6300Sstevel@tonic-gate 	if (mtime_changed)
6310Sstevel@tonic-gate 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
6320Sstevel@tonic-gate 
6335486Svv149972 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
6345486Svv149972 		mutex_enter(&rp->r_statelock);
6355486Svv149972 		rp->r_flags &= ~R4INCACHEPURGE;
6365486Svv149972 		cv_broadcast(&rp->r_cv);
6375486Svv149972 		mutex_exit(&rp->r_statelock);
6385486Svv149972 		cachepurge_set = B_FALSE;
6395486Svv149972 	}
6405486Svv149972 
6410Sstevel@tonic-gate 	if (ctime_changed) {
6420Sstevel@tonic-gate 		(void) nfs4_access_purge_rp(rp);
6430Sstevel@tonic-gate 		if (rp->r_secattr != NULL) {
6440Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
6450Sstevel@tonic-gate 			vsp = rp->r_secattr;
6460Sstevel@tonic-gate 			rp->r_secattr = NULL;
6470Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
6480Sstevel@tonic-gate 			if (vsp != NULL)
6490Sstevel@tonic-gate 				nfs4_acl_free_cache(vsp);
6500Sstevel@tonic-gate 		}
6510Sstevel@tonic-gate 	}
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate 	if (!was_serial) {
6540Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
6550Sstevel@tonic-gate 		rp->r_serial = NULL;
6560Sstevel@tonic-gate 		cv_broadcast(&rp->r_cv);
6570Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
6580Sstevel@tonic-gate 	}
6590Sstevel@tonic-gate }
6600Sstevel@tonic-gate 
6610Sstevel@tonic-gate /*
6620Sstevel@tonic-gate  * Set attributes cache for given vnode using virtual attributes.
6630Sstevel@tonic-gate  *
6640Sstevel@tonic-gate  * Set the timeout value on the attribute cache and fill it
6650Sstevel@tonic-gate  * with the passed in attributes.
6660Sstevel@tonic-gate  *
6670Sstevel@tonic-gate  * The caller must be holding r_statelock.
6680Sstevel@tonic-gate  */
6690Sstevel@tonic-gate static void
nfs4_attrcache_va(vnode_t * vp,nfs4_ga_res_t * garp,int set_cache_timeout)6700Sstevel@tonic-gate nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
6710Sstevel@tonic-gate {
6720Sstevel@tonic-gate 	rnode4_t *rp;
6730Sstevel@tonic-gate 	mntinfo4_t *mi;
6740Sstevel@tonic-gate 	hrtime_t delta;
6750Sstevel@tonic-gate 	hrtime_t now;
6760Sstevel@tonic-gate 	vattr_t *vap = &garp->n4g_va;
6770Sstevel@tonic-gate 
6780Sstevel@tonic-gate 	rp = VTOR4(vp);
6790Sstevel@tonic-gate 
6800Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&rp->r_statelock));
6810Sstevel@tonic-gate 	ASSERT(vap->va_mask == AT_ALL);
6820Sstevel@tonic-gate 
6830Sstevel@tonic-gate 	/* Switch to master before checking v_flag */
6840Sstevel@tonic-gate 	if (IS_SHADOW(vp, rp))
6850Sstevel@tonic-gate 		vp = RTOV4(rp);
6860Sstevel@tonic-gate 
6870Sstevel@tonic-gate 	now = gethrtime();
6880Sstevel@tonic-gate 
6890Sstevel@tonic-gate 	mi = VTOMI4(vp);
6900Sstevel@tonic-gate 
6910Sstevel@tonic-gate 	/*
6920Sstevel@tonic-gate 	 * Only establish a new cache timeout (if requested).  Never
6930Sstevel@tonic-gate 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
6940Sstevel@tonic-gate 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
6950Sstevel@tonic-gate 	 */
6960Sstevel@tonic-gate 	if (set_cache_timeout && ! rp->r_time_cache_inval)
6970Sstevel@tonic-gate 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
6980Sstevel@tonic-gate 
6990Sstevel@tonic-gate 	/*
7000Sstevel@tonic-gate 	 * Delta is the number of nanoseconds that we will
7010Sstevel@tonic-gate 	 * cache the attributes of the file.  It is based on
7020Sstevel@tonic-gate 	 * the number of nanoseconds since the last time that
7030Sstevel@tonic-gate 	 * we detected a change.  The assumption is that files
7040Sstevel@tonic-gate 	 * that changed recently are likely to change again.
7050Sstevel@tonic-gate 	 * There is a minimum and a maximum for regular files
7060Sstevel@tonic-gate 	 * and for directories which is enforced though.
7070Sstevel@tonic-gate 	 *
7080Sstevel@tonic-gate 	 * Using the time since last change was detected
7090Sstevel@tonic-gate 	 * eliminates direct comparison or calculation
7100Sstevel@tonic-gate 	 * using mixed client and server times.  NFS does
7110Sstevel@tonic-gate 	 * not make any assumptions regarding the client
7120Sstevel@tonic-gate 	 * and server clocks being synchronized.
7130Sstevel@tonic-gate 	 */
7140Sstevel@tonic-gate 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
7150Sstevel@tonic-gate 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
7160Sstevel@tonic-gate 	    vap->va_size != rp->r_attr.va_size) {
7170Sstevel@tonic-gate 		rp->r_time_attr_saved = now;
7180Sstevel@tonic-gate 	}
7190Sstevel@tonic-gate 
7200Sstevel@tonic-gate 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
7210Sstevel@tonic-gate 		delta = 0;
7220Sstevel@tonic-gate 	else {
7230Sstevel@tonic-gate 		delta = now - rp->r_time_attr_saved;
7240Sstevel@tonic-gate 		if (vp->v_type == VDIR) {
7250Sstevel@tonic-gate 			if (delta < mi->mi_acdirmin)
7260Sstevel@tonic-gate 				delta = mi->mi_acdirmin;
7270Sstevel@tonic-gate 			else if (delta > mi->mi_acdirmax)
7280Sstevel@tonic-gate 				delta = mi->mi_acdirmax;
7290Sstevel@tonic-gate 		} else {
7300Sstevel@tonic-gate 			if (delta < mi->mi_acregmin)
7310Sstevel@tonic-gate 				delta = mi->mi_acregmin;
7320Sstevel@tonic-gate 			else if (delta > mi->mi_acregmax)
7330Sstevel@tonic-gate 				delta = mi->mi_acregmax;
7340Sstevel@tonic-gate 		}
7350Sstevel@tonic-gate 	}
7360Sstevel@tonic-gate 	rp->r_time_attr_inval = now + delta;
7370Sstevel@tonic-gate 
7380Sstevel@tonic-gate 	rp->r_attr = *vap;
7390Sstevel@tonic-gate 	if (garp->n4g_change_valid)
7400Sstevel@tonic-gate 		rp->r_change = garp->n4g_change;
7410Sstevel@tonic-gate 
7420Sstevel@tonic-gate 	/*
7430Sstevel@tonic-gate 	 * The attributes that were returned may be valid and can
7440Sstevel@tonic-gate 	 * be used, but they may not be allowed to be cached.
7450Sstevel@tonic-gate 	 * Reset the timers to cause immediate invalidation and
7460Sstevel@tonic-gate 	 * clear r_change so no VERIFY operations will suceed
7470Sstevel@tonic-gate 	 */
7480Sstevel@tonic-gate 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
7490Sstevel@tonic-gate 		rp->r_time_attr_inval = now;
7500Sstevel@tonic-gate 		rp->r_time_attr_saved = now;
7510Sstevel@tonic-gate 		rp->r_change = 0;
7520Sstevel@tonic-gate 	}
7530Sstevel@tonic-gate 
7540Sstevel@tonic-gate 	/*
7550Sstevel@tonic-gate 	 * If mounted_on_fileid returned AND the object is a stub,
7560Sstevel@tonic-gate 	 * then set object's va_nodeid to the mounted over fid
7570Sstevel@tonic-gate 	 * returned by server.
7580Sstevel@tonic-gate 	 *
7590Sstevel@tonic-gate 	 * If mounted_on_fileid not provided/supported, then
7600Sstevel@tonic-gate 	 * just set it to 0 for now.  Eventually it would be
7610Sstevel@tonic-gate 	 * better to set it to a hashed version of FH.  This
7620Sstevel@tonic-gate 	 * would probably be good enough to provide a unique
7630Sstevel@tonic-gate 	 * fid/d_ino within a dir.
7640Sstevel@tonic-gate 	 *
7650Sstevel@tonic-gate 	 * We don't need to carry mounted_on_fileid in the
7660Sstevel@tonic-gate 	 * rnode as long as the client never requests fileid
7670Sstevel@tonic-gate 	 * without also requesting mounted_on_fileid.  For
7680Sstevel@tonic-gate 	 * now, it stays.
7690Sstevel@tonic-gate 	 */
7700Sstevel@tonic-gate 	if (garp->n4g_mon_fid_valid) {
7710Sstevel@tonic-gate 		rp->r_mntd_fid = garp->n4g_mon_fid;
7720Sstevel@tonic-gate 
7735302Sth199096 		if (RP_ISSTUB(rp))
7740Sstevel@tonic-gate 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
7750Sstevel@tonic-gate 	}
7760Sstevel@tonic-gate 
7770Sstevel@tonic-gate 	/*
7780Sstevel@tonic-gate 	 * Check to see if there are valid pathconf bits to
7790Sstevel@tonic-gate 	 * cache in the rnode.
7800Sstevel@tonic-gate 	 */
7810Sstevel@tonic-gate 	if (garp->n4g_ext_res) {
7820Sstevel@tonic-gate 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
7830Sstevel@tonic-gate 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
7840Sstevel@tonic-gate 		} else {
7850Sstevel@tonic-gate 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
7860Sstevel@tonic-gate 				rp->r_pathconf.pc4_xattr_valid = TRUE;
7870Sstevel@tonic-gate 				rp->r_pathconf.pc4_xattr_exists =
7880Sstevel@tonic-gate 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
7890Sstevel@tonic-gate 			}
7900Sstevel@tonic-gate 		}
7910Sstevel@tonic-gate 	}
7920Sstevel@tonic-gate 	/*
7930Sstevel@tonic-gate 	 * Update the size of the file if there is no cached data or if
7940Sstevel@tonic-gate 	 * the cached data is clean and there is no data being written
7950Sstevel@tonic-gate 	 * out.
7960Sstevel@tonic-gate 	 */
7970Sstevel@tonic-gate 	if (rp->r_size != vap->va_size &&
7980Sstevel@tonic-gate 	    (!vn_has_cached_data(vp) ||
7990Sstevel@tonic-gate 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
8000Sstevel@tonic-gate 		rp->r_size = vap->va_size;
8010Sstevel@tonic-gate 	}
8020Sstevel@tonic-gate 	nfs_setswaplike(vp, vap);
8030Sstevel@tonic-gate 	rp->r_flags &= ~R4WRITEMODIFIED;
8040Sstevel@tonic-gate }
8050Sstevel@tonic-gate 
8060Sstevel@tonic-gate /*
8070Sstevel@tonic-gate  * Get attributes over-the-wire and update attributes cache
8080Sstevel@tonic-gate  * if no error occurred in the over-the-wire operation.
8090Sstevel@tonic-gate  * Return 0 if successful, otherwise error.
8100Sstevel@tonic-gate  */
8110Sstevel@tonic-gate int
nfs4_getattr_otw(vnode_t * vp,nfs4_ga_res_t * garp,cred_t * cr,int get_acl)8120Sstevel@tonic-gate nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
8130Sstevel@tonic-gate {
8140Sstevel@tonic-gate 	mntinfo4_t *mi = VTOMI4(vp);
8150Sstevel@tonic-gate 	hrtime_t t;
8160Sstevel@tonic-gate 	nfs4_recov_state_t recov_state;
8170Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8180Sstevel@tonic-gate 
8190Sstevel@tonic-gate 	recov_state.rs_flags = 0;
8200Sstevel@tonic-gate 	recov_state.rs_num_retry_despite_err = 0;
8210Sstevel@tonic-gate 
8220Sstevel@tonic-gate 	/* Save the original mount point security flavor */
8230Sstevel@tonic-gate 	(void) save_mnt_secinfo(mi->mi_curr_serv);
8240Sstevel@tonic-gate 
8250Sstevel@tonic-gate recov_retry:
8265302Sth199096 
8270Sstevel@tonic-gate 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
8285302Sth199096 	    &recov_state, NULL))) {
8290Sstevel@tonic-gate 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
8300Sstevel@tonic-gate 		return (e.error);
8310Sstevel@tonic-gate 	}
8320Sstevel@tonic-gate 
8330Sstevel@tonic-gate 	t = gethrtime();
8340Sstevel@tonic-gate 
8350Sstevel@tonic-gate 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
8360Sstevel@tonic-gate 
8370Sstevel@tonic-gate 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
8380Sstevel@tonic-gate 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
83911291SRobert.Thurlow@Sun.COM 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
8400Sstevel@tonic-gate 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
8415302Sth199096 			    &recov_state, 1);
8420Sstevel@tonic-gate 			goto recov_retry;
8430Sstevel@tonic-gate 		}
8440Sstevel@tonic-gate 	}
8450Sstevel@tonic-gate 
8460Sstevel@tonic-gate 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate 	if (!e.error) {
8490Sstevel@tonic-gate 		if (e.stat == NFS4_OK) {
8500Sstevel@tonic-gate 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
8510Sstevel@tonic-gate 		} else {
8520Sstevel@tonic-gate 			e.error = geterrno4(e.stat);
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate 			nfs4_purge_stale_fh(e.error, vp, cr);
8550Sstevel@tonic-gate 		}
8560Sstevel@tonic-gate 	}
8570Sstevel@tonic-gate 
8580Sstevel@tonic-gate 	/*
8590Sstevel@tonic-gate 	 * If getattr a node that is a stub for a crossed
8600Sstevel@tonic-gate 	 * mount point, keep the original secinfo flavor for
8610Sstevel@tonic-gate 	 * the current file system, not the crossed one.
8620Sstevel@tonic-gate 	 */
8630Sstevel@tonic-gate 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
8640Sstevel@tonic-gate 
8650Sstevel@tonic-gate 	return (e.error);
8660Sstevel@tonic-gate }
8670Sstevel@tonic-gate 
8680Sstevel@tonic-gate /*
8690Sstevel@tonic-gate  * Generate a compound to get attributes over-the-wire.
8700Sstevel@tonic-gate  */
8710Sstevel@tonic-gate void
nfs4_getattr_otw_norecovery(vnode_t * vp,nfs4_ga_res_t * garp,nfs4_error_t * ep,cred_t * cr,int get_acl)8720Sstevel@tonic-gate nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
8735302Sth199096     nfs4_error_t *ep, cred_t *cr, int get_acl)
8740Sstevel@tonic-gate {
8750Sstevel@tonic-gate 	COMPOUND4args_clnt args;
8760Sstevel@tonic-gate 	COMPOUND4res_clnt res;
8770Sstevel@tonic-gate 	int doqueue;
8780Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
8790Sstevel@tonic-gate 	nfs_argop4 argop[2];
8800Sstevel@tonic-gate 
8810Sstevel@tonic-gate 	args.ctag = TAG_GETATTR;
8820Sstevel@tonic-gate 
8830Sstevel@tonic-gate 	args.array_len = 2;
8840Sstevel@tonic-gate 	args.array = argop;
8850Sstevel@tonic-gate 
8860Sstevel@tonic-gate 	/* putfh */
8870Sstevel@tonic-gate 	argop[0].argop = OP_CPUTFH;
8880Sstevel@tonic-gate 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
8890Sstevel@tonic-gate 
8900Sstevel@tonic-gate 	/* getattr */
8910Sstevel@tonic-gate 	/*
8920Sstevel@tonic-gate 	 * Unlike nfs version 2 and 3, where getattr returns all the
8935331Samw 	 * attributes, nfs version 4 returns only the ones explicitly
8940Sstevel@tonic-gate 	 * asked for. This creates problems, as some system functions
8950Sstevel@tonic-gate 	 * (e.g. cache check) require certain attributes and if the
8960Sstevel@tonic-gate 	 * cached node lacks some attributes such as uid/gid, it can
8970Sstevel@tonic-gate 	 * affect system utilities (e.g. "ls") that rely on the information
8980Sstevel@tonic-gate 	 * to be there. This can lead to anything from system crashes to
8990Sstevel@tonic-gate 	 * corrupted information processed by user apps.
9000Sstevel@tonic-gate 	 * So to ensure that all bases are covered, request at least
9010Sstevel@tonic-gate 	 * the AT_ALL attribute mask.
9020Sstevel@tonic-gate 	 */
9030Sstevel@tonic-gate 	argop[1].argop = OP_GETATTR;
9040Sstevel@tonic-gate 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9050Sstevel@tonic-gate 	if (get_acl)
9060Sstevel@tonic-gate 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
9070Sstevel@tonic-gate 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
9080Sstevel@tonic-gate 
9090Sstevel@tonic-gate 	doqueue = 1;
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
9120Sstevel@tonic-gate 
9130Sstevel@tonic-gate 	if (ep->error)
9140Sstevel@tonic-gate 		return;
9150Sstevel@tonic-gate 
9160Sstevel@tonic-gate 	if (res.status != NFS4_OK) {
9170Sstevel@tonic-gate 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9180Sstevel@tonic-gate 		return;
9190Sstevel@tonic-gate 	}
9200Sstevel@tonic-gate 
9210Sstevel@tonic-gate 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
9220Sstevel@tonic-gate 
9230Sstevel@tonic-gate 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9240Sstevel@tonic-gate }
9250Sstevel@tonic-gate 
9260Sstevel@tonic-gate /*
9270Sstevel@tonic-gate  * Return either cached or remote attributes. If get remote attr
9280Sstevel@tonic-gate  * use them to check and invalidate caches, then cache the new attributes.
9290Sstevel@tonic-gate  */
9300Sstevel@tonic-gate int
nfs4getattr(vnode_t * vp,vattr_t * vap,cred_t * cr)9310Sstevel@tonic-gate nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
9320Sstevel@tonic-gate {
9330Sstevel@tonic-gate 	int error;
9340Sstevel@tonic-gate 	rnode4_t *rp;
9350Sstevel@tonic-gate 	nfs4_ga_res_t gar;
9360Sstevel@tonic-gate 
9370Sstevel@tonic-gate 	ASSERT(nfs4_consistent_type(vp));
9380Sstevel@tonic-gate 
9390Sstevel@tonic-gate 	/*
9400Sstevel@tonic-gate 	 * If we've got cached attributes, we're done, otherwise go
9410Sstevel@tonic-gate 	 * to the server to get attributes, which will update the cache
9425302Sth199096 	 * in the process. Either way, use the cached attributes for
9435302Sth199096 	 * the caller's vattr_t.
9445302Sth199096 	 *
9455302Sth199096 	 * Note that we ignore the gar set by the OTW call: the attr caching
9465302Sth199096 	 * code may make adjustments when storing to the rnode, and we want
9475302Sth199096 	 * to see those changes here.
9480Sstevel@tonic-gate 	 */
9490Sstevel@tonic-gate 	rp = VTOR4(vp);
9505302Sth199096 	error = 0;
9510Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
9525302Sth199096 	if (!ATTRCACHE4_VALID(vp)) {
9530Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
9545302Sth199096 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
9555302Sth199096 		mutex_enter(&rp->r_statelock);
9560Sstevel@tonic-gate 	}
9575302Sth199096 
9580Sstevel@tonic-gate 	if (!error)
9595302Sth199096 		*vap = rp->r_attr;
9600Sstevel@tonic-gate 
9610Sstevel@tonic-gate 	/* Return the client's view of file size */
9620Sstevel@tonic-gate 	vap->va_size = rp->r_size;
9635302Sth199096 
9640Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
9650Sstevel@tonic-gate 
9660Sstevel@tonic-gate 	ASSERT(nfs4_consistent_type(vp));
9670Sstevel@tonic-gate 
9680Sstevel@tonic-gate 	return (error);
9690Sstevel@tonic-gate }
9700Sstevel@tonic-gate 
9710Sstevel@tonic-gate int
nfs4_attr_otw(vnode_t * vp,nfs4_tag_type_t tag_type,nfs4_ga_res_t * garp,bitmap4 reqbitmap,cred_t * cr)9720Sstevel@tonic-gate nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
9735302Sth199096     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
9740Sstevel@tonic-gate {
9750Sstevel@tonic-gate 	COMPOUND4args_clnt args;
9760Sstevel@tonic-gate 	COMPOUND4res_clnt res;
9770Sstevel@tonic-gate 	int doqueue;
9780Sstevel@tonic-gate 	nfs_argop4 argop[2];
9790Sstevel@tonic-gate 	mntinfo4_t *mi = VTOMI4(vp);
9800Sstevel@tonic-gate 	bool_t needrecov = FALSE;
9810Sstevel@tonic-gate 	nfs4_recov_state_t recov_state;
9820Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9830Sstevel@tonic-gate 	nfs4_ga_ext_res_t *gerp;
9840Sstevel@tonic-gate 
9850Sstevel@tonic-gate 	recov_state.rs_flags = 0;
9860Sstevel@tonic-gate 	recov_state.rs_num_retry_despite_err = 0;
9870Sstevel@tonic-gate 
9880Sstevel@tonic-gate recov_retry:
9890Sstevel@tonic-gate 	args.ctag = tag_type;
9900Sstevel@tonic-gate 
9910Sstevel@tonic-gate 	args.array_len = 2;
9920Sstevel@tonic-gate 	args.array = argop;
9930Sstevel@tonic-gate 
9940Sstevel@tonic-gate 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
9950Sstevel@tonic-gate 	if (e.error)
9960Sstevel@tonic-gate 		return (e.error);
9970Sstevel@tonic-gate 
9980Sstevel@tonic-gate 	/* putfh */
9990Sstevel@tonic-gate 	argop[0].argop = OP_CPUTFH;
10000Sstevel@tonic-gate 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
10010Sstevel@tonic-gate 
10020Sstevel@tonic-gate 	/* getattr */
10030Sstevel@tonic-gate 	argop[1].argop = OP_GETATTR;
10040Sstevel@tonic-gate 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
10050Sstevel@tonic-gate 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
10060Sstevel@tonic-gate 
10070Sstevel@tonic-gate 	doqueue = 1;
10080Sstevel@tonic-gate 
10090Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
10100Sstevel@tonic-gate 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
10110Sstevel@tonic-gate 	    rnode4info(VTOR4(vp))));
10120Sstevel@tonic-gate 
10130Sstevel@tonic-gate 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
10140Sstevel@tonic-gate 
10150Sstevel@tonic-gate 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
10160Sstevel@tonic-gate 	if (!needrecov && e.error) {
10170Sstevel@tonic-gate 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
10185302Sth199096 		    needrecov);
10190Sstevel@tonic-gate 		return (e.error);
10200Sstevel@tonic-gate 	}
10210Sstevel@tonic-gate 
10220Sstevel@tonic-gate 	if (needrecov) {
10230Sstevel@tonic-gate 		bool_t abort;
10240Sstevel@tonic-gate 
10250Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
10260Sstevel@tonic-gate 		    "nfs4_attr_otw: initiating recovery\n"));
10270Sstevel@tonic-gate 
10280Sstevel@tonic-gate 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
102911291SRobert.Thurlow@Sun.COM 		    NULL, OP_GETATTR, NULL, NULL, NULL);
10300Sstevel@tonic-gate 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
10315302Sth199096 		    needrecov);
10320Sstevel@tonic-gate 		if (!e.error) {
10330Sstevel@tonic-gate 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
10340Sstevel@tonic-gate 			e.error = geterrno4(res.status);
10350Sstevel@tonic-gate 		}
10360Sstevel@tonic-gate 		if (abort == FALSE)
10370Sstevel@tonic-gate 			goto recov_retry;
10380Sstevel@tonic-gate 		return (e.error);
10390Sstevel@tonic-gate 	}
10400Sstevel@tonic-gate 
10410Sstevel@tonic-gate 	if (res.status) {
10420Sstevel@tonic-gate 		e.error = geterrno4(res.status);
10430Sstevel@tonic-gate 	} else {
10440Sstevel@tonic-gate 		gerp = garp->n4g_ext_res;
10450Sstevel@tonic-gate 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
10465302Sth199096 		    garp, sizeof (nfs4_ga_res_t));
10470Sstevel@tonic-gate 		garp->n4g_ext_res = gerp;
10480Sstevel@tonic-gate 		if (garp->n4g_ext_res &&
10490Sstevel@tonic-gate 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
10500Sstevel@tonic-gate 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
10515302Sth199096 			    ga_res.n4g_ext_res,
10525302Sth199096 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
10530Sstevel@tonic-gate 	}
10540Sstevel@tonic-gate 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
10550Sstevel@tonic-gate 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
10565302Sth199096 	    needrecov);
10570Sstevel@tonic-gate 	return (e.error);
10580Sstevel@tonic-gate }
10590Sstevel@tonic-gate 
10600Sstevel@tonic-gate /*
10610Sstevel@tonic-gate  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
10620Sstevel@tonic-gate  * for the demand-based allocation of async threads per-mount.  The
10630Sstevel@tonic-gate  * nfs_async_timeout is the amount of time a thread will live after it
10640Sstevel@tonic-gate  * becomes idle, unless new I/O requests are received before the thread
10650Sstevel@tonic-gate  * dies.  See nfs4_async_putpage and nfs4_async_start.
10660Sstevel@tonic-gate  */
10670Sstevel@tonic-gate 
10680Sstevel@tonic-gate static void	nfs4_async_start(struct vfs *);
106911507SVallish.Vaidyeshwara@Sun.COM static void	nfs4_async_pgops_start(struct vfs *);
107011507SVallish.Vaidyeshwara@Sun.COM static void	nfs4_async_common_start(struct vfs *, int);
10710Sstevel@tonic-gate 
10720Sstevel@tonic-gate static void
free_async_args4(struct nfs4_async_reqs * args)10730Sstevel@tonic-gate free_async_args4(struct nfs4_async_reqs *args)
10740Sstevel@tonic-gate {
10750Sstevel@tonic-gate 	rnode4_t *rp;
10760Sstevel@tonic-gate 
10770Sstevel@tonic-gate 	if (args->a_io != NFS4_INACTIVE) {
10780Sstevel@tonic-gate 		rp = VTOR4(args->a_vp);
10790Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
10800Sstevel@tonic-gate 		rp->r_count--;
10810Sstevel@tonic-gate 		if (args->a_io == NFS4_PUTAPAGE ||
10820Sstevel@tonic-gate 		    args->a_io == NFS4_PAGEIO)
10830Sstevel@tonic-gate 			rp->r_awcount--;
10840Sstevel@tonic-gate 		cv_broadcast(&rp->r_cv);
10850Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
10860Sstevel@tonic-gate 		VN_RELE(args->a_vp);
10870Sstevel@tonic-gate 	}
10880Sstevel@tonic-gate 	crfree(args->a_cred);
10890Sstevel@tonic-gate 	kmem_free(args, sizeof (*args));
10900Sstevel@tonic-gate }
10910Sstevel@tonic-gate 
10920Sstevel@tonic-gate /*
10930Sstevel@tonic-gate  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
10940Sstevel@tonic-gate  * pageout(), running in the global zone, have legitimate reasons to do
10950Sstevel@tonic-gate  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
10960Sstevel@tonic-gate  * use of a a per-mount "asynchronous requests manager thread" which is
10970Sstevel@tonic-gate  * signaled by the various asynchronous work routines when there is
10980Sstevel@tonic-gate  * asynchronous work to be done.  It is responsible for creating new
10990Sstevel@tonic-gate  * worker threads if necessary, and notifying existing worker threads
11000Sstevel@tonic-gate  * that there is work to be done.
11010Sstevel@tonic-gate  *
11020Sstevel@tonic-gate  * In other words, it will "take the specifications from the customers and
11030Sstevel@tonic-gate  * give them to the engineers."
11040Sstevel@tonic-gate  *
11050Sstevel@tonic-gate  * Worker threads die off of their own accord if they are no longer
11060Sstevel@tonic-gate  * needed.
11070Sstevel@tonic-gate  *
11080Sstevel@tonic-gate  * This thread is killed when the zone is going away or the filesystem
11090Sstevel@tonic-gate  * is being unmounted.
11100Sstevel@tonic-gate  */
11110Sstevel@tonic-gate void
nfs4_async_manager(vfs_t * vfsp)11120Sstevel@tonic-gate nfs4_async_manager(vfs_t *vfsp)
11130Sstevel@tonic-gate {
11140Sstevel@tonic-gate 	callb_cpr_t cprinfo;
11150Sstevel@tonic-gate 	mntinfo4_t *mi;
11160Sstevel@tonic-gate 	uint_t max_threads;
11170Sstevel@tonic-gate 
11180Sstevel@tonic-gate 	mi = VFTOMI4(vfsp);
11190Sstevel@tonic-gate 
11200Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
11215302Sth199096 	    "nfs4_async_manager");
11220Sstevel@tonic-gate 
11230Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
11240Sstevel@tonic-gate 	/*
11250Sstevel@tonic-gate 	 * We want to stash the max number of threads that this mount was
11260Sstevel@tonic-gate 	 * allowed so we can use it later when the variable is set to zero as
11270Sstevel@tonic-gate 	 * part of the zone/mount going away.
11280Sstevel@tonic-gate 	 *
11290Sstevel@tonic-gate 	 * We want to be able to create at least one thread to handle
113011286SMarcel.Telka@Sun.COM 	 * asynchronous inactive calls.
11310Sstevel@tonic-gate 	 */
11320Sstevel@tonic-gate 	max_threads = MAX(mi->mi_max_threads, 1);
11330Sstevel@tonic-gate 	/*
11340Sstevel@tonic-gate 	 * We don't want to wait for mi_max_threads to go to zero, since that
11350Sstevel@tonic-gate 	 * happens as part of a failed unmount, but this thread should only
11360Sstevel@tonic-gate 	 * exit when the mount is really going away.
11370Sstevel@tonic-gate 	 *
11380Sstevel@tonic-gate 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
11390Sstevel@tonic-gate 	 * attempted: the various _async_*() functions know to do things
11400Sstevel@tonic-gate 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
11410Sstevel@tonic-gate 	 * outstanding requests.
11420Sstevel@tonic-gate 	 *
11430Sstevel@tonic-gate 	 * Note that we still create zthreads even if we notice the zone is
11440Sstevel@tonic-gate 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
11450Sstevel@tonic-gate 	 * shutdown sequence to take slightly longer in some cases, but
11460Sstevel@tonic-gate 	 * doesn't violate the protocol, as all threads will exit as soon as
11470Sstevel@tonic-gate 	 * they're done processing the remaining requests.
11480Sstevel@tonic-gate 	 */
114911286SMarcel.Telka@Sun.COM 	for (;;) {
11500Sstevel@tonic-gate 		while (mi->mi_async_req_count > 0) {
11510Sstevel@tonic-gate 			/*
11520Sstevel@tonic-gate 			 * Paranoia: If the mount started out having
11530Sstevel@tonic-gate 			 * (mi->mi_max_threads == 0), and the value was
11540Sstevel@tonic-gate 			 * later changed (via a debugger or somesuch),
11550Sstevel@tonic-gate 			 * we could be confused since we will think we
11560Sstevel@tonic-gate 			 * can't create any threads, and the calling
11570Sstevel@tonic-gate 			 * code (which looks at the current value of
11580Sstevel@tonic-gate 			 * mi->mi_max_threads, now non-zero) thinks we
11590Sstevel@tonic-gate 			 * can.
11600Sstevel@tonic-gate 			 *
11610Sstevel@tonic-gate 			 * So, because we're paranoid, we create threads
11620Sstevel@tonic-gate 			 * up to the maximum of the original and the
11630Sstevel@tonic-gate 			 * current value. This means that future
11640Sstevel@tonic-gate 			 * (debugger-induced) alterations of
11650Sstevel@tonic-gate 			 * mi->mi_max_threads are ignored for our
11660Sstevel@tonic-gate 			 * purposes, but who told them they could change
11670Sstevel@tonic-gate 			 * random values on a live kernel anyhow?
11680Sstevel@tonic-gate 			 */
116911507SVallish.Vaidyeshwara@Sun.COM 			if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
11700Sstevel@tonic-gate 			    MAX(mi->mi_max_threads, max_threads)) {
117111507SVallish.Vaidyeshwara@Sun.COM 				mi->mi_threads[NFS4_ASYNC_QUEUE]++;
11720Sstevel@tonic-gate 				mutex_exit(&mi->mi_async_lock);
11731705Sjwahlig 				MI4_HOLD(mi);
11740Sstevel@tonic-gate 				VFS_HOLD(vfsp);	/* hold for new thread */
11750Sstevel@tonic-gate 				(void) zthread_create(NULL, 0, nfs4_async_start,
11760Sstevel@tonic-gate 				    vfsp, 0, minclsyspri);
11770Sstevel@tonic-gate 				mutex_enter(&mi->mi_async_lock);
117811507SVallish.Vaidyeshwara@Sun.COM 			} else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
117911507SVallish.Vaidyeshwara@Sun.COM 			    NUM_ASYNC_PGOPS_THREADS) {
118011507SVallish.Vaidyeshwara@Sun.COM 				mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
118111507SVallish.Vaidyeshwara@Sun.COM 				mutex_exit(&mi->mi_async_lock);
118211507SVallish.Vaidyeshwara@Sun.COM 				MI4_HOLD(mi);
118311507SVallish.Vaidyeshwara@Sun.COM 				VFS_HOLD(vfsp); /* hold for new thread */
118411507SVallish.Vaidyeshwara@Sun.COM 				(void) zthread_create(NULL, 0,
118511507SVallish.Vaidyeshwara@Sun.COM 				    nfs4_async_pgops_start, vfsp, 0,
118611507SVallish.Vaidyeshwara@Sun.COM 				    minclsyspri);
118711507SVallish.Vaidyeshwara@Sun.COM 				mutex_enter(&mi->mi_async_lock);
11880Sstevel@tonic-gate 			}
118911507SVallish.Vaidyeshwara@Sun.COM 			NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
11900Sstevel@tonic-gate 			ASSERT(mi->mi_async_req_count != 0);
11910Sstevel@tonic-gate 			mi->mi_async_req_count--;
11920Sstevel@tonic-gate 		}
119311286SMarcel.Telka@Sun.COM 
11940Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
119511286SMarcel.Telka@Sun.COM 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
119611286SMarcel.Telka@Sun.COM 			mutex_exit(&mi->mi_lock);
119711286SMarcel.Telka@Sun.COM 			break;
119811286SMarcel.Telka@Sun.COM 		}
119911286SMarcel.Telka@Sun.COM 		mutex_exit(&mi->mi_lock);
120011286SMarcel.Telka@Sun.COM 
120111286SMarcel.Telka@Sun.COM 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
120211286SMarcel.Telka@Sun.COM 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
120311286SMarcel.Telka@Sun.COM 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
12040Sstevel@tonic-gate 	}
12050Sstevel@tonic-gate 
12060Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
12070Sstevel@tonic-gate 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
12080Sstevel@tonic-gate 	/*
12090Sstevel@tonic-gate 	 * Let everyone know we're done.
12100Sstevel@tonic-gate 	 */
12110Sstevel@tonic-gate 	mi->mi_manager_thread = NULL;
12120Sstevel@tonic-gate 	/*
12130Sstevel@tonic-gate 	 * Wake up the inactive thread.
12140Sstevel@tonic-gate 	 */
12150Sstevel@tonic-gate 	cv_broadcast(&mi->mi_inact_req_cv);
12160Sstevel@tonic-gate 	/*
12170Sstevel@tonic-gate 	 * Wake up anyone sitting in nfs4_async_manager_stop()
12180Sstevel@tonic-gate 	 */
12190Sstevel@tonic-gate 	cv_broadcast(&mi->mi_async_cv);
12200Sstevel@tonic-gate 	/*
12210Sstevel@tonic-gate 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
12220Sstevel@tonic-gate 	 * since CALLB_CPR_EXIT is actually responsible for releasing
12230Sstevel@tonic-gate 	 * 'mi_async_lock'.
12240Sstevel@tonic-gate 	 */
12250Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cprinfo);
12260Sstevel@tonic-gate 	VFS_RELE(vfsp);	/* release thread's hold */
12271705Sjwahlig 	MI4_RELE(mi);
12280Sstevel@tonic-gate 	zthread_exit();
12290Sstevel@tonic-gate }
12300Sstevel@tonic-gate 
12310Sstevel@tonic-gate /*
12320Sstevel@tonic-gate  * Signal (and wait for) the async manager thread to clean up and go away.
12330Sstevel@tonic-gate  */
12340Sstevel@tonic-gate void
nfs4_async_manager_stop(vfs_t * vfsp)12350Sstevel@tonic-gate nfs4_async_manager_stop(vfs_t *vfsp)
12360Sstevel@tonic-gate {
12370Sstevel@tonic-gate 	mntinfo4_t *mi = VFTOMI4(vfsp);
12380Sstevel@tonic-gate 
12390Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
12400Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
12410Sstevel@tonic-gate 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
12420Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
12430Sstevel@tonic-gate 	cv_broadcast(&mi->mi_async_reqs_cv);
12440Sstevel@tonic-gate 	/*
12450Sstevel@tonic-gate 	 * Wait for the async manager thread to die.
12460Sstevel@tonic-gate 	 */
12470Sstevel@tonic-gate 	while (mi->mi_manager_thread != NULL)
12480Sstevel@tonic-gate 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
12490Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
12500Sstevel@tonic-gate }
12510Sstevel@tonic-gate 
12520Sstevel@tonic-gate int
nfs4_async_readahead(vnode_t * vp,u_offset_t blkoff,caddr_t addr,struct seg * seg,cred_t * cr,void (* readahead)(vnode_t *,u_offset_t,caddr_t,struct seg *,cred_t *))12530Sstevel@tonic-gate nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
12545302Sth199096     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
12555302Sth199096     u_offset_t, caddr_t, struct seg *, cred_t *))
12560Sstevel@tonic-gate {
12570Sstevel@tonic-gate 	rnode4_t *rp;
12580Sstevel@tonic-gate 	mntinfo4_t *mi;
12590Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
12600Sstevel@tonic-gate 
12610Sstevel@tonic-gate 	rp = VTOR4(vp);
12620Sstevel@tonic-gate 	ASSERT(rp->r_freef == NULL);
12630Sstevel@tonic-gate 
12640Sstevel@tonic-gate 	mi = VTOMI4(vp);
12650Sstevel@tonic-gate 
12660Sstevel@tonic-gate 	/*
12670Sstevel@tonic-gate 	 * If addr falls in a different segment, don't bother doing readahead.
12680Sstevel@tonic-gate 	 */
12690Sstevel@tonic-gate 	if (addr >= seg->s_base + seg->s_size)
12700Sstevel@tonic-gate 		return (-1);
12710Sstevel@tonic-gate 
12720Sstevel@tonic-gate 	/*
12730Sstevel@tonic-gate 	 * If we can't allocate a request structure, punt on the readahead.
12740Sstevel@tonic-gate 	 */
12750Sstevel@tonic-gate 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
12760Sstevel@tonic-gate 		return (-1);
12770Sstevel@tonic-gate 
12780Sstevel@tonic-gate 	/*
12790Sstevel@tonic-gate 	 * If a lock operation is pending, don't initiate any new
12800Sstevel@tonic-gate 	 * readaheads.  Otherwise, bump r_count to indicate the new
12810Sstevel@tonic-gate 	 * asynchronous I/O.
12820Sstevel@tonic-gate 	 */
12830Sstevel@tonic-gate 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
12840Sstevel@tonic-gate 		kmem_free(args, sizeof (*args));
12850Sstevel@tonic-gate 		return (-1);
12860Sstevel@tonic-gate 	}
12870Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
12880Sstevel@tonic-gate 	rp->r_count++;
12890Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
12900Sstevel@tonic-gate 	nfs_rw_exit(&rp->r_lkserlock);
12910Sstevel@tonic-gate 
12920Sstevel@tonic-gate 	args->a_next = NULL;
12930Sstevel@tonic-gate #ifdef DEBUG
12940Sstevel@tonic-gate 	args->a_queuer = curthread;
12950Sstevel@tonic-gate #endif
12960Sstevel@tonic-gate 	VN_HOLD(vp);
12970Sstevel@tonic-gate 	args->a_vp = vp;
12980Sstevel@tonic-gate 	ASSERT(cr != NULL);
12990Sstevel@tonic-gate 	crhold(cr);
13000Sstevel@tonic-gate 	args->a_cred = cr;
13010Sstevel@tonic-gate 	args->a_io = NFS4_READ_AHEAD;
13020Sstevel@tonic-gate 	args->a_nfs4_readahead = readahead;
13030Sstevel@tonic-gate 	args->a_nfs4_blkoff = blkoff;
13040Sstevel@tonic-gate 	args->a_nfs4_seg = seg;
13050Sstevel@tonic-gate 	args->a_nfs4_addr = addr;
13060Sstevel@tonic-gate 
13070Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
13080Sstevel@tonic-gate 
13090Sstevel@tonic-gate 	/*
13100Sstevel@tonic-gate 	 * If asyncio has been disabled, don't bother readahead.
13110Sstevel@tonic-gate 	 */
13120Sstevel@tonic-gate 	if (mi->mi_max_threads == 0) {
13130Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
13140Sstevel@tonic-gate 		goto noasync;
13150Sstevel@tonic-gate 	}
13160Sstevel@tonic-gate 
13170Sstevel@tonic-gate 	/*
13180Sstevel@tonic-gate 	 * Link request structure into the async list and
13190Sstevel@tonic-gate 	 * wakeup async thread to do the i/o.
13200Sstevel@tonic-gate 	 */
13210Sstevel@tonic-gate 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
13220Sstevel@tonic-gate 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
13230Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
13240Sstevel@tonic-gate 	} else {
13250Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
13260Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
13270Sstevel@tonic-gate 	}
13280Sstevel@tonic-gate 
13290Sstevel@tonic-gate 	if (mi->mi_io_kstats) {
13300Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
13310Sstevel@tonic-gate 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
13320Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
13330Sstevel@tonic-gate 	}
13340Sstevel@tonic-gate 
13350Sstevel@tonic-gate 	mi->mi_async_req_count++;
13360Sstevel@tonic-gate 	ASSERT(mi->mi_async_req_count != 0);
13370Sstevel@tonic-gate 	cv_signal(&mi->mi_async_reqs_cv);
13380Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
13390Sstevel@tonic-gate 	return (0);
13400Sstevel@tonic-gate 
13410Sstevel@tonic-gate noasync:
13420Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
13430Sstevel@tonic-gate 	rp->r_count--;
13440Sstevel@tonic-gate 	cv_broadcast(&rp->r_cv);
13450Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
13460Sstevel@tonic-gate 	VN_RELE(vp);
13470Sstevel@tonic-gate 	crfree(cr);
13480Sstevel@tonic-gate 	kmem_free(args, sizeof (*args));
13490Sstevel@tonic-gate 	return (-1);
13500Sstevel@tonic-gate }
13510Sstevel@tonic-gate 
135211507SVallish.Vaidyeshwara@Sun.COM static void
nfs4_async_start(struct vfs * vfsp)135311507SVallish.Vaidyeshwara@Sun.COM nfs4_async_start(struct vfs *vfsp)
135411507SVallish.Vaidyeshwara@Sun.COM {
135511507SVallish.Vaidyeshwara@Sun.COM 	nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
135611507SVallish.Vaidyeshwara@Sun.COM }
135711507SVallish.Vaidyeshwara@Sun.COM 
135811507SVallish.Vaidyeshwara@Sun.COM static void
nfs4_async_pgops_start(struct vfs * vfsp)135911507SVallish.Vaidyeshwara@Sun.COM nfs4_async_pgops_start(struct vfs *vfsp)
136011507SVallish.Vaidyeshwara@Sun.COM {
136111507SVallish.Vaidyeshwara@Sun.COM 	nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
136211507SVallish.Vaidyeshwara@Sun.COM }
136311507SVallish.Vaidyeshwara@Sun.COM 
13640Sstevel@tonic-gate /*
13650Sstevel@tonic-gate  * The async queues for each mounted file system are arranged as a
13660Sstevel@tonic-gate  * set of queues, one for each async i/o type.  Requests are taken
13670Sstevel@tonic-gate  * from the queues in a round-robin fashion.  A number of consecutive
13680Sstevel@tonic-gate  * requests are taken from each queue before moving on to the next
13690Sstevel@tonic-gate  * queue.  This functionality may allow the NFS Version 2 server to do
13700Sstevel@tonic-gate  * write clustering, even if the client is mixing writes and reads
13710Sstevel@tonic-gate  * because it will take multiple write requests from the queue
13720Sstevel@tonic-gate  * before processing any of the other async i/o types.
13730Sstevel@tonic-gate  *
137411507SVallish.Vaidyeshwara@Sun.COM  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
13750Sstevel@tonic-gate  * model defined by cpr to suspend the system. Specifically over the
13760Sstevel@tonic-gate  * wire calls are cpr-unsafe. The thread should be reevaluated in
13770Sstevel@tonic-gate  * case of future updates to the cpr model.
13780Sstevel@tonic-gate  */
13790Sstevel@tonic-gate static void
nfs4_async_common_start(struct vfs * vfsp,int async_queue)138011507SVallish.Vaidyeshwara@Sun.COM nfs4_async_common_start(struct vfs *vfsp, int async_queue)
13810Sstevel@tonic-gate {
13820Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
13830Sstevel@tonic-gate 	mntinfo4_t *mi = VFTOMI4(vfsp);
13840Sstevel@tonic-gate 	clock_t time_left = 1;
13850Sstevel@tonic-gate 	callb_cpr_t cprinfo;
13860Sstevel@tonic-gate 	int i;
13870Sstevel@tonic-gate 	extern int nfs_async_timeout;
138811507SVallish.Vaidyeshwara@Sun.COM 	int async_types;
138911507SVallish.Vaidyeshwara@Sun.COM 	kcondvar_t *async_work_cv;
139011507SVallish.Vaidyeshwara@Sun.COM 
139111507SVallish.Vaidyeshwara@Sun.COM 	if (async_queue == NFS4_ASYNC_QUEUE) {
139211507SVallish.Vaidyeshwara@Sun.COM 		async_types = NFS4_ASYNC_TYPES;
139311507SVallish.Vaidyeshwara@Sun.COM 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
139411507SVallish.Vaidyeshwara@Sun.COM 	} else {
139511507SVallish.Vaidyeshwara@Sun.COM 		async_types = NFS4_ASYNC_PGOPS_TYPES;
139611507SVallish.Vaidyeshwara@Sun.COM 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
139711507SVallish.Vaidyeshwara@Sun.COM 	}
13980Sstevel@tonic-gate 
13990Sstevel@tonic-gate 	/*
14000Sstevel@tonic-gate 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
14010Sstevel@tonic-gate 	 * built in an implementation independent manner.
14020Sstevel@tonic-gate 	 */
14030Sstevel@tonic-gate 	if (nfs_async_timeout == -1)
14040Sstevel@tonic-gate 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
14050Sstevel@tonic-gate 
14060Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
14070Sstevel@tonic-gate 
14080Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
14090Sstevel@tonic-gate 	for (;;) {
14100Sstevel@tonic-gate 		/*
14110Sstevel@tonic-gate 		 * Find the next queue containing an entry.  We start
14120Sstevel@tonic-gate 		 * at the current queue pointer and then round robin
14130Sstevel@tonic-gate 		 * through all of them until we either find a non-empty
14140Sstevel@tonic-gate 		 * queue or have looked through all of them.
14150Sstevel@tonic-gate 		 */
141611507SVallish.Vaidyeshwara@Sun.COM 		for (i = 0; i < async_types; i++) {
141711507SVallish.Vaidyeshwara@Sun.COM 			args = *mi->mi_async_curr[async_queue];
14180Sstevel@tonic-gate 			if (args != NULL)
14190Sstevel@tonic-gate 				break;
142011507SVallish.Vaidyeshwara@Sun.COM 			mi->mi_async_curr[async_queue]++;
142111507SVallish.Vaidyeshwara@Sun.COM 			if (mi->mi_async_curr[async_queue] ==
142211507SVallish.Vaidyeshwara@Sun.COM 			    &mi->mi_async_reqs[async_types]) {
142311507SVallish.Vaidyeshwara@Sun.COM 				mi->mi_async_curr[async_queue] =
142411507SVallish.Vaidyeshwara@Sun.COM 				    &mi->mi_async_reqs[0];
142511507SVallish.Vaidyeshwara@Sun.COM 			}
14260Sstevel@tonic-gate 		}
14270Sstevel@tonic-gate 		/*
14280Sstevel@tonic-gate 		 * If we didn't find a entry, then block until woken up
14290Sstevel@tonic-gate 		 * again and then look through the queues again.
14300Sstevel@tonic-gate 		 */
14310Sstevel@tonic-gate 		if (args == NULL) {
14320Sstevel@tonic-gate 			/*
14330Sstevel@tonic-gate 			 * Exiting is considered to be safe for CPR as well
14340Sstevel@tonic-gate 			 */
14350Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
14360Sstevel@tonic-gate 
14370Sstevel@tonic-gate 			/*
14380Sstevel@tonic-gate 			 * Wakeup thread waiting to unmount the file
14390Sstevel@tonic-gate 			 * system only if all async threads are inactive.
14400Sstevel@tonic-gate 			 *
14410Sstevel@tonic-gate 			 * If we've timed-out and there's nothing to do,
14420Sstevel@tonic-gate 			 * then get rid of this thread.
14430Sstevel@tonic-gate 			 */
14440Sstevel@tonic-gate 			if (mi->mi_max_threads == 0 || time_left <= 0) {
144511507SVallish.Vaidyeshwara@Sun.COM 				--mi->mi_threads[async_queue];
144611507SVallish.Vaidyeshwara@Sun.COM 
144711507SVallish.Vaidyeshwara@Sun.COM 				if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
144811507SVallish.Vaidyeshwara@Sun.COM 				    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
14490Sstevel@tonic-gate 					cv_signal(&mi->mi_async_cv);
14500Sstevel@tonic-gate 				CALLB_CPR_EXIT(&cprinfo);
14510Sstevel@tonic-gate 				VFS_RELE(vfsp);	/* release thread's hold */
14521705Sjwahlig 				MI4_RELE(mi);
14530Sstevel@tonic-gate 				zthread_exit();
14540Sstevel@tonic-gate 				/* NOTREACHED */
14550Sstevel@tonic-gate 			}
145611507SVallish.Vaidyeshwara@Sun.COM 			time_left = cv_reltimedwait(async_work_cv,
145711066Srafael.vanoni@sun.com 			    &mi->mi_async_lock, nfs_async_timeout,
145811066Srafael.vanoni@sun.com 			    TR_CLOCK_TICK);
14590Sstevel@tonic-gate 
14600Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
14610Sstevel@tonic-gate 
14620Sstevel@tonic-gate 			continue;
14630Sstevel@tonic-gate 		} else {
14640Sstevel@tonic-gate 			time_left = 1;
14650Sstevel@tonic-gate 		}
14660Sstevel@tonic-gate 
14670Sstevel@tonic-gate 		/*
14680Sstevel@tonic-gate 		 * Remove the request from the async queue and then
14690Sstevel@tonic-gate 		 * update the current async request queue pointer.  If
14700Sstevel@tonic-gate 		 * the current queue is empty or we have removed enough
14710Sstevel@tonic-gate 		 * consecutive entries from it, then reset the counter
14720Sstevel@tonic-gate 		 * for this queue and then move the current pointer to
14730Sstevel@tonic-gate 		 * the next queue.
14740Sstevel@tonic-gate 		 */
147511507SVallish.Vaidyeshwara@Sun.COM 		*mi->mi_async_curr[async_queue] = args->a_next;
147611507SVallish.Vaidyeshwara@Sun.COM 		if (*mi->mi_async_curr[async_queue] == NULL ||
14770Sstevel@tonic-gate 		    --mi->mi_async_clusters[args->a_io] == 0) {
14780Sstevel@tonic-gate 			mi->mi_async_clusters[args->a_io] =
14795302Sth199096 			    mi->mi_async_init_clusters;
148011507SVallish.Vaidyeshwara@Sun.COM 			mi->mi_async_curr[async_queue]++;
148111507SVallish.Vaidyeshwara@Sun.COM 			if (mi->mi_async_curr[async_queue] ==
148211507SVallish.Vaidyeshwara@Sun.COM 			    &mi->mi_async_reqs[async_types]) {
148311507SVallish.Vaidyeshwara@Sun.COM 				mi->mi_async_curr[async_queue] =
148411507SVallish.Vaidyeshwara@Sun.COM 				    &mi->mi_async_reqs[0];
148511507SVallish.Vaidyeshwara@Sun.COM 			}
14860Sstevel@tonic-gate 		}
14870Sstevel@tonic-gate 
14880Sstevel@tonic-gate 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
14890Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
14900Sstevel@tonic-gate 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
14910Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
14920Sstevel@tonic-gate 		}
14930Sstevel@tonic-gate 
14940Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
14950Sstevel@tonic-gate 
14960Sstevel@tonic-gate 		/*
14970Sstevel@tonic-gate 		 * Obtain arguments from the async request structure.
14980Sstevel@tonic-gate 		 */
14990Sstevel@tonic-gate 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
15000Sstevel@tonic-gate 			(*args->a_nfs4_readahead)(args->a_vp,
15015302Sth199096 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
15025302Sth199096 			    args->a_nfs4_seg, args->a_cred);
15030Sstevel@tonic-gate 		} else if (args->a_io == NFS4_PUTAPAGE) {
15040Sstevel@tonic-gate 			(void) (*args->a_nfs4_putapage)(args->a_vp,
15055302Sth199096 			    args->a_nfs4_pp, args->a_nfs4_off,
15065302Sth199096 			    args->a_nfs4_len, args->a_nfs4_flags,
15075302Sth199096 			    args->a_cred);
15080Sstevel@tonic-gate 		} else if (args->a_io == NFS4_PAGEIO) {
15090Sstevel@tonic-gate 			(void) (*args->a_nfs4_pageio)(args->a_vp,
15105302Sth199096 			    args->a_nfs4_pp, args->a_nfs4_off,
15115302Sth199096 			    args->a_nfs4_len, args->a_nfs4_flags,
15125302Sth199096 			    args->a_cred);
15130Sstevel@tonic-gate 		} else if (args->a_io == NFS4_READDIR) {
15140Sstevel@tonic-gate 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
15155302Sth199096 			    args->a_nfs4_rdc, args->a_cred));
15160Sstevel@tonic-gate 		} else if (args->a_io == NFS4_COMMIT) {
15170Sstevel@tonic-gate 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
15185302Sth199096 			    args->a_nfs4_offset, args->a_nfs4_count,
15195302Sth199096 			    args->a_cred);
15200Sstevel@tonic-gate 		} else if (args->a_io == NFS4_INACTIVE) {
15210Sstevel@tonic-gate 			nfs4_inactive_otw(args->a_vp, args->a_cred);
15220Sstevel@tonic-gate 		}
15230Sstevel@tonic-gate 
15240Sstevel@tonic-gate 		/*
15250Sstevel@tonic-gate 		 * Now, release the vnode and free the credentials
15260Sstevel@tonic-gate 		 * structure.
15270Sstevel@tonic-gate 		 */
15280Sstevel@tonic-gate 		free_async_args4(args);
15290Sstevel@tonic-gate 		/*
15300Sstevel@tonic-gate 		 * Reacquire the mutex because it will be needed above.
15310Sstevel@tonic-gate 		 */
15320Sstevel@tonic-gate 		mutex_enter(&mi->mi_async_lock);
15330Sstevel@tonic-gate 	}
15340Sstevel@tonic-gate }
15350Sstevel@tonic-gate 
15360Sstevel@tonic-gate /*
15370Sstevel@tonic-gate  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
15380Sstevel@tonic-gate  * part of VOP_INACTIVE.
15390Sstevel@tonic-gate  */
15400Sstevel@tonic-gate 
15410Sstevel@tonic-gate void
nfs4_inactive_thread(mntinfo4_t * mi)15420Sstevel@tonic-gate nfs4_inactive_thread(mntinfo4_t *mi)
15430Sstevel@tonic-gate {
15440Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
15450Sstevel@tonic-gate 	callb_cpr_t cprinfo;
15460Sstevel@tonic-gate 	vfs_t *vfsp = mi->mi_vfsp;
15470Sstevel@tonic-gate 
15480Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
15495302Sth199096 	    "nfs4_inactive_thread");
15500Sstevel@tonic-gate 
15510Sstevel@tonic-gate 	for (;;) {
15520Sstevel@tonic-gate 		mutex_enter(&mi->mi_async_lock);
15530Sstevel@tonic-gate 		args = mi->mi_async_reqs[NFS4_INACTIVE];
15540Sstevel@tonic-gate 		if (args == NULL) {
15550Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
15560Sstevel@tonic-gate 			/*
15571705Sjwahlig 			 * We don't want to exit until the async manager is done
15580Sstevel@tonic-gate 			 * with its work; hence the check for mi_manager_thread
15590Sstevel@tonic-gate 			 * being NULL.
15600Sstevel@tonic-gate 			 *
15610Sstevel@tonic-gate 			 * The async manager thread will cv_broadcast() on
15620Sstevel@tonic-gate 			 * mi_inact_req_cv when it's done, at which point we'll
15630Sstevel@tonic-gate 			 * wake up and exit.
15640Sstevel@tonic-gate 			 */
15651705Sjwahlig 			if (mi->mi_manager_thread == NULL)
15660Sstevel@tonic-gate 				goto die;
15670Sstevel@tonic-gate 			mi->mi_flags |= MI4_INACTIVE_IDLE;
15680Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
15690Sstevel@tonic-gate 			cv_signal(&mi->mi_async_cv);
15700Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
15710Sstevel@tonic-gate 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
15720Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
15730Sstevel@tonic-gate 			mutex_exit(&mi->mi_async_lock);
15740Sstevel@tonic-gate 		} else {
15750Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
15760Sstevel@tonic-gate 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
15770Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
15780Sstevel@tonic-gate 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
15790Sstevel@tonic-gate 			mutex_exit(&mi->mi_async_lock);
15800Sstevel@tonic-gate 			nfs4_inactive_otw(args->a_vp, args->a_cred);
15810Sstevel@tonic-gate 			crfree(args->a_cred);
15820Sstevel@tonic-gate 			kmem_free(args, sizeof (*args));
15830Sstevel@tonic-gate 		}
15840Sstevel@tonic-gate 	}
15850Sstevel@tonic-gate die:
15860Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
15870Sstevel@tonic-gate 	mi->mi_inactive_thread = NULL;
15880Sstevel@tonic-gate 	cv_signal(&mi->mi_async_cv);
15891705Sjwahlig 
15900Sstevel@tonic-gate 	/*
15910Sstevel@tonic-gate 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
15920Sstevel@tonic-gate 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
15930Sstevel@tonic-gate 	 */
15940Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cprinfo);
15951705Sjwahlig 
15960Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
15970Sstevel@tonic-gate 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
15981705Sjwahlig 
15991705Sjwahlig 	MI4_RELE(mi);
16000Sstevel@tonic-gate 	zthread_exit();
16010Sstevel@tonic-gate 	/* NOTREACHED */
16020Sstevel@tonic-gate }
16030Sstevel@tonic-gate 
16040Sstevel@tonic-gate /*
16050Sstevel@tonic-gate  * nfs_async_stop:
16060Sstevel@tonic-gate  * Wait for all outstanding putpage operations and the inactive thread to
16070Sstevel@tonic-gate  * complete; nfs4_async_stop_sig() without interruptibility.
16080Sstevel@tonic-gate  */
16090Sstevel@tonic-gate void
nfs4_async_stop(struct vfs * vfsp)16100Sstevel@tonic-gate nfs4_async_stop(struct vfs *vfsp)
16110Sstevel@tonic-gate {
16120Sstevel@tonic-gate 	mntinfo4_t *mi = VFTOMI4(vfsp);
16130Sstevel@tonic-gate 
16140Sstevel@tonic-gate 	/*
16150Sstevel@tonic-gate 	 * Wait for all outstanding async operations to complete and for
16160Sstevel@tonic-gate 	 * worker threads to exit.
16170Sstevel@tonic-gate 	 */
16180Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
16190Sstevel@tonic-gate 	mi->mi_max_threads = 0;
162011507SVallish.Vaidyeshwara@Sun.COM 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
162111507SVallish.Vaidyeshwara@Sun.COM 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
162211507SVallish.Vaidyeshwara@Sun.COM 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
16230Sstevel@tonic-gate 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
16240Sstevel@tonic-gate 
16250Sstevel@tonic-gate 	/*
16260Sstevel@tonic-gate 	 * Wait for the inactive thread to finish doing what it's doing.  It
16270Sstevel@tonic-gate 	 * won't exit until the last reference to the vfs_t goes away.
16280Sstevel@tonic-gate 	 */
16290Sstevel@tonic-gate 	if (mi->mi_inactive_thread != NULL) {
16300Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
16310Sstevel@tonic-gate 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
16320Sstevel@tonic-gate 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
16330Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
16340Sstevel@tonic-gate 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
16350Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
16360Sstevel@tonic-gate 		}
16370Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
16380Sstevel@tonic-gate 	}
16390Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
16400Sstevel@tonic-gate }
16410Sstevel@tonic-gate 
16420Sstevel@tonic-gate /*
16430Sstevel@tonic-gate  * nfs_async_stop_sig:
16440Sstevel@tonic-gate  * Wait for all outstanding putpage operations and the inactive thread to
16450Sstevel@tonic-gate  * complete. If a signal is delivered we will abort and return non-zero;
16460Sstevel@tonic-gate  * otherwise return 0. Since this routine is called from nfs4_unmount, we
16475331Samw  * need to make it interruptible.
16480Sstevel@tonic-gate  */
16490Sstevel@tonic-gate int
nfs4_async_stop_sig(struct vfs * vfsp)16500Sstevel@tonic-gate nfs4_async_stop_sig(struct vfs *vfsp)
16510Sstevel@tonic-gate {
16520Sstevel@tonic-gate 	mntinfo4_t *mi = VFTOMI4(vfsp);
16530Sstevel@tonic-gate 	ushort_t omax;
16540Sstevel@tonic-gate 	bool_t intr = FALSE;
16550Sstevel@tonic-gate 
16560Sstevel@tonic-gate 	/*
16570Sstevel@tonic-gate 	 * Wait for all outstanding putpage operations to complete and for
16580Sstevel@tonic-gate 	 * worker threads to exit.
16590Sstevel@tonic-gate 	 */
16600Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
16610Sstevel@tonic-gate 	omax = mi->mi_max_threads;
16620Sstevel@tonic-gate 	mi->mi_max_threads = 0;
166311507SVallish.Vaidyeshwara@Sun.COM 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
166411507SVallish.Vaidyeshwara@Sun.COM 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
166511507SVallish.Vaidyeshwara@Sun.COM 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
16660Sstevel@tonic-gate 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
16670Sstevel@tonic-gate 			intr = TRUE;
16680Sstevel@tonic-gate 			goto interrupted;
16690Sstevel@tonic-gate 		}
16700Sstevel@tonic-gate 	}
16710Sstevel@tonic-gate 
16720Sstevel@tonic-gate 	/*
16730Sstevel@tonic-gate 	 * Wait for the inactive thread to finish doing what it's doing.  It
16740Sstevel@tonic-gate 	 * won't exit until the a last reference to the vfs_t goes away.
16750Sstevel@tonic-gate 	 */
16760Sstevel@tonic-gate 	if (mi->mi_inactive_thread != NULL) {
16770Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
16780Sstevel@tonic-gate 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
16790Sstevel@tonic-gate 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
16800Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
16810Sstevel@tonic-gate 			if (!cv_wait_sig(&mi->mi_async_cv,
16820Sstevel@tonic-gate 			    &mi->mi_async_lock)) {
16830Sstevel@tonic-gate 				intr = TRUE;
16840Sstevel@tonic-gate 				goto interrupted;
16850Sstevel@tonic-gate 			}
16860Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
16870Sstevel@tonic-gate 		}
16880Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
16890Sstevel@tonic-gate 	}
16900Sstevel@tonic-gate interrupted:
16910Sstevel@tonic-gate 	if (intr)
16920Sstevel@tonic-gate 		mi->mi_max_threads = omax;
16930Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
16940Sstevel@tonic-gate 
16950Sstevel@tonic-gate 	return (intr);
16960Sstevel@tonic-gate }
16970Sstevel@tonic-gate 
16980Sstevel@tonic-gate int
nfs4_async_putapage(vnode_t * vp,page_t * pp,u_offset_t off,size_t len,int flags,cred_t * cr,int (* putapage)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))16990Sstevel@tonic-gate nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
17005302Sth199096     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
17015302Sth199096     u_offset_t, size_t, int, cred_t *))
17020Sstevel@tonic-gate {
17030Sstevel@tonic-gate 	rnode4_t *rp;
17040Sstevel@tonic-gate 	mntinfo4_t *mi;
17050Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
17060Sstevel@tonic-gate 
17070Sstevel@tonic-gate 	ASSERT(flags & B_ASYNC);
17080Sstevel@tonic-gate 	ASSERT(vp->v_vfsp != NULL);
17090Sstevel@tonic-gate 
17100Sstevel@tonic-gate 	rp = VTOR4(vp);
17110Sstevel@tonic-gate 	ASSERT(rp->r_count > 0);
17120Sstevel@tonic-gate 
17130Sstevel@tonic-gate 	mi = VTOMI4(vp);
17140Sstevel@tonic-gate 
17150Sstevel@tonic-gate 	/*
17160Sstevel@tonic-gate 	 * If we can't allocate a request structure, do the putpage
17170Sstevel@tonic-gate 	 * operation synchronously in this thread's context.
17180Sstevel@tonic-gate 	 */
17190Sstevel@tonic-gate 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
17200Sstevel@tonic-gate 		goto noasync;
17210Sstevel@tonic-gate 
17220Sstevel@tonic-gate 	args->a_next = NULL;
17230Sstevel@tonic-gate #ifdef DEBUG
17240Sstevel@tonic-gate 	args->a_queuer = curthread;
17250Sstevel@tonic-gate #endif
17260Sstevel@tonic-gate 	VN_HOLD(vp);
17270Sstevel@tonic-gate 	args->a_vp = vp;
17280Sstevel@tonic-gate 	ASSERT(cr != NULL);
17290Sstevel@tonic-gate 	crhold(cr);
17300Sstevel@tonic-gate 	args->a_cred = cr;
17310Sstevel@tonic-gate 	args->a_io = NFS4_PUTAPAGE;
17320Sstevel@tonic-gate 	args->a_nfs4_putapage = putapage;
17330Sstevel@tonic-gate 	args->a_nfs4_pp = pp;
17340Sstevel@tonic-gate 	args->a_nfs4_off = off;
17350Sstevel@tonic-gate 	args->a_nfs4_len = (uint_t)len;
17360Sstevel@tonic-gate 	args->a_nfs4_flags = flags;
17370Sstevel@tonic-gate 
17380Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
17390Sstevel@tonic-gate 
17400Sstevel@tonic-gate 	/*
17410Sstevel@tonic-gate 	 * If asyncio has been disabled, then make a synchronous request.
17420Sstevel@tonic-gate 	 * This check is done a second time in case async io was diabled
17430Sstevel@tonic-gate 	 * while this thread was blocked waiting for memory pressure to
17440Sstevel@tonic-gate 	 * reduce or for the queue to drain.
17450Sstevel@tonic-gate 	 */
17460Sstevel@tonic-gate 	if (mi->mi_max_threads == 0) {
17470Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
17480Sstevel@tonic-gate 
17490Sstevel@tonic-gate 		VN_RELE(vp);
17500Sstevel@tonic-gate 		crfree(cr);
17510Sstevel@tonic-gate 		kmem_free(args, sizeof (*args));
17520Sstevel@tonic-gate 		goto noasync;
17530Sstevel@tonic-gate 	}
17540Sstevel@tonic-gate 
17550Sstevel@tonic-gate 	/*
17560Sstevel@tonic-gate 	 * Link request structure into the async list and
17570Sstevel@tonic-gate 	 * wakeup async thread to do the i/o.
17580Sstevel@tonic-gate 	 */
17590Sstevel@tonic-gate 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
17600Sstevel@tonic-gate 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
17610Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
17620Sstevel@tonic-gate 	} else {
17630Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
17640Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
17650Sstevel@tonic-gate 	}
17660Sstevel@tonic-gate 
17670Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
17680Sstevel@tonic-gate 	rp->r_count++;
17690Sstevel@tonic-gate 	rp->r_awcount++;
17700Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
17710Sstevel@tonic-gate 
17720Sstevel@tonic-gate 	if (mi->mi_io_kstats) {
17730Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
17740Sstevel@tonic-gate 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
17750Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
17760Sstevel@tonic-gate 	}
17770Sstevel@tonic-gate 
17780Sstevel@tonic-gate 	mi->mi_async_req_count++;
17790Sstevel@tonic-gate 	ASSERT(mi->mi_async_req_count != 0);
17800Sstevel@tonic-gate 	cv_signal(&mi->mi_async_reqs_cv);
17810Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
17820Sstevel@tonic-gate 	return (0);
17830Sstevel@tonic-gate 
17840Sstevel@tonic-gate noasync:
17850Sstevel@tonic-gate 
17860Sstevel@tonic-gate 	if (curproc == proc_pageout || curproc == proc_fsflush ||
1787766Scarlsonj 	    nfs_zone() == mi->mi_zone) {
17880Sstevel@tonic-gate 		/*
17890Sstevel@tonic-gate 		 * If we get here in the context of the pageout/fsflush,
17900Sstevel@tonic-gate 		 * or we have run out of memory or we're attempting to
17910Sstevel@tonic-gate 		 * unmount we refuse to do a sync write, because this may
17920Sstevel@tonic-gate 		 * hang pageout/fsflush and the machine. In this case,
17930Sstevel@tonic-gate 		 * we just re-mark the page as dirty and punt on the page.
17940Sstevel@tonic-gate 		 *
17950Sstevel@tonic-gate 		 * Make sure B_FORCE isn't set.  We can re-mark the
17960Sstevel@tonic-gate 		 * pages as dirty and unlock the pages in one swoop by
17970Sstevel@tonic-gate 		 * passing in B_ERROR to pvn_write_done().  However,
17980Sstevel@tonic-gate 		 * we should make sure B_FORCE isn't set - we don't
17990Sstevel@tonic-gate 		 * want the page tossed before it gets written out.
18000Sstevel@tonic-gate 		 */
18010Sstevel@tonic-gate 		if (flags & B_FORCE)
18020Sstevel@tonic-gate 			flags &= ~(B_INVAL | B_FORCE);
18030Sstevel@tonic-gate 		pvn_write_done(pp, flags | B_ERROR);
18040Sstevel@tonic-gate 		return (0);
18050Sstevel@tonic-gate 	}
18060Sstevel@tonic-gate 
18070Sstevel@tonic-gate 	/*
1808766Scarlsonj 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
18090Sstevel@tonic-gate 	 * which means that this was a cross-zone sync putpage.
18100Sstevel@tonic-gate 	 *
18110Sstevel@tonic-gate 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
18120Sstevel@tonic-gate 	 * as dirty and unlock them.
18130Sstevel@tonic-gate 	 *
18140Sstevel@tonic-gate 	 * We don't want to clear B_FORCE here as the caller presumably
18150Sstevel@tonic-gate 	 * knows what they're doing if they set it.
18160Sstevel@tonic-gate 	 */
18170Sstevel@tonic-gate 	pvn_write_done(pp, flags | B_ERROR);
18180Sstevel@tonic-gate 	return (EPERM);
18190Sstevel@tonic-gate }
18200Sstevel@tonic-gate 
18210Sstevel@tonic-gate int
nfs4_async_pageio(vnode_t * vp,page_t * pp,u_offset_t io_off,size_t io_len,int flags,cred_t * cr,int (* pageio)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))18220Sstevel@tonic-gate nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
18235302Sth199096     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
18245302Sth199096     size_t, int, cred_t *))
18250Sstevel@tonic-gate {
18260Sstevel@tonic-gate 	rnode4_t *rp;
18270Sstevel@tonic-gate 	mntinfo4_t *mi;
18280Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
18290Sstevel@tonic-gate 
18300Sstevel@tonic-gate 	ASSERT(flags & B_ASYNC);
18310Sstevel@tonic-gate 	ASSERT(vp->v_vfsp != NULL);
18320Sstevel@tonic-gate 
18330Sstevel@tonic-gate 	rp = VTOR4(vp);
18340Sstevel@tonic-gate 	ASSERT(rp->r_count > 0);
18350Sstevel@tonic-gate 
18360Sstevel@tonic-gate 	mi = VTOMI4(vp);
18370Sstevel@tonic-gate 
18380Sstevel@tonic-gate 	/*
18390Sstevel@tonic-gate 	 * If we can't allocate a request structure, do the pageio
18400Sstevel@tonic-gate 	 * request synchronously in this thread's context.
18410Sstevel@tonic-gate 	 */
18420Sstevel@tonic-gate 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
18430Sstevel@tonic-gate 		goto noasync;
18440Sstevel@tonic-gate 
18450Sstevel@tonic-gate 	args->a_next = NULL;
18460Sstevel@tonic-gate #ifdef DEBUG
18470Sstevel@tonic-gate 	args->a_queuer = curthread;
18480Sstevel@tonic-gate #endif
18490Sstevel@tonic-gate 	VN_HOLD(vp);
18500Sstevel@tonic-gate 	args->a_vp = vp;
18510Sstevel@tonic-gate 	ASSERT(cr != NULL);
18520Sstevel@tonic-gate 	crhold(cr);
18530Sstevel@tonic-gate 	args->a_cred = cr;
18540Sstevel@tonic-gate 	args->a_io = NFS4_PAGEIO;
18550Sstevel@tonic-gate 	args->a_nfs4_pageio = pageio;
18560Sstevel@tonic-gate 	args->a_nfs4_pp = pp;
18570Sstevel@tonic-gate 	args->a_nfs4_off = io_off;
18580Sstevel@tonic-gate 	args->a_nfs4_len = (uint_t)io_len;
18590Sstevel@tonic-gate 	args->a_nfs4_flags = flags;
18600Sstevel@tonic-gate 
18610Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
18620Sstevel@tonic-gate 
18630Sstevel@tonic-gate 	/*
18640Sstevel@tonic-gate 	 * If asyncio has been disabled, then make a synchronous request.
18650Sstevel@tonic-gate 	 * This check is done a second time in case async io was diabled
18660Sstevel@tonic-gate 	 * while this thread was blocked waiting for memory pressure to
18670Sstevel@tonic-gate 	 * reduce or for the queue to drain.
18680Sstevel@tonic-gate 	 */
18690Sstevel@tonic-gate 	if (mi->mi_max_threads == 0) {
18700Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
18710Sstevel@tonic-gate 
18720Sstevel@tonic-gate 		VN_RELE(vp);
18730Sstevel@tonic-gate 		crfree(cr);
18740Sstevel@tonic-gate 		kmem_free(args, sizeof (*args));
18750Sstevel@tonic-gate 		goto noasync;
18760Sstevel@tonic-gate 	}
18770Sstevel@tonic-gate 
18780Sstevel@tonic-gate 	/*
18790Sstevel@tonic-gate 	 * Link request structure into the async list and
18800Sstevel@tonic-gate 	 * wakeup async thread to do the i/o.
18810Sstevel@tonic-gate 	 */
18820Sstevel@tonic-gate 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
18830Sstevel@tonic-gate 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
18840Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_PAGEIO] = args;
18850Sstevel@tonic-gate 	} else {
18860Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
18870Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_PAGEIO] = args;
18880Sstevel@tonic-gate 	}
18890Sstevel@tonic-gate 
18900Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
18910Sstevel@tonic-gate 	rp->r_count++;
18920Sstevel@tonic-gate 	rp->r_awcount++;
18930Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
18940Sstevel@tonic-gate 
18950Sstevel@tonic-gate 	if (mi->mi_io_kstats) {
18960Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
18970Sstevel@tonic-gate 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
18980Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
18990Sstevel@tonic-gate 	}
19000Sstevel@tonic-gate 
19010Sstevel@tonic-gate 	mi->mi_async_req_count++;
19020Sstevel@tonic-gate 	ASSERT(mi->mi_async_req_count != 0);
19030Sstevel@tonic-gate 	cv_signal(&mi->mi_async_reqs_cv);
19040Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
19050Sstevel@tonic-gate 	return (0);
19060Sstevel@tonic-gate 
19070Sstevel@tonic-gate noasync:
19080Sstevel@tonic-gate 	/*
19090Sstevel@tonic-gate 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
19100Sstevel@tonic-gate 	 * the page list), for writes we do it synchronously, except for
19110Sstevel@tonic-gate 	 * proc_pageout/proc_fsflush as described below.
19120Sstevel@tonic-gate 	 */
19130Sstevel@tonic-gate 	if (flags & B_READ) {
19140Sstevel@tonic-gate 		pvn_read_done(pp, flags | B_ERROR);
19150Sstevel@tonic-gate 		return (0);
19160Sstevel@tonic-gate 	}
19170Sstevel@tonic-gate 
19180Sstevel@tonic-gate 	if (curproc == proc_pageout || curproc == proc_fsflush) {
19190Sstevel@tonic-gate 		/*
19200Sstevel@tonic-gate 		 * If we get here in the context of the pageout/fsflush,
19210Sstevel@tonic-gate 		 * we refuse to do a sync write, because this may hang
19220Sstevel@tonic-gate 		 * pageout/fsflush (and the machine). In this case, we just
19230Sstevel@tonic-gate 		 * re-mark the page as dirty and punt on the page.
19240Sstevel@tonic-gate 		 *
19250Sstevel@tonic-gate 		 * Make sure B_FORCE isn't set.  We can re-mark the
19260Sstevel@tonic-gate 		 * pages as dirty and unlock the pages in one swoop by
19270Sstevel@tonic-gate 		 * passing in B_ERROR to pvn_write_done().  However,
19280Sstevel@tonic-gate 		 * we should make sure B_FORCE isn't set - we don't
19290Sstevel@tonic-gate 		 * want the page tossed before it gets written out.
19300Sstevel@tonic-gate 		 */
19310Sstevel@tonic-gate 		if (flags & B_FORCE)
19320Sstevel@tonic-gate 			flags &= ~(B_INVAL | B_FORCE);
19330Sstevel@tonic-gate 		pvn_write_done(pp, flags | B_ERROR);
19340Sstevel@tonic-gate 		return (0);
19350Sstevel@tonic-gate 	}
19360Sstevel@tonic-gate 
1937766Scarlsonj 	if (nfs_zone() != mi->mi_zone) {
19380Sstevel@tonic-gate 		/*
19390Sstevel@tonic-gate 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
19400Sstevel@tonic-gate 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
19410Sstevel@tonic-gate 		 * them.
19420Sstevel@tonic-gate 		 *
19430Sstevel@tonic-gate 		 * We don't want to clear B_FORCE here as the caller presumably
19440Sstevel@tonic-gate 		 * knows what they're doing if they set it.
19450Sstevel@tonic-gate 		 */
19460Sstevel@tonic-gate 		pvn_write_done(pp, flags | B_ERROR);
19470Sstevel@tonic-gate 		return (EPERM);
19480Sstevel@tonic-gate 	}
19490Sstevel@tonic-gate 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
19500Sstevel@tonic-gate }
19510Sstevel@tonic-gate 
19520Sstevel@tonic-gate void
nfs4_async_readdir(vnode_t * vp,rddir4_cache * rdc,cred_t * cr,int (* readdir)(vnode_t *,rddir4_cache *,cred_t *))19530Sstevel@tonic-gate nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
19545302Sth199096     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
19550Sstevel@tonic-gate {
19560Sstevel@tonic-gate 	rnode4_t *rp;
19570Sstevel@tonic-gate 	mntinfo4_t *mi;
19580Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
19590Sstevel@tonic-gate 
19600Sstevel@tonic-gate 	rp = VTOR4(vp);
19610Sstevel@tonic-gate 	ASSERT(rp->r_freef == NULL);
19620Sstevel@tonic-gate 
19630Sstevel@tonic-gate 	mi = VTOMI4(vp);
19640Sstevel@tonic-gate 
19650Sstevel@tonic-gate 	/*
19660Sstevel@tonic-gate 	 * If we can't allocate a request structure, skip the readdir.
19670Sstevel@tonic-gate 	 */
19680Sstevel@tonic-gate 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
19690Sstevel@tonic-gate 		goto noasync;
19700Sstevel@tonic-gate 
19710Sstevel@tonic-gate 	args->a_next = NULL;
19720Sstevel@tonic-gate #ifdef DEBUG
19730Sstevel@tonic-gate 	args->a_queuer = curthread;
19740Sstevel@tonic-gate #endif
19750Sstevel@tonic-gate 	VN_HOLD(vp);
19760Sstevel@tonic-gate 	args->a_vp = vp;
19770Sstevel@tonic-gate 	ASSERT(cr != NULL);
19780Sstevel@tonic-gate 	crhold(cr);
19790Sstevel@tonic-gate 	args->a_cred = cr;
19800Sstevel@tonic-gate 	args->a_io = NFS4_READDIR;
19810Sstevel@tonic-gate 	args->a_nfs4_readdir = readdir;
19820Sstevel@tonic-gate 	args->a_nfs4_rdc = rdc;
19830Sstevel@tonic-gate 
19840Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
19850Sstevel@tonic-gate 
19860Sstevel@tonic-gate 	/*
19870Sstevel@tonic-gate 	 * If asyncio has been disabled, then skip this request
19880Sstevel@tonic-gate 	 */
19890Sstevel@tonic-gate 	if (mi->mi_max_threads == 0) {
19900Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
19910Sstevel@tonic-gate 
19920Sstevel@tonic-gate 		VN_RELE(vp);
19930Sstevel@tonic-gate 		crfree(cr);
19940Sstevel@tonic-gate 		kmem_free(args, sizeof (*args));
19950Sstevel@tonic-gate 		goto noasync;
19960Sstevel@tonic-gate 	}
19970Sstevel@tonic-gate 
19980Sstevel@tonic-gate 	/*
19990Sstevel@tonic-gate 	 * Link request structure into the async list and
20000Sstevel@tonic-gate 	 * wakeup async thread to do the i/o.
20010Sstevel@tonic-gate 	 */
20020Sstevel@tonic-gate 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
20030Sstevel@tonic-gate 		mi->mi_async_reqs[NFS4_READDIR] = args;
20040Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_READDIR] = args;
20050Sstevel@tonic-gate 	} else {
20060Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
20070Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_READDIR] = args;
20080Sstevel@tonic-gate 	}
20090Sstevel@tonic-gate 
20100Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
20110Sstevel@tonic-gate 	rp->r_count++;
20120Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
20130Sstevel@tonic-gate 
20140Sstevel@tonic-gate 	if (mi->mi_io_kstats) {
20150Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
20160Sstevel@tonic-gate 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
20170Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
20180Sstevel@tonic-gate 	}
20190Sstevel@tonic-gate 
20200Sstevel@tonic-gate 	mi->mi_async_req_count++;
20210Sstevel@tonic-gate 	ASSERT(mi->mi_async_req_count != 0);
20220Sstevel@tonic-gate 	cv_signal(&mi->mi_async_reqs_cv);
20230Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
20240Sstevel@tonic-gate 	return;
20250Sstevel@tonic-gate 
20260Sstevel@tonic-gate noasync:
20270Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
20280Sstevel@tonic-gate 	rdc->entries = NULL;
20290Sstevel@tonic-gate 	/*
20300Sstevel@tonic-gate 	 * Indicate that no one is trying to fill this entry and
20310Sstevel@tonic-gate 	 * it still needs to be filled.
20320Sstevel@tonic-gate 	 */
20330Sstevel@tonic-gate 	rdc->flags &= ~RDDIR;
20340Sstevel@tonic-gate 	rdc->flags |= RDDIRREQ;
20350Sstevel@tonic-gate 	rddir4_cache_rele(rp, rdc);
20360Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
20370Sstevel@tonic-gate }
20380Sstevel@tonic-gate 
20390Sstevel@tonic-gate void
nfs4_async_commit(vnode_t * vp,page_t * plist,offset3 offset,count3 count,cred_t * cr,void (* commit)(vnode_t *,page_t *,offset3,count3,cred_t *))20400Sstevel@tonic-gate nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
20415302Sth199096     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
20425302Sth199096     cred_t *))
20430Sstevel@tonic-gate {
20440Sstevel@tonic-gate 	rnode4_t *rp;
20450Sstevel@tonic-gate 	mntinfo4_t *mi;
20460Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
20470Sstevel@tonic-gate 	page_t *pp;
20480Sstevel@tonic-gate 
20490Sstevel@tonic-gate 	rp = VTOR4(vp);
20500Sstevel@tonic-gate 	mi = VTOMI4(vp);
20510Sstevel@tonic-gate 
20520Sstevel@tonic-gate 	/*
20530Sstevel@tonic-gate 	 * If we can't allocate a request structure, do the commit
20540Sstevel@tonic-gate 	 * operation synchronously in this thread's context.
20550Sstevel@tonic-gate 	 */
20560Sstevel@tonic-gate 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
20570Sstevel@tonic-gate 		goto noasync;
20580Sstevel@tonic-gate 
20590Sstevel@tonic-gate 	args->a_next = NULL;
20600Sstevel@tonic-gate #ifdef DEBUG
20610Sstevel@tonic-gate 	args->a_queuer = curthread;
20620Sstevel@tonic-gate #endif
20630Sstevel@tonic-gate 	VN_HOLD(vp);
20640Sstevel@tonic-gate 	args->a_vp = vp;
20650Sstevel@tonic-gate 	ASSERT(cr != NULL);
20660Sstevel@tonic-gate 	crhold(cr);
20670Sstevel@tonic-gate 	args->a_cred = cr;
20680Sstevel@tonic-gate 	args->a_io = NFS4_COMMIT;
20690Sstevel@tonic-gate 	args->a_nfs4_commit = commit;
20700Sstevel@tonic-gate 	args->a_nfs4_plist = plist;
20710Sstevel@tonic-gate 	args->a_nfs4_offset = offset;
20720Sstevel@tonic-gate 	args->a_nfs4_count = count;
20730Sstevel@tonic-gate 
20740Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
20750Sstevel@tonic-gate 
20760Sstevel@tonic-gate 	/*
20770Sstevel@tonic-gate 	 * If asyncio has been disabled, then make a synchronous request.
20780Sstevel@tonic-gate 	 * This check is done a second time in case async io was diabled
20790Sstevel@tonic-gate 	 * while this thread was blocked waiting for memory pressure to
20800Sstevel@tonic-gate 	 * reduce or for the queue to drain.
20810Sstevel@tonic-gate 	 */
20820Sstevel@tonic-gate 	if (mi->mi_max_threads == 0) {
20830Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
20840Sstevel@tonic-gate 
20850Sstevel@tonic-gate 		VN_RELE(vp);
20860Sstevel@tonic-gate 		crfree(cr);
20870Sstevel@tonic-gate 		kmem_free(args, sizeof (*args));
20880Sstevel@tonic-gate 		goto noasync;
20890Sstevel@tonic-gate 	}
20900Sstevel@tonic-gate 
20910Sstevel@tonic-gate 	/*
20920Sstevel@tonic-gate 	 * Link request structure into the async list and
20930Sstevel@tonic-gate 	 * wakeup async thread to do the i/o.
20940Sstevel@tonic-gate 	 */
20950Sstevel@tonic-gate 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
20960Sstevel@tonic-gate 		mi->mi_async_reqs[NFS4_COMMIT] = args;
20970Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_COMMIT] = args;
20980Sstevel@tonic-gate 	} else {
20990Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
21000Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_COMMIT] = args;
21010Sstevel@tonic-gate 	}
21020Sstevel@tonic-gate 
21030Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
21040Sstevel@tonic-gate 	rp->r_count++;
21050Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
21060Sstevel@tonic-gate 
21070Sstevel@tonic-gate 	if (mi->mi_io_kstats) {
21080Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
21090Sstevel@tonic-gate 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
21100Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
21110Sstevel@tonic-gate 	}
21120Sstevel@tonic-gate 
21130Sstevel@tonic-gate 	mi->mi_async_req_count++;
21140Sstevel@tonic-gate 	ASSERT(mi->mi_async_req_count != 0);
21150Sstevel@tonic-gate 	cv_signal(&mi->mi_async_reqs_cv);
21160Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
21170Sstevel@tonic-gate 	return;
21180Sstevel@tonic-gate 
21190Sstevel@tonic-gate noasync:
21200Sstevel@tonic-gate 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2121766Scarlsonj 	    nfs_zone() != mi->mi_zone) {
21220Sstevel@tonic-gate 		while (plist != NULL) {
21230Sstevel@tonic-gate 			pp = plist;
21240Sstevel@tonic-gate 			page_sub(&plist, pp);
21250Sstevel@tonic-gate 			pp->p_fsdata = C_COMMIT;
21260Sstevel@tonic-gate 			page_unlock(pp);
21270Sstevel@tonic-gate 		}
21280Sstevel@tonic-gate 		return;
21290Sstevel@tonic-gate 	}
21300Sstevel@tonic-gate 	(*commit)(vp, plist, offset, count, cr);
21310Sstevel@tonic-gate }
21320Sstevel@tonic-gate 
21330Sstevel@tonic-gate /*
21340Sstevel@tonic-gate  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
21350Sstevel@tonic-gate  * reference to the vnode is handed over to the thread; the caller should
21360Sstevel@tonic-gate  * no longer refer to the vnode.
21370Sstevel@tonic-gate  *
21380Sstevel@tonic-gate  * Unlike most of the async routines, this handoff is needed for
21390Sstevel@tonic-gate  * correctness reasons, not just performance.  So doing operations in the
21400Sstevel@tonic-gate  * context of the current thread is not an option.
21410Sstevel@tonic-gate  */
21420Sstevel@tonic-gate void
nfs4_async_inactive(vnode_t * vp,cred_t * cr)21430Sstevel@tonic-gate nfs4_async_inactive(vnode_t *vp, cred_t *cr)
21440Sstevel@tonic-gate {
21450Sstevel@tonic-gate 	mntinfo4_t *mi;
21460Sstevel@tonic-gate 	struct nfs4_async_reqs *args;
21470Sstevel@tonic-gate 	boolean_t signal_inactive_thread = B_FALSE;
21480Sstevel@tonic-gate 
21490Sstevel@tonic-gate 	mi = VTOMI4(vp);
21500Sstevel@tonic-gate 
21510Sstevel@tonic-gate 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
21520Sstevel@tonic-gate 	args->a_next = NULL;
21530Sstevel@tonic-gate #ifdef DEBUG
21540Sstevel@tonic-gate 	args->a_queuer = curthread;
21550Sstevel@tonic-gate #endif
21560Sstevel@tonic-gate 	args->a_vp = vp;
21570Sstevel@tonic-gate 	ASSERT(cr != NULL);
21580Sstevel@tonic-gate 	crhold(cr);
21590Sstevel@tonic-gate 	args->a_cred = cr;
21600Sstevel@tonic-gate 	args->a_io = NFS4_INACTIVE;
21610Sstevel@tonic-gate 
21620Sstevel@tonic-gate 	/*
21630Sstevel@tonic-gate 	 * Note that we don't check mi->mi_max_threads here, since we
21640Sstevel@tonic-gate 	 * *need* to get rid of this vnode regardless of whether someone
21650Sstevel@tonic-gate 	 * set nfs4_max_threads to zero in /etc/system.
21660Sstevel@tonic-gate 	 *
21670Sstevel@tonic-gate 	 * The manager thread knows about this and is willing to create
21685331Samw 	 * at least one thread to accommodate us.
21690Sstevel@tonic-gate 	 */
21700Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
21710Sstevel@tonic-gate 	if (mi->mi_inactive_thread == NULL) {
21720Sstevel@tonic-gate 		rnode4_t *rp;
21730Sstevel@tonic-gate 		vnode_t *unldvp = NULL;
21740Sstevel@tonic-gate 		char *unlname;
21750Sstevel@tonic-gate 		cred_t *unlcred;
21760Sstevel@tonic-gate 
21770Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
21780Sstevel@tonic-gate 		/*
21790Sstevel@tonic-gate 		 * We just need to free up the memory associated with the
21800Sstevel@tonic-gate 		 * vnode, which can be safely done from within the current
21810Sstevel@tonic-gate 		 * context.
21820Sstevel@tonic-gate 		 */
21830Sstevel@tonic-gate 		crfree(cr);	/* drop our reference */
21840Sstevel@tonic-gate 		kmem_free(args, sizeof (*args));
21850Sstevel@tonic-gate 		rp = VTOR4(vp);
21860Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
21870Sstevel@tonic-gate 		if (rp->r_unldvp != NULL) {
21880Sstevel@tonic-gate 			unldvp = rp->r_unldvp;
21890Sstevel@tonic-gate 			rp->r_unldvp = NULL;
21900Sstevel@tonic-gate 			unlname = rp->r_unlname;
21910Sstevel@tonic-gate 			rp->r_unlname = NULL;
21920Sstevel@tonic-gate 			unlcred = rp->r_unlcred;
21930Sstevel@tonic-gate 			rp->r_unlcred = NULL;
21940Sstevel@tonic-gate 		}
21950Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
21960Sstevel@tonic-gate 		/*
21970Sstevel@tonic-gate 		 * No need to explicitly throw away any cached pages.  The
21980Sstevel@tonic-gate 		 * eventual r4inactive() will attempt a synchronous
21990Sstevel@tonic-gate 		 * VOP_PUTPAGE() which will immediately fail since the request
22000Sstevel@tonic-gate 		 * is coming from the wrong zone, and then will proceed to call
22010Sstevel@tonic-gate 		 * nfs4_invalidate_pages() which will clean things up for us.
22020Sstevel@tonic-gate 		 *
22030Sstevel@tonic-gate 		 * Throw away the delegation here so rp4_addfree()'s attempt to
22040Sstevel@tonic-gate 		 * return any existing delegations becomes a no-op.
22050Sstevel@tonic-gate 		 */
22061705Sjwahlig 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
22071705Sjwahlig 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
22085302Sth199096 			    FALSE);
22090Sstevel@tonic-gate 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
22101705Sjwahlig 			nfs_rw_exit(&mi->mi_recovlock);
22111705Sjwahlig 		}
22120Sstevel@tonic-gate 		nfs4_clear_open_streams(rp);
22130Sstevel@tonic-gate 
22140Sstevel@tonic-gate 		rp4_addfree(rp, cr);
22150Sstevel@tonic-gate 		if (unldvp != NULL) {
22160Sstevel@tonic-gate 			kmem_free(unlname, MAXNAMELEN);
22170Sstevel@tonic-gate 			VN_RELE(unldvp);
22180Sstevel@tonic-gate 			crfree(unlcred);
22190Sstevel@tonic-gate 		}
22200Sstevel@tonic-gate 		return;
22210Sstevel@tonic-gate 	}
22220Sstevel@tonic-gate 
22230Sstevel@tonic-gate 	if (mi->mi_manager_thread == NULL) {
22240Sstevel@tonic-gate 		/*
22250Sstevel@tonic-gate 		 * We want to talk to the inactive thread.
22260Sstevel@tonic-gate 		 */
22270Sstevel@tonic-gate 		signal_inactive_thread = B_TRUE;
22280Sstevel@tonic-gate 	}
22290Sstevel@tonic-gate 
22300Sstevel@tonic-gate 	/*
22310Sstevel@tonic-gate 	 * Enqueue the vnode and wake up either the special thread (empty
22320Sstevel@tonic-gate 	 * list) or an async thread.
22330Sstevel@tonic-gate 	 */
22340Sstevel@tonic-gate 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
22350Sstevel@tonic-gate 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
22360Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_INACTIVE] = args;
22370Sstevel@tonic-gate 		signal_inactive_thread = B_TRUE;
22380Sstevel@tonic-gate 	} else {
22390Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
22400Sstevel@tonic-gate 		mi->mi_async_tail[NFS4_INACTIVE] = args;
22410Sstevel@tonic-gate 	}
22420Sstevel@tonic-gate 	if (signal_inactive_thread) {
22430Sstevel@tonic-gate 		cv_signal(&mi->mi_inact_req_cv);
22440Sstevel@tonic-gate 	} else  {
22450Sstevel@tonic-gate 		mi->mi_async_req_count++;
22460Sstevel@tonic-gate 		ASSERT(mi->mi_async_req_count != 0);
22470Sstevel@tonic-gate 		cv_signal(&mi->mi_async_reqs_cv);
22480Sstevel@tonic-gate 	}
22490Sstevel@tonic-gate 
22500Sstevel@tonic-gate 	mutex_exit(&mi->mi_async_lock);
22510Sstevel@tonic-gate }
22520Sstevel@tonic-gate 
22530Sstevel@tonic-gate int
writerp4(rnode4_t * rp,caddr_t base,int tcount,struct uio * uio,int pgcreated)22540Sstevel@tonic-gate writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
22550Sstevel@tonic-gate {
22560Sstevel@tonic-gate 	int pagecreate;
22570Sstevel@tonic-gate 	int n;
22580Sstevel@tonic-gate 	int saved_n;
22590Sstevel@tonic-gate 	caddr_t saved_base;
22600Sstevel@tonic-gate 	u_offset_t offset;
22610Sstevel@tonic-gate 	int error;
22620Sstevel@tonic-gate 	int sm_error;
22631841Spraks 	vnode_t *vp = RTOV(rp);
22640Sstevel@tonic-gate 
22650Sstevel@tonic-gate 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
22660Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
22671841Spraks 	if (!vpm_enable) {
22681841Spraks 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
22691841Spraks 	}
22700Sstevel@tonic-gate 
22710Sstevel@tonic-gate 	/*
22720Sstevel@tonic-gate 	 * Move bytes in at most PAGESIZE chunks. We must avoid
22730Sstevel@tonic-gate 	 * spanning pages in uiomove() because page faults may cause
22740Sstevel@tonic-gate 	 * the cache to be invalidated out from under us. The r_size is not
22750Sstevel@tonic-gate 	 * updated until after the uiomove. If we push the last page of a
22760Sstevel@tonic-gate 	 * file before r_size is correct, we will lose the data written past
22770Sstevel@tonic-gate 	 * the current (and invalid) r_size.
22780Sstevel@tonic-gate 	 */
22790Sstevel@tonic-gate 	do {
22800Sstevel@tonic-gate 		offset = uio->uio_loffset;
22810Sstevel@tonic-gate 		pagecreate = 0;
22820Sstevel@tonic-gate 
22830Sstevel@tonic-gate 		/*
22840Sstevel@tonic-gate 		 * n is the number of bytes required to satisfy the request
22850Sstevel@tonic-gate 		 *   or the number of bytes to fill out the page.
22860Sstevel@tonic-gate 		 */
22871841Spraks 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
22880Sstevel@tonic-gate 
22890Sstevel@tonic-gate 		/*
22900Sstevel@tonic-gate 		 * Check to see if we can skip reading in the page
22910Sstevel@tonic-gate 		 * and just allocate the memory.  We can do this
22920Sstevel@tonic-gate 		 * if we are going to rewrite the entire mapping
22930Sstevel@tonic-gate 		 * or if we are going to write to or beyond the current
22940Sstevel@tonic-gate 		 * end of file from the beginning of the mapping.
22950Sstevel@tonic-gate 		 *
22960Sstevel@tonic-gate 		 * The read of r_size is now protected by r_statelock.
22970Sstevel@tonic-gate 		 */
22980Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
22990Sstevel@tonic-gate 		/*
23000Sstevel@tonic-gate 		 * When pgcreated is nonzero the caller has already done
23010Sstevel@tonic-gate 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
23020Sstevel@tonic-gate 		 * segkpm this means we already have at least one page
23030Sstevel@tonic-gate 		 * created and mapped at base.
23040Sstevel@tonic-gate 		 */
23050Sstevel@tonic-gate 		pagecreate = pgcreated ||
23065302Sth199096 		    ((offset & PAGEOFFSET) == 0 &&
23075302Sth199096 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
23080Sstevel@tonic-gate 
23090Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
23100Sstevel@tonic-gate 
23111841Spraks 		if (!vpm_enable && pagecreate) {
23120Sstevel@tonic-gate 			/*
23130Sstevel@tonic-gate 			 * The last argument tells segmap_pagecreate() to
23140Sstevel@tonic-gate 			 * always lock the page, as opposed to sometimes
23150Sstevel@tonic-gate 			 * returning with the page locked. This way we avoid a
23160Sstevel@tonic-gate 			 * fault on the ensuing uiomove(), but also
23170Sstevel@tonic-gate 			 * more importantly (to fix bug 1094402) we can
23180Sstevel@tonic-gate 			 * call segmap_fault() to unlock the page in all
23190Sstevel@tonic-gate 			 * cases. An alternative would be to modify
23200Sstevel@tonic-gate 			 * segmap_pagecreate() to tell us when it is
23210Sstevel@tonic-gate 			 * locking a page, but that's a fairly major
23220Sstevel@tonic-gate 			 * interface change.
23230Sstevel@tonic-gate 			 */
23240Sstevel@tonic-gate 			if (pgcreated == 0)
23250Sstevel@tonic-gate 				(void) segmap_pagecreate(segkmap, base,
23265302Sth199096 				    (uint_t)n, 1);
23270Sstevel@tonic-gate 			saved_base = base;
23280Sstevel@tonic-gate 			saved_n = n;
23290Sstevel@tonic-gate 		}
23300Sstevel@tonic-gate 
23310Sstevel@tonic-gate 		/*
23320Sstevel@tonic-gate 		 * The number of bytes of data in the last page can not
23330Sstevel@tonic-gate 		 * be accurately be determined while page is being
23340Sstevel@tonic-gate 		 * uiomove'd to and the size of the file being updated.
23350Sstevel@tonic-gate 		 * Thus, inform threads which need to know accurately
23360Sstevel@tonic-gate 		 * how much data is in the last page of the file.  They
23370Sstevel@tonic-gate 		 * will not do the i/o immediately, but will arrange for
23380Sstevel@tonic-gate 		 * the i/o to happen later when this modify operation
23390Sstevel@tonic-gate 		 * will have finished.
23400Sstevel@tonic-gate 		 */
23410Sstevel@tonic-gate 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
23420Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
23430Sstevel@tonic-gate 		rp->r_flags |= R4MODINPROGRESS;
23440Sstevel@tonic-gate 		rp->r_modaddr = (offset & MAXBMASK);
23450Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
23460Sstevel@tonic-gate 
23471841Spraks 		if (vpm_enable) {
23481841Spraks 			/*
23491841Spraks 			 * Copy data. If new pages are created, part of
23501841Spraks 			 * the page that is not written will be initizliazed
23511841Spraks 			 * with zeros.
23521841Spraks 			 */
23531841Spraks 			error = vpm_data_copy(vp, offset, n, uio,
23545302Sth199096 			    !pagecreate, NULL, 0, S_WRITE);
23551841Spraks 		} else {
23561841Spraks 			error = uiomove(base, n, UIO_WRITE, uio);
23571841Spraks 		}
23580Sstevel@tonic-gate 
23590Sstevel@tonic-gate 		/*
23600Sstevel@tonic-gate 		 * r_size is the maximum number of
23610Sstevel@tonic-gate 		 * bytes known to be in the file.
23620Sstevel@tonic-gate 		 * Make sure it is at least as high as the
23630Sstevel@tonic-gate 		 * first unwritten byte pointed to by uio_loffset.
23640Sstevel@tonic-gate 		 */
23650Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
23660Sstevel@tonic-gate 		if (rp->r_size < uio->uio_loffset)
23670Sstevel@tonic-gate 			rp->r_size = uio->uio_loffset;
23680Sstevel@tonic-gate 		rp->r_flags &= ~R4MODINPROGRESS;
23690Sstevel@tonic-gate 		rp->r_flags |= R4DIRTY;
23700Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
23710Sstevel@tonic-gate 
23720Sstevel@tonic-gate 		/* n = # of bytes written */
23730Sstevel@tonic-gate 		n = (int)(uio->uio_loffset - offset);
23741841Spraks 
23751841Spraks 		if (!vpm_enable) {
23761841Spraks 			base += n;
23771841Spraks 		}
23781841Spraks 
23790Sstevel@tonic-gate 		tcount -= n;
23800Sstevel@tonic-gate 		/*
23810Sstevel@tonic-gate 		 * If we created pages w/o initializing them completely,
23820Sstevel@tonic-gate 		 * we need to zero the part that wasn't set up.
23830Sstevel@tonic-gate 		 * This happens on a most EOF write cases and if
23840Sstevel@tonic-gate 		 * we had some sort of error during the uiomove.
23850Sstevel@tonic-gate 		 */
23861841Spraks 		if (!vpm_enable && pagecreate) {
23870Sstevel@tonic-gate 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
23880Sstevel@tonic-gate 				(void) kzero(base, PAGESIZE - n);
23890Sstevel@tonic-gate 
23900Sstevel@tonic-gate 			if (pgcreated) {
23910Sstevel@tonic-gate 				/*
23920Sstevel@tonic-gate 				 * Caller is responsible for this page,
23930Sstevel@tonic-gate 				 * it was not created in this loop.
23940Sstevel@tonic-gate 				 */
23950Sstevel@tonic-gate 				pgcreated = 0;
23960Sstevel@tonic-gate 			} else {
23970Sstevel@tonic-gate 				/*
23980Sstevel@tonic-gate 				 * For bug 1094402: segmap_pagecreate locks
23990Sstevel@tonic-gate 				 * page. Unlock it. This also unlocks the
24000Sstevel@tonic-gate 				 * pages allocated by page_create_va() in
24010Sstevel@tonic-gate 				 * segmap_pagecreate().
24020Sstevel@tonic-gate 				 */
24030Sstevel@tonic-gate 				sm_error = segmap_fault(kas.a_hat, segkmap,
24045302Sth199096 				    saved_base, saved_n,
24055302Sth199096 				    F_SOFTUNLOCK, S_WRITE);
24060Sstevel@tonic-gate 				if (error == 0)
24070Sstevel@tonic-gate 					error = sm_error;
24080Sstevel@tonic-gate 			}
24090Sstevel@tonic-gate 		}
24100Sstevel@tonic-gate 	} while (tcount > 0 && error == 0);
24110Sstevel@tonic-gate 
24120Sstevel@tonic-gate 	return (error);
24130Sstevel@tonic-gate }
24140Sstevel@tonic-gate 
24150Sstevel@tonic-gate int
nfs4_putpages(vnode_t * vp,u_offset_t off,size_t len,int flags,cred_t * cr)24160Sstevel@tonic-gate nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
24170Sstevel@tonic-gate {
24180Sstevel@tonic-gate 	rnode4_t *rp;
24190Sstevel@tonic-gate 	page_t *pp;
24200Sstevel@tonic-gate 	u_offset_t eoff;
24210Sstevel@tonic-gate 	u_offset_t io_off;
24220Sstevel@tonic-gate 	size_t io_len;
24230Sstevel@tonic-gate 	int error;
24240Sstevel@tonic-gate 	int rdirty;
24250Sstevel@tonic-gate 	int err;
24260Sstevel@tonic-gate 
24270Sstevel@tonic-gate 	rp = VTOR4(vp);
24280Sstevel@tonic-gate 	ASSERT(rp->r_count > 0);
24290Sstevel@tonic-gate 
24300Sstevel@tonic-gate 	if (!nfs4_has_pages(vp))
24310Sstevel@tonic-gate 		return (0);
24320Sstevel@tonic-gate 
24330Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
24340Sstevel@tonic-gate 
24350Sstevel@tonic-gate 	/*
24360Sstevel@tonic-gate 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
24370Sstevel@tonic-gate 	 * writes.  B_FORCE is set to force the VM system to actually
24380Sstevel@tonic-gate 	 * invalidate the pages, even if the i/o failed.  The pages
24390Sstevel@tonic-gate 	 * need to get invalidated because they can't be written out
24400Sstevel@tonic-gate 	 * because there isn't any space left on either the server's
24410Sstevel@tonic-gate 	 * file system or in the user's disk quota.  The B_FREE bit
24420Sstevel@tonic-gate 	 * is cleared to avoid confusion as to whether this is a
24430Sstevel@tonic-gate 	 * request to place the page on the freelist or to destroy
24440Sstevel@tonic-gate 	 * it.
24450Sstevel@tonic-gate 	 */
24460Sstevel@tonic-gate 	if ((rp->r_flags & R4OUTOFSPACE) ||
24470Sstevel@tonic-gate 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
24480Sstevel@tonic-gate 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
24490Sstevel@tonic-gate 
24500Sstevel@tonic-gate 	if (len == 0) {
24510Sstevel@tonic-gate 		/*
24520Sstevel@tonic-gate 		 * If doing a full file synchronous operation, then clear
24530Sstevel@tonic-gate 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
24540Sstevel@tonic-gate 		 * is happening, then R4DIRTY will get set again.  The
24550Sstevel@tonic-gate 		 * R4DIRTY bit must get cleared before the flush so that
24560Sstevel@tonic-gate 		 * we don't lose this information.
24573101Sthurlow 		 *
24583101Sthurlow 		 * If there are no full file async write operations
24593101Sthurlow 		 * pending and RDIRTY bit is set, clear it.
24600Sstevel@tonic-gate 		 */
24610Sstevel@tonic-gate 		if (off == (u_offset_t)0 &&
24620Sstevel@tonic-gate 		    !(flags & B_ASYNC) &&
24630Sstevel@tonic-gate 		    (rp->r_flags & R4DIRTY)) {
24640Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
24650Sstevel@tonic-gate 			rdirty = (rp->r_flags & R4DIRTY);
24660Sstevel@tonic-gate 			rp->r_flags &= ~R4DIRTY;
24670Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
24683101Sthurlow 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
24693101Sthurlow 			mutex_enter(&rp->r_statelock);
24703101Sthurlow 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
24713101Sthurlow 				rdirty = (rp->r_flags & R4DIRTY);
24723101Sthurlow 				rp->r_flags &= ~R4DIRTY;
24733101Sthurlow 			}
24743101Sthurlow 			mutex_exit(&rp->r_statelock);
24750Sstevel@tonic-gate 		} else
24760Sstevel@tonic-gate 			rdirty = 0;
24770Sstevel@tonic-gate 
24780Sstevel@tonic-gate 		/*
24790Sstevel@tonic-gate 		 * Search the entire vp list for pages >= off, and flush
24800Sstevel@tonic-gate 		 * the dirty pages.
24810Sstevel@tonic-gate 		 */
24820Sstevel@tonic-gate 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
24835302Sth199096 		    flags, cr);
24840Sstevel@tonic-gate 
24850Sstevel@tonic-gate 		/*
24865331Samw 		 * If an error occurred and the file was marked as dirty
24870Sstevel@tonic-gate 		 * before and we aren't forcibly invalidating pages, then
24880Sstevel@tonic-gate 		 * reset the R4DIRTY flag.
24890Sstevel@tonic-gate 		 */
24900Sstevel@tonic-gate 		if (error && rdirty &&
24910Sstevel@tonic-gate 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
24920Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
24930Sstevel@tonic-gate 			rp->r_flags |= R4DIRTY;
24940Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
24950Sstevel@tonic-gate 		}
24960Sstevel@tonic-gate 	} else {
24970Sstevel@tonic-gate 		/*
24980Sstevel@tonic-gate 		 * Do a range from [off...off + len) looking for pages
24990Sstevel@tonic-gate 		 * to deal with.
25000Sstevel@tonic-gate 		 */
25010Sstevel@tonic-gate 		error = 0;
25020Sstevel@tonic-gate 		io_len = 0;
25030Sstevel@tonic-gate 		eoff = off + len;
25040Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
25050Sstevel@tonic-gate 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
25060Sstevel@tonic-gate 		    io_off += io_len) {
25070Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
25080Sstevel@tonic-gate 			/*
25090Sstevel@tonic-gate 			 * If we are not invalidating, synchronously
25100Sstevel@tonic-gate 			 * freeing or writing pages use the routine
25110Sstevel@tonic-gate 			 * page_lookup_nowait() to prevent reclaiming
25120Sstevel@tonic-gate 			 * them from the free list.
25130Sstevel@tonic-gate 			 */
25140Sstevel@tonic-gate 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
25150Sstevel@tonic-gate 				pp = page_lookup(vp, io_off,
25160Sstevel@tonic-gate 				    (flags & (B_INVAL | B_FREE)) ?
25170Sstevel@tonic-gate 				    SE_EXCL : SE_SHARED);
25180Sstevel@tonic-gate 			} else {
25190Sstevel@tonic-gate 				pp = page_lookup_nowait(vp, io_off,
25200Sstevel@tonic-gate 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
25210Sstevel@tonic-gate 			}
25220Sstevel@tonic-gate 
25230Sstevel@tonic-gate 			if (pp == NULL || !pvn_getdirty(pp, flags))
25240Sstevel@tonic-gate 				io_len = PAGESIZE;
25250Sstevel@tonic-gate 			else {
25260Sstevel@tonic-gate 				err = (*rp->r_putapage)(vp, pp, &io_off,
25270Sstevel@tonic-gate 				    &io_len, flags, cr);
25280Sstevel@tonic-gate 				if (!error)
25290Sstevel@tonic-gate 					error = err;
25300Sstevel@tonic-gate 				/*
25310Sstevel@tonic-gate 				 * "io_off" and "io_len" are returned as
25320Sstevel@tonic-gate 				 * the range of pages we actually wrote.
25330Sstevel@tonic-gate 				 * This allows us to skip ahead more quickly
25340Sstevel@tonic-gate 				 * since several pages may've been dealt
25350Sstevel@tonic-gate 				 * with by this iteration of the loop.
25360Sstevel@tonic-gate 				 */
25370Sstevel@tonic-gate 			}
25380Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
25390Sstevel@tonic-gate 		}
25400Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
25410Sstevel@tonic-gate 	}
25420Sstevel@tonic-gate 
25430Sstevel@tonic-gate 	return (error);
25440Sstevel@tonic-gate }
25450Sstevel@tonic-gate 
25460Sstevel@tonic-gate void
nfs4_invalidate_pages(vnode_t * vp,u_offset_t off,cred_t * cr)25470Sstevel@tonic-gate nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
25480Sstevel@tonic-gate {
25490Sstevel@tonic-gate 	rnode4_t *rp;
25500Sstevel@tonic-gate 
25510Sstevel@tonic-gate 	rp = VTOR4(vp);
25520Sstevel@tonic-gate 	if (IS_SHADOW(vp, rp))
25530Sstevel@tonic-gate 		vp = RTOV4(rp);
25540Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
25550Sstevel@tonic-gate 	while (rp->r_flags & R4TRUNCATE)
25560Sstevel@tonic-gate 		cv_wait(&rp->r_cv, &rp->r_statelock);
25570Sstevel@tonic-gate 	rp->r_flags |= R4TRUNCATE;
25580Sstevel@tonic-gate 	if (off == (u_offset_t)0) {
25590Sstevel@tonic-gate 		rp->r_flags &= ~R4DIRTY;
25600Sstevel@tonic-gate 		if (!(rp->r_flags & R4STALE))
25610Sstevel@tonic-gate 			rp->r_error = 0;
25620Sstevel@tonic-gate 	}
25630Sstevel@tonic-gate 	rp->r_truncaddr = off;
25640Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
25650Sstevel@tonic-gate 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
25665302Sth199096 	    B_INVAL | B_TRUNC, cr);
25670Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
25680Sstevel@tonic-gate 	rp->r_flags &= ~R4TRUNCATE;
25690Sstevel@tonic-gate 	cv_broadcast(&rp->r_cv);
25700Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
25710Sstevel@tonic-gate }
25720Sstevel@tonic-gate 
25730Sstevel@tonic-gate static int
nfs4_mnt_kstat_update(kstat_t * ksp,int rw)25740Sstevel@tonic-gate nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
25750Sstevel@tonic-gate {
25760Sstevel@tonic-gate 	mntinfo4_t *mi;
25770Sstevel@tonic-gate 	struct mntinfo_kstat *mik;
25780Sstevel@tonic-gate 	vfs_t *vfsp;
25790Sstevel@tonic-gate 
25800Sstevel@tonic-gate 	/* this is a read-only kstat. Bail out on a write */
25810Sstevel@tonic-gate 	if (rw == KSTAT_WRITE)
25820Sstevel@tonic-gate 		return (EACCES);
25830Sstevel@tonic-gate 
25840Sstevel@tonic-gate 
25850Sstevel@tonic-gate 	/*
25860Sstevel@tonic-gate 	 * We don't want to wait here as kstat_chain_lock could be held by
25870Sstevel@tonic-gate 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
25880Sstevel@tonic-gate 	 * and thus could lead to a deadlock.
25890Sstevel@tonic-gate 	 */
25900Sstevel@tonic-gate 	vfsp = (struct vfs *)ksp->ks_private;
25910Sstevel@tonic-gate 
25920Sstevel@tonic-gate 	mi = VFTOMI4(vfsp);
25930Sstevel@tonic-gate 	mik = (struct mntinfo_kstat *)ksp->ks_data;
25940Sstevel@tonic-gate 
25950Sstevel@tonic-gate 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
25960Sstevel@tonic-gate 
25970Sstevel@tonic-gate 	mik->mik_vers = (uint32_t)mi->mi_vers;
25980Sstevel@tonic-gate 	mik->mik_flags = mi->mi_flags;
25990Sstevel@tonic-gate 	/*
26000Sstevel@tonic-gate 	 * The sv_secdata holds the flavor the client specifies.
26010Sstevel@tonic-gate 	 * If the client uses default and a security negotiation
26020Sstevel@tonic-gate 	 * occurs, sv_currsec will point to the current flavor
26030Sstevel@tonic-gate 	 * selected from the server flavor list.
26040Sstevel@tonic-gate 	 * sv_currsec is NULL if no security negotiation takes place.
26050Sstevel@tonic-gate 	 */
26060Sstevel@tonic-gate 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
26075302Sth199096 	    mi->mi_curr_serv->sv_currsec->secmod :
26085302Sth199096 	    mi->mi_curr_serv->sv_secdata->secmod;
26090Sstevel@tonic-gate 	mik->mik_curread = (uint32_t)mi->mi_curread;
26100Sstevel@tonic-gate 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
26110Sstevel@tonic-gate 	mik->mik_retrans = mi->mi_retrans;
26120Sstevel@tonic-gate 	mik->mik_timeo = mi->mi_timeo;
26130Sstevel@tonic-gate 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
26140Sstevel@tonic-gate 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
26150Sstevel@tonic-gate 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
26160Sstevel@tonic-gate 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
26170Sstevel@tonic-gate 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
26180Sstevel@tonic-gate 	mik->mik_failover = (uint32_t)mi->mi_failover;
26190Sstevel@tonic-gate 	mik->mik_remap = (uint32_t)mi->mi_remap;
26200Sstevel@tonic-gate 
26210Sstevel@tonic-gate 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
26220Sstevel@tonic-gate 
26230Sstevel@tonic-gate 	return (0);
26240Sstevel@tonic-gate }
26250Sstevel@tonic-gate 
26260Sstevel@tonic-gate void
nfs4_mnt_kstat_init(struct vfs * vfsp)26270Sstevel@tonic-gate nfs4_mnt_kstat_init(struct vfs *vfsp)
26280Sstevel@tonic-gate {
26290Sstevel@tonic-gate 	mntinfo4_t *mi = VFTOMI4(vfsp);
26300Sstevel@tonic-gate 
26310Sstevel@tonic-gate 	/*
26320Sstevel@tonic-gate 	 * PSARC 2001/697 Contract Private Interface
26330Sstevel@tonic-gate 	 * All nfs kstats are under SunMC contract
26340Sstevel@tonic-gate 	 * Please refer to the PSARC listed above and contact
26350Sstevel@tonic-gate 	 * SunMC before making any changes!
26360Sstevel@tonic-gate 	 *
26370Sstevel@tonic-gate 	 * Changes must be reviewed by Solaris File Sharing
26380Sstevel@tonic-gate 	 * Changes must be communicated to contract-2001-697@sun.com
26390Sstevel@tonic-gate 	 *
26400Sstevel@tonic-gate 	 */
26410Sstevel@tonic-gate 
26420Sstevel@tonic-gate 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
26430Sstevel@tonic-gate 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
26440Sstevel@tonic-gate 	if (mi->mi_io_kstats) {
26450Sstevel@tonic-gate 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
26460Sstevel@tonic-gate 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
26470Sstevel@tonic-gate 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
26480Sstevel@tonic-gate 		kstat_install(mi->mi_io_kstats);
26490Sstevel@tonic-gate 	}
26500Sstevel@tonic-gate 
26510Sstevel@tonic-gate 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
26520Sstevel@tonic-gate 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
26530Sstevel@tonic-gate 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
26540Sstevel@tonic-gate 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
26550Sstevel@tonic-gate 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
26560Sstevel@tonic-gate 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
26570Sstevel@tonic-gate 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
26580Sstevel@tonic-gate 		kstat_install(mi->mi_ro_kstats);
26590Sstevel@tonic-gate 	}
26600Sstevel@tonic-gate 
26610Sstevel@tonic-gate 	nfs4_mnt_recov_kstat_init(vfsp);
26620Sstevel@tonic-gate }
26630Sstevel@tonic-gate 
26640Sstevel@tonic-gate void
nfs4_write_error(vnode_t * vp,int error,cred_t * cr)26650Sstevel@tonic-gate nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
26660Sstevel@tonic-gate {
26670Sstevel@tonic-gate 	mntinfo4_t *mi;
266811066Srafael.vanoni@sun.com 	clock_t now = ddi_get_lbolt();
26690Sstevel@tonic-gate 
26700Sstevel@tonic-gate 	mi = VTOMI4(vp);
26710Sstevel@tonic-gate 	/*
26720Sstevel@tonic-gate 	 * In case of forced unmount, do not print any messages
26730Sstevel@tonic-gate 	 * since it can flood the console with error messages.
26740Sstevel@tonic-gate 	 */
26750Sstevel@tonic-gate 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
26760Sstevel@tonic-gate 		return;
26770Sstevel@tonic-gate 
26780Sstevel@tonic-gate 	/*
26790Sstevel@tonic-gate 	 * If the mount point is dead, not recoverable, do not
26800Sstevel@tonic-gate 	 * print error messages that can flood the console.
26810Sstevel@tonic-gate 	 */
26820Sstevel@tonic-gate 	if (mi->mi_flags & MI4_RECOV_FAIL)
26830Sstevel@tonic-gate 		return;
26840Sstevel@tonic-gate 
26850Sstevel@tonic-gate 	/*
26860Sstevel@tonic-gate 	 * No use in flooding the console with ENOSPC
26870Sstevel@tonic-gate 	 * messages from the same file system.
26880Sstevel@tonic-gate 	 */
26890Sstevel@tonic-gate 	if ((error != ENOSPC && error != EDQUOT) ||
269011066Srafael.vanoni@sun.com 	    now - mi->mi_printftime > 0) {
26910Sstevel@tonic-gate 		zoneid_t zoneid = mi->mi_zone->zone_id;
26920Sstevel@tonic-gate 
26930Sstevel@tonic-gate #ifdef DEBUG
26940Sstevel@tonic-gate 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
26950Sstevel@tonic-gate 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
26960Sstevel@tonic-gate #else
26970Sstevel@tonic-gate 		nfs_perror(error, "NFS write error on host %s: %m.\n",
26980Sstevel@tonic-gate 		    VTOR4(vp)->r_server->sv_hostname, NULL);
26990Sstevel@tonic-gate #endif
27000Sstevel@tonic-gate 		if (error == ENOSPC || error == EDQUOT) {
27010Sstevel@tonic-gate 			zcmn_err(zoneid, CE_CONT,
27020Sstevel@tonic-gate 			    "^File: userid=%d, groupid=%d\n",
27030Sstevel@tonic-gate 			    crgetuid(cr), crgetgid(cr));
27040Sstevel@tonic-gate 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
27050Sstevel@tonic-gate 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
27060Sstevel@tonic-gate 				zcmn_err(zoneid, CE_CONT,
27070Sstevel@tonic-gate 				    "^User: userid=%d, groupid=%d\n",
27080Sstevel@tonic-gate 				    crgetuid(curthread->t_cred),
27090Sstevel@tonic-gate 				    crgetgid(curthread->t_cred));
27100Sstevel@tonic-gate 			}
271111066Srafael.vanoni@sun.com 			mi->mi_printftime = now +
27120Sstevel@tonic-gate 			    nfs_write_error_interval * hz;
27130Sstevel@tonic-gate 		}
27140Sstevel@tonic-gate 		sfh4_printfhandle(VTOR4(vp)->r_fh);
27150Sstevel@tonic-gate #ifdef DEBUG
27160Sstevel@tonic-gate 		if (error == EACCES) {
27170Sstevel@tonic-gate 			zcmn_err(zoneid, CE_CONT,
27180Sstevel@tonic-gate 			    "nfs_bio: cred is%s kcred\n",
27190Sstevel@tonic-gate 			    cr == kcred ? "" : " not");
27200Sstevel@tonic-gate 		}
27210Sstevel@tonic-gate #endif
27220Sstevel@tonic-gate 	}
27230Sstevel@tonic-gate }
27240Sstevel@tonic-gate 
27250Sstevel@tonic-gate /*
27260Sstevel@tonic-gate  * Return non-zero if the given file can be safely memory mapped.  Locks
27270Sstevel@tonic-gate  * are safe if whole-file (length and offset are both zero).
27280Sstevel@tonic-gate  */
27290Sstevel@tonic-gate 
27300Sstevel@tonic-gate #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
27310Sstevel@tonic-gate 
27320Sstevel@tonic-gate static int
nfs4_safemap(const vnode_t * vp)27330Sstevel@tonic-gate nfs4_safemap(const vnode_t *vp)
27340Sstevel@tonic-gate {
27350Sstevel@tonic-gate 	locklist_t	*llp, *next_llp;
27360Sstevel@tonic-gate 	int		safe = 1;
27370Sstevel@tonic-gate 	rnode4_t	*rp = VTOR4(vp);
27380Sstevel@tonic-gate 
27390Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
27400Sstevel@tonic-gate 
27410Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
27425302Sth199096 	    "vp = %p", (void *)vp));
27430Sstevel@tonic-gate 
27440Sstevel@tonic-gate 	/*
27450Sstevel@tonic-gate 	 * Review all the locks for the vnode, both ones that have been
27460Sstevel@tonic-gate 	 * acquired and ones that are pending.  We assume that
27470Sstevel@tonic-gate 	 * flk_active_locks_for_vp() has merged any locks that can be
27480Sstevel@tonic-gate 	 * merged (so that if a process has the entire file locked, it is
27490Sstevel@tonic-gate 	 * represented as a single lock).
27500Sstevel@tonic-gate 	 *
27510Sstevel@tonic-gate 	 * Note that we can't bail out of the loop if we find a non-safe
27520Sstevel@tonic-gate 	 * lock, because we have to free all the elements in the llp list.
27530Sstevel@tonic-gate 	 * We might be able to speed up this code slightly by not looking
27540Sstevel@tonic-gate 	 * at each lock's l_start and l_len fields once we've found a
27550Sstevel@tonic-gate 	 * non-safe lock.
27560Sstevel@tonic-gate 	 */
27570Sstevel@tonic-gate 
27580Sstevel@tonic-gate 	llp = flk_active_locks_for_vp(vp);
27590Sstevel@tonic-gate 	while (llp) {
27600Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
27610Sstevel@tonic-gate 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
27620Sstevel@tonic-gate 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
27630Sstevel@tonic-gate 		if (!SAFE_LOCK(llp->ll_flock)) {
27640Sstevel@tonic-gate 			safe = 0;
27650Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
27660Sstevel@tonic-gate 			    "nfs4_safemap: unsafe active lock (%" PRId64
27670Sstevel@tonic-gate 			    ", %" PRId64 ")", llp->ll_flock.l_start,
27680Sstevel@tonic-gate 			    llp->ll_flock.l_len));
27690Sstevel@tonic-gate 		}
27700Sstevel@tonic-gate 		next_llp = llp->ll_next;
27710Sstevel@tonic-gate 		VN_RELE(llp->ll_vp);
27720Sstevel@tonic-gate 		kmem_free(llp, sizeof (*llp));
27730Sstevel@tonic-gate 		llp = next_llp;
27740Sstevel@tonic-gate 	}
27750Sstevel@tonic-gate 
27760Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
27775302Sth199096 	    safe ? "safe" : "unsafe"));
27780Sstevel@tonic-gate 	return (safe);
27790Sstevel@tonic-gate }
27800Sstevel@tonic-gate 
27810Sstevel@tonic-gate /*
27820Sstevel@tonic-gate  * Return whether there is a lost LOCK or LOCKU queued up for the given
27830Sstevel@tonic-gate  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
27840Sstevel@tonic-gate  */
27850Sstevel@tonic-gate 
27860Sstevel@tonic-gate bool_t
nfs4_map_lost_lock_conflict(vnode_t * vp)27870Sstevel@tonic-gate nfs4_map_lost_lock_conflict(vnode_t *vp)
27880Sstevel@tonic-gate {
27890Sstevel@tonic-gate 	bool_t conflict = FALSE;
27900Sstevel@tonic-gate 	nfs4_lost_rqst_t *lrp;
27910Sstevel@tonic-gate 	mntinfo4_t *mi = VTOMI4(vp);
27920Sstevel@tonic-gate 
27930Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
27940Sstevel@tonic-gate 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
27950Sstevel@tonic-gate 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
27960Sstevel@tonic-gate 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
27970Sstevel@tonic-gate 			continue;
27980Sstevel@tonic-gate 		ASSERT(lrp->lr_vp != NULL);
27995331Samw 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
28000Sstevel@tonic-gate 			continue;	/* different file */
28010Sstevel@tonic-gate 		if (!SAFE_LOCK(*lrp->lr_flk)) {
28020Sstevel@tonic-gate 			conflict = TRUE;
28030Sstevel@tonic-gate 			break;
28040Sstevel@tonic-gate 		}
28050Sstevel@tonic-gate 	}
28060Sstevel@tonic-gate 
28070Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
28080Sstevel@tonic-gate 	return (conflict);
28090Sstevel@tonic-gate }
28100Sstevel@tonic-gate 
28110Sstevel@tonic-gate /*
28120Sstevel@tonic-gate  * nfs_lockcompletion:
28130Sstevel@tonic-gate  *
28140Sstevel@tonic-gate  * If the vnode has a lock that makes it unsafe to cache the file, mark it
28150Sstevel@tonic-gate  * as non cachable (set VNOCACHE bit).
28160Sstevel@tonic-gate  */
28170Sstevel@tonic-gate 
28180Sstevel@tonic-gate void
nfs4_lockcompletion(vnode_t * vp,int cmd)28190Sstevel@tonic-gate nfs4_lockcompletion(vnode_t *vp, int cmd)
28200Sstevel@tonic-gate {
28210Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
28220Sstevel@tonic-gate 
28230Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
28240Sstevel@tonic-gate 	ASSERT(!IS_SHADOW(vp, rp));
28250Sstevel@tonic-gate 
28260Sstevel@tonic-gate 	if (cmd == F_SETLK || cmd == F_SETLKW) {
28270Sstevel@tonic-gate 
28280Sstevel@tonic-gate 		if (!nfs4_safemap(vp)) {
28290Sstevel@tonic-gate 			mutex_enter(&vp->v_lock);
28300Sstevel@tonic-gate 			vp->v_flag |= VNOCACHE;
28310Sstevel@tonic-gate 			mutex_exit(&vp->v_lock);
28320Sstevel@tonic-gate 		} else {
28330Sstevel@tonic-gate 			mutex_enter(&vp->v_lock);
28340Sstevel@tonic-gate 			vp->v_flag &= ~VNOCACHE;
28350Sstevel@tonic-gate 			mutex_exit(&vp->v_lock);
28360Sstevel@tonic-gate 		}
28370Sstevel@tonic-gate 	}
28380Sstevel@tonic-gate 	/*
28390Sstevel@tonic-gate 	 * The cached attributes of the file are stale after acquiring
28400Sstevel@tonic-gate 	 * the lock on the file. They were updated when the file was
28410Sstevel@tonic-gate 	 * opened, but not updated when the lock was acquired. Therefore the
28420Sstevel@tonic-gate 	 * cached attributes are invalidated after the lock is obtained.
28430Sstevel@tonic-gate 	 */
28440Sstevel@tonic-gate 	PURGE_ATTRCACHE4(vp);
28450Sstevel@tonic-gate }
28460Sstevel@tonic-gate 
28470Sstevel@tonic-gate /* ARGSUSED */
28480Sstevel@tonic-gate static void *
nfs4_mi_init(zoneid_t zoneid)28490Sstevel@tonic-gate nfs4_mi_init(zoneid_t zoneid)
28500Sstevel@tonic-gate {
28510Sstevel@tonic-gate 	struct mi4_globals *mig;
28520Sstevel@tonic-gate 
28530Sstevel@tonic-gate 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
28540Sstevel@tonic-gate 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
28550Sstevel@tonic-gate 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
28560Sstevel@tonic-gate 	    offsetof(mntinfo4_t, mi_zone_node));
28570Sstevel@tonic-gate 	mig->mig_destructor_called = B_FALSE;
28580Sstevel@tonic-gate 	return (mig);
28590Sstevel@tonic-gate }
28600Sstevel@tonic-gate 
28610Sstevel@tonic-gate /*
28620Sstevel@tonic-gate  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
28630Sstevel@tonic-gate  * state and killing off threads.
28640Sstevel@tonic-gate  */
28650Sstevel@tonic-gate /* ARGSUSED */
28660Sstevel@tonic-gate static void
nfs4_mi_shutdown(zoneid_t zoneid,void * data)28670Sstevel@tonic-gate nfs4_mi_shutdown(zoneid_t zoneid, void *data)
28680Sstevel@tonic-gate {
28690Sstevel@tonic-gate 	struct mi4_globals *mig = data;
28700Sstevel@tonic-gate 	mntinfo4_t *mi;
28710Sstevel@tonic-gate 	nfs4_server_t *np;
28720Sstevel@tonic-gate 
28730Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
28740Sstevel@tonic-gate 	    "nfs4_mi_shutdown zone %d\n", zoneid));
28750Sstevel@tonic-gate 	ASSERT(mig != NULL);
28761705Sjwahlig 	for (;;) {
28771705Sjwahlig 		mutex_enter(&mig->mig_lock);
28781705Sjwahlig 		mi = list_head(&mig->mig_list);
28791705Sjwahlig 		if (mi == NULL) {
28801705Sjwahlig 			mutex_exit(&mig->mig_lock);
28811705Sjwahlig 			break;
28821705Sjwahlig 		}
28831705Sjwahlig 
28840Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
28850Sstevel@tonic-gate 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
28860Sstevel@tonic-gate 		/*
28870Sstevel@tonic-gate 		 * purge the DNLC for this filesystem
28880Sstevel@tonic-gate 		 */
28890Sstevel@tonic-gate 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
28900Sstevel@tonic-gate 		/*
28910Sstevel@tonic-gate 		 * Tell existing async worker threads to exit.
28920Sstevel@tonic-gate 		 */
28931705Sjwahlig 		mutex_enter(&mi->mi_async_lock);
28940Sstevel@tonic-gate 		mi->mi_max_threads = 0;
289511507SVallish.Vaidyeshwara@Sun.COM 		NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
28960Sstevel@tonic-gate 		/*
28971705Sjwahlig 		 * Set the appropriate flags, signal and wait for both the
28981705Sjwahlig 		 * async manager and the inactive thread to exit when they're
28991705Sjwahlig 		 * done with their current work.
29000Sstevel@tonic-gate 		 */
29010Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
29020Sstevel@tonic-gate 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
29030Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
29040Sstevel@tonic-gate 		mutex_exit(&mi->mi_async_lock);
29051705Sjwahlig 		if (mi->mi_manager_thread) {
29061705Sjwahlig 			nfs4_async_manager_stop(mi->mi_vfsp);
29071705Sjwahlig 		}
29081705Sjwahlig 		if (mi->mi_inactive_thread) {
29091705Sjwahlig 			mutex_enter(&mi->mi_async_lock);
29101705Sjwahlig 			cv_signal(&mi->mi_inact_req_cv);
29111705Sjwahlig 			/*
29121705Sjwahlig 			 * Wait for the inactive thread to exit.
29131705Sjwahlig 			 */
29141705Sjwahlig 			while (mi->mi_inactive_thread != NULL) {
29151705Sjwahlig 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
29161705Sjwahlig 			}
29171705Sjwahlig 			mutex_exit(&mi->mi_async_lock);
29181705Sjwahlig 		}
2919264Sthurlow 		/*
29201705Sjwahlig 		 * Wait for the recovery thread to complete, that is, it will
29211705Sjwahlig 		 * signal when it is done using the "mi" structure and about
29221705Sjwahlig 		 * to exit
29231705Sjwahlig 		 */
29241705Sjwahlig 		mutex_enter(&mi->mi_lock);
29251705Sjwahlig 		while (mi->mi_in_recovery > 0)
29261705Sjwahlig 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
29271705Sjwahlig 		mutex_exit(&mi->mi_lock);
29281705Sjwahlig 		/*
2929264Sthurlow 		 * We're done when every mi has been done or the list is empty.
29301705Sjwahlig 		 * This one is done, remove it from the list.
2931264Sthurlow 		 */
29321705Sjwahlig 		list_remove(&mig->mig_list, mi);
2933264Sthurlow 		mutex_exit(&mig->mig_lock);
2934*13096SJordan.Vaughan@Sun.com 		zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2935*13096SJordan.Vaughan@Sun.com 
29361705Sjwahlig 		/*
29371705Sjwahlig 		 * Release hold on vfs and mi done to prevent race with zone
29381705Sjwahlig 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
29391705Sjwahlig 		 */
2940264Sthurlow 		VFS_RELE(mi->mi_vfsp);
29411705Sjwahlig 		MI4_RELE(mi);
29420Sstevel@tonic-gate 	}
29430Sstevel@tonic-gate 	/*
29440Sstevel@tonic-gate 	 * Tell each renew thread in the zone to exit
29450Sstevel@tonic-gate 	 */
29460Sstevel@tonic-gate 	mutex_enter(&nfs4_server_lst_lock);
29470Sstevel@tonic-gate 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
29480Sstevel@tonic-gate 		mutex_enter(&np->s_lock);
29490Sstevel@tonic-gate 		if (np->zoneid == zoneid) {
29500Sstevel@tonic-gate 			/*
29510Sstevel@tonic-gate 			 * We add another hold onto the nfs4_server_t
29520Sstevel@tonic-gate 			 * because this will make sure tha the nfs4_server_t
29530Sstevel@tonic-gate 			 * stays around until nfs4_callback_fini_zone destroys
29540Sstevel@tonic-gate 			 * the zone. This way, the renew thread can
29550Sstevel@tonic-gate 			 * unconditionally release its holds on the
29560Sstevel@tonic-gate 			 * nfs4_server_t.
29570Sstevel@tonic-gate 			 */
29580Sstevel@tonic-gate 			np->s_refcnt++;
29590Sstevel@tonic-gate 			nfs4_mark_srv_dead(np);
29600Sstevel@tonic-gate 		}
29610Sstevel@tonic-gate 		mutex_exit(&np->s_lock);
29620Sstevel@tonic-gate 	}
29630Sstevel@tonic-gate 	mutex_exit(&nfs4_server_lst_lock);
29640Sstevel@tonic-gate }
29650Sstevel@tonic-gate 
29660Sstevel@tonic-gate static void
nfs4_mi_free_globals(struct mi4_globals * mig)29670Sstevel@tonic-gate nfs4_mi_free_globals(struct mi4_globals *mig)
29680Sstevel@tonic-gate {
29690Sstevel@tonic-gate 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
29700Sstevel@tonic-gate 	mutex_destroy(&mig->mig_lock);
29710Sstevel@tonic-gate 	kmem_free(mig, sizeof (*mig));
29720Sstevel@tonic-gate }
29730Sstevel@tonic-gate 
29740Sstevel@tonic-gate /* ARGSUSED */
29750Sstevel@tonic-gate static void
nfs4_mi_destroy(zoneid_t zoneid,void * data)29760Sstevel@tonic-gate nfs4_mi_destroy(zoneid_t zoneid, void *data)
29770Sstevel@tonic-gate {
29780Sstevel@tonic-gate 	struct mi4_globals *mig = data;
29790Sstevel@tonic-gate 
29800Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
29810Sstevel@tonic-gate 	    "nfs4_mi_destroy zone %d\n", zoneid));
29820Sstevel@tonic-gate 	ASSERT(mig != NULL);
29830Sstevel@tonic-gate 	mutex_enter(&mig->mig_lock);
29840Sstevel@tonic-gate 	if (list_head(&mig->mig_list) != NULL) {
29850Sstevel@tonic-gate 		/* Still waiting for VFS_FREEVFS() */
29860Sstevel@tonic-gate 		mig->mig_destructor_called = B_TRUE;
29870Sstevel@tonic-gate 		mutex_exit(&mig->mig_lock);
29880Sstevel@tonic-gate 		return;
29890Sstevel@tonic-gate 	}
29900Sstevel@tonic-gate 	nfs4_mi_free_globals(mig);
29910Sstevel@tonic-gate }
29920Sstevel@tonic-gate 
29930Sstevel@tonic-gate /*
29940Sstevel@tonic-gate  * Add an NFS mount to the per-zone list of NFS mounts.
29950Sstevel@tonic-gate  */
29960Sstevel@tonic-gate void
nfs4_mi_zonelist_add(mntinfo4_t * mi)29970Sstevel@tonic-gate nfs4_mi_zonelist_add(mntinfo4_t *mi)
29980Sstevel@tonic-gate {
29990Sstevel@tonic-gate 	struct mi4_globals *mig;
30000Sstevel@tonic-gate 
30010Sstevel@tonic-gate 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
30020Sstevel@tonic-gate 	mutex_enter(&mig->mig_lock);
30030Sstevel@tonic-gate 	list_insert_head(&mig->mig_list, mi);
30041705Sjwahlig 	/*
30051705Sjwahlig 	 * hold added to eliminate race with zone shutdown -this will be
30061705Sjwahlig 	 * released in mi_shutdown
30071705Sjwahlig 	 */
30081705Sjwahlig 	MI4_HOLD(mi);
30091705Sjwahlig 	VFS_HOLD(mi->mi_vfsp);
30100Sstevel@tonic-gate 	mutex_exit(&mig->mig_lock);
30110Sstevel@tonic-gate }
30120Sstevel@tonic-gate 
30130Sstevel@tonic-gate /*
30140Sstevel@tonic-gate  * Remove an NFS mount from the per-zone list of NFS mounts.
30150Sstevel@tonic-gate  */
30161705Sjwahlig int
nfs4_mi_zonelist_remove(mntinfo4_t * mi)30170Sstevel@tonic-gate nfs4_mi_zonelist_remove(mntinfo4_t *mi)
30180Sstevel@tonic-gate {
30190Sstevel@tonic-gate 	struct mi4_globals *mig;
30201705Sjwahlig 	int ret = 0;
30210Sstevel@tonic-gate 
30220Sstevel@tonic-gate 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
30230Sstevel@tonic-gate 	mutex_enter(&mig->mig_lock);
30241705Sjwahlig 	mutex_enter(&mi->mi_lock);
30251705Sjwahlig 	/* if this mi is marked dead, then the zone already released it */
30261705Sjwahlig 	if (!(mi->mi_flags & MI4_DEAD)) {
30271705Sjwahlig 		list_remove(&mig->mig_list, mi);
30288976SJames.Wahlig@Sun.COM 		mutex_exit(&mi->mi_lock);
30291705Sjwahlig 
30301705Sjwahlig 		/* release the holds put on in zonelist_add(). */
30311705Sjwahlig 		VFS_RELE(mi->mi_vfsp);
30321705Sjwahlig 		MI4_RELE(mi);
30331705Sjwahlig 		ret = 1;
30348976SJames.Wahlig@Sun.COM 	} else {
30358976SJames.Wahlig@Sun.COM 		mutex_exit(&mi->mi_lock);
30361705Sjwahlig 	}
30371705Sjwahlig 
30380Sstevel@tonic-gate 	/*
30390Sstevel@tonic-gate 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
30400Sstevel@tonic-gate 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
30410Sstevel@tonic-gate 	 * mi globals.
30420Sstevel@tonic-gate 	 */
30430Sstevel@tonic-gate 	if (list_head(&mig->mig_list) == NULL &&
30440Sstevel@tonic-gate 	    mig->mig_destructor_called == B_TRUE) {
30450Sstevel@tonic-gate 		nfs4_mi_free_globals(mig);
30461705Sjwahlig 		return (ret);
30470Sstevel@tonic-gate 	}
30480Sstevel@tonic-gate 	mutex_exit(&mig->mig_lock);
30491705Sjwahlig 	return (ret);
30500Sstevel@tonic-gate }
30510Sstevel@tonic-gate 
30520Sstevel@tonic-gate void
nfs_free_mi4(mntinfo4_t * mi)30530Sstevel@tonic-gate nfs_free_mi4(mntinfo4_t *mi)
30540Sstevel@tonic-gate {
30550Sstevel@tonic-gate 	nfs4_open_owner_t	*foop;
30561705Sjwahlig 	nfs4_oo_hash_bucket_t   *bucketp;
30570Sstevel@tonic-gate 	nfs4_debug_msg_t	*msgp;
30580Sstevel@tonic-gate 	int i;
30591705Sjwahlig 	servinfo4_t 		*svp;
30601705Sjwahlig 
306110487SVallish.Vaidyeshwara@Sun.COM 	/*
306210487SVallish.Vaidyeshwara@Sun.COM 	 * Code introduced here should be carefully evaluated to make
306310487SVallish.Vaidyeshwara@Sun.COM 	 * sure none of the freed resources are accessed either directly
306410487SVallish.Vaidyeshwara@Sun.COM 	 * or indirectly after freeing them. For eg: Introducing calls to
306510487SVallish.Vaidyeshwara@Sun.COM 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
306610487SVallish.Vaidyeshwara@Sun.COM 	 * the structure members or other routines calling back into NFS
306710487SVallish.Vaidyeshwara@Sun.COM 	 * accessing freed mntinfo4_t structure member.
306810487SVallish.Vaidyeshwara@Sun.COM 	 */
30690Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
30700Sstevel@tonic-gate 	ASSERT(mi->mi_recovthread == NULL);
30710Sstevel@tonic-gate 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
30720Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
30730Sstevel@tonic-gate 	mutex_enter(&mi->mi_async_lock);
307411507SVallish.Vaidyeshwara@Sun.COM 	ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
307511507SVallish.Vaidyeshwara@Sun.COM 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
30760Sstevel@tonic-gate 	ASSERT(mi->mi_manager_thread == NULL);
30771705Sjwahlig 	mutex_exit(&mi->mi_async_lock);
30781705Sjwahlig 	if (mi->mi_io_kstats) {
30791705Sjwahlig 		kstat_delete(mi->mi_io_kstats);
30801705Sjwahlig 		mi->mi_io_kstats = NULL;
30810Sstevel@tonic-gate 	}
30821705Sjwahlig 	if (mi->mi_ro_kstats) {
30831705Sjwahlig 		kstat_delete(mi->mi_ro_kstats);
30841705Sjwahlig 		mi->mi_ro_kstats = NULL;
30850Sstevel@tonic-gate 	}
30861705Sjwahlig 	if (mi->mi_recov_ksp) {
30871705Sjwahlig 		kstat_delete(mi->mi_recov_ksp);
30881705Sjwahlig 		mi->mi_recov_ksp = NULL;
30891705Sjwahlig 	}
30900Sstevel@tonic-gate 	mutex_enter(&mi->mi_msg_list_lock);
30910Sstevel@tonic-gate 	while (msgp = list_head(&mi->mi_msg_list)) {
30920Sstevel@tonic-gate 		list_remove(&mi->mi_msg_list, msgp);
30930Sstevel@tonic-gate 		nfs4_free_msg(msgp);
30940Sstevel@tonic-gate 	}
30950Sstevel@tonic-gate 	mutex_exit(&mi->mi_msg_list_lock);
30960Sstevel@tonic-gate 	list_destroy(&mi->mi_msg_list);
30977902SNagakiran.Rajashekar@Sun.COM 	if (mi->mi_fname != NULL)
30987902SNagakiran.Rajashekar@Sun.COM 		fn_rele(&mi->mi_fname);
30990Sstevel@tonic-gate 	if (mi->mi_rootfh != NULL)
31000Sstevel@tonic-gate 		sfh4_rele(&mi->mi_rootfh);
31010Sstevel@tonic-gate 	if (mi->mi_srvparentfh != NULL)
31020Sstevel@tonic-gate 		sfh4_rele(&mi->mi_srvparentfh);
310310487SVallish.Vaidyeshwara@Sun.COM 	svp = mi->mi_servers;
310410487SVallish.Vaidyeshwara@Sun.COM 	sv4_free(svp);
31050Sstevel@tonic-gate 	mutex_destroy(&mi->mi_lock);
31060Sstevel@tonic-gate 	mutex_destroy(&mi->mi_async_lock);
31070Sstevel@tonic-gate 	mutex_destroy(&mi->mi_msg_list_lock);
31080Sstevel@tonic-gate 	nfs_rw_destroy(&mi->mi_recovlock);
31090Sstevel@tonic-gate 	nfs_rw_destroy(&mi->mi_rename_lock);
31100Sstevel@tonic-gate 	nfs_rw_destroy(&mi->mi_fh_lock);
31110Sstevel@tonic-gate 	cv_destroy(&mi->mi_failover_cv);
31120Sstevel@tonic-gate 	cv_destroy(&mi->mi_async_reqs_cv);
311311507SVallish.Vaidyeshwara@Sun.COM 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
311411507SVallish.Vaidyeshwara@Sun.COM 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
31150Sstevel@tonic-gate 	cv_destroy(&mi->mi_async_cv);
31160Sstevel@tonic-gate 	cv_destroy(&mi->mi_inact_req_cv);
31170Sstevel@tonic-gate 	/*
31180Sstevel@tonic-gate 	 * Destroy the oo hash lists and mutexes for the cred hash table.
31190Sstevel@tonic-gate 	 */
31200Sstevel@tonic-gate 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
31210Sstevel@tonic-gate 		bucketp = &(mi->mi_oo_list[i]);
31220Sstevel@tonic-gate 		/* Destroy any remaining open owners on the list */
31230Sstevel@tonic-gate 		foop = list_head(&bucketp->b_oo_hash_list);
31240Sstevel@tonic-gate 		while (foop != NULL) {
31250Sstevel@tonic-gate 			list_remove(&bucketp->b_oo_hash_list, foop);
31260Sstevel@tonic-gate 			nfs4_destroy_open_owner(foop);
31270Sstevel@tonic-gate 			foop = list_head(&bucketp->b_oo_hash_list);
31280Sstevel@tonic-gate 		}
31290Sstevel@tonic-gate 		list_destroy(&bucketp->b_oo_hash_list);
31300Sstevel@tonic-gate 		mutex_destroy(&bucketp->b_lock);
31310Sstevel@tonic-gate 	}
31320Sstevel@tonic-gate 	/*
31330Sstevel@tonic-gate 	 * Empty and destroy the freed open owner list.
31340Sstevel@tonic-gate 	 */
31350Sstevel@tonic-gate 	foop = list_head(&mi->mi_foo_list);
31360Sstevel@tonic-gate 	while (foop != NULL) {
31370Sstevel@tonic-gate 		list_remove(&mi->mi_foo_list, foop);
31380Sstevel@tonic-gate 		nfs4_destroy_open_owner(foop);
31390Sstevel@tonic-gate 		foop = list_head(&mi->mi_foo_list);
31400Sstevel@tonic-gate 	}
31410Sstevel@tonic-gate 	list_destroy(&mi->mi_foo_list);
31420Sstevel@tonic-gate 	list_destroy(&mi->mi_bseqid_list);
31430Sstevel@tonic-gate 	list_destroy(&mi->mi_lost_state);
31440Sstevel@tonic-gate 	avl_destroy(&mi->mi_filehandles);
31450Sstevel@tonic-gate 	kmem_free(mi, sizeof (*mi));
31460Sstevel@tonic-gate }
31471705Sjwahlig void
mi_hold(mntinfo4_t * mi)31481705Sjwahlig mi_hold(mntinfo4_t *mi)
31491705Sjwahlig {
31501705Sjwahlig 	atomic_add_32(&mi->mi_count, 1);
31511705Sjwahlig 	ASSERT(mi->mi_count != 0);
31521705Sjwahlig }
31531705Sjwahlig 
31541705Sjwahlig void
mi_rele(mntinfo4_t * mi)31551705Sjwahlig mi_rele(mntinfo4_t *mi)
31561705Sjwahlig {
31571705Sjwahlig 	ASSERT(mi->mi_count != 0);
31581705Sjwahlig 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
31591705Sjwahlig 		nfs_free_mi4(mi);
31601705Sjwahlig 	}
31611705Sjwahlig }
31620Sstevel@tonic-gate 
31630Sstevel@tonic-gate vnode_t    nfs4_xattr_notsupp_vnode;
31640Sstevel@tonic-gate 
31650Sstevel@tonic-gate void
nfs4_clnt_init(void)31660Sstevel@tonic-gate nfs4_clnt_init(void)
31670Sstevel@tonic-gate {
31680Sstevel@tonic-gate 	nfs4_vnops_init();
31690Sstevel@tonic-gate 	(void) nfs4_rnode_init();
31700Sstevel@tonic-gate 	(void) nfs4_shadow_init();
31710Sstevel@tonic-gate 	(void) nfs4_acache_init();
31720Sstevel@tonic-gate 	(void) nfs4_subr_init();
31730Sstevel@tonic-gate 	nfs4_acl_init();
31740Sstevel@tonic-gate 	nfs_idmap_init();
31750Sstevel@tonic-gate 	nfs4_callback_init();
31760Sstevel@tonic-gate 	nfs4_secinfo_init();
31770Sstevel@tonic-gate #ifdef	DEBUG
31780Sstevel@tonic-gate 	tsd_create(&nfs4_tsd_key, NULL);
31790Sstevel@tonic-gate #endif
31800Sstevel@tonic-gate 
31810Sstevel@tonic-gate 	/*
31820Sstevel@tonic-gate 	 * Add a CPR callback so that we can update client
31830Sstevel@tonic-gate 	 * lease after a suspend and resume.
31840Sstevel@tonic-gate 	 */
31850Sstevel@tonic-gate 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
31860Sstevel@tonic-gate 
31870Sstevel@tonic-gate 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
31880Sstevel@tonic-gate 	    nfs4_mi_destroy);
31890Sstevel@tonic-gate 
31900Sstevel@tonic-gate 	/*
31910Sstevel@tonic-gate 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
31920Sstevel@tonic-gate 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
31930Sstevel@tonic-gate 	 */
31940Sstevel@tonic-gate 	nfs4_xattr_notsupp_vnode.v_count = 1;
31950Sstevel@tonic-gate }
31960Sstevel@tonic-gate 
31970Sstevel@tonic-gate void
nfs4_clnt_fini(void)31980Sstevel@tonic-gate nfs4_clnt_fini(void)
31990Sstevel@tonic-gate {
32000Sstevel@tonic-gate 	(void) zone_key_delete(mi4_list_key);
32010Sstevel@tonic-gate 	nfs4_vnops_fini();
32020Sstevel@tonic-gate 	(void) nfs4_rnode_fini();
32030Sstevel@tonic-gate 	(void) nfs4_shadow_fini();
32040Sstevel@tonic-gate 	(void) nfs4_acache_fini();
32050Sstevel@tonic-gate 	(void) nfs4_subr_fini();
32060Sstevel@tonic-gate 	nfs_idmap_fini();
32070Sstevel@tonic-gate 	nfs4_callback_fini();
32080Sstevel@tonic-gate 	nfs4_secinfo_fini();
32090Sstevel@tonic-gate #ifdef	DEBUG
32100Sstevel@tonic-gate 	tsd_destroy(&nfs4_tsd_key);
32110Sstevel@tonic-gate #endif
32120Sstevel@tonic-gate 	if (cid)
32130Sstevel@tonic-gate 		(void) callb_delete(cid);
32140Sstevel@tonic-gate }
32150Sstevel@tonic-gate 
32160Sstevel@tonic-gate /*ARGSUSED*/
32170Sstevel@tonic-gate static boolean_t
nfs4_client_cpr_callb(void * arg,int code)32180Sstevel@tonic-gate nfs4_client_cpr_callb(void *arg, int code)
32190Sstevel@tonic-gate {
32200Sstevel@tonic-gate 	/*
32210Sstevel@tonic-gate 	 * We get called for Suspend and Resume events.
32220Sstevel@tonic-gate 	 * For the suspend case we simply don't care!
32230Sstevel@tonic-gate 	 */
32240Sstevel@tonic-gate 	if (code == CB_CODE_CPR_CHKPT) {
32250Sstevel@tonic-gate 		return (B_TRUE);
32260Sstevel@tonic-gate 	}
32270Sstevel@tonic-gate 
32280Sstevel@tonic-gate 	/*
32290Sstevel@tonic-gate 	 * When we get to here we are in the process of
32300Sstevel@tonic-gate 	 * resuming the system from a previous suspend.
32310Sstevel@tonic-gate 	 */
32320Sstevel@tonic-gate 	nfs4_client_resumed = gethrestime_sec();
32330Sstevel@tonic-gate 	return (B_TRUE);
32340Sstevel@tonic-gate }
32350Sstevel@tonic-gate 
32360Sstevel@tonic-gate void
nfs4_renew_lease_thread(nfs4_server_t * sp)32370Sstevel@tonic-gate nfs4_renew_lease_thread(nfs4_server_t *sp)
32380Sstevel@tonic-gate {
32390Sstevel@tonic-gate 	int	error = 0;
32400Sstevel@tonic-gate 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
32410Sstevel@tonic-gate 	clock_t	tick_delay = 0;
32420Sstevel@tonic-gate 	clock_t time_left = 0;
32430Sstevel@tonic-gate 	callb_cpr_t cpr_info;
32440Sstevel@tonic-gate 	kmutex_t cpr_lock;
32450Sstevel@tonic-gate 
32460Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
32475302Sth199096 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
32480Sstevel@tonic-gate 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
32490Sstevel@tonic-gate 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
32500Sstevel@tonic-gate 
32510Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
32520Sstevel@tonic-gate 	/* sp->s_lease_time is set via a GETATTR */
32530Sstevel@tonic-gate 	sp->last_renewal_time = gethrestime_sec();
32540Sstevel@tonic-gate 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
32550Sstevel@tonic-gate 	ASSERT(sp->s_refcnt >= 1);
32560Sstevel@tonic-gate 
32570Sstevel@tonic-gate 	for (;;) {
32580Sstevel@tonic-gate 		if (!sp->state_ref_count ||
32595302Sth199096 		    sp->lease_valid != NFS4_LEASE_VALID) {
32600Sstevel@tonic-gate 
32610Sstevel@tonic-gate 			kip_secs = MAX((sp->s_lease_time >> 1) -
32625302Sth199096 			    (3 * sp->propagation_delay.tv_sec), 1);
32630Sstevel@tonic-gate 
32640Sstevel@tonic-gate 			tick_delay = SEC_TO_TICK(kip_secs);
32650Sstevel@tonic-gate 
32660Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
32675302Sth199096 			    "nfs4_renew_lease_thread: no renew : thread "
32685302Sth199096 			    "wait %ld secs", kip_secs));
32690Sstevel@tonic-gate 
32700Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
32715302Sth199096 			    "nfs4_renew_lease_thread: no renew : "
32725302Sth199096 			    "state_ref_count %d, lease_valid %d",
32735302Sth199096 			    sp->state_ref_count, sp->lease_valid));
32740Sstevel@tonic-gate 
32750Sstevel@tonic-gate 			mutex_enter(&cpr_lock);
32760Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
32770Sstevel@tonic-gate 			mutex_exit(&cpr_lock);
327811066Srafael.vanoni@sun.com 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
327911066Srafael.vanoni@sun.com 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
32800Sstevel@tonic-gate 			mutex_enter(&cpr_lock);
32810Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
32820Sstevel@tonic-gate 			mutex_exit(&cpr_lock);
32830Sstevel@tonic-gate 
32840Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
32855302Sth199096 			    "nfs4_renew_lease_thread: no renew: "
32865302Sth199096 			    "time left %ld", time_left));
32870Sstevel@tonic-gate 
32880Sstevel@tonic-gate 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
32890Sstevel@tonic-gate 				goto die;
32900Sstevel@tonic-gate 			continue;
32910Sstevel@tonic-gate 		}
32920Sstevel@tonic-gate 
32930Sstevel@tonic-gate 		tmp_last_renewal_time = sp->last_renewal_time;
32940Sstevel@tonic-gate 
32950Sstevel@tonic-gate 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
32965302Sth199096 		    (3 * sp->propagation_delay.tv_sec);
32970Sstevel@tonic-gate 
32980Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
32995302Sth199096 		    "nfs4_renew_lease_thread: tmp_time %ld, "
33005302Sth199096 		    "sp->last_renewal_time %ld", tmp_time,
33015302Sth199096 		    sp->last_renewal_time));
33020Sstevel@tonic-gate 
33030Sstevel@tonic-gate 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
33040Sstevel@tonic-gate 
33050Sstevel@tonic-gate 		tick_delay = SEC_TO_TICK(kip_secs);
33060Sstevel@tonic-gate 
33070Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
33085302Sth199096 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
33095302Sth199096 		    "secs", kip_secs));
33100Sstevel@tonic-gate 
33110Sstevel@tonic-gate 		mutex_enter(&cpr_lock);
33120Sstevel@tonic-gate 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
33130Sstevel@tonic-gate 		mutex_exit(&cpr_lock);
331411066Srafael.vanoni@sun.com 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
331511066Srafael.vanoni@sun.com 		    tick_delay, TR_CLOCK_TICK);
33160Sstevel@tonic-gate 		mutex_enter(&cpr_lock);
33170Sstevel@tonic-gate 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
33180Sstevel@tonic-gate 		mutex_exit(&cpr_lock);
33190Sstevel@tonic-gate 
33200Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
33215302Sth199096 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
33225302Sth199096 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
33235302Sth199096 		    "tmp_last_renewal_time %ld", time_left,
33245302Sth199096 		    sp->last_renewal_time, nfs4_client_resumed,
33255302Sth199096 		    tmp_last_renewal_time));
33260Sstevel@tonic-gate 
33270Sstevel@tonic-gate 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
33280Sstevel@tonic-gate 			goto die;
33290Sstevel@tonic-gate 
33300Sstevel@tonic-gate 		if (tmp_last_renewal_time == sp->last_renewal_time ||
33315302Sth199096 		    (nfs4_client_resumed != 0 &&
33325302Sth199096 		    nfs4_client_resumed > sp->last_renewal_time)) {
33330Sstevel@tonic-gate 			/*
33340Sstevel@tonic-gate 			 * Issue RENEW op since we haven't renewed the lease
33350Sstevel@tonic-gate 			 * since we slept.
33360Sstevel@tonic-gate 			 */
33370Sstevel@tonic-gate 			tmp_now_time = gethrestime_sec();
33380Sstevel@tonic-gate 			error = nfs4renew(sp);
33390Sstevel@tonic-gate 			/*
33400Sstevel@tonic-gate 			 * Need to re-acquire sp's lock, nfs4renew()
33410Sstevel@tonic-gate 			 * relinqueshes it.
33420Sstevel@tonic-gate 			 */
33430Sstevel@tonic-gate 			mutex_enter(&sp->s_lock);
33440Sstevel@tonic-gate 
33450Sstevel@tonic-gate 			/*
33460Sstevel@tonic-gate 			 * See if someone changed s_thread_exit while we gave
33470Sstevel@tonic-gate 			 * up s_lock.
33480Sstevel@tonic-gate 			 */
33490Sstevel@tonic-gate 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
33500Sstevel@tonic-gate 				goto die;
33510Sstevel@tonic-gate 
33520Sstevel@tonic-gate 			if (!error) {
33530Sstevel@tonic-gate 				/*
33540Sstevel@tonic-gate 				 * check to see if we implicitly renewed while
33550Sstevel@tonic-gate 				 * we waited for a reply for our RENEW call.
33560Sstevel@tonic-gate 				 */
33570Sstevel@tonic-gate 				if (tmp_last_renewal_time ==
33585302Sth199096 				    sp->last_renewal_time) {
33590Sstevel@tonic-gate 					/* no implicit renew came */
33600Sstevel@tonic-gate 					sp->last_renewal_time = tmp_now_time;
33610Sstevel@tonic-gate 				} else {
33620Sstevel@tonic-gate 					NFS4_DEBUG(nfs4_client_lease_debug,
33635302Sth199096 					    (CE_NOTE, "renew_thread: did "
33645302Sth199096 					    "implicit renewal before reply "
33655302Sth199096 					    "from server for RENEW"));
33660Sstevel@tonic-gate 				}
33670Sstevel@tonic-gate 			} else {
33680Sstevel@tonic-gate 				/* figure out error */
33690Sstevel@tonic-gate 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
33705302Sth199096 				    "renew_thread: nfs4renew returned error"
33715302Sth199096 				    " %d", error));
33720Sstevel@tonic-gate 			}
33730Sstevel@tonic-gate 
33740Sstevel@tonic-gate 		}
33750Sstevel@tonic-gate 	}
33760Sstevel@tonic-gate 
33770Sstevel@tonic-gate die:
33780Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
33795302Sth199096 	    "nfs4_renew_lease_thread: thread exiting"));
33800Sstevel@tonic-gate 
33810Sstevel@tonic-gate 	while (sp->s_otw_call_count != 0) {
33820Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
33835302Sth199096 		    "nfs4_renew_lease_thread: waiting for outstanding "
33845302Sth199096 		    "otw calls to finish for sp 0x%p, current "
33855302Sth199096 		    "s_otw_call_count %d", (void *)sp,
33865302Sth199096 		    sp->s_otw_call_count));
33870Sstevel@tonic-gate 		mutex_enter(&cpr_lock);
33880Sstevel@tonic-gate 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
33890Sstevel@tonic-gate 		mutex_exit(&cpr_lock);
33900Sstevel@tonic-gate 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
33910Sstevel@tonic-gate 		mutex_enter(&cpr_lock);
33920Sstevel@tonic-gate 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
33930Sstevel@tonic-gate 		mutex_exit(&cpr_lock);
33940Sstevel@tonic-gate 	}
33950Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
33960Sstevel@tonic-gate 
33970Sstevel@tonic-gate 	nfs4_server_rele(sp);		/* free the thread's reference */
33980Sstevel@tonic-gate 	nfs4_server_rele(sp);		/* free the list's reference */
33990Sstevel@tonic-gate 	sp = NULL;
34000Sstevel@tonic-gate 
34010Sstevel@tonic-gate done:
34020Sstevel@tonic-gate 	mutex_enter(&cpr_lock);
34030Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
34040Sstevel@tonic-gate 	mutex_destroy(&cpr_lock);
34050Sstevel@tonic-gate 
34060Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
34075302Sth199096 	    "nfs4_renew_lease_thread: renew thread exit officially"));
34080Sstevel@tonic-gate 
34090Sstevel@tonic-gate 	zthread_exit();
34100Sstevel@tonic-gate 	/* NOT REACHED */
34110Sstevel@tonic-gate }
34120Sstevel@tonic-gate 
34130Sstevel@tonic-gate /*
34140Sstevel@tonic-gate  * Send out a RENEW op to the server.
34150Sstevel@tonic-gate  * Assumes sp is locked down.
34160Sstevel@tonic-gate  */
34170Sstevel@tonic-gate static int
nfs4renew(nfs4_server_t * sp)34180Sstevel@tonic-gate nfs4renew(nfs4_server_t *sp)
34190Sstevel@tonic-gate {
34200Sstevel@tonic-gate 	COMPOUND4args_clnt args;
34210Sstevel@tonic-gate 	COMPOUND4res_clnt res;
34220Sstevel@tonic-gate 	nfs_argop4 argop[1];
34230Sstevel@tonic-gate 	int doqueue = 1;
34240Sstevel@tonic-gate 	int rpc_error;
34250Sstevel@tonic-gate 	cred_t *cr;
34260Sstevel@tonic-gate 	mntinfo4_t *mi;
34270Sstevel@tonic-gate 	timespec_t prop_time, after_time;
34280Sstevel@tonic-gate 	int needrecov = FALSE;
34290Sstevel@tonic-gate 	nfs4_recov_state_t recov_state;
34300Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
34310Sstevel@tonic-gate 
34320Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
34330Sstevel@tonic-gate 
34340Sstevel@tonic-gate 	recov_state.rs_flags = 0;
34350Sstevel@tonic-gate 	recov_state.rs_num_retry_despite_err = 0;
34360Sstevel@tonic-gate 
34370Sstevel@tonic-gate recov_retry:
34380Sstevel@tonic-gate 	mi = sp->mntinfo4_list;
34390Sstevel@tonic-gate 	VFS_HOLD(mi->mi_vfsp);
34400Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
34410Sstevel@tonic-gate 	ASSERT(mi != NULL);
34420Sstevel@tonic-gate 
34430Sstevel@tonic-gate 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
34440Sstevel@tonic-gate 	if (e.error) {
34450Sstevel@tonic-gate 		VFS_RELE(mi->mi_vfsp);
34460Sstevel@tonic-gate 		return (e.error);
34470Sstevel@tonic-gate 	}
34480Sstevel@tonic-gate 
34490Sstevel@tonic-gate 	/* Check to see if we're dealing with a marked-dead sp */
34500Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
34510Sstevel@tonic-gate 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
34520Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
34530Sstevel@tonic-gate 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
34540Sstevel@tonic-gate 		VFS_RELE(mi->mi_vfsp);
34550Sstevel@tonic-gate 		return (0);
34560Sstevel@tonic-gate 	}
34570Sstevel@tonic-gate 
34580Sstevel@tonic-gate 	/* Make sure mi hasn't changed on us */
34590Sstevel@tonic-gate 	if (mi != sp->mntinfo4_list) {
34600Sstevel@tonic-gate 		/* Must drop sp's lock to avoid a recursive mutex enter */
34610Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
34620Sstevel@tonic-gate 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
34630Sstevel@tonic-gate 		VFS_RELE(mi->mi_vfsp);
34640Sstevel@tonic-gate 		mutex_enter(&sp->s_lock);
34650Sstevel@tonic-gate 		goto recov_retry;
34660Sstevel@tonic-gate 	}
34670Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
34680Sstevel@tonic-gate 
34690Sstevel@tonic-gate 	args.ctag = TAG_RENEW;
34700Sstevel@tonic-gate 
34710Sstevel@tonic-gate 	args.array_len = 1;
34720Sstevel@tonic-gate 	args.array = argop;
34730Sstevel@tonic-gate 
34740Sstevel@tonic-gate 	argop[0].argop = OP_RENEW;
34750Sstevel@tonic-gate 
34760Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
34770Sstevel@tonic-gate 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
34780Sstevel@tonic-gate 	cr = sp->s_cred;
34790Sstevel@tonic-gate 	crhold(cr);
34800Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
34810Sstevel@tonic-gate 
34820Sstevel@tonic-gate 	ASSERT(cr != NULL);
34830Sstevel@tonic-gate 
34840Sstevel@tonic-gate 	/* used to figure out RTT for sp */
34850Sstevel@tonic-gate 	gethrestime(&prop_time);
34860Sstevel@tonic-gate 
34870Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
34880Sstevel@tonic-gate 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
34890Sstevel@tonic-gate 	    (void*)sp));
34900Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
34915302Sth199096 	    prop_time.tv_sec, prop_time.tv_nsec));
34920Sstevel@tonic-gate 
34930Sstevel@tonic-gate 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
34945302Sth199096 	    mntinfo4_t *, mi);
34950Sstevel@tonic-gate 
34960Sstevel@tonic-gate 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
34970Sstevel@tonic-gate 	crfree(cr);
34980Sstevel@tonic-gate 
34990Sstevel@tonic-gate 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
35005302Sth199096 	    mntinfo4_t *, mi);
35010Sstevel@tonic-gate 
35020Sstevel@tonic-gate 	gethrestime(&after_time);
35030Sstevel@tonic-gate 
35040Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
35050Sstevel@tonic-gate 	sp->propagation_delay.tv_sec =
35065302Sth199096 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
35070Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
35080Sstevel@tonic-gate 
35090Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
35105302Sth199096 	    after_time.tv_sec, after_time.tv_nsec));
35110Sstevel@tonic-gate 
35120Sstevel@tonic-gate 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3513104Swebaker 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
35140Sstevel@tonic-gate 		nfs4_delegreturn_all(sp);
35150Sstevel@tonic-gate 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
35160Sstevel@tonic-gate 		VFS_RELE(mi->mi_vfsp);
35170Sstevel@tonic-gate 		/*
35180Sstevel@tonic-gate 		 * If the server returns CB_PATH_DOWN, it has renewed
35190Sstevel@tonic-gate 		 * the lease and informed us that the callback path is
35200Sstevel@tonic-gate 		 * down.  Since the lease is renewed, just return 0 and
35210Sstevel@tonic-gate 		 * let the renew thread proceed as normal.
35220Sstevel@tonic-gate 		 */
35230Sstevel@tonic-gate 		return (0);
35240Sstevel@tonic-gate 	}
35250Sstevel@tonic-gate 
35260Sstevel@tonic-gate 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
35270Sstevel@tonic-gate 	if (!needrecov && e.error) {
35280Sstevel@tonic-gate 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
35290Sstevel@tonic-gate 		VFS_RELE(mi->mi_vfsp);
35300Sstevel@tonic-gate 		return (e.error);
35310Sstevel@tonic-gate 	}
35320Sstevel@tonic-gate 
35330Sstevel@tonic-gate 	rpc_error = e.error;
35340Sstevel@tonic-gate 
35350Sstevel@tonic-gate 	if (needrecov) {
35360Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
35370Sstevel@tonic-gate 		    "nfs4renew: initiating recovery\n"));
35380Sstevel@tonic-gate 
35390Sstevel@tonic-gate 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
354011291SRobert.Thurlow@Sun.COM 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
35410Sstevel@tonic-gate 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
35420Sstevel@tonic-gate 			VFS_RELE(mi->mi_vfsp);
35430Sstevel@tonic-gate 			if (!e.error)
35440Sstevel@tonic-gate 				(void) xdr_free(xdr_COMPOUND4res_clnt,
35455302Sth199096 				    (caddr_t)&res);
35460Sstevel@tonic-gate 			mutex_enter(&sp->s_lock);
35470Sstevel@tonic-gate 			goto recov_retry;
35480Sstevel@tonic-gate 		}
35490Sstevel@tonic-gate 		/* fall through for res.status case */
35500Sstevel@tonic-gate 	}
35510Sstevel@tonic-gate 
35520Sstevel@tonic-gate 	if (res.status) {
35530Sstevel@tonic-gate 		if (res.status == NFS4ERR_LEASE_MOVED) {
35540Sstevel@tonic-gate 			/*EMPTY*/
35550Sstevel@tonic-gate 			/*
35560Sstevel@tonic-gate 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
35570Sstevel@tonic-gate 			 * to renew the lease on that server
35580Sstevel@tonic-gate 			 */
35590Sstevel@tonic-gate 		}
35600Sstevel@tonic-gate 		e.error = geterrno4(res.status);
35610Sstevel@tonic-gate 	}
35620Sstevel@tonic-gate 
35630Sstevel@tonic-gate 	if (!rpc_error)
35640Sstevel@tonic-gate 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
35650Sstevel@tonic-gate 
35660Sstevel@tonic-gate 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
35670Sstevel@tonic-gate 
35680Sstevel@tonic-gate 	VFS_RELE(mi->mi_vfsp);
35690Sstevel@tonic-gate 
35700Sstevel@tonic-gate 	return (e.error);
35710Sstevel@tonic-gate }
35720Sstevel@tonic-gate 
35730Sstevel@tonic-gate void
nfs4_inc_state_ref_count(mntinfo4_t * mi)35740Sstevel@tonic-gate nfs4_inc_state_ref_count(mntinfo4_t *mi)
35750Sstevel@tonic-gate {
35760Sstevel@tonic-gate 	nfs4_server_t	*sp;
35770Sstevel@tonic-gate 
35780Sstevel@tonic-gate 	/* this locks down sp if it is found */
35790Sstevel@tonic-gate 	sp = find_nfs4_server(mi);
35800Sstevel@tonic-gate 
35810Sstevel@tonic-gate 	if (sp != NULL) {
35820Sstevel@tonic-gate 		nfs4_inc_state_ref_count_nolock(sp, mi);
35830Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
35840Sstevel@tonic-gate 		nfs4_server_rele(sp);
35850Sstevel@tonic-gate 	}
35860Sstevel@tonic-gate }
35870Sstevel@tonic-gate 
35880Sstevel@tonic-gate /*
35890Sstevel@tonic-gate  * Bump the number of OPEN files (ie: those with state) so we know if this
35900Sstevel@tonic-gate  * nfs4_server has any state to maintain a lease for or not.
35910Sstevel@tonic-gate  *
35920Sstevel@tonic-gate  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
35930Sstevel@tonic-gate  */
35940Sstevel@tonic-gate void
nfs4_inc_state_ref_count_nolock(nfs4_server_t * sp,mntinfo4_t * mi)35950Sstevel@tonic-gate nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
35960Sstevel@tonic-gate {
35970Sstevel@tonic-gate 	ASSERT(mutex_owned(&sp->s_lock));
35980Sstevel@tonic-gate 
35990Sstevel@tonic-gate 	sp->state_ref_count++;
36000Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
36015302Sth199096 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
36025302Sth199096 	    sp->state_ref_count));
36030Sstevel@tonic-gate 
36040Sstevel@tonic-gate 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
36050Sstevel@tonic-gate 		sp->lease_valid = NFS4_LEASE_VALID;
36060Sstevel@tonic-gate 
36070Sstevel@tonic-gate 	/*
36080Sstevel@tonic-gate 	 * If this call caused the lease to be marked valid and/or
36090Sstevel@tonic-gate 	 * took the state_ref_count from 0 to 1, then start the time
36100Sstevel@tonic-gate 	 * on lease renewal.
36110Sstevel@tonic-gate 	 */
36120Sstevel@tonic-gate 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
36130Sstevel@tonic-gate 		sp->last_renewal_time = gethrestime_sec();
36140Sstevel@tonic-gate 
36150Sstevel@tonic-gate 	/* update the number of open files for mi */
36160Sstevel@tonic-gate 	mi->mi_open_files++;
36170Sstevel@tonic-gate }
36180Sstevel@tonic-gate 
36190Sstevel@tonic-gate void
nfs4_dec_state_ref_count(mntinfo4_t * mi)36200Sstevel@tonic-gate nfs4_dec_state_ref_count(mntinfo4_t *mi)
36210Sstevel@tonic-gate {
36220Sstevel@tonic-gate 	nfs4_server_t	*sp;
36230Sstevel@tonic-gate 
36240Sstevel@tonic-gate 	/* this locks down sp if it is found */
36250Sstevel@tonic-gate 	sp = find_nfs4_server_all(mi, 1);
36260Sstevel@tonic-gate 
36270Sstevel@tonic-gate 	if (sp != NULL) {
36280Sstevel@tonic-gate 		nfs4_dec_state_ref_count_nolock(sp, mi);
36290Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
36300Sstevel@tonic-gate 		nfs4_server_rele(sp);
36310Sstevel@tonic-gate 	}
36320Sstevel@tonic-gate }
36330Sstevel@tonic-gate 
36340Sstevel@tonic-gate /*
36350Sstevel@tonic-gate  * Decrement the number of OPEN files (ie: those with state) so we know if
36360Sstevel@tonic-gate  * this nfs4_server has any state to maintain a lease for or not.
36370Sstevel@tonic-gate  */
36380Sstevel@tonic-gate void
nfs4_dec_state_ref_count_nolock(nfs4_server_t * sp,mntinfo4_t * mi)36390Sstevel@tonic-gate nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
36400Sstevel@tonic-gate {
36410Sstevel@tonic-gate 	ASSERT(mutex_owned(&sp->s_lock));
36420Sstevel@tonic-gate 	ASSERT(sp->state_ref_count != 0);
36430Sstevel@tonic-gate 	sp->state_ref_count--;
36440Sstevel@tonic-gate 
36450Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
36465302Sth199096 	    "nfs4_dec_state_ref_count: state ref count now %d",
36475302Sth199096 	    sp->state_ref_count));
36480Sstevel@tonic-gate 
36490Sstevel@tonic-gate 	mi->mi_open_files--;
36500Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
36515302Sth199096 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
36525302Sth199096 	    mi->mi_open_files, mi->mi_flags));
36530Sstevel@tonic-gate 
36540Sstevel@tonic-gate 	/* We don't have to hold the mi_lock to test mi_flags */
36550Sstevel@tonic-gate 	if (mi->mi_open_files == 0 &&
36560Sstevel@tonic-gate 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
36570Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
36585302Sth199096 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
36595302Sth199096 		    "we have closed the last open file", (void*)mi));
36600Sstevel@tonic-gate 		nfs4_remove_mi_from_server(mi, sp);
36610Sstevel@tonic-gate 	}
36620Sstevel@tonic-gate }
36630Sstevel@tonic-gate 
36640Sstevel@tonic-gate bool_t
inlease(nfs4_server_t * sp)36650Sstevel@tonic-gate inlease(nfs4_server_t *sp)
36660Sstevel@tonic-gate {
36670Sstevel@tonic-gate 	bool_t result;
36680Sstevel@tonic-gate 
36690Sstevel@tonic-gate 	ASSERT(mutex_owned(&sp->s_lock));
36700Sstevel@tonic-gate 
36710Sstevel@tonic-gate 	if (sp->lease_valid == NFS4_LEASE_VALID &&
36720Sstevel@tonic-gate 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
36730Sstevel@tonic-gate 		result = TRUE;
36740Sstevel@tonic-gate 	else
36750Sstevel@tonic-gate 		result = FALSE;
36760Sstevel@tonic-gate 
36770Sstevel@tonic-gate 	return (result);
36780Sstevel@tonic-gate }
36790Sstevel@tonic-gate 
36800Sstevel@tonic-gate 
36810Sstevel@tonic-gate /*
36820Sstevel@tonic-gate  * Return non-zero if the given nfs4_server_t is going through recovery.
36830Sstevel@tonic-gate  */
36840Sstevel@tonic-gate 
36850Sstevel@tonic-gate int
nfs4_server_in_recovery(nfs4_server_t * sp)36860Sstevel@tonic-gate nfs4_server_in_recovery(nfs4_server_t *sp)
36870Sstevel@tonic-gate {
36880Sstevel@tonic-gate 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
36890Sstevel@tonic-gate }
36900Sstevel@tonic-gate 
36910Sstevel@tonic-gate /*
36920Sstevel@tonic-gate  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
36930Sstevel@tonic-gate  * first is less than, equal to, or greater than the second.
36940Sstevel@tonic-gate  */
36950Sstevel@tonic-gate 
36960Sstevel@tonic-gate int
sfh4cmp(const void * p1,const void * p2)36970Sstevel@tonic-gate sfh4cmp(const void *p1, const void *p2)
36980Sstevel@tonic-gate {
36990Sstevel@tonic-gate 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
37000Sstevel@tonic-gate 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
37010Sstevel@tonic-gate 
37020Sstevel@tonic-gate 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
37030Sstevel@tonic-gate }
37040Sstevel@tonic-gate 
37050Sstevel@tonic-gate /*
37060Sstevel@tonic-gate  * Create a table for shared filehandle objects.
37070Sstevel@tonic-gate  */
37080Sstevel@tonic-gate 
37090Sstevel@tonic-gate void
sfh4_createtab(avl_tree_t * tab)37100Sstevel@tonic-gate sfh4_createtab(avl_tree_t *tab)
37110Sstevel@tonic-gate {
37120Sstevel@tonic-gate 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
37135302Sth199096 	    offsetof(nfs4_sharedfh_t, sfh_tree));
37140Sstevel@tonic-gate }
37150Sstevel@tonic-gate 
37160Sstevel@tonic-gate /*
37170Sstevel@tonic-gate  * Return a shared filehandle object for the given filehandle.  The caller
37180Sstevel@tonic-gate  * is responsible for eventually calling sfh4_rele().
37190Sstevel@tonic-gate  */
37200Sstevel@tonic-gate 
37210Sstevel@tonic-gate nfs4_sharedfh_t *
sfh4_put(const nfs_fh4 * fh,mntinfo4_t * mi,nfs4_sharedfh_t * key)37220Sstevel@tonic-gate sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
37230Sstevel@tonic-gate {
37240Sstevel@tonic-gate 	nfs4_sharedfh_t *sfh, *nsfh;
37250Sstevel@tonic-gate 	avl_index_t where;
37260Sstevel@tonic-gate 	nfs4_sharedfh_t skey;
37270Sstevel@tonic-gate 
37280Sstevel@tonic-gate 	if (!key) {
37290Sstevel@tonic-gate 		skey.sfh_fh = *fh;
37300Sstevel@tonic-gate 		key = &skey;
37310Sstevel@tonic-gate 	}
37320Sstevel@tonic-gate 
37330Sstevel@tonic-gate 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
37340Sstevel@tonic-gate 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
37350Sstevel@tonic-gate 	/*
37360Sstevel@tonic-gate 	 * We allocate the largest possible filehandle size because it's
37370Sstevel@tonic-gate 	 * not that big, and it saves us from possibly having to resize the
37380Sstevel@tonic-gate 	 * buffer later.
37390Sstevel@tonic-gate 	 */
37400Sstevel@tonic-gate 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
37410Sstevel@tonic-gate 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
37420Sstevel@tonic-gate 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
37430Sstevel@tonic-gate 	nsfh->sfh_refcnt = 1;
37440Sstevel@tonic-gate 	nsfh->sfh_flags = SFH4_IN_TREE;
37450Sstevel@tonic-gate 	nsfh->sfh_mi = mi;
37460Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
37475302Sth199096 	    (void *)nsfh));
37480Sstevel@tonic-gate 
37490Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
37500Sstevel@tonic-gate 	sfh = avl_find(&mi->mi_filehandles, key, &where);
37510Sstevel@tonic-gate 	if (sfh != NULL) {
37520Sstevel@tonic-gate 		mutex_enter(&sfh->sfh_lock);
37530Sstevel@tonic-gate 		sfh->sfh_refcnt++;
37540Sstevel@tonic-gate 		mutex_exit(&sfh->sfh_lock);
37550Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_fh_lock);
37560Sstevel@tonic-gate 		/* free our speculative allocs */
37570Sstevel@tonic-gate 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
37580Sstevel@tonic-gate 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
37590Sstevel@tonic-gate 		return (sfh);
37600Sstevel@tonic-gate 	}
37610Sstevel@tonic-gate 
37620Sstevel@tonic-gate 	avl_insert(&mi->mi_filehandles, nsfh, where);
37630Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_fh_lock);
37640Sstevel@tonic-gate 
37650Sstevel@tonic-gate 	return (nsfh);
37660Sstevel@tonic-gate }
37670Sstevel@tonic-gate 
37680Sstevel@tonic-gate /*
37690Sstevel@tonic-gate  * Return a shared filehandle object for the given filehandle.  The caller
37700Sstevel@tonic-gate  * is responsible for eventually calling sfh4_rele().
37710Sstevel@tonic-gate  */
37720Sstevel@tonic-gate 
37730Sstevel@tonic-gate nfs4_sharedfh_t *
sfh4_get(const nfs_fh4 * fh,mntinfo4_t * mi)37740Sstevel@tonic-gate sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
37750Sstevel@tonic-gate {
37760Sstevel@tonic-gate 	nfs4_sharedfh_t *sfh;
37770Sstevel@tonic-gate 	nfs4_sharedfh_t key;
37780Sstevel@tonic-gate 
37790Sstevel@tonic-gate 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
37800Sstevel@tonic-gate 
37810Sstevel@tonic-gate #ifdef DEBUG
37820Sstevel@tonic-gate 	if (nfs4_sharedfh_debug) {
37830Sstevel@tonic-gate 		nfs4_fhandle_t fhandle;
37840Sstevel@tonic-gate 
37850Sstevel@tonic-gate 		fhandle.fh_len = fh->nfs_fh4_len;
37860Sstevel@tonic-gate 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
37870Sstevel@tonic-gate 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
37880Sstevel@tonic-gate 		nfs4_printfhandle(&fhandle);
37890Sstevel@tonic-gate 	}
37900Sstevel@tonic-gate #endif
37910Sstevel@tonic-gate 
37920Sstevel@tonic-gate 	/*
37930Sstevel@tonic-gate 	 * If there's already an object for the given filehandle, bump the
37940Sstevel@tonic-gate 	 * reference count and return it.  Otherwise, create a new object
37950Sstevel@tonic-gate 	 * and add it to the AVL tree.
37960Sstevel@tonic-gate 	 */
37970Sstevel@tonic-gate 
37980Sstevel@tonic-gate 	key.sfh_fh = *fh;
37990Sstevel@tonic-gate 
38000Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
38010Sstevel@tonic-gate 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
38020Sstevel@tonic-gate 	if (sfh != NULL) {
38030Sstevel@tonic-gate 		mutex_enter(&sfh->sfh_lock);
38040Sstevel@tonic-gate 		sfh->sfh_refcnt++;
38050Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
38065302Sth199096 		    "sfh4_get: found existing %p, new refcnt=%d",
38075302Sth199096 		    (void *)sfh, sfh->sfh_refcnt));
38080Sstevel@tonic-gate 		mutex_exit(&sfh->sfh_lock);
38090Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_fh_lock);
38100Sstevel@tonic-gate 		return (sfh);
38110Sstevel@tonic-gate 	}
38120Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_fh_lock);
38130Sstevel@tonic-gate 
38140Sstevel@tonic-gate 	return (sfh4_put(fh, mi, &key));
38150Sstevel@tonic-gate }
38160Sstevel@tonic-gate 
38170Sstevel@tonic-gate /*
38180Sstevel@tonic-gate  * Get a reference to the given shared filehandle object.
38190Sstevel@tonic-gate  */
38200Sstevel@tonic-gate 
38210Sstevel@tonic-gate void
sfh4_hold(nfs4_sharedfh_t * sfh)38220Sstevel@tonic-gate sfh4_hold(nfs4_sharedfh_t *sfh)
38230Sstevel@tonic-gate {
38240Sstevel@tonic-gate 	ASSERT(sfh->sfh_refcnt > 0);
38250Sstevel@tonic-gate 
38260Sstevel@tonic-gate 	mutex_enter(&sfh->sfh_lock);
38270Sstevel@tonic-gate 	sfh->sfh_refcnt++;
38280Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_sharedfh_debug,
38295302Sth199096 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
38305302Sth199096 	    (void *)sfh, sfh->sfh_refcnt));
38310Sstevel@tonic-gate 	mutex_exit(&sfh->sfh_lock);
38320Sstevel@tonic-gate }
38330Sstevel@tonic-gate 
38340Sstevel@tonic-gate /*
38350Sstevel@tonic-gate  * Release a reference to the given shared filehandle object and null out
38360Sstevel@tonic-gate  * the given pointer.
38370Sstevel@tonic-gate  */
38380Sstevel@tonic-gate 
38390Sstevel@tonic-gate void
sfh4_rele(nfs4_sharedfh_t ** sfhpp)38400Sstevel@tonic-gate sfh4_rele(nfs4_sharedfh_t **sfhpp)
38410Sstevel@tonic-gate {
38420Sstevel@tonic-gate 	mntinfo4_t *mi;
38430Sstevel@tonic-gate 	nfs4_sharedfh_t *sfh = *sfhpp;
38440Sstevel@tonic-gate 
38450Sstevel@tonic-gate 	ASSERT(sfh->sfh_refcnt > 0);
38460Sstevel@tonic-gate 
38470Sstevel@tonic-gate 	mutex_enter(&sfh->sfh_lock);
38480Sstevel@tonic-gate 	if (sfh->sfh_refcnt > 1) {
38490Sstevel@tonic-gate 		sfh->sfh_refcnt--;
38500Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
38510Sstevel@tonic-gate 		    "sfh4_rele %p, new refcnt=%d",
38520Sstevel@tonic-gate 		    (void *)sfh, sfh->sfh_refcnt));
38530Sstevel@tonic-gate 		mutex_exit(&sfh->sfh_lock);
38540Sstevel@tonic-gate 		goto finish;
38550Sstevel@tonic-gate 	}
38560Sstevel@tonic-gate 	mutex_exit(&sfh->sfh_lock);
38570Sstevel@tonic-gate 
38580Sstevel@tonic-gate 	/*
38590Sstevel@tonic-gate 	 * Possibly the last reference, so get the lock for the table in
38600Sstevel@tonic-gate 	 * case it's time to remove the object from the table.
38610Sstevel@tonic-gate 	 */
38620Sstevel@tonic-gate 	mi = sfh->sfh_mi;
38630Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
38640Sstevel@tonic-gate 	mutex_enter(&sfh->sfh_lock);
38650Sstevel@tonic-gate 	sfh->sfh_refcnt--;
38660Sstevel@tonic-gate 	if (sfh->sfh_refcnt > 0) {
38670Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
38680Sstevel@tonic-gate 		    "sfh4_rele %p, new refcnt=%d",
38690Sstevel@tonic-gate 		    (void *)sfh, sfh->sfh_refcnt));
38700Sstevel@tonic-gate 		mutex_exit(&sfh->sfh_lock);
38710Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_fh_lock);
38720Sstevel@tonic-gate 		goto finish;
38730Sstevel@tonic-gate 	}
38740Sstevel@tonic-gate 
38750Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
38765302Sth199096 	    "sfh4_rele %p, last ref", (void *)sfh));
38770Sstevel@tonic-gate 	if (sfh->sfh_flags & SFH4_IN_TREE) {
38780Sstevel@tonic-gate 		avl_remove(&mi->mi_filehandles, sfh);
38790Sstevel@tonic-gate 		sfh->sfh_flags &= ~SFH4_IN_TREE;
38800Sstevel@tonic-gate 	}
38810Sstevel@tonic-gate 	mutex_exit(&sfh->sfh_lock);
38820Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_fh_lock);
38830Sstevel@tonic-gate 	mutex_destroy(&sfh->sfh_lock);
38840Sstevel@tonic-gate 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
38850Sstevel@tonic-gate 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
38860Sstevel@tonic-gate 
38870Sstevel@tonic-gate finish:
38880Sstevel@tonic-gate 	*sfhpp = NULL;
38890Sstevel@tonic-gate }
38900Sstevel@tonic-gate 
38910Sstevel@tonic-gate /*
38920Sstevel@tonic-gate  * Update the filehandle for the given shared filehandle object.
38930Sstevel@tonic-gate  */
38940Sstevel@tonic-gate 
38950Sstevel@tonic-gate int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
38960Sstevel@tonic-gate 
38970Sstevel@tonic-gate void
sfh4_update(nfs4_sharedfh_t * sfh,const nfs_fh4 * newfh)38980Sstevel@tonic-gate sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
38990Sstevel@tonic-gate {
39000Sstevel@tonic-gate 	mntinfo4_t *mi = sfh->sfh_mi;
39010Sstevel@tonic-gate 	nfs4_sharedfh_t *dupsfh;
39020Sstevel@tonic-gate 	avl_index_t where;
39030Sstevel@tonic-gate 	nfs4_sharedfh_t key;
39040Sstevel@tonic-gate 
39050Sstevel@tonic-gate #ifdef DEBUG
39060Sstevel@tonic-gate 	mutex_enter(&sfh->sfh_lock);
39070Sstevel@tonic-gate 	ASSERT(sfh->sfh_refcnt > 0);
39080Sstevel@tonic-gate 	mutex_exit(&sfh->sfh_lock);
39090Sstevel@tonic-gate #endif
39100Sstevel@tonic-gate 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
39110Sstevel@tonic-gate 
39120Sstevel@tonic-gate 	/*
39130Sstevel@tonic-gate 	 * The basic plan is to remove the shared filehandle object from
39140Sstevel@tonic-gate 	 * the table, update it to have the new filehandle, then reinsert
39150Sstevel@tonic-gate 	 * it.
39160Sstevel@tonic-gate 	 */
39170Sstevel@tonic-gate 
39180Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
39190Sstevel@tonic-gate 	mutex_enter(&sfh->sfh_lock);
39200Sstevel@tonic-gate 	if (sfh->sfh_flags & SFH4_IN_TREE) {
39210Sstevel@tonic-gate 		avl_remove(&mi->mi_filehandles, sfh);
39220Sstevel@tonic-gate 		sfh->sfh_flags &= ~SFH4_IN_TREE;
39230Sstevel@tonic-gate 	}
39240Sstevel@tonic-gate 	mutex_exit(&sfh->sfh_lock);
39250Sstevel@tonic-gate 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
39260Sstevel@tonic-gate 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
39270Sstevel@tonic-gate 	    sfh->sfh_fh.nfs_fh4_len);
39280Sstevel@tonic-gate 
39290Sstevel@tonic-gate 	/*
39300Sstevel@tonic-gate 	 * XXX If there is already a shared filehandle object with the new
39310Sstevel@tonic-gate 	 * filehandle, we're in trouble, because the rnode code assumes
39320Sstevel@tonic-gate 	 * that there is only one shared filehandle object for a given
39330Sstevel@tonic-gate 	 * filehandle.  So issue a warning (for read-write mounts only)
39340Sstevel@tonic-gate 	 * and don't try to re-insert the given object into the table.
39350Sstevel@tonic-gate 	 * Hopefully the given object will quickly go away and everyone
39360Sstevel@tonic-gate 	 * will use the new object.
39370Sstevel@tonic-gate 	 */
39380Sstevel@tonic-gate 	key.sfh_fh = *newfh;
39390Sstevel@tonic-gate 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
39400Sstevel@tonic-gate 	if (dupsfh != NULL) {
39410Sstevel@tonic-gate 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
39420Sstevel@tonic-gate 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
39430Sstevel@tonic-gate 			    "duplicate filehandle detected");
39440Sstevel@tonic-gate 			sfh4_printfhandle(dupsfh);
39450Sstevel@tonic-gate 		}
39460Sstevel@tonic-gate 	} else {
39470Sstevel@tonic-gate 		avl_insert(&mi->mi_filehandles, sfh, where);
39480Sstevel@tonic-gate 		mutex_enter(&sfh->sfh_lock);
39490Sstevel@tonic-gate 		sfh->sfh_flags |= SFH4_IN_TREE;
39500Sstevel@tonic-gate 		mutex_exit(&sfh->sfh_lock);
39510Sstevel@tonic-gate 	}
39520Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_fh_lock);
39530Sstevel@tonic-gate }
39540Sstevel@tonic-gate 
39550Sstevel@tonic-gate /*
39560Sstevel@tonic-gate  * Copy out the current filehandle for the given shared filehandle object.
39570Sstevel@tonic-gate  */
39580Sstevel@tonic-gate 
39590Sstevel@tonic-gate void
sfh4_copyval(const nfs4_sharedfh_t * sfh,nfs4_fhandle_t * fhp)39600Sstevel@tonic-gate sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
39610Sstevel@tonic-gate {
39620Sstevel@tonic-gate 	mntinfo4_t *mi = sfh->sfh_mi;
39630Sstevel@tonic-gate 
39640Sstevel@tonic-gate 	ASSERT(sfh->sfh_refcnt > 0);
39650Sstevel@tonic-gate 
39660Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
39670Sstevel@tonic-gate 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
39680Sstevel@tonic-gate 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
39690Sstevel@tonic-gate 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
39700Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_fh_lock);
39710Sstevel@tonic-gate }
39720Sstevel@tonic-gate 
39730Sstevel@tonic-gate /*
39740Sstevel@tonic-gate  * Print out the filehandle for the given shared filehandle object.
39750Sstevel@tonic-gate  */
39760Sstevel@tonic-gate 
39770Sstevel@tonic-gate void
sfh4_printfhandle(const nfs4_sharedfh_t * sfh)39780Sstevel@tonic-gate sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
39790Sstevel@tonic-gate {
39800Sstevel@tonic-gate 	nfs4_fhandle_t fhandle;
39810Sstevel@tonic-gate 
39820Sstevel@tonic-gate 	sfh4_copyval(sfh, &fhandle);
39830Sstevel@tonic-gate 	nfs4_printfhandle(&fhandle);
39840Sstevel@tonic-gate }
39850Sstevel@tonic-gate 
39860Sstevel@tonic-gate /*
39870Sstevel@tonic-gate  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
39880Sstevel@tonic-gate  * if they're the same, +1 if the first is "greater" than the second.  The
39890Sstevel@tonic-gate  * caller (or whoever's calling the AVL package) is responsible for
39900Sstevel@tonic-gate  * handling locking issues.
39910Sstevel@tonic-gate  */
39920Sstevel@tonic-gate 
39930Sstevel@tonic-gate static int
fncmp(const void * p1,const void * p2)39940Sstevel@tonic-gate fncmp(const void *p1, const void *p2)
39950Sstevel@tonic-gate {
39960Sstevel@tonic-gate 	const nfs4_fname_t *f1 = p1;
39970Sstevel@tonic-gate 	const nfs4_fname_t *f2 = p2;
39980Sstevel@tonic-gate 	int res;
39990Sstevel@tonic-gate 
40000Sstevel@tonic-gate 	res = strcmp(f1->fn_name, f2->fn_name);
40010Sstevel@tonic-gate 	/*
40020Sstevel@tonic-gate 	 * The AVL package wants +/-1, not arbitrary positive or negative
40030Sstevel@tonic-gate 	 * integers.
40040Sstevel@tonic-gate 	 */
40050Sstevel@tonic-gate 	if (res > 0)
40060Sstevel@tonic-gate 		res = 1;
40070Sstevel@tonic-gate 	else if (res < 0)
40080Sstevel@tonic-gate 		res = -1;
40090Sstevel@tonic-gate 	return (res);
40100Sstevel@tonic-gate }
40110Sstevel@tonic-gate 
40120Sstevel@tonic-gate /*
40130Sstevel@tonic-gate  * Get or create an fname with the given name, as a child of the given
40140Sstevel@tonic-gate  * fname.  The caller is responsible for eventually releasing the reference
40150Sstevel@tonic-gate  * (fn_rele()).  parent may be NULL.
40160Sstevel@tonic-gate  */
40170Sstevel@tonic-gate 
40180Sstevel@tonic-gate nfs4_fname_t *
fn_get(nfs4_fname_t * parent,char * name,nfs4_sharedfh_t * sfh)40197902SNagakiran.Rajashekar@Sun.COM fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
40200Sstevel@tonic-gate {
40210Sstevel@tonic-gate 	nfs4_fname_t key;
40220Sstevel@tonic-gate 	nfs4_fname_t *fnp;
40230Sstevel@tonic-gate 	avl_index_t where;
40240Sstevel@tonic-gate 
40250Sstevel@tonic-gate 	key.fn_name = name;
40260Sstevel@tonic-gate 
40270Sstevel@tonic-gate 	/*
40280Sstevel@tonic-gate 	 * If there's already an fname registered with the given name, bump
40290Sstevel@tonic-gate 	 * its reference count and return it.  Otherwise, create a new one
40300Sstevel@tonic-gate 	 * and add it to the parent's AVL tree.
40317902SNagakiran.Rajashekar@Sun.COM 	 *
40327902SNagakiran.Rajashekar@Sun.COM 	 * fname entries we are looking for should match both name
40337902SNagakiran.Rajashekar@Sun.COM 	 * and sfh stored in the fname.
40340Sstevel@tonic-gate 	 */
40357902SNagakiran.Rajashekar@Sun.COM again:
40360Sstevel@tonic-gate 	if (parent != NULL) {
40370Sstevel@tonic-gate 		mutex_enter(&parent->fn_lock);
40380Sstevel@tonic-gate 		fnp = avl_find(&parent->fn_children, &key, &where);
40390Sstevel@tonic-gate 		if (fnp != NULL) {
40407902SNagakiran.Rajashekar@Sun.COM 			/*
40417902SNagakiran.Rajashekar@Sun.COM 			 * This hold on fnp is released below later,
40427902SNagakiran.Rajashekar@Sun.COM 			 * in case this is not the fnp we want.
40437902SNagakiran.Rajashekar@Sun.COM 			 */
40440Sstevel@tonic-gate 			fn_hold(fnp);
40457902SNagakiran.Rajashekar@Sun.COM 
40467902SNagakiran.Rajashekar@Sun.COM 			if (fnp->fn_sfh == sfh) {
40477902SNagakiran.Rajashekar@Sun.COM 				/*
40487902SNagakiran.Rajashekar@Sun.COM 				 * We have found our entry.
40497902SNagakiran.Rajashekar@Sun.COM 				 * put an hold and return it.
40507902SNagakiran.Rajashekar@Sun.COM 				 */
40517902SNagakiran.Rajashekar@Sun.COM 				mutex_exit(&parent->fn_lock);
40527902SNagakiran.Rajashekar@Sun.COM 				return (fnp);
40537902SNagakiran.Rajashekar@Sun.COM 			}
40547902SNagakiran.Rajashekar@Sun.COM 
40557902SNagakiran.Rajashekar@Sun.COM 			/*
40567902SNagakiran.Rajashekar@Sun.COM 			 * We have found an entry that has a mismatching
40577902SNagakiran.Rajashekar@Sun.COM 			 * fn_sfh. This could be a stale entry due to
40587902SNagakiran.Rajashekar@Sun.COM 			 * server side rename. We will remove this entry
40597902SNagakiran.Rajashekar@Sun.COM 			 * and make sure no such entries exist.
40607902SNagakiran.Rajashekar@Sun.COM 			 */
40610Sstevel@tonic-gate 			mutex_exit(&parent->fn_lock);
40627902SNagakiran.Rajashekar@Sun.COM 			mutex_enter(&fnp->fn_lock);
40637902SNagakiran.Rajashekar@Sun.COM 			if (fnp->fn_parent == parent) {
40647902SNagakiran.Rajashekar@Sun.COM 				/*
40657902SNagakiran.Rajashekar@Sun.COM 				 * Remove ourselves from parent's
40667902SNagakiran.Rajashekar@Sun.COM 				 * fn_children tree.
40677902SNagakiran.Rajashekar@Sun.COM 				 */
40687902SNagakiran.Rajashekar@Sun.COM 				mutex_enter(&parent->fn_lock);
40697902SNagakiran.Rajashekar@Sun.COM 				avl_remove(&parent->fn_children, fnp);
40707902SNagakiran.Rajashekar@Sun.COM 				mutex_exit(&parent->fn_lock);
40717902SNagakiran.Rajashekar@Sun.COM 				fn_rele(&fnp->fn_parent);
40727902SNagakiran.Rajashekar@Sun.COM 			}
40737902SNagakiran.Rajashekar@Sun.COM 			mutex_exit(&fnp->fn_lock);
40747902SNagakiran.Rajashekar@Sun.COM 			fn_rele(&fnp);
40757902SNagakiran.Rajashekar@Sun.COM 			goto again;
40760Sstevel@tonic-gate 		}
40770Sstevel@tonic-gate 	}
40780Sstevel@tonic-gate 
40790Sstevel@tonic-gate 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
40800Sstevel@tonic-gate 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
40810Sstevel@tonic-gate 	fnp->fn_parent = parent;
40820Sstevel@tonic-gate 	if (parent != NULL)
40830Sstevel@tonic-gate 		fn_hold(parent);
40840Sstevel@tonic-gate 	fnp->fn_len = strlen(name);
40850Sstevel@tonic-gate 	ASSERT(fnp->fn_len < MAXNAMELEN);
40860Sstevel@tonic-gate 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
40870Sstevel@tonic-gate 	(void) strcpy(fnp->fn_name, name);
40880Sstevel@tonic-gate 	fnp->fn_refcnt = 1;
40897902SNagakiran.Rajashekar@Sun.COM 
40907902SNagakiran.Rajashekar@Sun.COM 	/*
40917902SNagakiran.Rajashekar@Sun.COM 	 * This hold on sfh is later released
40927902SNagakiran.Rajashekar@Sun.COM 	 * when we do the final fn_rele() on this fname.
40937902SNagakiran.Rajashekar@Sun.COM 	 */
40947902SNagakiran.Rajashekar@Sun.COM 	sfh4_hold(sfh);
40957902SNagakiran.Rajashekar@Sun.COM 	fnp->fn_sfh = sfh;
40967902SNagakiran.Rajashekar@Sun.COM 
40970Sstevel@tonic-gate 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
40980Sstevel@tonic-gate 	    offsetof(nfs4_fname_t, fn_tree));
40990Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
41005302Sth199096 	    "fn_get %p:%s, a new nfs4_fname_t!",
41015302Sth199096 	    (void *)fnp, fnp->fn_name));
41020Sstevel@tonic-gate 	if (parent != NULL) {
41030Sstevel@tonic-gate 		avl_insert(&parent->fn_children, fnp, where);
41040Sstevel@tonic-gate 		mutex_exit(&parent->fn_lock);
41050Sstevel@tonic-gate 	}
41060Sstevel@tonic-gate 
41070Sstevel@tonic-gate 	return (fnp);
41080Sstevel@tonic-gate }
41090Sstevel@tonic-gate 
41100Sstevel@tonic-gate void
fn_hold(nfs4_fname_t * fnp)41110Sstevel@tonic-gate fn_hold(nfs4_fname_t *fnp)
41120Sstevel@tonic-gate {
41130Sstevel@tonic-gate 	atomic_add_32(&fnp->fn_refcnt, 1);
41140Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
41155302Sth199096 	    "fn_hold %p:%s, new refcnt=%d",
41165302Sth199096 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
41170Sstevel@tonic-gate }
41180Sstevel@tonic-gate 
41190Sstevel@tonic-gate /*
41200Sstevel@tonic-gate  * Decrement the reference count of the given fname, and destroy it if its
41210Sstevel@tonic-gate  * reference count goes to zero.  Nulls out the given pointer.
41220Sstevel@tonic-gate  */
41230Sstevel@tonic-gate 
41240Sstevel@tonic-gate void
fn_rele(nfs4_fname_t ** fnpp)41250Sstevel@tonic-gate fn_rele(nfs4_fname_t **fnpp)
41260Sstevel@tonic-gate {
41270Sstevel@tonic-gate 	nfs4_fname_t *parent;
41280Sstevel@tonic-gate 	uint32_t newref;
41290Sstevel@tonic-gate 	nfs4_fname_t *fnp;
41300Sstevel@tonic-gate 
41310Sstevel@tonic-gate recur:
41320Sstevel@tonic-gate 	fnp = *fnpp;
41330Sstevel@tonic-gate 	*fnpp = NULL;
41340Sstevel@tonic-gate 
41350Sstevel@tonic-gate 	mutex_enter(&fnp->fn_lock);
41360Sstevel@tonic-gate 	parent = fnp->fn_parent;
41370Sstevel@tonic-gate 	if (parent != NULL)
41380Sstevel@tonic-gate 		mutex_enter(&parent->fn_lock);	/* prevent new references */
41390Sstevel@tonic-gate 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
41400Sstevel@tonic-gate 	if (newref > 0) {
41410Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
41425302Sth199096 		    "fn_rele %p:%s, new refcnt=%d",
41435302Sth199096 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
41440Sstevel@tonic-gate 		if (parent != NULL)
41450Sstevel@tonic-gate 			mutex_exit(&parent->fn_lock);
41460Sstevel@tonic-gate 		mutex_exit(&fnp->fn_lock);
41470Sstevel@tonic-gate 		return;
41480Sstevel@tonic-gate 	}
41490Sstevel@tonic-gate 
41500Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
41515302Sth199096 	    "fn_rele %p:%s, last reference, deleting...",
41525302Sth199096 	    (void *)fnp, fnp->fn_name));
41530Sstevel@tonic-gate 	if (parent != NULL) {
41540Sstevel@tonic-gate 		avl_remove(&parent->fn_children, fnp);
41550Sstevel@tonic-gate 		mutex_exit(&parent->fn_lock);
41560Sstevel@tonic-gate 	}
41570Sstevel@tonic-gate 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
41587902SNagakiran.Rajashekar@Sun.COM 	sfh4_rele(&fnp->fn_sfh);
41590Sstevel@tonic-gate 	mutex_destroy(&fnp->fn_lock);
41600Sstevel@tonic-gate 	avl_destroy(&fnp->fn_children);
41610Sstevel@tonic-gate 	kmem_free(fnp, sizeof (nfs4_fname_t));
41620Sstevel@tonic-gate 	/*
41630Sstevel@tonic-gate 	 * Recursivly fn_rele the parent.
41640Sstevel@tonic-gate 	 * Use goto instead of a recursive call to avoid stack overflow.
41650Sstevel@tonic-gate 	 */
41660Sstevel@tonic-gate 	if (parent != NULL) {
41670Sstevel@tonic-gate 		fnpp = &parent;
41680Sstevel@tonic-gate 		goto recur;
41690Sstevel@tonic-gate 	}
41700Sstevel@tonic-gate }
41710Sstevel@tonic-gate 
41720Sstevel@tonic-gate /*
41730Sstevel@tonic-gate  * Returns the single component name of the given fname, in a MAXNAMELEN
41740Sstevel@tonic-gate  * string buffer, which the caller is responsible for freeing.  Note that
41750Sstevel@tonic-gate  * the name may become invalid as a result of fn_move().
41760Sstevel@tonic-gate  */
41770Sstevel@tonic-gate 
41780Sstevel@tonic-gate char *
fn_name(nfs4_fname_t * fnp)41790Sstevel@tonic-gate fn_name(nfs4_fname_t *fnp)
41800Sstevel@tonic-gate {
41810Sstevel@tonic-gate 	char *name;
41820Sstevel@tonic-gate 
41830Sstevel@tonic-gate 	ASSERT(fnp->fn_len < MAXNAMELEN);
41840Sstevel@tonic-gate 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
41850Sstevel@tonic-gate 	mutex_enter(&fnp->fn_lock);
41860Sstevel@tonic-gate 	(void) strcpy(name, fnp->fn_name);
41870Sstevel@tonic-gate 	mutex_exit(&fnp->fn_lock);
41880Sstevel@tonic-gate 
41890Sstevel@tonic-gate 	return (name);
41900Sstevel@tonic-gate }
41910Sstevel@tonic-gate 
41920Sstevel@tonic-gate 
41930Sstevel@tonic-gate /*
41940Sstevel@tonic-gate  * fn_path_realloc
41950Sstevel@tonic-gate  *
41960Sstevel@tonic-gate  * This function, used only by fn_path, constructs
41970Sstevel@tonic-gate  * a new string which looks like "prepend" + "/" + "current".
41980Sstevel@tonic-gate  * by allocating a new string and freeing the old one.
41990Sstevel@tonic-gate  */
42000Sstevel@tonic-gate static void
fn_path_realloc(char ** curses,char * prepend)42010Sstevel@tonic-gate fn_path_realloc(char **curses, char *prepend)
42020Sstevel@tonic-gate {
42030Sstevel@tonic-gate 	int len, curlen = 0;
42040Sstevel@tonic-gate 	char *news;
42050Sstevel@tonic-gate 
42060Sstevel@tonic-gate 	if (*curses == NULL) {
42070Sstevel@tonic-gate 		/*
42080Sstevel@tonic-gate 		 * Prime the pump, allocate just the
42090Sstevel@tonic-gate 		 * space for prepend and return that.
42100Sstevel@tonic-gate 		 */
42110Sstevel@tonic-gate 		len = strlen(prepend) + 1;
42120Sstevel@tonic-gate 		news = kmem_alloc(len, KM_SLEEP);
42130Sstevel@tonic-gate 		(void) strncpy(news, prepend, len);
42140Sstevel@tonic-gate 	} else {
42150Sstevel@tonic-gate 		/*
42160Sstevel@tonic-gate 		 * Allocate the space  for a new string
42170Sstevel@tonic-gate 		 * +1 +1 is for the "/" and the NULL
42180Sstevel@tonic-gate 		 * byte at the end of it all.
42190Sstevel@tonic-gate 		 */
42200Sstevel@tonic-gate 		curlen = strlen(*curses);
42210Sstevel@tonic-gate 		len = curlen + strlen(prepend) + 1 + 1;
42220Sstevel@tonic-gate 		news = kmem_alloc(len, KM_SLEEP);
42230Sstevel@tonic-gate 		(void) strncpy(news, prepend, len);
42240Sstevel@tonic-gate 		(void) strcat(news, "/");
42250Sstevel@tonic-gate 		(void) strcat(news, *curses);
42260Sstevel@tonic-gate 		kmem_free(*curses, curlen + 1);
42270Sstevel@tonic-gate 	}
42280Sstevel@tonic-gate 	*curses = news;
42290Sstevel@tonic-gate }
42300Sstevel@tonic-gate 
42310Sstevel@tonic-gate /*
42320Sstevel@tonic-gate  * Returns the path name (starting from the fs root) for the given fname.
42330Sstevel@tonic-gate  * The caller is responsible for freeing.  Note that the path may be or
42340Sstevel@tonic-gate  * become invalid as a result of fn_move().
42350Sstevel@tonic-gate  */
42360Sstevel@tonic-gate 
42370Sstevel@tonic-gate char *
fn_path(nfs4_fname_t * fnp)42380Sstevel@tonic-gate fn_path(nfs4_fname_t *fnp)
42390Sstevel@tonic-gate {
42400Sstevel@tonic-gate 	char *path;
42410Sstevel@tonic-gate 	nfs4_fname_t *nextfnp;
42420Sstevel@tonic-gate 
42430Sstevel@tonic-gate 	if (fnp == NULL)
42440Sstevel@tonic-gate 		return (NULL);
42450Sstevel@tonic-gate 
42460Sstevel@tonic-gate 	path = NULL;
42470Sstevel@tonic-gate 
42480Sstevel@tonic-gate 	/* walk up the tree constructing the pathname.  */
42490Sstevel@tonic-gate 
42500Sstevel@tonic-gate 	fn_hold(fnp);			/* adjust for later rele */
42510Sstevel@tonic-gate 	do {
42520Sstevel@tonic-gate 		mutex_enter(&fnp->fn_lock);
42530Sstevel@tonic-gate 		/*
42540Sstevel@tonic-gate 		 * Add fn_name in front of the current path
42550Sstevel@tonic-gate 		 */
42560Sstevel@tonic-gate 		fn_path_realloc(&path, fnp->fn_name);
42570Sstevel@tonic-gate 		nextfnp = fnp->fn_parent;
42580Sstevel@tonic-gate 		if (nextfnp != NULL)
42590Sstevel@tonic-gate 			fn_hold(nextfnp);
42600Sstevel@tonic-gate 		mutex_exit(&fnp->fn_lock);
42610Sstevel@tonic-gate 		fn_rele(&fnp);
42620Sstevel@tonic-gate 		fnp = nextfnp;
42630Sstevel@tonic-gate 	} while (fnp != NULL);
42640Sstevel@tonic-gate 
42650Sstevel@tonic-gate 	return (path);
42660Sstevel@tonic-gate }
42670Sstevel@tonic-gate 
42680Sstevel@tonic-gate /*
42690Sstevel@tonic-gate  * Return a reference to the parent of the given fname, which the caller is
42700Sstevel@tonic-gate  * responsible for eventually releasing.
42710Sstevel@tonic-gate  */
42720Sstevel@tonic-gate 
42730Sstevel@tonic-gate nfs4_fname_t *
fn_parent(nfs4_fname_t * fnp)42740Sstevel@tonic-gate fn_parent(nfs4_fname_t *fnp)
42750Sstevel@tonic-gate {
42760Sstevel@tonic-gate 	nfs4_fname_t *parent;
42770Sstevel@tonic-gate 
42780Sstevel@tonic-gate 	mutex_enter(&fnp->fn_lock);
42790Sstevel@tonic-gate 	parent = fnp->fn_parent;
42800Sstevel@tonic-gate 	if (parent != NULL)
42810Sstevel@tonic-gate 		fn_hold(parent);
42820Sstevel@tonic-gate 	mutex_exit(&fnp->fn_lock);
42830Sstevel@tonic-gate 
42840Sstevel@tonic-gate 	return (parent);
42850Sstevel@tonic-gate }
42860Sstevel@tonic-gate 
42870Sstevel@tonic-gate /*
42880Sstevel@tonic-gate  * Update fnp so that its parent is newparent and its name is newname.
42890Sstevel@tonic-gate  */
42900Sstevel@tonic-gate 
42910Sstevel@tonic-gate void
fn_move(nfs4_fname_t * fnp,nfs4_fname_t * newparent,char * newname)42920Sstevel@tonic-gate fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
42930Sstevel@tonic-gate {
42940Sstevel@tonic-gate 	nfs4_fname_t *parent, *tmpfnp;
42950Sstevel@tonic-gate 	ssize_t newlen;
42960Sstevel@tonic-gate 	nfs4_fname_t key;
42970Sstevel@tonic-gate 	avl_index_t where;
42980Sstevel@tonic-gate 
42990Sstevel@tonic-gate 	/*
43000Sstevel@tonic-gate 	 * This assert exists to catch the client trying to rename
43010Sstevel@tonic-gate 	 * a dir to be a child of itself.  This happened at a recent
43020Sstevel@tonic-gate 	 * bakeoff against a 3rd party (broken) server which allowed
43030Sstevel@tonic-gate 	 * the rename to succeed.  If it trips it means that:
43040Sstevel@tonic-gate 	 *	a) the code in nfs4rename that detects this case is broken
43050Sstevel@tonic-gate 	 *	b) the server is broken (since it allowed the bogus rename)
43060Sstevel@tonic-gate 	 *
43070Sstevel@tonic-gate 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
43080Sstevel@tonic-gate 	 * panic below from:  mutex_enter(&newparent->fn_lock);
43090Sstevel@tonic-gate 	 */
43100Sstevel@tonic-gate 	ASSERT(fnp != newparent);
43110Sstevel@tonic-gate 
43120Sstevel@tonic-gate 	/*
43130Sstevel@tonic-gate 	 * Remove fnp from its current parent, change its name, then add it
431410238SPavel.Filipensky@Sun.COM 	 * to newparent. It might happen that fnp was replaced by another
431510238SPavel.Filipensky@Sun.COM 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
431610238SPavel.Filipensky@Sun.COM 	 * In such case, fnp->fn_parent is NULL and we skip the removal
431710238SPavel.Filipensky@Sun.COM 	 * of fnp from its current parent.
43180Sstevel@tonic-gate 	 */
43190Sstevel@tonic-gate 	mutex_enter(&fnp->fn_lock);
43200Sstevel@tonic-gate 	parent = fnp->fn_parent;
432110238SPavel.Filipensky@Sun.COM 	if (parent != NULL) {
432210238SPavel.Filipensky@Sun.COM 		mutex_enter(&parent->fn_lock);
432310238SPavel.Filipensky@Sun.COM 		avl_remove(&parent->fn_children, fnp);
432410238SPavel.Filipensky@Sun.COM 		mutex_exit(&parent->fn_lock);
432510238SPavel.Filipensky@Sun.COM 		fn_rele(&fnp->fn_parent);
432610238SPavel.Filipensky@Sun.COM 	}
43270Sstevel@tonic-gate 
43280Sstevel@tonic-gate 	newlen = strlen(newname);
43290Sstevel@tonic-gate 	if (newlen != fnp->fn_len) {
43300Sstevel@tonic-gate 		ASSERT(newlen < MAXNAMELEN);
43310Sstevel@tonic-gate 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
43320Sstevel@tonic-gate 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
43330Sstevel@tonic-gate 		fnp->fn_len = newlen;
43340Sstevel@tonic-gate 	}
43350Sstevel@tonic-gate 	(void) strcpy(fnp->fn_name, newname);
43360Sstevel@tonic-gate 
43370Sstevel@tonic-gate again:
43380Sstevel@tonic-gate 	mutex_enter(&newparent->fn_lock);
43390Sstevel@tonic-gate 	key.fn_name = fnp->fn_name;
43400Sstevel@tonic-gate 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
43410Sstevel@tonic-gate 	if (tmpfnp != NULL) {
43420Sstevel@tonic-gate 		/*
43430Sstevel@tonic-gate 		 * This could be due to a file that was unlinked while
43440Sstevel@tonic-gate 		 * open, or perhaps the rnode is in the free list.  Remove
43450Sstevel@tonic-gate 		 * it from newparent and let it go away on its own.  The
43460Sstevel@tonic-gate 		 * contorted code is to deal with lock order issues and
43470Sstevel@tonic-gate 		 * race conditions.
43480Sstevel@tonic-gate 		 */
43490Sstevel@tonic-gate 		fn_hold(tmpfnp);
43500Sstevel@tonic-gate 		mutex_exit(&newparent->fn_lock);
43510Sstevel@tonic-gate 		mutex_enter(&tmpfnp->fn_lock);
43520Sstevel@tonic-gate 		if (tmpfnp->fn_parent == newparent) {
43530Sstevel@tonic-gate 			mutex_enter(&newparent->fn_lock);
43540Sstevel@tonic-gate 			avl_remove(&newparent->fn_children, tmpfnp);
43550Sstevel@tonic-gate 			mutex_exit(&newparent->fn_lock);
43560Sstevel@tonic-gate 			fn_rele(&tmpfnp->fn_parent);
43570Sstevel@tonic-gate 		}
43580Sstevel@tonic-gate 		mutex_exit(&tmpfnp->fn_lock);
43590Sstevel@tonic-gate 		fn_rele(&tmpfnp);
43600Sstevel@tonic-gate 		goto again;
43610Sstevel@tonic-gate 	}
43620Sstevel@tonic-gate 	fnp->fn_parent = newparent;
43630Sstevel@tonic-gate 	fn_hold(newparent);
43640Sstevel@tonic-gate 	avl_insert(&newparent->fn_children, fnp, where);
43650Sstevel@tonic-gate 	mutex_exit(&newparent->fn_lock);
43660Sstevel@tonic-gate 	mutex_exit(&fnp->fn_lock);
43670Sstevel@tonic-gate }
43680Sstevel@tonic-gate 
43690Sstevel@tonic-gate #ifdef DEBUG
43700Sstevel@tonic-gate /*
43710Sstevel@tonic-gate  * Return non-zero if the type information makes sense for the given vnode.
43720Sstevel@tonic-gate  * Otherwise panic.
43730Sstevel@tonic-gate  */
43740Sstevel@tonic-gate int
nfs4_consistent_type(vnode_t * vp)43750Sstevel@tonic-gate nfs4_consistent_type(vnode_t *vp)
43760Sstevel@tonic-gate {
43770Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
43780Sstevel@tonic-gate 
43790Sstevel@tonic-gate 	if (nfs4_vtype_debug && vp->v_type != VNON &&
43800Sstevel@tonic-gate 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
43810Sstevel@tonic-gate 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
43825302Sth199096 		    "rnode attr type=%d", (void *)vp, vp->v_type,
43835302Sth199096 		    rp->r_attr.va_type);
43840Sstevel@tonic-gate 	}
43850Sstevel@tonic-gate 
43860Sstevel@tonic-gate 	return (1);
43870Sstevel@tonic-gate }
43880Sstevel@tonic-gate #endif /* DEBUG */
4389