xref: /onnv-gate/usr/src/uts/common/vm/vpm.c (revision 5331:3047ad28a67b)
11841Spraks /*
21841Spraks  * CDDL HEADER START
31841Spraks  *
41841Spraks  * The contents of this file are subject to the terms of the
51841Spraks  * Common Development and Distribution License (the "License").
61841Spraks  * You may not use this file except in compliance with the License.
71841Spraks  *
81841Spraks  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
91841Spraks  * or http://www.opensolaris.org/os/licensing.
101841Spraks  * See the License for the specific language governing permissions
111841Spraks  * and limitations under the License.
121841Spraks  *
131841Spraks  * When distributing Covered Code, include this CDDL HEADER in each
141841Spraks  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
151841Spraks  * If applicable, add the following below this CDDL HEADER, with the
161841Spraks  * fields enclosed by brackets "[]" replaced with your own identifying
171841Spraks  * information: Portions Copyright [yyyy] [name of copyright owner]
181841Spraks  *
191841Spraks  * CDDL HEADER END
201841Spraks  */
211841Spraks /*
22*5331Samw  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
231841Spraks  * Use is subject to license terms.
241841Spraks  */
251841Spraks 
261841Spraks #pragma ident	"%Z%%M%	%I%	%E% SMI"
271841Spraks 
281841Spraks /*
291841Spraks  * VM - generic vnode page mapping interfaces.
301841Spraks  *
311841Spraks  * Mechanism to provide temporary mappings to vnode pages.
321841Spraks  * The typical use would be to copy/access file data.
331841Spraks  */
341841Spraks 
351841Spraks #include <sys/types.h>
361841Spraks #include <sys/t_lock.h>
371841Spraks #include <sys/param.h>
381841Spraks #include <sys/sysmacros.h>
391841Spraks #include <sys/buf.h>
401841Spraks #include <sys/systm.h>
411841Spraks #include <sys/vnode.h>
421841Spraks #include <sys/mman.h>
431841Spraks #include <sys/errno.h>
441841Spraks #include <sys/cred.h>
451841Spraks #include <sys/kmem.h>
461841Spraks #include <sys/vtrace.h>
471841Spraks #include <sys/cmn_err.h>
481841Spraks #include <sys/debug.h>
491841Spraks #include <sys/thread.h>
501841Spraks #include <sys/dumphdr.h>
511841Spraks #include <sys/bitmap.h>
521841Spraks #include <sys/lgrp.h>
531841Spraks 
541841Spraks #include <vm/seg_kmem.h>
551841Spraks #include <vm/hat.h>
561841Spraks #include <vm/as.h>
571841Spraks #include <vm/seg.h>
581841Spraks #include <vm/seg_kpm.h>
591841Spraks #include <vm/seg_map.h>
601841Spraks #include <vm/page.h>
611841Spraks #include <vm/pvn.h>
621841Spraks #include <vm/rm.h>
631841Spraks #include <vm/vpm.h>
641841Spraks 
651841Spraks /*
661841Spraks  * Needs to be enabled by each platform.
671841Spraks  */
681841Spraks int vpm_enable = 0;
691841Spraks 
701841Spraks #ifdef	SEGKPM_SUPPORT
711841Spraks 
721841Spraks 
731841Spraks int	vpm_cache_enable = 1;
741841Spraks long	vpm_cache_percent = 12;
751841Spraks long	vpm_cache_size;
761841Spraks int	vpm_nfreelist = 0;
771841Spraks int	vpmd_freemsk = 0;
781841Spraks 
791841Spraks #define	VPM_S_PAD	64
801841Spraks union vpm_cpu {
811841Spraks 	struct {
821841Spraks 		int	vcpu_free_ndx;
831841Spraks 		ulong_t	vcpu_hits;
841841Spraks 		ulong_t vcpu_misses;
851841Spraks 	} vcpu;
861841Spraks 	char vpm_pad[VPM_S_PAD];
871841Spraks };
881841Spraks static union vpm_cpu	*vpmd_cpu;
891841Spraks 
901841Spraks #define	vfree_ndx	vcpu.vcpu_free_ndx
911841Spraks 
921841Spraks int	vpm_cachemode = VPMCACHE_LRU;
931841Spraks 
941841Spraks #define	PPMTX(pp) (&(pp)->p_ilock)
951841Spraks 
961841Spraks static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
971841Spraks static struct vpmfree *vpmd_free;
981841Spraks #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
991841Spraks #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
1001841Spraks #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
1011841Spraks #define	VPMP(id)	(&vpmd_vpmap[id - 1])
1021841Spraks #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
1031841Spraks 
1041841Spraks 
1051841Spraks #ifdef	DEBUG
1061841Spraks 
1071841Spraks struct	vpm_debug {
1081841Spraks 	int vpmd_steals;
1091841Spraks 	int vpmd_contend;
1101841Spraks 	int vpmd_prevpagelocked;
1111841Spraks 	int vpmd_getpagefailed;
1121841Spraks 	int vpmd_zerostart;
1131841Spraks 	int vpmd_emptyfreelist;
1141841Spraks 	int vpmd_nofreevpms;
1151841Spraks } vpm_debug;
1161841Spraks 
1171841Spraks #define	VPM_DEBUG(x)	((vpm_debug.x)++)
1181841Spraks 
1191841Spraks int	steals;
1201841Spraks int	steals_mtbf = 7;
1211841Spraks int	contend;
1221841Spraks int	contend_mtbf = 127;
1231841Spraks 
1241841Spraks #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
1251841Spraks 
1261841Spraks #else	/* DEBUG */
1271841Spraks 
1281841Spraks #define	VPM_MTBF(v, f)	(1)
1291841Spraks #define	VPM_DEBUG(x)	/* nothing */
1301841Spraks 
1311841Spraks #endif
1321841Spraks 
1331841Spraks /*
1341841Spraks  * The vpm cache.
1351841Spraks  *
1361841Spraks  * The main purpose of having a cache here is to speed up page_lookup()
1371841Spraks  * operations and also provide an LRU(default) behaviour of file pages. The
1381841Spraks  * page_lookup() operation tends to be expensive if a page has to be
1391841Spraks  * reclaimed from the system page cache("cachelist"). Once we speed up the
1401841Spraks  * page_lookup()->page_reclaim() path then there there should be no need for
1411841Spraks  * this cache. The system page cache(cachelist) should effectively serve the
1421841Spraks  * purpose of caching file pages.
1431841Spraks  *
1441841Spraks  * This cache is very similar to segmap's smap cache. Each page in the
1451841Spraks  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
1461841Spraks  * hash table. The page_t has a reference to the vpmap_t when cached. For a
1471841Spraks  * given vnode, offset the page is found by means of a page_lookup() operation.
1481841Spraks  * Any page which has a mapping(i.e when cached) will not be in the
1491841Spraks  * system 'cachelist'. Hence the page_lookup() will not have to do a
1501841Spraks  * page_reclaim(). That is how the cache serves to speed up page_lookup()
1511841Spraks  * operations.
1521841Spraks  *
1531841Spraks  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
1541841Spraks  */
1551841Spraks 
1561841Spraks void
1571841Spraks vpm_init()
1581841Spraks {
1591841Spraks 	long  npages;
1601841Spraks 	struct vpmap *vpm;
1611841Spraks 	struct vpmfree *vpmflp;
1621841Spraks 	int i, ndx;
1631841Spraks 	extern void prefetch_smap_w(void *);
1641841Spraks 
1651841Spraks 	if (!vpm_cache_enable) {
1661841Spraks 		return;
1671841Spraks 	}
1681841Spraks 
1691841Spraks 	/*
1701841Spraks 	 * Set the size of the cache.
1711841Spraks 	 */
1721841Spraks 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
1731841Spraks 	if (vpm_cache_size < VPMAP_MINCACHE) {
1741841Spraks 		vpm_cache_size = VPMAP_MINCACHE;
1751841Spraks 	}
1761841Spraks 
1771841Spraks 	/*
1781841Spraks 	 * Number of freelists.
1791841Spraks 	 */
1801841Spraks 	if (vpm_nfreelist == 0) {
1811841Spraks 		vpm_nfreelist = max_ncpus;
1821841Spraks 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
1831841Spraks 		cmn_err(CE_WARN, "vpmap create : number of freelist "
1841841Spraks 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
1851841Spraks 		vpm_nfreelist = 2 * max_ncpus;
1861841Spraks 	}
1871841Spraks 
1881841Spraks 	/*
1891841Spraks 	 * Round it up to the next power of 2
1901841Spraks 	 */
1911841Spraks 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
1921841Spraks 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
1931841Spraks 	}
1941841Spraks 	vpmd_freemsk = vpm_nfreelist - 1;
1951841Spraks 
1961841Spraks 	/*
1971841Spraks 	 * Use a per cpu rotor index to spread the allocations evenly
1981841Spraks 	 * across the available vpm freelists.
1991841Spraks 	 */
2001841Spraks 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
2011841Spraks 	ndx = 0;
2021841Spraks 	for (i = 0; i < max_ncpus; i++) {
2031841Spraks 
2041841Spraks 		vpmd_cpu[i].vfree_ndx = ndx;
2051841Spraks 		ndx = (ndx + 1) & vpmd_freemsk;
2061841Spraks 	}
2071841Spraks 
2081841Spraks 	/*
2091841Spraks 	 * Allocate and initialize the freelist.
2101841Spraks 	 */
2111841Spraks 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
2121841Spraks 				KM_SLEEP);
2131841Spraks 	for (i = 0; i < vpm_nfreelist; i++) {
2141841Spraks 
2151841Spraks 		vpmflp = &vpmd_free[i];
2161841Spraks 		/*
2171841Spraks 		 * Set up initial queue pointers. They will get flipped
2181841Spraks 		 * back and forth.
2191841Spraks 		 */
2201841Spraks 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
2211841Spraks 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
2221841Spraks 	}
2231841Spraks 
2241841Spraks 	npages = mmu_btop(vpm_cache_size);
2251841Spraks 
2261841Spraks 
2271841Spraks 	/*
2281841Spraks 	 * Allocate and initialize the vpmap structs.
2291841Spraks 	 */
2301841Spraks 	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
2311841Spraks 	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
2321841Spraks 		struct vpmfree *vpmflp;
2331841Spraks 		union vpm_freeq *releq;
2341841Spraks 		struct vpmap *vpmapf;
2351841Spraks 
2361841Spraks 		/*
2371841Spraks 		 * Use prefetch as we have to walk thru a large number of
2381841Spraks 		 * these data structures. We just use the smap's prefetch
2391841Spraks 		 * routine as it does the same. This should work fine
240*5331Samw 		 * for x64(this needs to be modified when enabled on sparc).
2411841Spraks 		 */
2421841Spraks 		prefetch_smap_w((void *)vpm);
2431841Spraks 
2441841Spraks 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
2451841Spraks 
2461841Spraks 		vpmflp = VPMAP2VMF(vpm);
2471841Spraks 		releq = vpmflp->vpm_releq;
2481841Spraks 
2491841Spraks 		vpmapf = releq->vpmq_free;
2501841Spraks 		if (vpmapf == NULL) {
2511841Spraks 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
2521841Spraks 		} else {
2531841Spraks 			vpm->vpm_next = vpmapf;
2541841Spraks 			vpm->vpm_prev = vpmapf->vpm_prev;
2551841Spraks 			vpmapf->vpm_prev = vpm;
2561841Spraks 			vpm->vpm_prev->vpm_next = vpm;
2571841Spraks 			releq->vpmq_free = vpm->vpm_next;
2581841Spraks 		}
2591841Spraks 
2601841Spraks 		/*
2611841Spraks 		 * Indicate that the vpmap is on the releq at start
2621841Spraks 		 */
2631841Spraks 		vpm->vpm_ndxflg = VPMRELEQ;
2641841Spraks 	}
2651841Spraks }
2661841Spraks 
2671841Spraks 
2681841Spraks /*
2691841Spraks  * unhooks vpm from the freelist if it is still on the freelist.
2701841Spraks  */
2711841Spraks #define	VPMAP_RMFREELIST(vpm) \
2721841Spraks 	{ \
2731841Spraks 		if (vpm->vpm_next != NULL) { \
2741841Spraks 			union vpm_freeq *freeq; \
2751841Spraks 			struct vpmfree *vpmflp; \
2761841Spraks 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
2771841Spraks 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
2781841Spraks 			mutex_enter(&freeq->vpmq_mtx); \
2791841Spraks 			if (freeq->vpmq_free != vpm) { \
2801841Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
2811841Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
2821841Spraks 			} else if (vpm == vpm->vpm_next) { \
2831841Spraks 				freeq->vpmq_free = NULL; \
2841841Spraks 			} else { \
2851841Spraks 				freeq->vpmq_free = vpm->vpm_next; \
2861841Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
2871841Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
2881841Spraks 			} \
2891841Spraks 			mutex_exit(&freeq->vpmq_mtx); \
2901841Spraks 			vpm->vpm_next = vpm->vpm_prev = NULL; \
2911841Spraks 		} \
2921841Spraks 	}
2931841Spraks 
2941841Spraks static int
2951841Spraks get_freelndx(int mode)
2961841Spraks {
2971841Spraks 	int ndx;
2981841Spraks 
2991841Spraks 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
3001841Spraks 	switch (mode) {
3011841Spraks 
3021841Spraks 	case	VPMCACHE_LRU:
3031841Spraks 	default:
3041841Spraks 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
3051841Spraks 			break;
3061841Spraks 	}
3071841Spraks 	return (ndx);
3081841Spraks }
3091841Spraks 
3101841Spraks 
3111841Spraks /*
3121841Spraks  * Find one vpmap structure from the free lists and use it for the newpage.
3131841Spraks  * The previous page it cached is dissociated and released. The page_t's
3141841Spraks  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
3151841Spraks  * for AMD64 when the page is exclusively locked in page_unload. That is
3161841Spraks  * because the p_vpmref is treated as mapping).
3171841Spraks  *
3181841Spraks  * The page's p_vpmref is set when the page is
3191841Spraks  * locked(at least SHARED locked).
3201841Spraks  */
3211841Spraks static struct vpmap *
3221841Spraks get_free_vpmap(page_t *newpage)
3231841Spraks {
3241841Spraks 	struct vpmfree *vpmflp;
3251841Spraks 	kmutex_t *vmtx;
3261841Spraks 	struct vpmap *vpm, *first;
3271841Spraks 	union vpm_freeq *allocq, *releq;
3281841Spraks 	page_t *pp = NULL;
3291841Spraks 	int end_ndx, page_locked = 0;
3301841Spraks 	int free_ndx;
3311841Spraks 
3321841Spraks 	/*
3331841Spraks 	 * get the freelist bin index.
3341841Spraks 	 */
3351841Spraks 	free_ndx = get_freelndx(vpm_cachemode);
3361841Spraks 
3371841Spraks 	end_ndx = free_ndx;
3381841Spraks 	vpmflp = &vpmd_free[free_ndx];
3391841Spraks 
3401841Spraks retry_queue:
3411841Spraks 	allocq = vpmflp->vpm_allocq;
3421841Spraks 	mutex_enter(&allocq->vpmq_mtx);
3431841Spraks 
3441841Spraks 	if ((vpm = allocq->vpmq_free) == NULL) {
3451841Spraks 
3461841Spraks skip_queue:
3471841Spraks 		/*
3481841Spraks 		 * The alloc list is empty or this queue is being skipped;
3491841Spraks 		 * first see if the allocq toggled.
3501841Spraks 		 */
3511841Spraks 		if (vpmflp->vpm_allocq != allocq) {
3521841Spraks 			/* queue changed */
3531841Spraks 			mutex_exit(&allocq->vpmq_mtx);
3541841Spraks 			goto retry_queue;
3551841Spraks 		}
3561841Spraks 		releq = vpmflp->vpm_releq;
3571841Spraks 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
3581841Spraks 			/* cannot get releq; a free vpmap may be there now */
3591841Spraks 			mutex_exit(&allocq->vpmq_mtx);
3601841Spraks 
3611841Spraks 			/*
3621841Spraks 			 * This loop could spin forever if this thread has
3631841Spraks 			 * higher priority than the thread that is holding
3641841Spraks 			 * releq->vpmq_mtx. In order to force the other thread
3651841Spraks 			 * to run, we'll lock/unlock the mutex which is safe
3661841Spraks 			 * since we just unlocked the allocq mutex.
3671841Spraks 			 */
3681841Spraks 			mutex_enter(&releq->vpmq_mtx);
3691841Spraks 			mutex_exit(&releq->vpmq_mtx);
3701841Spraks 			goto retry_queue;
3711841Spraks 		}
3721841Spraks 		if (releq->vpmq_free == NULL) {
3731841Spraks 			VPM_DEBUG(vpmd_emptyfreelist);
3741841Spraks 			/*
3751841Spraks 			 * This freelist is empty.
3761841Spraks 			 * This should not happen unless clients
3771841Spraks 			 * are failing to release the vpmap after
3781841Spraks 			 * accessing the data. Before resorting
3791841Spraks 			 * to sleeping, try the next list of the same color.
3801841Spraks 			 */
3811841Spraks 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
3821841Spraks 			if (free_ndx != end_ndx) {
3831841Spraks 				mutex_exit(&releq->vpmq_mtx);
3841841Spraks 				mutex_exit(&allocq->vpmq_mtx);
3851841Spraks 				vpmflp = &vpmd_free[free_ndx];
3861841Spraks 				goto retry_queue;
3871841Spraks 			}
3881841Spraks 			/*
3891841Spraks 			 * Tried all freelists.
3901841Spraks 			 * wait on this list and hope something gets freed.
3911841Spraks 			 */
3921841Spraks 			vpmflp->vpm_want++;
3931841Spraks 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
3941841Spraks 			cv_wait(&vpmflp->vpm_free_cv,
3951841Spraks 				&vpmflp->vpm_freeq[0].vpmq_mtx);
3961841Spraks 			vpmflp->vpm_want--;
3971841Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
3981841Spraks 			vpmflp = &vpmd_free[free_ndx];
3991841Spraks 			VPM_DEBUG(vpmd_nofreevpms);
4001841Spraks 			goto retry_queue;
4011841Spraks 		} else {
4021841Spraks 			/*
4031841Spraks 			 * Something on the rele queue; flip the alloc
4041841Spraks 			 * and rele queues and retry.
4051841Spraks 			 */
4061841Spraks 			vpmflp->vpm_allocq = releq;
4071841Spraks 			vpmflp->vpm_releq = allocq;
4081841Spraks 			mutex_exit(&allocq->vpmq_mtx);
4091841Spraks 			mutex_exit(&releq->vpmq_mtx);
4101841Spraks 			if (page_locked) {
4111841Spraks 				delay(hz >> 2);
4121841Spraks 				page_locked = 0;
4131841Spraks 			}
4141841Spraks 			goto retry_queue;
4151841Spraks 		}
4161841Spraks 	} else {
4171841Spraks 		int gotnewvpm;
4181841Spraks 		kmutex_t *pmtx;
4191841Spraks 		uint_t vpmref;
4201841Spraks 
4211841Spraks 		/*
4221841Spraks 		 * Fastpath the case we get the vpmap mutex
4231841Spraks 		 * on the first try.
4241841Spraks 		 */
4251841Spraks 		first = vpm;
4261841Spraks next_vpmap:
4271841Spraks 		vmtx = VPMAPMTX(vpm);
4281841Spraks 		if (!mutex_tryenter(vmtx)) {
4291841Spraks 			/*
4301841Spraks 			 * Another thread is trying to reclaim this slot.
4311841Spraks 			 * Skip to the next queue or vpmap.
4321841Spraks 			 */
4331841Spraks 			if ((vpm = vpm->vpm_next) == first) {
4341841Spraks 				goto skip_queue;
4351841Spraks 			} else {
4361841Spraks 				goto next_vpmap;
4371841Spraks 			}
4381841Spraks 		}
4391841Spraks 
4401841Spraks 		/*
4411841Spraks 		 * Assign this vpm to the newpage.
4421841Spraks 		 */
4431841Spraks 		pmtx = PPMTX(newpage);
4441841Spraks 		gotnewvpm = 0;
4451841Spraks 		mutex_enter(pmtx);
4461841Spraks 
4471841Spraks 		/*
4481841Spraks 		 * Check if some other thread already assigned a vpm to
4491841Spraks 		 * this page.
4501841Spraks 		 */
4511841Spraks 		if ((vpmref = newpage->p_vpmref) == 0) {
4521841Spraks 			newpage->p_vpmref = VPMID(vpm);
4531841Spraks 			gotnewvpm = 1;
4541841Spraks 		} else {
4551841Spraks 			VPM_DEBUG(vpmd_contend);
4561841Spraks 			mutex_exit(vmtx);
4571841Spraks 		}
4581841Spraks 		mutex_exit(pmtx);
4591841Spraks 
4601841Spraks 		if (gotnewvpm) {
4611841Spraks 
4621841Spraks 			/*
4631841Spraks 			 * At this point, we've selected the vpm. Remove vpm
4641841Spraks 			 * from its freelist. If vpm is the first one in
4651841Spraks 			 * the freelist, update the head of the freelist.
4661841Spraks 			 */
4671841Spraks 			if (first == vpm) {
4681841Spraks 				ASSERT(first == allocq->vpmq_free);
4691841Spraks 				allocq->vpmq_free = vpm->vpm_next;
4701841Spraks 			}
4711841Spraks 
4721841Spraks 			/*
4731841Spraks 			 * If the head of the freelist still points to vpm,
4741841Spraks 			 * then there are no more free vpmaps in that list.
4751841Spraks 			 */
4761841Spraks 			if (allocq->vpmq_free == vpm)
4771841Spraks 				/*
4781841Spraks 				 * Took the last one
4791841Spraks 				 */
4801841Spraks 				allocq->vpmq_free = NULL;
4811841Spraks 			else {
4821841Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
4831841Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
4841841Spraks 			}
4851841Spraks 			mutex_exit(&allocq->vpmq_mtx);
4861841Spraks 			vpm->vpm_prev = vpm->vpm_next = NULL;
4871841Spraks 
4881841Spraks 			/*
4891841Spraks 			 * Disassociate the previous page. On x64 systems
4901841Spraks 			 * p_vpmref is used as a mapping reference to the page.
4911841Spraks 			 */
4921841Spraks 			if ((pp = vpm->vpm_pp) != NULL &&
4931841Spraks 				vpm->vpm_vp == pp->p_vnode &&
4941841Spraks 				vpm->vpm_off == pp->p_offset) {
4951841Spraks 
4961841Spraks 				pmtx = PPMTX(pp);
4971841Spraks 				if (page_trylock(pp, SE_SHARED)) {
4981841Spraks 					/*
4991841Spraks 					 * Now verify that it is the correct
5001841Spraks 					 * page. If not someone else stole it,
5011841Spraks 					 * so just unlock it and leave.
5021841Spraks 					 */
5031841Spraks 					mutex_enter(pmtx);
5041841Spraks 					if (PP_ISFREE(pp) ||
5051841Spraks 						vpm->vpm_vp != pp->p_vnode ||
5061841Spraks 						vpm->vpm_off != pp->p_offset ||
5071841Spraks 						pp->p_vpmref != VPMID(vpm)) {
5081841Spraks 						mutex_exit(pmtx);
5091841Spraks 
5101841Spraks 						page_unlock(pp);
5111841Spraks 					} else {
5121841Spraks 						/*
5131841Spraks 						 * Release the page.
5141841Spraks 						 */
5151841Spraks 						pp->p_vpmref = 0;
5161841Spraks 						mutex_exit(pmtx);
5171841Spraks 						hat_kpm_mapout(pp, 0,
5181841Spraks 							hat_kpm_page2va(pp, 1));
5191841Spraks 						(void) page_release(pp, 1);
5201841Spraks 					}
5211841Spraks 				} else {
5221841Spraks 					/*
5231841Spraks 					 * If the page cannot be locked, just
5241841Spraks 					 * clear the p_vpmref and go.
5251841Spraks 					 */
5261841Spraks 					mutex_enter(pmtx);
5271841Spraks 					if (pp->p_vpmref == VPMID(vpm)) {
5281841Spraks 						pp->p_vpmref = 0;
5291841Spraks 					}
5301841Spraks 					mutex_exit(pmtx);
5311841Spraks 					VPM_DEBUG(vpmd_prevpagelocked);
5321841Spraks 				}
5331841Spraks 			}
5341841Spraks 
5351841Spraks 			/*
5361841Spraks 			 * Setup vpm to point to the new page.
5371841Spraks 			 */
5381841Spraks 			vpm->vpm_pp = newpage;
5391841Spraks 			vpm->vpm_vp = newpage->p_vnode;
5401841Spraks 			vpm->vpm_off = newpage->p_offset;
5411841Spraks 
5421841Spraks 		} else {
5431841Spraks 			int steal = !VPM_MTBF(steals, steals_mtbf);
5441841Spraks 			/*
5451841Spraks 			 * Page already has a vpm assigned just use that.
5461841Spraks 			 * Grab the vpm mutex and verify that it is still
5471841Spraks 			 * the correct one. The pp->p_vpmref should not change
5481841Spraks 			 * once we have the vpm mutex and the page lock.
5491841Spraks 			 */
5501841Spraks 			mutex_exit(&allocq->vpmq_mtx);
5511841Spraks 			vpm = VPMP(vpmref);
5521841Spraks 			vmtx = VPMAPMTX(vpm);
5531841Spraks 			mutex_enter(vmtx);
5541841Spraks 			if ((steal && vpm->vpm_refcnt == 0) ||
5551841Spraks 			    vpm->vpm_pp != newpage) {
5561841Spraks 				/*
5571841Spraks 				 * The vpm got stolen, retry.
5581841Spraks 				 * clear the p_vpmref.
5591841Spraks 				 */
5601841Spraks 				pmtx = PPMTX(newpage);
5611841Spraks 				mutex_enter(pmtx);
5621841Spraks 				if (newpage->p_vpmref == vpmref) {
5631841Spraks 					newpage->p_vpmref = 0;
5641841Spraks 				}
5651841Spraks 				mutex_exit(pmtx);
5661841Spraks 
5671841Spraks 				mutex_exit(vmtx);
5681841Spraks 				VPM_DEBUG(vpmd_steals);
5691841Spraks 				goto retry_queue;
5701841Spraks 			} else if (vpm->vpm_refcnt == 0) {
5711841Spraks 				/*
5721841Spraks 				 * Remove it from the free list if it
5731841Spraks 				 * exists there.
5741841Spraks 				 */
5751841Spraks 				VPMAP_RMFREELIST(vpm);
5761841Spraks 			}
5771841Spraks 		}
5781841Spraks 		return (vpm);
5791841Spraks 	}
5801841Spraks }
5811841Spraks 
5821841Spraks static void
5831841Spraks free_vpmap(struct vpmap *vpm)
5841841Spraks {
5851841Spraks 	struct vpmfree *vpmflp;
5861841Spraks 	struct vpmap *vpmfreelist;
5871841Spraks 	union vpm_freeq *releq;
5881841Spraks 
5891841Spraks 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
5901841Spraks 
5911841Spraks 	if (vpm->vpm_refcnt != 0) {
5921841Spraks 		panic("free_vpmap");
5931841Spraks 		/*NOTREACHED*/
5941841Spraks 	}
5951841Spraks 
5961841Spraks 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
5971841Spraks 	/*
5981841Spraks 	 * Add to the tail of the release queue
5991841Spraks 	 * Note that vpm_releq and vpm_allocq could toggle
6001841Spraks 	 * before we get the lock. This does not affect
6011841Spraks 	 * correctness as the 2 queues are only maintained
6021841Spraks 	 * to reduce lock pressure.
6031841Spraks 	 */
6041841Spraks 	releq = vpmflp->vpm_releq;
6051841Spraks 	if (releq == &vpmflp->vpm_freeq[0]) {
6061841Spraks 		vpm->vpm_ndxflg = 0;
6071841Spraks 	} else {
6081841Spraks 		vpm->vpm_ndxflg = 1;
6091841Spraks 	}
6101841Spraks 	mutex_enter(&releq->vpmq_mtx);
6111841Spraks 	vpmfreelist = releq->vpmq_free;
6121841Spraks 	if (vpmfreelist == 0) {
6131841Spraks 		int want;
6141841Spraks 
6151841Spraks 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
6161841Spraks 		/*
6171841Spraks 		 * Both queue mutexes are held to set vpm_want;
6181841Spraks 		 * snapshot the value before dropping releq mutex.
6191841Spraks 		 * If vpm_want appears after the releq mutex is dropped,
6201841Spraks 		 * then the vpmap just freed is already gone.
6211841Spraks 		 */
6221841Spraks 		want = vpmflp->vpm_want;
6231841Spraks 		mutex_exit(&releq->vpmq_mtx);
6241841Spraks 		/*
6251841Spraks 		 * See if there was a waiter before dropping the releq mutex
6261841Spraks 		 * then recheck after obtaining vpm_freeq[0] mutex as
6271841Spraks 		 * the another thread may have already signaled.
6281841Spraks 		 */
6291841Spraks 		if (want) {
6301841Spraks 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
6311841Spraks 			if (vpmflp->vpm_want)
6321841Spraks 				cv_signal(&vpmflp->vpm_free_cv);
6331841Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
6341841Spraks 		}
6351841Spraks 	} else {
6361841Spraks 		vpm->vpm_next = vpmfreelist;
6371841Spraks 		vpm->vpm_prev = vpmfreelist->vpm_prev;
6381841Spraks 		vpmfreelist->vpm_prev = vpm;
6391841Spraks 		vpm->vpm_prev->vpm_next = vpm;
6401841Spraks 		mutex_exit(&releq->vpmq_mtx);
6411841Spraks 	}
6421841Spraks }
6431841Spraks 
6441841Spraks /*
6451841Spraks  * Get the vpmap for the page.
6461841Spraks  * The refcnt of this vpm is incremented.
6471841Spraks  */
6481841Spraks static struct vpmap *
6491841Spraks get_vpmap(page_t *pp)
6501841Spraks {
6511841Spraks 	struct vpmap *vpm = NULL;
6521841Spraks 	kmutex_t *vmtx;
6531841Spraks 	kmutex_t *pmtx;
6541841Spraks 	unsigned int refid;
6551841Spraks 
6561841Spraks 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
6571841Spraks 
6581841Spraks 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
6591841Spraks 		vpm = VPMP(refid);
6601841Spraks 		vmtx = VPMAPMTX(vpm);
6611841Spraks 		mutex_enter(vmtx);
6621841Spraks 		/*
6631841Spraks 		 * Since we have the page lock and the vpm mutex, the
6641841Spraks 		 * pp->p_vpmref cannot change.
6651841Spraks 		 */
6661841Spraks 		if (vpm->vpm_pp != pp) {
6671841Spraks 			pmtx = PPMTX(pp);
6681841Spraks 
6691841Spraks 			/*
6701841Spraks 			 * Clear the p_vpmref as it is incorrect.
6711841Spraks 			 * This can happen if the page was stolen.
6721841Spraks 			 * On x64 this should not happen as p_vpmref
6731841Spraks 			 * is treated as a mapping on the page. So
6741841Spraks 			 * if the page is stolen, the mapping would have
6751841Spraks 			 * been cleared in page_unload().
6761841Spraks 			 */
6771841Spraks 			mutex_enter(pmtx);
6781841Spraks 			if (pp->p_vpmref == refid)
6791841Spraks 				pp->p_vpmref = 0;
6801841Spraks 			mutex_exit(pmtx);
6811841Spraks 
6821841Spraks 			mutex_exit(vmtx);
6831841Spraks 			vpm = NULL;
6841841Spraks 		} else if (vpm->vpm_refcnt == 0) {
6851841Spraks 			/*
6861841Spraks 			 * Got the vpm, remove it from the free
6871841Spraks 			 * list if it exists there.
6881841Spraks 			 */
6891841Spraks 			VPMAP_RMFREELIST(vpm);
6901841Spraks 		}
6911841Spraks 	}
6921841Spraks 	if (vpm == NULL) {
6931841Spraks 		/*
6941841Spraks 		 * get_free_vpmap() returns with the vpmap mutex held.
6951841Spraks 		 */
6961841Spraks 		vpm = get_free_vpmap(pp);
6971841Spraks 		vmtx = VPMAPMTX(vpm);
6981841Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
6991841Spraks 	} else {
7001841Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
7011841Spraks 	}
7021841Spraks 
7031841Spraks 	vpm->vpm_refcnt++;
7041841Spraks 	mutex_exit(vmtx);
7051841Spraks 
7061841Spraks 	return (vpm);
7071841Spraks }
7081841Spraks 
7091841Spraks /* END --- vpm cache ---- */
7101841Spraks 
7111841Spraks /*
7121841Spraks  * The vnode page mapping(vpm) interface routines.
7131841Spraks  */
7141841Spraks 
7151841Spraks /*
7161841Spraks  * Find or create the pages starting form baseoff for specified
7171841Spraks  * length 'len'.
7181841Spraks  */
7191841Spraks static int
7201841Spraks vpm_pagecreate(
7211841Spraks 	struct vnode *vp,
7221841Spraks 	u_offset_t baseoff,
7231841Spraks 	size_t len,
7241841Spraks 	vmap_t vml[],
7251841Spraks 	int nseg,
7261841Spraks 	int *newpage)
7271841Spraks {
7281841Spraks 
7291841Spraks 	page_t *pp = NULL;
7301841Spraks 	caddr_t base;
7311841Spraks 	u_offset_t off = baseoff;
7321841Spraks 	int i;
7331841Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
7341841Spraks 
7352353Spraks 	for (i = 0; len > 0; len -= PAGESIZE, i++) {
7361841Spraks 		struct vpmap *vpm;
7371841Spraks 
7381841Spraks 
7391841Spraks 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
7401841Spraks 
7411841Spraks 			base = segkpm_create_va(off);
7421841Spraks 
7431841Spraks 			/*
7441841Spraks 			 * the seg pointer passed in is just advisor. Just
7451841Spraks 			 * pass segkmap for now like segmap does with
7461841Spraks 			 * segmap_kpm enabled.
7471841Spraks 			 */
7481841Spraks 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
7491841Spraks 			    segkmap, base)) == NULL) {
7501841Spraks 				panic("segmap_pagecreate_vpm: "
7511841Spraks 				    "page_create failed");
7521841Spraks 				/*NOTREACHED*/
7531841Spraks 			}
7541841Spraks 			if (newpage != NULL)
7551841Spraks 				*newpage = 1;
7561841Spraks 
7571841Spraks 			page_io_unlock(pp);
7581841Spraks 		}
7591841Spraks 
7601841Spraks 		/*
7611841Spraks 		 * Get the vpm for this page_t.
7621841Spraks 		 */
7631841Spraks 		if (vpm_cache_enable) {
7641841Spraks 			vpm = get_vpmap(pp);
7651841Spraks 			vml[i].vs_data = (void *)&vpm->vpm_pp;
7661841Spraks 		} else {
7671841Spraks 			vml[i].vs_data = (void *)pp;
7681841Spraks 			pp->p_vpmref = 0;
7691841Spraks 		}
7701841Spraks 
7711841Spraks 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
7721841Spraks 		vml[i].vs_len = PAGESIZE;
7731841Spraks 
7741841Spraks 		off += PAGESIZE;
7751841Spraks 	}
7761841Spraks 	vml[i].vs_data = NULL;
7771841Spraks 	vml[i].vs_addr = (caddr_t)NULL;
7781841Spraks 	return (0);
7791841Spraks }
7801841Spraks 
7811841Spraks 
7821841Spraks /*
7831841Spraks  * Returns vpm mappings of pages in the range [off, off+len], where
7841841Spraks  * len is rounded up to the PAGESIZE boundary. The list of pages and
7851841Spraks  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
7861841Spraks  * The nseg is the number of vmap_t entries in the array.
7871841Spraks  *
7881841Spraks  * Currently max len allowed is MAXBSIZE therefore, it will either
7891841Spraks  * fetch/create one or two pages depending on what is the PAGESIZE.
7901841Spraks  *
7911841Spraks  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
7921841Spraks  * For such cases, use the seg_map interfaces.
7931841Spraks  */
7941841Spraks int
7951841Spraks vpm_map_pages(
7961841Spraks 	struct vnode *vp,
7971841Spraks 	u_offset_t off,
7981841Spraks 	size_t len,
7991841Spraks 	int fetchpage,
8001841Spraks 	vmap_t *vml,
8011841Spraks 	int nseg,
8021841Spraks 	int  *newpage,
8031841Spraks 	enum seg_rw rw)
8041841Spraks {
8051841Spraks 	extern struct vnode *common_specvp();
8061841Spraks 	u_offset_t baseoff;
8071841Spraks 	uint_t prot;
8081841Spraks 	caddr_t base;
8091841Spraks 	page_t *pp, *pplist[MAXVMAPS];
8101841Spraks 	struct vpmap *vpm;
8111841Spraks 	int i, error = 0;
8121841Spraks 
8131841Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
8141841Spraks 	baseoff = off & (offset_t)PAGEMASK;
8151841Spraks 	vml[0].vs_data = NULL;
8161841Spraks 	vml[0].vs_addr = (caddr_t)NULL;
8171841Spraks 	/*
8181841Spraks 	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
8191841Spraks 	 * len longer then MAXBSIZE, but there should be a limit
8201841Spraks 	 * which should be determined by how many pages the VOP_GETPAGE()
8211841Spraks 	 * can fetch.
8221841Spraks 	 */
8231841Spraks 	if (off + len > baseoff + MAXBSIZE) {
8241841Spraks 		panic("vpm_map_pages bad len");
8251841Spraks 		/*NOTREACHED*/
8261841Spraks 	}
8271841Spraks 
8281841Spraks 	/*
8291841Spraks 	 * If this is a block device we have to be sure to use the
8301841Spraks 	 * "common" block device vnode for the mapping.
8311841Spraks 	 */
8321841Spraks 	if (vp->v_type == VBLK)
8331841Spraks 		vp = common_specvp(vp);
8341841Spraks 
8352353Spraks 	/*
8362353Spraks 	 * round up len to a multiple of PAGESIZE.
8372353Spraks 	 */
8382353Spraks 	len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK);
8391841Spraks 
8401841Spraks 	if (!fetchpage)
8411841Spraks 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
8421841Spraks 
8432353Spraks 	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
8441841Spraks 
8451841Spraks 		pp = page_lookup(vp, baseoff, SE_SHARED);
8461841Spraks 
8471841Spraks 		/*
8481841Spraks 		 * If we did not find the page or if this page was not
8491841Spraks 		 * in our cache, then let VOP_GETPAGE get all the pages.
8501841Spraks 		 * We need to call VOP_GETPAGE so that filesytems can do some
8511841Spraks 		 * (un)necessary tracking for sequential access.
8521841Spraks 		 */
8531841Spraks 
8541841Spraks 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
8551841Spraks 			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
8561841Spraks 							!= (P_MOD | P_REF))) {
8571841Spraks 			if (pp != NULL) {
8581841Spraks 				page_unlock(pp);
8591841Spraks 			}
8601841Spraks 
8611841Spraks 			/*
8621841Spraks 			 * Pass a dummy address as it will be required
8631841Spraks 			 * by page_create_va(). We pass segkmap as the seg
8641841Spraks 			 * as some file systems(UFS) check it.
8651841Spraks 			 */
8661841Spraks 			base = segkpm_create_va(baseoff);
8671841Spraks 
8681841Spraks 			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
869*5331Samw 			len, segkmap, base, rw, CRED(), NULL);
8701841Spraks 			if (error) {
8711841Spraks 				VPM_DEBUG(vpmd_getpagefailed);
8721841Spraks 				pplist[i] = NULL;
8731841Spraks 			}
8741841Spraks 			break;
8751841Spraks 		} else {
8761841Spraks 			pplist[i] = pp;
8771841Spraks 			baseoff += PAGESIZE;
8781841Spraks 		}
8791841Spraks 	}
8801841Spraks 
8811841Spraks 	if (error) {
8821841Spraks 		for (i = 0; pplist[i] != NULL; i++) {
8831841Spraks 			page_unlock(pplist[i]);
8841841Spraks 			pplist[i] = NULL;
8851841Spraks 		}
8861841Spraks 		vml[0].vs_addr = NULL;
8871841Spraks 		vml[0].vs_data = NULL;
8882970Spraks 		return (error);
8891841Spraks 	}
8901841Spraks 
8911841Spraks 	/*
8921841Spraks 	 * Get the vpm's for pages.
8931841Spraks 	 */
8941841Spraks 	for (i = 0; pplist[i] != NULL; i++) {
8951841Spraks 		if (vpm_cache_enable) {
8961841Spraks 			vpm = get_vpmap(pplist[i]);
8971841Spraks 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
8981841Spraks 		} else {
8991841Spraks 			vml[i].vs_data = (void *)pplist[i];
9001841Spraks 			pplist[i]->p_vpmref = 0;
9011841Spraks 		}
9021841Spraks 
9031841Spraks 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
9041841Spraks 		vml[i].vs_len = PAGESIZE;
9051841Spraks 	}
9061841Spraks 
9071841Spraks 	vml[i].vs_data = NULL;
9081841Spraks 	vml[i].vs_addr = (caddr_t)NULL;
9091841Spraks 
9101841Spraks 	return (0);
9111841Spraks }
9121841Spraks 
9131841Spraks /*
9141841Spraks  * Release the vpm mappings on the pages and unlock them.
9151841Spraks  */
9161841Spraks void
9171841Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
9181841Spraks {
9191841Spraks 	int i;
9201841Spraks 	struct vpmap *vpm;
9211841Spraks 	kmutex_t *mtx;
9221841Spraks 	page_t *pp;
9231841Spraks 
9241841Spraks 	for (i = 0; vml[i].vs_data != NULL; i++) {
9251841Spraks 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
9261841Spraks 
9271841Spraks 		if (vpm_cache_enable) {
9281841Spraks 			pp = *(((page_t **)vml[i].vs_data));
9291841Spraks 		} else {
9301841Spraks 			pp = (page_t *)vml[i].vs_data;
9311841Spraks 		}
9321841Spraks 
9331841Spraks 		/*
9341841Spraks 		 * Mark page as being modified or referenced, bacause vpm pages
9351841Spraks 		 * would not cause faults where it would be set normally.
9361841Spraks 		 */
9371841Spraks 		if (rw == S_WRITE) {
9381841Spraks 			hat_setrefmod(pp);
9391841Spraks 		} else {
9401841Spraks 			ASSERT(rw == S_READ);
9411841Spraks 			hat_setref(pp);
9421841Spraks 		}
9431841Spraks 
9441841Spraks 		if (vpm_cache_enable) {
9451841Spraks 			page_unlock(pp);
9461841Spraks 			vpm = (struct vpmap *)((char *)vml[i].vs_data
9471841Spraks 					- offsetof(struct vpmap, vpm_pp));
9481841Spraks 			mtx = VPMAPMTX(vpm);
9491841Spraks 			mutex_enter(mtx);
9501841Spraks 
9511841Spraks 			if (--vpm->vpm_refcnt == 0) {
9521841Spraks 				free_vpmap(vpm);
9531841Spraks 			}
9541841Spraks 			mutex_exit(mtx);
9551841Spraks 		} else {
9561841Spraks 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
9571841Spraks 			(void) page_release(pp, 1);
9581841Spraks 		}
9591841Spraks 		vml[i].vs_data = NULL;
9601841Spraks 		vml[i].vs_addr = NULL;
9611841Spraks 	}
9621841Spraks }
9631841Spraks 
9641841Spraks /*
9651841Spraks  * Given the vp, off and the uio structure, this routine will do the
9661841Spraks  * the copy (uiomove). If the last page created is partially written,
9671841Spraks  * the rest of the page is zeroed out. It also zeros the beginning of
9681841Spraks  * the first page till the start offset if requested(zerostart).
9691841Spraks  * If pages are to be fetched, it will call the filesystem's getpage
9701841Spraks  * function (VOP_GETPAGE) to get them, otherwise they will be created if
9711841Spraks  * not already present in the page cache.
9721841Spraks  */
9731841Spraks int
9741841Spraks vpm_data_copy(struct vnode *vp,
9751841Spraks 	u_offset_t off,
9761841Spraks 	size_t len,
9771841Spraks 	struct uio *uio,
9781841Spraks 	int fetchpage,
9791841Spraks 	int *newpage,
9801841Spraks 	int zerostart,
9811841Spraks 	enum seg_rw rw)
9821841Spraks {
9831841Spraks 	int error;
9841841Spraks 	struct vmap vml[MINVMAPS];
9851841Spraks 	enum uio_rw uiorw;
9861841Spraks 	int npages = 0;
9871841Spraks 
9881841Spraks 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
9891841Spraks 	/*
9901841Spraks 	 * 'off' will be the offset where the I/O starts.
9911841Spraks 	 * We get the pages starting at the (off & PAGEMASK)
9921841Spraks 	 * page boundary.
9931841Spraks 	 */
9941841Spraks 	error = vpm_map_pages(vp, off, (uint_t)len,
9951841Spraks 		fetchpage, vml, MINVMAPS, &npages,  rw);
9961841Spraks 
9971841Spraks 	if (newpage != NULL)
9981841Spraks 		*newpage = npages;
9991841Spraks 	if (!error) {
10001841Spraks 		int i, pn, slen = len;
10011841Spraks 		int pon = off & PAGEOFFSET;
10021841Spraks 
10031841Spraks 		/*
10041841Spraks 		 * Clear from the beginning of the page to start offset
10051841Spraks 		 * if requested.
10061841Spraks 		 */
10071841Spraks 		if (!fetchpage && zerostart) {
10081841Spraks 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
10091841Spraks 			VPM_DEBUG(vpmd_zerostart);
10101841Spraks 		}
10111841Spraks 
10121841Spraks 		for (i = 0; !error && slen > 0 &&
10131841Spraks 				vml[i].vs_addr != NULL; i++) {
10141841Spraks 			pn = (int)MIN(slen, (PAGESIZE - pon));
10151841Spraks 			error = uiomove(vml[i].vs_addr + pon,
10161841Spraks 				    (long)pn, uiorw, uio);
10171841Spraks 			slen -= pn;
10181841Spraks 			pon = 0;
10191841Spraks 		}
10201841Spraks 
10211841Spraks 		/*
10221841Spraks 		 * When new pages are created, zero out part of the
10231841Spraks 		 * page we did not copy to.
10241841Spraks 		 */
10251841Spraks 		if (!fetchpage && npages &&
10261841Spraks 			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
10271841Spraks 			int nzero;
10281841Spraks 
10291841Spraks 			pon = (uio->uio_loffset & PAGEOFFSET);
10301841Spraks 			nzero = PAGESIZE  - pon;
10311841Spraks 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
10321841Spraks 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
10331841Spraks 		}
10341841Spraks 		vpm_unmap_pages(vml, rw);
10351841Spraks 	}
10361841Spraks 	return (error);
10371841Spraks }
10381841Spraks 
10391841Spraks /*
10401841Spraks  * called to flush pages for the given vnode covering
10411841Spraks  * [off, off+len] range.
10421841Spraks  */
10431841Spraks int
10441841Spraks vpm_sync_pages(struct vnode *vp,
10451841Spraks 		u_offset_t off,
10461841Spraks 		size_t len,
10471841Spraks 		uint_t flags)
10481841Spraks {
10491841Spraks 	extern struct vnode *common_specvp();
10501841Spraks 	int bflags = 0;
10511841Spraks 	int error = 0;
10521841Spraks 	size_t psize = roundup(len, PAGESIZE);
10531841Spraks 
10541841Spraks 	/*
10551841Spraks 	 * If this is a block device we have to be sure to use the
10561841Spraks 	 * "common" block device vnode for the mapping.
10571841Spraks 	 */
10581841Spraks 	if (vp->v_type == VBLK)
10591841Spraks 		vp = common_specvp(vp);
10601841Spraks 
10611841Spraks 	if ((flags & ~SM_DONTNEED) != 0) {
10621841Spraks 		if (flags & SM_ASYNC)
10631841Spraks 			bflags |= B_ASYNC;
10641841Spraks 		if (flags & SM_INVAL)
10651841Spraks 			bflags |= B_INVAL;
10661841Spraks 		if (flags & SM_DESTROY)
10671841Spraks 			bflags |= (B_INVAL|B_TRUNC);
10681841Spraks 		if (flags & SM_FREE)
10691841Spraks 			bflags |= B_FREE;
10701841Spraks 		if (flags & SM_DONTNEED)
10711841Spraks 			bflags |= B_DONTNEED;
10721841Spraks 
1073*5331Samw 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
10741841Spraks 	}
10751841Spraks 
10761841Spraks 	return (error);
10771841Spraks }
10781841Spraks 
10791841Spraks 
10801841Spraks #else	/* SEGKPM_SUPPORT */
10811841Spraks 
10821841Spraks /* vpm stubs */
10831841Spraks void
10841841Spraks vpm_init()
10851841Spraks {
10861841Spraks }
10871841Spraks 
10881841Spraks /*ARGSUSED*/
10891841Spraks int
10901841Spraks vpm_pagecreate(
10911841Spraks 	struct vnode *vp,
10921841Spraks 	u_offset_t baseoff,
10931841Spraks 	size_t len,
10941841Spraks 	vmap_t vml[],
10951841Spraks 	int nseg,
10961841Spraks 	int *newpage)
10971841Spraks {
10981841Spraks 	return (0);
10991841Spraks }
11001841Spraks 
11011841Spraks /*ARGSUSED*/
11021841Spraks int
11031841Spraks vpm_map_pages(
11041841Spraks 	struct vnode *vp,
11051841Spraks 	u_offset_t off,
11061841Spraks 	size_t len,
11071841Spraks 	int fetchpage,
11081841Spraks 	vmap_t vml[],
11091841Spraks 	int nseg,
11101841Spraks 	int *newpage,
11111841Spraks 	enum seg_rw rw)
11121841Spraks {
11131841Spraks 	return (0);
11141841Spraks }
11151841Spraks 
11161841Spraks /*ARGSUSED*/
11171841Spraks int
11181841Spraks vpm_data_copy(struct vnode *vp,
11191841Spraks 	u_offset_t off,
11201841Spraks 	size_t len,
11211841Spraks 	struct uio *uio,
11221841Spraks 	int fetchpage,
11231841Spraks 	int *newpage,
11241841Spraks 	int zerostart,
11251841Spraks 	enum seg_rw rw)
11261841Spraks {
11271841Spraks 	return (0);
11281841Spraks }
11291841Spraks 
11301841Spraks /*ARGSUSED*/
11311841Spraks void
11321841Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
11331841Spraks {
11341841Spraks }
11351841Spraks /*ARGSUSED*/
11361841Spraks int
11371841Spraks vpm_sync_pages(struct vnode *vp,
11381841Spraks 		u_offset_t off,
11391841Spraks 		size_t len,
11401841Spraks 		uint_t flags)
11411841Spraks {
11421841Spraks 	return (0);
11431841Spraks }
11441841Spraks #endif	/* SEGKPM_SUPPORT */
1145