xref: /onnv-gate/usr/src/uts/common/vm/vpm.c (revision 1841:9d7ebafcda38)
1*1841Spraks /*
2*1841Spraks  * CDDL HEADER START
3*1841Spraks  *
4*1841Spraks  * The contents of this file are subject to the terms of the
5*1841Spraks  * Common Development and Distribution License (the "License").
6*1841Spraks  * You may not use this file except in compliance with the License.
7*1841Spraks  *
8*1841Spraks  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*1841Spraks  * or http://www.opensolaris.org/os/licensing.
10*1841Spraks  * See the License for the specific language governing permissions
11*1841Spraks  * and limitations under the License.
12*1841Spraks  *
13*1841Spraks  * When distributing Covered Code, include this CDDL HEADER in each
14*1841Spraks  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*1841Spraks  * If applicable, add the following below this CDDL HEADER, with the
16*1841Spraks  * fields enclosed by brackets "[]" replaced with your own identifying
17*1841Spraks  * information: Portions Copyright [yyyy] [name of copyright owner]
18*1841Spraks  *
19*1841Spraks  * CDDL HEADER END
20*1841Spraks  */
21*1841Spraks /*
22*1841Spraks  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23*1841Spraks  * Use is subject to license terms.
24*1841Spraks  */
25*1841Spraks 
26*1841Spraks #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*1841Spraks 
28*1841Spraks /*
29*1841Spraks  * VM - generic vnode page mapping interfaces.
30*1841Spraks  *
31*1841Spraks  * Mechanism to provide temporary mappings to vnode pages.
32*1841Spraks  * The typical use would be to copy/access file data.
33*1841Spraks  */
34*1841Spraks 
35*1841Spraks #include <sys/types.h>
36*1841Spraks #include <sys/t_lock.h>
37*1841Spraks #include <sys/param.h>
38*1841Spraks #include <sys/sysmacros.h>
39*1841Spraks #include <sys/buf.h>
40*1841Spraks #include <sys/systm.h>
41*1841Spraks #include <sys/vnode.h>
42*1841Spraks #include <sys/mman.h>
43*1841Spraks #include <sys/errno.h>
44*1841Spraks #include <sys/cred.h>
45*1841Spraks #include <sys/kmem.h>
46*1841Spraks #include <sys/vtrace.h>
47*1841Spraks #include <sys/cmn_err.h>
48*1841Spraks #include <sys/debug.h>
49*1841Spraks #include <sys/thread.h>
50*1841Spraks #include <sys/dumphdr.h>
51*1841Spraks #include <sys/bitmap.h>
52*1841Spraks #include <sys/lgrp.h>
53*1841Spraks 
54*1841Spraks #include <vm/seg_kmem.h>
55*1841Spraks #include <vm/hat.h>
56*1841Spraks #include <vm/as.h>
57*1841Spraks #include <vm/seg.h>
58*1841Spraks #include <vm/seg_kpm.h>
59*1841Spraks #include <vm/seg_map.h>
60*1841Spraks #include <vm/page.h>
61*1841Spraks #include <vm/pvn.h>
62*1841Spraks #include <vm/rm.h>
63*1841Spraks #include <vm/vpm.h>
64*1841Spraks 
65*1841Spraks /*
66*1841Spraks  * Needs to be enabled by each platform.
67*1841Spraks  */
68*1841Spraks int vpm_enable = 0;
69*1841Spraks 
70*1841Spraks #ifdef	SEGKPM_SUPPORT
71*1841Spraks 
72*1841Spraks 
73*1841Spraks int	vpm_cache_enable = 1;
74*1841Spraks long	vpm_cache_percent = 12;
75*1841Spraks long	vpm_cache_size;
76*1841Spraks int	vpm_nfreelist = 0;
77*1841Spraks int	vpmd_freemsk = 0;
78*1841Spraks 
79*1841Spraks #define	VPM_S_PAD	64
80*1841Spraks union vpm_cpu {
81*1841Spraks 	struct {
82*1841Spraks 		int	vcpu_free_ndx;
83*1841Spraks 		ulong_t	vcpu_hits;
84*1841Spraks 		ulong_t vcpu_misses;
85*1841Spraks 	} vcpu;
86*1841Spraks 	char vpm_pad[VPM_S_PAD];
87*1841Spraks };
88*1841Spraks static union vpm_cpu	*vpmd_cpu;
89*1841Spraks 
90*1841Spraks #define	vfree_ndx	vcpu.vcpu_free_ndx
91*1841Spraks 
92*1841Spraks int	vpm_cachemode = VPMCACHE_LRU;
93*1841Spraks 
94*1841Spraks #define	PPMTX(pp) (&(pp)->p_ilock)
95*1841Spraks 
96*1841Spraks static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
97*1841Spraks static struct vpmfree *vpmd_free;
98*1841Spraks #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
99*1841Spraks #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
100*1841Spraks #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
101*1841Spraks #define	VPMP(id)	(&vpmd_vpmap[id - 1])
102*1841Spraks #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
103*1841Spraks 
104*1841Spraks 
105*1841Spraks #ifdef	DEBUG
106*1841Spraks 
107*1841Spraks struct	vpm_debug {
108*1841Spraks 	int vpmd_steals;
109*1841Spraks 	int vpmd_contend;
110*1841Spraks 	int vpmd_prevpagelocked;
111*1841Spraks 	int vpmd_getpagefailed;
112*1841Spraks 	int vpmd_zerostart;
113*1841Spraks 	int vpmd_emptyfreelist;
114*1841Spraks 	int vpmd_nofreevpms;
115*1841Spraks } vpm_debug;
116*1841Spraks 
117*1841Spraks #define	VPM_DEBUG(x)	((vpm_debug.x)++)
118*1841Spraks 
119*1841Spraks int	steals;
120*1841Spraks int	steals_mtbf = 7;
121*1841Spraks int	contend;
122*1841Spraks int	contend_mtbf = 127;
123*1841Spraks 
124*1841Spraks #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
125*1841Spraks 
126*1841Spraks #else	/* DEBUG */
127*1841Spraks 
128*1841Spraks #define	VPM_MTBF(v, f)	(1)
129*1841Spraks #define	VPM_DEBUG(x)	/* nothing */
130*1841Spraks 
131*1841Spraks #endif
132*1841Spraks 
133*1841Spraks /*
134*1841Spraks  * The vpm cache.
135*1841Spraks  *
136*1841Spraks  * The main purpose of having a cache here is to speed up page_lookup()
137*1841Spraks  * operations and also provide an LRU(default) behaviour of file pages. The
138*1841Spraks  * page_lookup() operation tends to be expensive if a page has to be
139*1841Spraks  * reclaimed from the system page cache("cachelist"). Once we speed up the
140*1841Spraks  * page_lookup()->page_reclaim() path then there there should be no need for
141*1841Spraks  * this cache. The system page cache(cachelist) should effectively serve the
142*1841Spraks  * purpose of caching file pages.
143*1841Spraks  *
144*1841Spraks  * This cache is very similar to segmap's smap cache. Each page in the
145*1841Spraks  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
146*1841Spraks  * hash table. The page_t has a reference to the vpmap_t when cached. For a
147*1841Spraks  * given vnode, offset the page is found by means of a page_lookup() operation.
148*1841Spraks  * Any page which has a mapping(i.e when cached) will not be in the
149*1841Spraks  * system 'cachelist'. Hence the page_lookup() will not have to do a
150*1841Spraks  * page_reclaim(). That is how the cache serves to speed up page_lookup()
151*1841Spraks  * operations.
152*1841Spraks  *
153*1841Spraks  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
154*1841Spraks  */
155*1841Spraks 
156*1841Spraks void
157*1841Spraks vpm_init()
158*1841Spraks {
159*1841Spraks 	long  npages;
160*1841Spraks 	struct vpmap *vpm;
161*1841Spraks 	struct vpmfree *vpmflp;
162*1841Spraks 	int i, ndx;
163*1841Spraks 	extern void prefetch_smap_w(void *);
164*1841Spraks 
165*1841Spraks 	if (!vpm_cache_enable) {
166*1841Spraks 		return;
167*1841Spraks 	}
168*1841Spraks 
169*1841Spraks 	/*
170*1841Spraks 	 * Set the size of the cache.
171*1841Spraks 	 */
172*1841Spraks 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
173*1841Spraks 	if (vpm_cache_size < VPMAP_MINCACHE) {
174*1841Spraks 		vpm_cache_size = VPMAP_MINCACHE;
175*1841Spraks 	}
176*1841Spraks 
177*1841Spraks 	/*
178*1841Spraks 	 * Number of freelists.
179*1841Spraks 	 */
180*1841Spraks 	if (vpm_nfreelist == 0) {
181*1841Spraks 		vpm_nfreelist = max_ncpus;
182*1841Spraks 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
183*1841Spraks 		cmn_err(CE_WARN, "vpmap create : number of freelist "
184*1841Spraks 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
185*1841Spraks 		vpm_nfreelist = 2 * max_ncpus;
186*1841Spraks 	}
187*1841Spraks 
188*1841Spraks 	/*
189*1841Spraks 	 * Round it up to the next power of 2
190*1841Spraks 	 */
191*1841Spraks 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
192*1841Spraks 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
193*1841Spraks 	}
194*1841Spraks 	vpmd_freemsk = vpm_nfreelist - 1;
195*1841Spraks 
196*1841Spraks 	/*
197*1841Spraks 	 * Use a per cpu rotor index to spread the allocations evenly
198*1841Spraks 	 * across the available vpm freelists.
199*1841Spraks 	 */
200*1841Spraks 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
201*1841Spraks 	ndx = 0;
202*1841Spraks 	for (i = 0; i < max_ncpus; i++) {
203*1841Spraks 
204*1841Spraks 		vpmd_cpu[i].vfree_ndx = ndx;
205*1841Spraks 		ndx = (ndx + 1) & vpmd_freemsk;
206*1841Spraks 	}
207*1841Spraks 
208*1841Spraks 	/*
209*1841Spraks 	 * Allocate and initialize the freelist.
210*1841Spraks 	 */
211*1841Spraks 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
212*1841Spraks 				KM_SLEEP);
213*1841Spraks 	for (i = 0; i < vpm_nfreelist; i++) {
214*1841Spraks 
215*1841Spraks 		vpmflp = &vpmd_free[i];
216*1841Spraks 		/*
217*1841Spraks 		 * Set up initial queue pointers. They will get flipped
218*1841Spraks 		 * back and forth.
219*1841Spraks 		 */
220*1841Spraks 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
221*1841Spraks 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
222*1841Spraks 	}
223*1841Spraks 
224*1841Spraks 	npages = mmu_btop(vpm_cache_size);
225*1841Spraks 
226*1841Spraks 
227*1841Spraks 	/*
228*1841Spraks 	 * Allocate and initialize the vpmap structs.
229*1841Spraks 	 */
230*1841Spraks 	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
231*1841Spraks 	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
232*1841Spraks 		struct vpmfree *vpmflp;
233*1841Spraks 		union vpm_freeq *releq;
234*1841Spraks 		struct vpmap *vpmapf;
235*1841Spraks 
236*1841Spraks 		/*
237*1841Spraks 		 * Use prefetch as we have to walk thru a large number of
238*1841Spraks 		 * these data structures. We just use the smap's prefetch
239*1841Spraks 		 * routine as it does the same. This should work fine
240*1841Spraks 		 * for x64(this needs to be modifed when enabled on sparc).
241*1841Spraks 		 */
242*1841Spraks 		prefetch_smap_w((void *)vpm);
243*1841Spraks 
244*1841Spraks 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
245*1841Spraks 
246*1841Spraks 		vpmflp = VPMAP2VMF(vpm);
247*1841Spraks 		releq = vpmflp->vpm_releq;
248*1841Spraks 
249*1841Spraks 		vpmapf = releq->vpmq_free;
250*1841Spraks 		if (vpmapf == NULL) {
251*1841Spraks 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
252*1841Spraks 		} else {
253*1841Spraks 			vpm->vpm_next = vpmapf;
254*1841Spraks 			vpm->vpm_prev = vpmapf->vpm_prev;
255*1841Spraks 			vpmapf->vpm_prev = vpm;
256*1841Spraks 			vpm->vpm_prev->vpm_next = vpm;
257*1841Spraks 			releq->vpmq_free = vpm->vpm_next;
258*1841Spraks 		}
259*1841Spraks 
260*1841Spraks 		/*
261*1841Spraks 		 * Indicate that the vpmap is on the releq at start
262*1841Spraks 		 */
263*1841Spraks 		vpm->vpm_ndxflg = VPMRELEQ;
264*1841Spraks 	}
265*1841Spraks }
266*1841Spraks 
267*1841Spraks 
268*1841Spraks /*
269*1841Spraks  * unhooks vpm from the freelist if it is still on the freelist.
270*1841Spraks  */
271*1841Spraks #define	VPMAP_RMFREELIST(vpm) \
272*1841Spraks 	{ \
273*1841Spraks 		if (vpm->vpm_next != NULL) { \
274*1841Spraks 			union vpm_freeq *freeq; \
275*1841Spraks 			struct vpmfree *vpmflp; \
276*1841Spraks 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
277*1841Spraks 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
278*1841Spraks 			mutex_enter(&freeq->vpmq_mtx); \
279*1841Spraks 			if (freeq->vpmq_free != vpm) { \
280*1841Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
281*1841Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
282*1841Spraks 			} else if (vpm == vpm->vpm_next) { \
283*1841Spraks 				freeq->vpmq_free = NULL; \
284*1841Spraks 			} else { \
285*1841Spraks 				freeq->vpmq_free = vpm->vpm_next; \
286*1841Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
287*1841Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
288*1841Spraks 			} \
289*1841Spraks 			mutex_exit(&freeq->vpmq_mtx); \
290*1841Spraks 			vpm->vpm_next = vpm->vpm_prev = NULL; \
291*1841Spraks 		} \
292*1841Spraks 	}
293*1841Spraks 
294*1841Spraks static int
295*1841Spraks get_freelndx(int mode)
296*1841Spraks {
297*1841Spraks 	int ndx;
298*1841Spraks 
299*1841Spraks 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
300*1841Spraks 	switch (mode) {
301*1841Spraks 
302*1841Spraks 	case	VPMCACHE_LRU:
303*1841Spraks 	default:
304*1841Spraks 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
305*1841Spraks 			break;
306*1841Spraks 	}
307*1841Spraks 	return (ndx);
308*1841Spraks }
309*1841Spraks 
310*1841Spraks 
311*1841Spraks /*
312*1841Spraks  * Find one vpmap structure from the free lists and use it for the newpage.
313*1841Spraks  * The previous page it cached is dissociated and released. The page_t's
314*1841Spraks  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
315*1841Spraks  * for AMD64 when the page is exclusively locked in page_unload. That is
316*1841Spraks  * because the p_vpmref is treated as mapping).
317*1841Spraks  *
318*1841Spraks  * The page's p_vpmref is set when the page is
319*1841Spraks  * locked(at least SHARED locked).
320*1841Spraks  */
321*1841Spraks static struct vpmap *
322*1841Spraks get_free_vpmap(page_t *newpage)
323*1841Spraks {
324*1841Spraks 	struct vpmfree *vpmflp;
325*1841Spraks 	kmutex_t *vmtx;
326*1841Spraks 	struct vpmap *vpm, *first;
327*1841Spraks 	union vpm_freeq *allocq, *releq;
328*1841Spraks 	page_t *pp = NULL;
329*1841Spraks 	int end_ndx, page_locked = 0;
330*1841Spraks 	int free_ndx;
331*1841Spraks 
332*1841Spraks 	/*
333*1841Spraks 	 * get the freelist bin index.
334*1841Spraks 	 */
335*1841Spraks 	free_ndx = get_freelndx(vpm_cachemode);
336*1841Spraks 
337*1841Spraks 	end_ndx = free_ndx;
338*1841Spraks 	vpmflp = &vpmd_free[free_ndx];
339*1841Spraks 
340*1841Spraks retry_queue:
341*1841Spraks 	allocq = vpmflp->vpm_allocq;
342*1841Spraks 	mutex_enter(&allocq->vpmq_mtx);
343*1841Spraks 
344*1841Spraks 	if ((vpm = allocq->vpmq_free) == NULL) {
345*1841Spraks 
346*1841Spraks skip_queue:
347*1841Spraks 		/*
348*1841Spraks 		 * The alloc list is empty or this queue is being skipped;
349*1841Spraks 		 * first see if the allocq toggled.
350*1841Spraks 		 */
351*1841Spraks 		if (vpmflp->vpm_allocq != allocq) {
352*1841Spraks 			/* queue changed */
353*1841Spraks 			mutex_exit(&allocq->vpmq_mtx);
354*1841Spraks 			goto retry_queue;
355*1841Spraks 		}
356*1841Spraks 		releq = vpmflp->vpm_releq;
357*1841Spraks 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
358*1841Spraks 			/* cannot get releq; a free vpmap may be there now */
359*1841Spraks 			mutex_exit(&allocq->vpmq_mtx);
360*1841Spraks 
361*1841Spraks 			/*
362*1841Spraks 			 * This loop could spin forever if this thread has
363*1841Spraks 			 * higher priority than the thread that is holding
364*1841Spraks 			 * releq->vpmq_mtx. In order to force the other thread
365*1841Spraks 			 * to run, we'll lock/unlock the mutex which is safe
366*1841Spraks 			 * since we just unlocked the allocq mutex.
367*1841Spraks 			 */
368*1841Spraks 			mutex_enter(&releq->vpmq_mtx);
369*1841Spraks 			mutex_exit(&releq->vpmq_mtx);
370*1841Spraks 			goto retry_queue;
371*1841Spraks 		}
372*1841Spraks 		if (releq->vpmq_free == NULL) {
373*1841Spraks 			VPM_DEBUG(vpmd_emptyfreelist);
374*1841Spraks 			/*
375*1841Spraks 			 * This freelist is empty.
376*1841Spraks 			 * This should not happen unless clients
377*1841Spraks 			 * are failing to release the vpmap after
378*1841Spraks 			 * accessing the data. Before resorting
379*1841Spraks 			 * to sleeping, try the next list of the same color.
380*1841Spraks 			 */
381*1841Spraks 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
382*1841Spraks 			if (free_ndx != end_ndx) {
383*1841Spraks 				mutex_exit(&releq->vpmq_mtx);
384*1841Spraks 				mutex_exit(&allocq->vpmq_mtx);
385*1841Spraks 				vpmflp = &vpmd_free[free_ndx];
386*1841Spraks 				goto retry_queue;
387*1841Spraks 			}
388*1841Spraks 			/*
389*1841Spraks 			 * Tried all freelists.
390*1841Spraks 			 * wait on this list and hope something gets freed.
391*1841Spraks 			 */
392*1841Spraks 			vpmflp->vpm_want++;
393*1841Spraks 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
394*1841Spraks 			cv_wait(&vpmflp->vpm_free_cv,
395*1841Spraks 				&vpmflp->vpm_freeq[0].vpmq_mtx);
396*1841Spraks 			vpmflp->vpm_want--;
397*1841Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
398*1841Spraks 			vpmflp = &vpmd_free[free_ndx];
399*1841Spraks 			VPM_DEBUG(vpmd_nofreevpms);
400*1841Spraks 			goto retry_queue;
401*1841Spraks 		} else {
402*1841Spraks 			/*
403*1841Spraks 			 * Something on the rele queue; flip the alloc
404*1841Spraks 			 * and rele queues and retry.
405*1841Spraks 			 */
406*1841Spraks 			vpmflp->vpm_allocq = releq;
407*1841Spraks 			vpmflp->vpm_releq = allocq;
408*1841Spraks 			mutex_exit(&allocq->vpmq_mtx);
409*1841Spraks 			mutex_exit(&releq->vpmq_mtx);
410*1841Spraks 			if (page_locked) {
411*1841Spraks 				delay(hz >> 2);
412*1841Spraks 				page_locked = 0;
413*1841Spraks 			}
414*1841Spraks 			goto retry_queue;
415*1841Spraks 		}
416*1841Spraks 	} else {
417*1841Spraks 		int gotnewvpm;
418*1841Spraks 		kmutex_t *pmtx;
419*1841Spraks 		uint_t vpmref;
420*1841Spraks 
421*1841Spraks 		/*
422*1841Spraks 		 * Fastpath the case we get the vpmap mutex
423*1841Spraks 		 * on the first try.
424*1841Spraks 		 */
425*1841Spraks 		first = vpm;
426*1841Spraks next_vpmap:
427*1841Spraks 		vmtx = VPMAPMTX(vpm);
428*1841Spraks 		if (!mutex_tryenter(vmtx)) {
429*1841Spraks 			/*
430*1841Spraks 			 * Another thread is trying to reclaim this slot.
431*1841Spraks 			 * Skip to the next queue or vpmap.
432*1841Spraks 			 */
433*1841Spraks 			if ((vpm = vpm->vpm_next) == first) {
434*1841Spraks 				goto skip_queue;
435*1841Spraks 			} else {
436*1841Spraks 				goto next_vpmap;
437*1841Spraks 			}
438*1841Spraks 		}
439*1841Spraks 
440*1841Spraks 		/*
441*1841Spraks 		 * Assign this vpm to the newpage.
442*1841Spraks 		 */
443*1841Spraks 		pmtx = PPMTX(newpage);
444*1841Spraks 		gotnewvpm = 0;
445*1841Spraks 		mutex_enter(pmtx);
446*1841Spraks 
447*1841Spraks 		/*
448*1841Spraks 		 * Check if some other thread already assigned a vpm to
449*1841Spraks 		 * this page.
450*1841Spraks 		 */
451*1841Spraks 		if ((vpmref = newpage->p_vpmref) == 0) {
452*1841Spraks 			newpage->p_vpmref = VPMID(vpm);
453*1841Spraks 			gotnewvpm = 1;
454*1841Spraks 		} else {
455*1841Spraks 			VPM_DEBUG(vpmd_contend);
456*1841Spraks 			mutex_exit(vmtx);
457*1841Spraks 		}
458*1841Spraks 		mutex_exit(pmtx);
459*1841Spraks 
460*1841Spraks 		if (gotnewvpm) {
461*1841Spraks 
462*1841Spraks 			/*
463*1841Spraks 			 * At this point, we've selected the vpm. Remove vpm
464*1841Spraks 			 * from its freelist. If vpm is the first one in
465*1841Spraks 			 * the freelist, update the head of the freelist.
466*1841Spraks 			 */
467*1841Spraks 			if (first == vpm) {
468*1841Spraks 				ASSERT(first == allocq->vpmq_free);
469*1841Spraks 				allocq->vpmq_free = vpm->vpm_next;
470*1841Spraks 			}
471*1841Spraks 
472*1841Spraks 			/*
473*1841Spraks 			 * If the head of the freelist still points to vpm,
474*1841Spraks 			 * then there are no more free vpmaps in that list.
475*1841Spraks 			 */
476*1841Spraks 			if (allocq->vpmq_free == vpm)
477*1841Spraks 				/*
478*1841Spraks 				 * Took the last one
479*1841Spraks 				 */
480*1841Spraks 				allocq->vpmq_free = NULL;
481*1841Spraks 			else {
482*1841Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
483*1841Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
484*1841Spraks 			}
485*1841Spraks 			mutex_exit(&allocq->vpmq_mtx);
486*1841Spraks 			vpm->vpm_prev = vpm->vpm_next = NULL;
487*1841Spraks 
488*1841Spraks 			/*
489*1841Spraks 			 * Disassociate the previous page. On x64 systems
490*1841Spraks 			 * p_vpmref is used as a mapping reference to the page.
491*1841Spraks 			 */
492*1841Spraks 			if ((pp = vpm->vpm_pp) != NULL &&
493*1841Spraks 				vpm->vpm_vp == pp->p_vnode &&
494*1841Spraks 				vpm->vpm_off == pp->p_offset) {
495*1841Spraks 
496*1841Spraks 				pmtx = PPMTX(pp);
497*1841Spraks 				if (page_trylock(pp, SE_SHARED)) {
498*1841Spraks 					/*
499*1841Spraks 					 * Now verify that it is the correct
500*1841Spraks 					 * page. If not someone else stole it,
501*1841Spraks 					 * so just unlock it and leave.
502*1841Spraks 					 */
503*1841Spraks 					mutex_enter(pmtx);
504*1841Spraks 					if (PP_ISFREE(pp) ||
505*1841Spraks 						vpm->vpm_vp != pp->p_vnode ||
506*1841Spraks 						vpm->vpm_off != pp->p_offset ||
507*1841Spraks 						pp->p_vpmref != VPMID(vpm)) {
508*1841Spraks 						mutex_exit(pmtx);
509*1841Spraks 
510*1841Spraks 						page_unlock(pp);
511*1841Spraks 					} else {
512*1841Spraks 						/*
513*1841Spraks 						 * Release the page.
514*1841Spraks 						 */
515*1841Spraks 						pp->p_vpmref = 0;
516*1841Spraks 						mutex_exit(pmtx);
517*1841Spraks 						hat_kpm_mapout(pp, 0,
518*1841Spraks 							hat_kpm_page2va(pp, 1));
519*1841Spraks 						(void) page_release(pp, 1);
520*1841Spraks 					}
521*1841Spraks 				} else {
522*1841Spraks 					/*
523*1841Spraks 					 * If the page cannot be locked, just
524*1841Spraks 					 * clear the p_vpmref and go.
525*1841Spraks 					 */
526*1841Spraks 					mutex_enter(pmtx);
527*1841Spraks 					if (pp->p_vpmref == VPMID(vpm)) {
528*1841Spraks 						pp->p_vpmref = 0;
529*1841Spraks 					}
530*1841Spraks 					mutex_exit(pmtx);
531*1841Spraks 					VPM_DEBUG(vpmd_prevpagelocked);
532*1841Spraks 				}
533*1841Spraks 			}
534*1841Spraks 
535*1841Spraks 			/*
536*1841Spraks 			 * Setup vpm to point to the new page.
537*1841Spraks 			 */
538*1841Spraks 			vpm->vpm_pp = newpage;
539*1841Spraks 			vpm->vpm_vp = newpage->p_vnode;
540*1841Spraks 			vpm->vpm_off = newpage->p_offset;
541*1841Spraks 
542*1841Spraks 		} else {
543*1841Spraks 			int steal = !VPM_MTBF(steals, steals_mtbf);
544*1841Spraks 			/*
545*1841Spraks 			 * Page already has a vpm assigned just use that.
546*1841Spraks 			 * Grab the vpm mutex and verify that it is still
547*1841Spraks 			 * the correct one. The pp->p_vpmref should not change
548*1841Spraks 			 * once we have the vpm mutex and the page lock.
549*1841Spraks 			 */
550*1841Spraks 			mutex_exit(&allocq->vpmq_mtx);
551*1841Spraks 			vpm = VPMP(vpmref);
552*1841Spraks 			vmtx = VPMAPMTX(vpm);
553*1841Spraks 			mutex_enter(vmtx);
554*1841Spraks 			if ((steal && vpm->vpm_refcnt == 0) ||
555*1841Spraks 			    vpm->vpm_pp != newpage) {
556*1841Spraks 				/*
557*1841Spraks 				 * The vpm got stolen, retry.
558*1841Spraks 				 * clear the p_vpmref.
559*1841Spraks 				 */
560*1841Spraks 				pmtx = PPMTX(newpage);
561*1841Spraks 				mutex_enter(pmtx);
562*1841Spraks 				if (newpage->p_vpmref == vpmref) {
563*1841Spraks 					newpage->p_vpmref = 0;
564*1841Spraks 				}
565*1841Spraks 				mutex_exit(pmtx);
566*1841Spraks 
567*1841Spraks 				mutex_exit(vmtx);
568*1841Spraks 				VPM_DEBUG(vpmd_steals);
569*1841Spraks 				goto retry_queue;
570*1841Spraks 			} else if (vpm->vpm_refcnt == 0) {
571*1841Spraks 				/*
572*1841Spraks 				 * Remove it from the free list if it
573*1841Spraks 				 * exists there.
574*1841Spraks 				 */
575*1841Spraks 				VPMAP_RMFREELIST(vpm);
576*1841Spraks 			}
577*1841Spraks 		}
578*1841Spraks 		return (vpm);
579*1841Spraks 	}
580*1841Spraks }
581*1841Spraks 
582*1841Spraks static void
583*1841Spraks free_vpmap(struct vpmap *vpm)
584*1841Spraks {
585*1841Spraks 	struct vpmfree *vpmflp;
586*1841Spraks 	struct vpmap *vpmfreelist;
587*1841Spraks 	union vpm_freeq *releq;
588*1841Spraks 
589*1841Spraks 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
590*1841Spraks 
591*1841Spraks 	if (vpm->vpm_refcnt != 0) {
592*1841Spraks 		panic("free_vpmap");
593*1841Spraks 		/*NOTREACHED*/
594*1841Spraks 	}
595*1841Spraks 
596*1841Spraks 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
597*1841Spraks 	/*
598*1841Spraks 	 * Add to the tail of the release queue
599*1841Spraks 	 * Note that vpm_releq and vpm_allocq could toggle
600*1841Spraks 	 * before we get the lock. This does not affect
601*1841Spraks 	 * correctness as the 2 queues are only maintained
602*1841Spraks 	 * to reduce lock pressure.
603*1841Spraks 	 */
604*1841Spraks 	releq = vpmflp->vpm_releq;
605*1841Spraks 	if (releq == &vpmflp->vpm_freeq[0]) {
606*1841Spraks 		vpm->vpm_ndxflg = 0;
607*1841Spraks 	} else {
608*1841Spraks 		vpm->vpm_ndxflg = 1;
609*1841Spraks 	}
610*1841Spraks 	mutex_enter(&releq->vpmq_mtx);
611*1841Spraks 	vpmfreelist = releq->vpmq_free;
612*1841Spraks 	if (vpmfreelist == 0) {
613*1841Spraks 		int want;
614*1841Spraks 
615*1841Spraks 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
616*1841Spraks 		/*
617*1841Spraks 		 * Both queue mutexes are held to set vpm_want;
618*1841Spraks 		 * snapshot the value before dropping releq mutex.
619*1841Spraks 		 * If vpm_want appears after the releq mutex is dropped,
620*1841Spraks 		 * then the vpmap just freed is already gone.
621*1841Spraks 		 */
622*1841Spraks 		want = vpmflp->vpm_want;
623*1841Spraks 		mutex_exit(&releq->vpmq_mtx);
624*1841Spraks 		/*
625*1841Spraks 		 * See if there was a waiter before dropping the releq mutex
626*1841Spraks 		 * then recheck after obtaining vpm_freeq[0] mutex as
627*1841Spraks 		 * the another thread may have already signaled.
628*1841Spraks 		 */
629*1841Spraks 		if (want) {
630*1841Spraks 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
631*1841Spraks 			if (vpmflp->vpm_want)
632*1841Spraks 				cv_signal(&vpmflp->vpm_free_cv);
633*1841Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
634*1841Spraks 		}
635*1841Spraks 	} else {
636*1841Spraks 		vpm->vpm_next = vpmfreelist;
637*1841Spraks 		vpm->vpm_prev = vpmfreelist->vpm_prev;
638*1841Spraks 		vpmfreelist->vpm_prev = vpm;
639*1841Spraks 		vpm->vpm_prev->vpm_next = vpm;
640*1841Spraks 		mutex_exit(&releq->vpmq_mtx);
641*1841Spraks 	}
642*1841Spraks }
643*1841Spraks 
644*1841Spraks /*
645*1841Spraks  * Get the vpmap for the page.
646*1841Spraks  * The refcnt of this vpm is incremented.
647*1841Spraks  */
648*1841Spraks static struct vpmap *
649*1841Spraks get_vpmap(page_t *pp)
650*1841Spraks {
651*1841Spraks 	struct vpmap *vpm = NULL;
652*1841Spraks 	kmutex_t *vmtx;
653*1841Spraks 	kmutex_t *pmtx;
654*1841Spraks 	unsigned int refid;
655*1841Spraks 
656*1841Spraks 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
657*1841Spraks 
658*1841Spraks 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
659*1841Spraks 		vpm = VPMP(refid);
660*1841Spraks 		vmtx = VPMAPMTX(vpm);
661*1841Spraks 		mutex_enter(vmtx);
662*1841Spraks 		/*
663*1841Spraks 		 * Since we have the page lock and the vpm mutex, the
664*1841Spraks 		 * pp->p_vpmref cannot change.
665*1841Spraks 		 */
666*1841Spraks 		if (vpm->vpm_pp != pp) {
667*1841Spraks 			pmtx = PPMTX(pp);
668*1841Spraks 
669*1841Spraks 			/*
670*1841Spraks 			 * Clear the p_vpmref as it is incorrect.
671*1841Spraks 			 * This can happen if the page was stolen.
672*1841Spraks 			 * On x64 this should not happen as p_vpmref
673*1841Spraks 			 * is treated as a mapping on the page. So
674*1841Spraks 			 * if the page is stolen, the mapping would have
675*1841Spraks 			 * been cleared in page_unload().
676*1841Spraks 			 */
677*1841Spraks 			mutex_enter(pmtx);
678*1841Spraks 			if (pp->p_vpmref == refid)
679*1841Spraks 				pp->p_vpmref = 0;
680*1841Spraks 			mutex_exit(pmtx);
681*1841Spraks 
682*1841Spraks 			mutex_exit(vmtx);
683*1841Spraks 			vpm = NULL;
684*1841Spraks 		} else if (vpm->vpm_refcnt == 0) {
685*1841Spraks 			/*
686*1841Spraks 			 * Got the vpm, remove it from the free
687*1841Spraks 			 * list if it exists there.
688*1841Spraks 			 */
689*1841Spraks 			VPMAP_RMFREELIST(vpm);
690*1841Spraks 		}
691*1841Spraks 	}
692*1841Spraks 	if (vpm == NULL) {
693*1841Spraks 		/*
694*1841Spraks 		 * get_free_vpmap() returns with the vpmap mutex held.
695*1841Spraks 		 */
696*1841Spraks 		vpm = get_free_vpmap(pp);
697*1841Spraks 		vmtx = VPMAPMTX(vpm);
698*1841Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
699*1841Spraks 	} else {
700*1841Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
701*1841Spraks 	}
702*1841Spraks 
703*1841Spraks 	vpm->vpm_refcnt++;
704*1841Spraks 	mutex_exit(vmtx);
705*1841Spraks 
706*1841Spraks 	return (vpm);
707*1841Spraks }
708*1841Spraks 
709*1841Spraks /* END --- vpm cache ---- */
710*1841Spraks 
711*1841Spraks /*
712*1841Spraks  * The vnode page mapping(vpm) interface routines.
713*1841Spraks  */
714*1841Spraks 
715*1841Spraks /*
716*1841Spraks  * Find or create the pages starting form baseoff for specified
717*1841Spraks  * length 'len'.
718*1841Spraks  */
719*1841Spraks static int
720*1841Spraks vpm_pagecreate(
721*1841Spraks 	struct vnode *vp,
722*1841Spraks 	u_offset_t baseoff,
723*1841Spraks 	size_t len,
724*1841Spraks 	vmap_t vml[],
725*1841Spraks 	int nseg,
726*1841Spraks 	int *newpage)
727*1841Spraks {
728*1841Spraks 
729*1841Spraks 	page_t *pp = NULL;
730*1841Spraks 	caddr_t base;
731*1841Spraks 	u_offset_t off = baseoff;
732*1841Spraks 	int i;
733*1841Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
734*1841Spraks 
735*1841Spraks 	for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++) {
736*1841Spraks 		struct vpmap *vpm;
737*1841Spraks 
738*1841Spraks 
739*1841Spraks 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
740*1841Spraks 
741*1841Spraks 			base = segkpm_create_va(off);
742*1841Spraks 
743*1841Spraks 			/*
744*1841Spraks 			 * the seg pointer passed in is just advisor. Just
745*1841Spraks 			 * pass segkmap for now like segmap does with
746*1841Spraks 			 * segmap_kpm enabled.
747*1841Spraks 			 */
748*1841Spraks 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
749*1841Spraks 			    segkmap, base)) == NULL) {
750*1841Spraks 				panic("segmap_pagecreate_vpm: "
751*1841Spraks 				    "page_create failed");
752*1841Spraks 				/*NOTREACHED*/
753*1841Spraks 			}
754*1841Spraks 			if (newpage != NULL)
755*1841Spraks 				*newpage = 1;
756*1841Spraks 
757*1841Spraks 			page_io_unlock(pp);
758*1841Spraks 		}
759*1841Spraks 
760*1841Spraks 		/*
761*1841Spraks 		 * Get the vpm for this page_t.
762*1841Spraks 		 */
763*1841Spraks 		if (vpm_cache_enable) {
764*1841Spraks 			vpm = get_vpmap(pp);
765*1841Spraks 			vml[i].vs_data = (void *)&vpm->vpm_pp;
766*1841Spraks 		} else {
767*1841Spraks 			vml[i].vs_data = (void *)pp;
768*1841Spraks 			pp->p_vpmref = 0;
769*1841Spraks 		}
770*1841Spraks 
771*1841Spraks 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
772*1841Spraks 		vml[i].vs_len = PAGESIZE;
773*1841Spraks 
774*1841Spraks 		off += PAGESIZE;
775*1841Spraks 	}
776*1841Spraks 	vml[i].vs_data = NULL;
777*1841Spraks 	vml[i].vs_addr = (caddr_t)NULL;
778*1841Spraks 	return (0);
779*1841Spraks }
780*1841Spraks 
781*1841Spraks 
782*1841Spraks /*
783*1841Spraks  * Returns vpm mappings of pages in the range [off, off+len], where
784*1841Spraks  * len is rounded up to the PAGESIZE boundary. The list of pages and
785*1841Spraks  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
786*1841Spraks  * The nseg is the number of vmap_t entries in the array.
787*1841Spraks  *
788*1841Spraks  * Currently max len allowed is MAXBSIZE therefore, it will either
789*1841Spraks  * fetch/create one or two pages depending on what is the PAGESIZE.
790*1841Spraks  *
791*1841Spraks  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
792*1841Spraks  * For such cases, use the seg_map interfaces.
793*1841Spraks  */
794*1841Spraks int
795*1841Spraks vpm_map_pages(
796*1841Spraks 	struct vnode *vp,
797*1841Spraks 	u_offset_t off,
798*1841Spraks 	size_t len,
799*1841Spraks 	int fetchpage,
800*1841Spraks 	vmap_t *vml,
801*1841Spraks 	int nseg,
802*1841Spraks 	int  *newpage,
803*1841Spraks 	enum seg_rw rw)
804*1841Spraks {
805*1841Spraks 	extern struct vnode *common_specvp();
806*1841Spraks 	u_offset_t baseoff;
807*1841Spraks 	uint_t prot;
808*1841Spraks 	caddr_t base;
809*1841Spraks 	page_t *pp, *pplist[MAXVMAPS];
810*1841Spraks 	struct vpmap *vpm;
811*1841Spraks 	int i, error = 0;
812*1841Spraks 
813*1841Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
814*1841Spraks 	baseoff = off & (offset_t)PAGEMASK;
815*1841Spraks 	vml[0].vs_data = NULL;
816*1841Spraks 	vml[0].vs_addr = (caddr_t)NULL;
817*1841Spraks 	/*
818*1841Spraks 	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
819*1841Spraks 	 * len longer then MAXBSIZE, but there should be a limit
820*1841Spraks 	 * which should be determined by how many pages the VOP_GETPAGE()
821*1841Spraks 	 * can fetch.
822*1841Spraks 	 */
823*1841Spraks 	if (off + len > baseoff + MAXBSIZE) {
824*1841Spraks 		panic("vpm_map_pages bad len");
825*1841Spraks 		/*NOTREACHED*/
826*1841Spraks 	}
827*1841Spraks 
828*1841Spraks 	/*
829*1841Spraks 	 * If this is a block device we have to be sure to use the
830*1841Spraks 	 * "common" block device vnode for the mapping.
831*1841Spraks 	 */
832*1841Spraks 	if (vp->v_type == VBLK)
833*1841Spraks 		vp = common_specvp(vp);
834*1841Spraks 
835*1841Spraks 
836*1841Spraks 	if (!fetchpage)
837*1841Spraks 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
838*1841Spraks 
839*1841Spraks 	for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++,
840*1841Spraks 						pplist[i] = NULL) {
841*1841Spraks 
842*1841Spraks 		pp = page_lookup(vp, baseoff, SE_SHARED);
843*1841Spraks 
844*1841Spraks 		/*
845*1841Spraks 		 * If we did not find the page or if this page was not
846*1841Spraks 		 * in our cache, then let VOP_GETPAGE get all the pages.
847*1841Spraks 		 * We need to call VOP_GETPAGE so that filesytems can do some
848*1841Spraks 		 * (un)necessary tracking for sequential access.
849*1841Spraks 		 */
850*1841Spraks 
851*1841Spraks 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
852*1841Spraks 			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
853*1841Spraks 							!= (P_MOD | P_REF))) {
854*1841Spraks 			if (pp != NULL) {
855*1841Spraks 				page_unlock(pp);
856*1841Spraks 			}
857*1841Spraks 
858*1841Spraks 			/*
859*1841Spraks 			 * Pass a dummy address as it will be required
860*1841Spraks 			 * by page_create_va(). We pass segkmap as the seg
861*1841Spraks 			 * as some file systems(UFS) check it.
862*1841Spraks 			 */
863*1841Spraks 			base = segkpm_create_va(baseoff);
864*1841Spraks 
865*1841Spraks 			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
866*1841Spraks 			roundup(len, PAGESIZE), segkmap, base, rw, CRED());
867*1841Spraks 			if (error) {
868*1841Spraks 				VPM_DEBUG(vpmd_getpagefailed);
869*1841Spraks 				pplist[i] = NULL;
870*1841Spraks 			}
871*1841Spraks 			break;
872*1841Spraks 		} else {
873*1841Spraks 			pplist[i] = pp;
874*1841Spraks 			baseoff += PAGESIZE;
875*1841Spraks 		}
876*1841Spraks 	}
877*1841Spraks 
878*1841Spraks 	if (error) {
879*1841Spraks 		for (i = 0; pplist[i] != NULL; i++) {
880*1841Spraks 			page_unlock(pplist[i]);
881*1841Spraks 			pplist[i] = NULL;
882*1841Spraks 		}
883*1841Spraks 		vml[0].vs_addr = NULL;
884*1841Spraks 		vml[0].vs_data = NULL;
885*1841Spraks 		return (FC_MAKE_ERR(error));
886*1841Spraks 	}
887*1841Spraks 
888*1841Spraks 	/*
889*1841Spraks 	 * Get the vpm's for pages.
890*1841Spraks 	 */
891*1841Spraks 	for (i = 0; pplist[i] != NULL; i++) {
892*1841Spraks 		if (vpm_cache_enable) {
893*1841Spraks 			vpm = get_vpmap(pplist[i]);
894*1841Spraks 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
895*1841Spraks 		} else {
896*1841Spraks 			vml[i].vs_data = (void *)pplist[i];
897*1841Spraks 			pplist[i]->p_vpmref = 0;
898*1841Spraks 		}
899*1841Spraks 
900*1841Spraks 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
901*1841Spraks 		vml[i].vs_len = PAGESIZE;
902*1841Spraks 	}
903*1841Spraks 
904*1841Spraks 	vml[i].vs_data = NULL;
905*1841Spraks 	vml[i].vs_addr = (caddr_t)NULL;
906*1841Spraks 
907*1841Spraks 	return (0);
908*1841Spraks }
909*1841Spraks 
910*1841Spraks /*
911*1841Spraks  * Release the vpm mappings on the pages and unlock them.
912*1841Spraks  */
913*1841Spraks void
914*1841Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
915*1841Spraks {
916*1841Spraks 	int i;
917*1841Spraks 	struct vpmap *vpm;
918*1841Spraks 	kmutex_t *mtx;
919*1841Spraks 	page_t *pp;
920*1841Spraks 
921*1841Spraks 	for (i = 0; vml[i].vs_data != NULL; i++) {
922*1841Spraks 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
923*1841Spraks 
924*1841Spraks 		if (vpm_cache_enable) {
925*1841Spraks 			pp = *(((page_t **)vml[i].vs_data));
926*1841Spraks 		} else {
927*1841Spraks 			pp = (page_t *)vml[i].vs_data;
928*1841Spraks 		}
929*1841Spraks 
930*1841Spraks 		/*
931*1841Spraks 		 * Mark page as being modified or referenced, bacause vpm pages
932*1841Spraks 		 * would not cause faults where it would be set normally.
933*1841Spraks 		 */
934*1841Spraks 		if (rw == S_WRITE) {
935*1841Spraks 			hat_setrefmod(pp);
936*1841Spraks 		} else {
937*1841Spraks 			ASSERT(rw == S_READ);
938*1841Spraks 			hat_setref(pp);
939*1841Spraks 		}
940*1841Spraks 
941*1841Spraks 		if (vpm_cache_enable) {
942*1841Spraks 			page_unlock(pp);
943*1841Spraks 			vpm = (struct vpmap *)((char *)vml[i].vs_data
944*1841Spraks 					- offsetof(struct vpmap, vpm_pp));
945*1841Spraks 			mtx = VPMAPMTX(vpm);
946*1841Spraks 			mutex_enter(mtx);
947*1841Spraks 
948*1841Spraks 			if (--vpm->vpm_refcnt == 0) {
949*1841Spraks 				free_vpmap(vpm);
950*1841Spraks 			}
951*1841Spraks 			mutex_exit(mtx);
952*1841Spraks 		} else {
953*1841Spraks 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
954*1841Spraks 			(void) page_release(pp, 1);
955*1841Spraks 		}
956*1841Spraks 		vml[i].vs_data = NULL;
957*1841Spraks 		vml[i].vs_addr = NULL;
958*1841Spraks 	}
959*1841Spraks }
960*1841Spraks 
961*1841Spraks /*
962*1841Spraks  * Given the vp, off and the uio structure, this routine will do the
963*1841Spraks  * the copy (uiomove). If the last page created is partially written,
964*1841Spraks  * the rest of the page is zeroed out. It also zeros the beginning of
965*1841Spraks  * the first page till the start offset if requested(zerostart).
966*1841Spraks  * If pages are to be fetched, it will call the filesystem's getpage
967*1841Spraks  * function (VOP_GETPAGE) to get them, otherwise they will be created if
968*1841Spraks  * not already present in the page cache.
969*1841Spraks  */
970*1841Spraks int
971*1841Spraks vpm_data_copy(struct vnode *vp,
972*1841Spraks 	u_offset_t off,
973*1841Spraks 	size_t len,
974*1841Spraks 	struct uio *uio,
975*1841Spraks 	int fetchpage,
976*1841Spraks 	int *newpage,
977*1841Spraks 	int zerostart,
978*1841Spraks 	enum seg_rw rw)
979*1841Spraks {
980*1841Spraks 	int error;
981*1841Spraks 	struct vmap vml[MINVMAPS];
982*1841Spraks 	enum uio_rw uiorw;
983*1841Spraks 	int npages = 0;
984*1841Spraks 
985*1841Spraks 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
986*1841Spraks 	/*
987*1841Spraks 	 * 'off' will be the offset where the I/O starts.
988*1841Spraks 	 * We get the pages starting at the (off & PAGEMASK)
989*1841Spraks 	 * page boundary.
990*1841Spraks 	 */
991*1841Spraks 	error = vpm_map_pages(vp, off, (uint_t)len,
992*1841Spraks 		fetchpage, vml, MINVMAPS, &npages,  rw);
993*1841Spraks 
994*1841Spraks 	if (newpage != NULL)
995*1841Spraks 		*newpage = npages;
996*1841Spraks 	if (!error) {
997*1841Spraks 		int i, pn, slen = len;
998*1841Spraks 		int pon = off & PAGEOFFSET;
999*1841Spraks 
1000*1841Spraks 		/*
1001*1841Spraks 		 * Clear from the beginning of the page to start offset
1002*1841Spraks 		 * if requested.
1003*1841Spraks 		 */
1004*1841Spraks 		if (!fetchpage && zerostart) {
1005*1841Spraks 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
1006*1841Spraks 			VPM_DEBUG(vpmd_zerostart);
1007*1841Spraks 		}
1008*1841Spraks 
1009*1841Spraks 		for (i = 0; !error && slen > 0 &&
1010*1841Spraks 				vml[i].vs_addr != NULL; i++) {
1011*1841Spraks 			pn = (int)MIN(slen, (PAGESIZE - pon));
1012*1841Spraks 			error = uiomove(vml[i].vs_addr + pon,
1013*1841Spraks 				    (long)pn, uiorw, uio);
1014*1841Spraks 			slen -= pn;
1015*1841Spraks 			pon = 0;
1016*1841Spraks 		}
1017*1841Spraks 
1018*1841Spraks 		/*
1019*1841Spraks 		 * When new pages are created, zero out part of the
1020*1841Spraks 		 * page we did not copy to.
1021*1841Spraks 		 */
1022*1841Spraks 		if (!fetchpage && npages &&
1023*1841Spraks 			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1024*1841Spraks 			int nzero;
1025*1841Spraks 
1026*1841Spraks 			pon = (uio->uio_loffset & PAGEOFFSET);
1027*1841Spraks 			nzero = PAGESIZE  - pon;
1028*1841Spraks 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1029*1841Spraks 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1030*1841Spraks 		}
1031*1841Spraks 		vpm_unmap_pages(vml, rw);
1032*1841Spraks 	}
1033*1841Spraks 	return (error);
1034*1841Spraks }
1035*1841Spraks 
1036*1841Spraks /*
1037*1841Spraks  * called to flush pages for the given vnode covering
1038*1841Spraks  * [off, off+len] range.
1039*1841Spraks  */
1040*1841Spraks int
1041*1841Spraks vpm_sync_pages(struct vnode *vp,
1042*1841Spraks 		u_offset_t off,
1043*1841Spraks 		size_t len,
1044*1841Spraks 		uint_t flags)
1045*1841Spraks {
1046*1841Spraks 	extern struct vnode *common_specvp();
1047*1841Spraks 	int bflags = 0;
1048*1841Spraks 	int error = 0;
1049*1841Spraks 	size_t psize = roundup(len, PAGESIZE);
1050*1841Spraks 
1051*1841Spraks 	/*
1052*1841Spraks 	 * If this is a block device we have to be sure to use the
1053*1841Spraks 	 * "common" block device vnode for the mapping.
1054*1841Spraks 	 */
1055*1841Spraks 	if (vp->v_type == VBLK)
1056*1841Spraks 		vp = common_specvp(vp);
1057*1841Spraks 
1058*1841Spraks 	if ((flags & ~SM_DONTNEED) != 0) {
1059*1841Spraks 		if (flags & SM_ASYNC)
1060*1841Spraks 			bflags |= B_ASYNC;
1061*1841Spraks 		if (flags & SM_INVAL)
1062*1841Spraks 			bflags |= B_INVAL;
1063*1841Spraks 		if (flags & SM_DESTROY)
1064*1841Spraks 			bflags |= (B_INVAL|B_TRUNC);
1065*1841Spraks 		if (flags & SM_FREE)
1066*1841Spraks 			bflags |= B_FREE;
1067*1841Spraks 		if (flags & SM_DONTNEED)
1068*1841Spraks 			bflags |= B_DONTNEED;
1069*1841Spraks 
1070*1841Spraks 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED());
1071*1841Spraks 	}
1072*1841Spraks 
1073*1841Spraks 	return (error);
1074*1841Spraks }
1075*1841Spraks 
1076*1841Spraks 
1077*1841Spraks #else	/* SEGKPM_SUPPORT */
1078*1841Spraks 
1079*1841Spraks /* vpm stubs */
1080*1841Spraks void
1081*1841Spraks vpm_init()
1082*1841Spraks {
1083*1841Spraks }
1084*1841Spraks 
1085*1841Spraks /*ARGSUSED*/
1086*1841Spraks int
1087*1841Spraks vpm_pagecreate(
1088*1841Spraks 	struct vnode *vp,
1089*1841Spraks 	u_offset_t baseoff,
1090*1841Spraks 	size_t len,
1091*1841Spraks 	vmap_t vml[],
1092*1841Spraks 	int nseg,
1093*1841Spraks 	int *newpage)
1094*1841Spraks {
1095*1841Spraks 	return (0);
1096*1841Spraks }
1097*1841Spraks 
1098*1841Spraks /*ARGSUSED*/
1099*1841Spraks int
1100*1841Spraks vpm_map_pages(
1101*1841Spraks 	struct vnode *vp,
1102*1841Spraks 	u_offset_t off,
1103*1841Spraks 	size_t len,
1104*1841Spraks 	int fetchpage,
1105*1841Spraks 	vmap_t vml[],
1106*1841Spraks 	int nseg,
1107*1841Spraks 	int *newpage,
1108*1841Spraks 	enum seg_rw rw)
1109*1841Spraks {
1110*1841Spraks 	return (0);
1111*1841Spraks }
1112*1841Spraks 
1113*1841Spraks /*ARGSUSED*/
1114*1841Spraks int
1115*1841Spraks vpm_data_copy(struct vnode *vp,
1116*1841Spraks 	u_offset_t off,
1117*1841Spraks 	size_t len,
1118*1841Spraks 	struct uio *uio,
1119*1841Spraks 	int fetchpage,
1120*1841Spraks 	int *newpage,
1121*1841Spraks 	int zerostart,
1122*1841Spraks 	enum seg_rw rw)
1123*1841Spraks {
1124*1841Spraks 	return (0);
1125*1841Spraks }
1126*1841Spraks 
1127*1841Spraks /*ARGSUSED*/
1128*1841Spraks void
1129*1841Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1130*1841Spraks {
1131*1841Spraks }
1132*1841Spraks /*ARGSUSED*/
1133*1841Spraks int
1134*1841Spraks vpm_sync_pages(struct vnode *vp,
1135*1841Spraks 		u_offset_t off,
1136*1841Spraks 		size_t len,
1137*1841Spraks 		uint_t flags)
1138*1841Spraks {
1139*1841Spraks 	return (0);
1140*1841Spraks }
1141*1841Spraks #endif	/* SEGKPM_SUPPORT */
1142